class TestApplicator(unittest.TestCase):
    def setUp(self):
        self.a = EADDir(input_dir=r'C:\Users\wboyle\PycharmProjects\bentley_code\utilities')

    def _method_for_testing_write(self, ead):
        texts = ead.tree.xpath("//text")
        for text in texts:
            text.text = "yo"

    def _method_for_testing_characterize(self, ead):
        results = []
        texts = ead.tree.xpath("//text")
        for text in texts:
            results.append(text.text)
        return results

    def test_ead_list(self):
        self.assertEquals(self.a.ead_files, ['ead_appended.xml', 'ead_messy.xml', 'ead_pretty.xml'])

    def test_characterize_directory(self):
        intended_results = [['text'], ['text'], ['text']]
        self.assertEquals(self.a.characterize_dir(function=self._method_for_testing_characterize), intended_results)

    def test_apply_to_directory(self):
        output_dir = os.path.join(self.a.input_dir, "output")
        self.a.apply_function_to_dir(function=self._method_for_testing_write, output_dir=output_dir)

        b = EADDir(output_dir)
        intended_results = [['yo'], ['yo'], ['yo']]
        self.assertEquals(b.characterize_dir(self._method_for_testing_characterize), intended_results)

        for ead in os.listdir(output_dir):
            os.remove(os.path.join(output_dir, ead))
    def test_apply_to_directory(self):
        output_dir = os.path.join(self.a.input_dir, "output")
        self.a.apply_function_to_dir(function=self._method_for_testing_write, output_dir=output_dir)

        b = EADDir(output_dir)
        intended_results = [['yo'], ['yo'], ['yo']]
        self.assertEquals(b.characterize_dir(self._method_for_testing_characterize), intended_results)

        for ead in os.listdir(output_dir):
            os.remove(os.path.join(output_dir, ead))
    def test_apply_to_directory(self):
        output_dir = os.path.join(self.a.input_dir, "output")
        self.a.apply_function_to_dir(function=self._method_for_testing_write,
                                     output_dir=output_dir)

        b = EADDir(output_dir)
        intended_results = [['yo'], ['yo'], ['yo']]
        self.assertEquals(
            b.characterize_dir(self._method_for_testing_characterize),
            intended_results)

        for ead in os.listdir(output_dir):
            os.remove(os.path.join(output_dir, ead))
class TestApplicator(unittest.TestCase):
    def setUp(self):
        self.a = EADDir(
            input_dir=r'C:\Users\wboyle\PycharmProjects\bentley_code\utilities'
        )

    def _method_for_testing_write(self, ead):
        texts = ead.tree.xpath("//text")
        for text in texts:
            text.text = "yo"

    def _method_for_testing_characterize(self, ead):
        results = []
        texts = ead.tree.xpath("//text")
        for text in texts:
            results.append(text.text)
        return results

    def test_ead_list(self):
        self.assertEquals(
            self.a.ead_files,
            ['ead_appended.xml', 'ead_messy.xml', 'ead_pretty.xml'])

    def test_characterize_directory(self):
        intended_results = [['text'], ['text'], ['text']]
        self.assertEquals(
            self.a.characterize_dir(
                function=self._method_for_testing_characterize),
            intended_results)

    def test_apply_to_directory(self):
        output_dir = os.path.join(self.a.input_dir, "output")
        self.a.apply_function_to_dir(function=self._method_for_testing_write,
                                     output_dir=output_dir)

        b = EADDir(output_dir)
        intended_results = [['yo'], ['yo'], ['yo']]
        self.assertEquals(
            b.characterize_dir(self._method_for_testing_characterize),
            intended_results)

        for ead in os.listdir(output_dir):
            os.remove(os.path.join(output_dir, ead))
示例#5
0
def get_all_agents(input_dir):
    """
    Directs extraction of controlaccess terms from a directory of EADs.

    :param input_dir: filepath to the input director
    :return: a dictionary in the form {"corpname": {"Apple Computer": [authid, naming_source], etc.},
                                       "persname": {"Jane Doe (1900-1911)": [authid, naming_source], etc.},
                                       "famname": {"Adams family": [authid, _naming_source], etc.}}
    """

    agent_types = ["corpname", "persname", "famname"]
    agents = dict(zip(agent_types, [{}, {}, {}]))

    ead_dir = EADDir(input_dir=input_dir)

    for ead in tqdm(ead_dir.ead_files, desc="grabbing all agents from eads"):
        tree = etree.parse(os.path.join(ead_dir.input_dir, ead))
        all_agents = get_agents_from_ead(tree)

        for key, value in all_agents.items():
            agents[key].update(value)

    return agents
            for text in texts:
                if text[0] == tag.tag and text[1] == tag.text and tag.get("authfilenumber"):
                    index = dct["normalize to"]

                    # deleting authfilenumbers if the normalization index is "x"
                    if index == "x":
                        if tag.text == "University of Michigan.":
                            continue

                        del tag.attrib["authfilenumber"]
                        continue

                    # normalizing terms
                    normal_form = texts[int(index) - 1]
                    normal_tag = normal_form[0]
                    normal_text = normal_form[1]

                    tag.tag = normal_tag
                    tag.text = normal_text




if __name__ == "__main__":
    directory = r'C:\Users\wboyle\PycharmProjects\without-reservations\Real_Masters_all'
    ead_dir = EADDir(input_dir=directory)
    data = load_data("normalization data.csv")

    ead_dir.apply_function_to_dir(replace_subjects, output_dir=directory)
import csv

from lxml import etree

from utilities import EADDir

def find_multiple_physdescs(ead):
    results = []
    parents = ead.tree.xpath("//physdesc")
    for parent in parents:
        tags = parent.xpath("extent")
        if len(tags) > 1:
            results.append([ead.filename, ead.tree.getpath(parent), len(tags), etree.tostring(etree.fromstring(etree.tostring(parent)))])

    return results

if __name__ == "__main__":
    input_dir = r'C:\Users\wboyle\PycharmProjects\vandura\Real_Masters_all'
    e = EADDir(input_dir)
    results = e.characterize_dir(find_multiple_physdescs)

    with open("eads_with_multiple_extents.csv", mode="wb") as f:
        writer = csv.writer(f)
        for result in results:
            writer.writerows(result)
 def setUp(self):
     self.a = EADDir(input_dir=r'C:\Users\wboyle\PycharmProjects\bentley_code\utilities')
 def setUp(self):
     self.a = EADDir(
         input_dir=r'C:\Users\wboyle\PycharmProjects\bentley_code\utilities'
     )