class TestApplicator(unittest.TestCase): def setUp(self): self.a = EADDir(input_dir=r'C:\Users\wboyle\PycharmProjects\bentley_code\utilities') def _method_for_testing_write(self, ead): texts = ead.tree.xpath("//text") for text in texts: text.text = "yo" def _method_for_testing_characterize(self, ead): results = [] texts = ead.tree.xpath("//text") for text in texts: results.append(text.text) return results def test_ead_list(self): self.assertEquals(self.a.ead_files, ['ead_appended.xml', 'ead_messy.xml', 'ead_pretty.xml']) def test_characterize_directory(self): intended_results = [['text'], ['text'], ['text']] self.assertEquals(self.a.characterize_dir(function=self._method_for_testing_characterize), intended_results) def test_apply_to_directory(self): output_dir = os.path.join(self.a.input_dir, "output") self.a.apply_function_to_dir(function=self._method_for_testing_write, output_dir=output_dir) b = EADDir(output_dir) intended_results = [['yo'], ['yo'], ['yo']] self.assertEquals(b.characterize_dir(self._method_for_testing_characterize), intended_results) for ead in os.listdir(output_dir): os.remove(os.path.join(output_dir, ead))
def test_apply_to_directory(self): output_dir = os.path.join(self.a.input_dir, "output") self.a.apply_function_to_dir(function=self._method_for_testing_write, output_dir=output_dir) b = EADDir(output_dir) intended_results = [['yo'], ['yo'], ['yo']] self.assertEquals(b.characterize_dir(self._method_for_testing_characterize), intended_results) for ead in os.listdir(output_dir): os.remove(os.path.join(output_dir, ead))
def test_apply_to_directory(self): output_dir = os.path.join(self.a.input_dir, "output") self.a.apply_function_to_dir(function=self._method_for_testing_write, output_dir=output_dir) b = EADDir(output_dir) intended_results = [['yo'], ['yo'], ['yo']] self.assertEquals( b.characterize_dir(self._method_for_testing_characterize), intended_results) for ead in os.listdir(output_dir): os.remove(os.path.join(output_dir, ead))
class TestApplicator(unittest.TestCase): def setUp(self): self.a = EADDir( input_dir=r'C:\Users\wboyle\PycharmProjects\bentley_code\utilities' ) def _method_for_testing_write(self, ead): texts = ead.tree.xpath("//text") for text in texts: text.text = "yo" def _method_for_testing_characterize(self, ead): results = [] texts = ead.tree.xpath("//text") for text in texts: results.append(text.text) return results def test_ead_list(self): self.assertEquals( self.a.ead_files, ['ead_appended.xml', 'ead_messy.xml', 'ead_pretty.xml']) def test_characterize_directory(self): intended_results = [['text'], ['text'], ['text']] self.assertEquals( self.a.characterize_dir( function=self._method_for_testing_characterize), intended_results) def test_apply_to_directory(self): output_dir = os.path.join(self.a.input_dir, "output") self.a.apply_function_to_dir(function=self._method_for_testing_write, output_dir=output_dir) b = EADDir(output_dir) intended_results = [['yo'], ['yo'], ['yo']] self.assertEquals( b.characterize_dir(self._method_for_testing_characterize), intended_results) for ead in os.listdir(output_dir): os.remove(os.path.join(output_dir, ead))
def get_all_agents(input_dir): """ Directs extraction of controlaccess terms from a directory of EADs. :param input_dir: filepath to the input director :return: a dictionary in the form {"corpname": {"Apple Computer": [authid, naming_source], etc.}, "persname": {"Jane Doe (1900-1911)": [authid, naming_source], etc.}, "famname": {"Adams family": [authid, _naming_source], etc.}} """ agent_types = ["corpname", "persname", "famname"] agents = dict(zip(agent_types, [{}, {}, {}])) ead_dir = EADDir(input_dir=input_dir) for ead in tqdm(ead_dir.ead_files, desc="grabbing all agents from eads"): tree = etree.parse(os.path.join(ead_dir.input_dir, ead)) all_agents = get_agents_from_ead(tree) for key, value in all_agents.items(): agents[key].update(value) return agents
for text in texts: if text[0] == tag.tag and text[1] == tag.text and tag.get("authfilenumber"): index = dct["normalize to"] # deleting authfilenumbers if the normalization index is "x" if index == "x": if tag.text == "University of Michigan.": continue del tag.attrib["authfilenumber"] continue # normalizing terms normal_form = texts[int(index) - 1] normal_tag = normal_form[0] normal_text = normal_form[1] tag.tag = normal_tag tag.text = normal_text if __name__ == "__main__": directory = r'C:\Users\wboyle\PycharmProjects\without-reservations\Real_Masters_all' ead_dir = EADDir(input_dir=directory) data = load_data("normalization data.csv") ead_dir.apply_function_to_dir(replace_subjects, output_dir=directory)
import csv from lxml import etree from utilities import EADDir def find_multiple_physdescs(ead): results = [] parents = ead.tree.xpath("//physdesc") for parent in parents: tags = parent.xpath("extent") if len(tags) > 1: results.append([ead.filename, ead.tree.getpath(parent), len(tags), etree.tostring(etree.fromstring(etree.tostring(parent)))]) return results if __name__ == "__main__": input_dir = r'C:\Users\wboyle\PycharmProjects\vandura\Real_Masters_all' e = EADDir(input_dir) results = e.characterize_dir(find_multiple_physdescs) with open("eads_with_multiple_extents.csv", mode="wb") as f: writer = csv.writer(f) for result in results: writer.writerows(result)
def setUp(self): self.a = EADDir(input_dir=r'C:\Users\wboyle\PycharmProjects\bentley_code\utilities')
def setUp(self): self.a = EADDir( input_dir=r'C:\Users\wboyle\PycharmProjects\bentley_code\utilities' )