def test_untlxml2py_non_UNTL_tag_raises_exception(): xml = BytesIO(b'<?xml version="1.0" encoding="UTF-8"?>\n' b'<metadata>\n' b' <dog>Bezos</dog>\n' b'</metadata>\n') with pytest.raises(untldoc.PyuntlException) as err: untldoc.untlxml2py(xml) assert 'Element "dog" not in UNTL dispatch.' == err.value.args[0]
def test_create_pyuntl_from_xml_string(self): self.root_element = untlxml2py( BytesIO( open( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'metadc_complete.untl.xml'), 'rb').read())) self.assertTrue(isinstance(self.root_element, Metadata))
def test_write_xml_file_from_pyuntl_ascii_is_now_utf8(tmpdir): """Show XML character references are written back as UTF-8. Files with XML character references inserted by `xmlcharrefreplace` when characters could not be encoded properly are now encoded in UTF-8. For example, 'dramáticas' read in is now written out 'dramáticas'. """ # Load data from XML file into a UNTLElement tree. ascii_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'metadc_ascii.untl.xml') ascii_record = untlxml2py(ascii_path) # Write the loaded data back to a new XML file. new_path = os.path.join(tmpdir, 'xml_test_output.xml') ascii_record.create_xml_file(new_path) # Read back the new file and original files into ElementTrees. new_record = ET.parse(new_path) ascii_record = ET.parse(ascii_path) # Compare initial XML to generated XML. # In ElementTree form the data is equal, but on disk it is different. assert ET.tostring(ascii_record.getroot()) == ET.tostring( new_record.getroot()) with open(ascii_path) as ascii_f, open(new_path) as new_f: ascii_text = ascii_f.read() new_text = new_f.read() assert 'dramáticas' in ascii_text assert 'dramáticas' not in new_text assert 'dramáticas' not in ascii_text assert 'dramáticas' in new_text
def test_create_pyuntl_from_xml_string(self): self.root_element = untlxml2py( StringIO.StringIO( open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'metadc_complete.untl.xml'), 'r').read() ) ) self.assertTrue(isinstance(self.root_element, Metadata))
def test_write_xml_file_from_pyuntl_is_identical(original_file, tmpdir): # Load data from XML file into a UNTLElement tree. original_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), original_file) well_formed_record = untlxml2py(original_path) # Write the loaded data back to a new XML file. new_path = os.path.join(tmpdir, 'xml_test_output.xml') well_formed_record.create_xml_file(new_path) # Compare initial XML to generated XML. with open(original_path) as original_f, open(new_path) as new_f: assert original_f.read() == new_f.read()
def test_untlxml2py_namespace(): """Namespace is removed from element.tag when building py object. Including xmlns in the xml prepends "{http://purl.org/dc/elements/1.1/}" to element tags, but untlxml2py will ignore that. """ xml = BytesIO(b'<?xml version="1.0" encoding="UTF-8"?>\n' b'<metadata xmlns="http://purl.org/dc/elements/1.1/">\n' b' <title qualifier="officialtitle">Tres Actos</title>\n' b'</metadata>\n') root = untldoc.untlxml2py(xml) assert isinstance(root, us.UNTLElement) assert root.tag == 'metadata' assert root.children[0].tag == 'title'
def test_untlxml2py(): """Verify children are correctly added from the UNTL XML.""" xml = BytesIO(b'<?xml version="1.0" encoding="UTF-8"?>\n' b'<metadata>\n' b' <title qualifier="officialtitle">Tres Actos</title>\n' b' <creator qualifier="aut">\n' b' <name>Last, Furston, 1807-1865.</name>\n' b' </creator>\n' b'</metadata>\n') root = untldoc.untlxml2py(xml) assert isinstance(root, us.UNTLElement) assert root.tag == 'metadata' assert len(root.children) == 2 assert root.children[0].tag == 'title' assert root.children[0].content == 'Tres Actos' assert root.children[0].qualifier == 'officialtitle' assert root.children[1].tag == 'creator' assert root.children[1].qualifier == 'aut' assert root.children[1].children[0].tag == 'name' assert root.children[1].children[0].content == 'Last, Furston, 1807-1865.'
def testCircularEquality(self): self.assertEqual( py2dict(untlxml2py(BytesIO( pydict2xmlstring(UNTL_DICT)))), UNTL_DICT)
def test_legacy_defaults_record(self): self.root_element = untlxml2py( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'metadc_legacy_defaults.untl.xml')) self.assertTrue(isinstance(self.root_element, Metadata))
def test_blank_description_record(self): self.root_element = untlxml2py( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'metadc_blank_description.untl.xml')) self.assertTrue(isinstance(self.root_element, Metadata))
def testCircularEquality(self): self.assertEqual( py2dict(untlxml2py(StringIO.StringIO( pydict2xmlstring(UNTL_DICT)))), UNTL_DICT)
# The content is not a legacy placeholder. if content not in COMMON_DEFAULT_ATTRIBUTE_VALUES and not match: # Only consider <meta qualifier="system"> records. if i.tag == 'meta': if i.qualifier == 'system': completeness_dict['%s' % i.tag]['present'] = True else: completeness_dict['%s' % i.tag]['present'] = True # Get total score of the pyuntl object. for k, v in completeness_dict.items(): # If presence was toggled true, adjust score based on weight. if v['present']: py_untl_object_score += completeness_dict[k]['weight'] # Calculate the float score completeness. completeness = py_untl_object_score / total_points return completeness if __name__ == '__main__': import glob import os from pyuntl import untldoc path = os.getcwd() for infile in glob.glob(os.path.join(path, '../tests/*.untl.xml')): py_untl = untldoc.untlxml2py(infile) completeness = determine_completeness(py_untl) print('|||||| %s' % infile.split('/')[-1]) print('completeness score: %s\n' % completeness)
def test_legacy_defaults_record(self): self.root_element = untlxml2py( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'metadc_legacy_defaults.untl.xml') ) self.assertTrue(isinstance(self.root_element, Metadata))
def test_blank_description_record(self): self.root_element = untlxml2py( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'metadc_blank_description.untl.xml') ) self.assertTrue(isinstance(self.root_element, Metadata))
ark = meta["content"] return ark metadata_fields = ["title", "creator", "contributor", "publisher", "date", "language", "description", "subject", "primarySource", "coverage", "source", "citation", "relation", "collection", "institution", "rights", "resourceType", "format", "identifier", "degree", "note", "meta"] header = True for filename in sys.stdin: try: rd = {} untl = untlxml2py(filename.strip()) rd["completeness"] = untl.completeness rd["record_length"] = untl.record_length rd["record_content_length"] = untl.record_content_length untl_dict = untlpy2dict(untl) for field in metadata_fields: rd[field] = len(untl_dict.get(field, [])) hash_name = "%s_hash" % field rd[hash_name] = hashlib.md5(str(untl_dict.get(field, []))).hexdigest() rd["hidden"] = get_is_hidden(untl_dict) rd["metadata_creator"] = get_metadata_creator(untl_dict) rd["metadata_editor"] = get_metadata_editor(untl_dict) rd["metadata_creation_date"] = get_metadata_creation_date(untl_dict) rd["metadata_edit_date"] = get_metadata_edit_date(untl_dict)
return ark metadata_fields = [ "title", "creator", "contributor", "publisher", "date", "language", "description", "subject", "primarySource", "coverage", "source", "citation", "relation", "collection", "institution", "rights", "resourceType", "format", "identifier", "degree", "note", "meta" ] header = True for filename in sys.stdin: try: rd = {} untl = untlxml2py(filename.strip()) rd["completeness"] = untl.completeness rd["record_length"] = untl.record_length rd["record_content_length"] = untl.record_content_length untl_dict = untlpy2dict(untl) for field in metadata_fields: rd[field] = len(untl_dict.get(field, [])) hash_name = "%s_hash" % field rd[hash_name] = hashlib.md5(str(untl_dict.get(field, []))).hexdigest() rd["hidden"] = get_is_hidden(untl_dict) rd["metadata_creator"] = get_metadata_creator(untl_dict) rd["metadata_editor"] = get_metadata_editor(untl_dict) rd["metadata_creation_date"] = get_metadata_creation_date(untl_dict)