예제 #1
0
def test_untlxml2py_non_UNTL_tag_raises_exception():
    xml = BytesIO(b'<?xml version="1.0" encoding="UTF-8"?>\n'
                  b'<metadata>\n'
                  b'  <dog>Bezos</dog>\n'
                  b'</metadata>\n')
    with pytest.raises(untldoc.PyuntlException) as err:
        untldoc.untlxml2py(xml)
    assert 'Element "dog" not in UNTL dispatch.' == err.value.args[0]
예제 #2
0
 def test_create_pyuntl_from_xml_string(self):
     self.root_element = untlxml2py(
         BytesIO(
             open(
                 os.path.join(os.path.dirname(os.path.realpath(__file__)),
                              'metadc_complete.untl.xml'), 'rb').read()))
     self.assertTrue(isinstance(self.root_element, Metadata))
예제 #3
0
def test_write_xml_file_from_pyuntl_ascii_is_now_utf8(tmpdir):
    """Show XML character references are written back as UTF-8.

    Files with XML character references inserted by `xmlcharrefreplace` when
    characters could not be encoded properly are now encoded in UTF-8.
    For example, 'dram&#225;ticas' read in is now written out 'dramáticas'.
    """
    # Load data from XML file into a UNTLElement tree.
    ascii_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                              'metadc_ascii.untl.xml')
    ascii_record = untlxml2py(ascii_path)
    # Write the loaded data back to a new XML file.
    new_path = os.path.join(tmpdir, 'xml_test_output.xml')
    ascii_record.create_xml_file(new_path)
    # Read back the new file and original files into ElementTrees.
    new_record = ET.parse(new_path)
    ascii_record = ET.parse(ascii_path)

    # Compare initial XML to generated XML.
    # In ElementTree form the data is equal, but on disk it is different.
    assert ET.tostring(ascii_record.getroot()) == ET.tostring(
        new_record.getroot())
    with open(ascii_path) as ascii_f, open(new_path) as new_f:
        ascii_text = ascii_f.read()
        new_text = new_f.read()
        assert 'dram&#225;ticas' in ascii_text
        assert 'dram&#225;ticas' not in new_text
        assert 'dramáticas' not in ascii_text
        assert 'dramáticas' in new_text
예제 #4
0
 def test_create_pyuntl_from_xml_string(self):
     self.root_element = untlxml2py(
         StringIO.StringIO(
             open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
                               'metadc_complete.untl.xml'),
                  'r').read()
         )
     )
     self.assertTrue(isinstance(self.root_element, Metadata))
예제 #5
0
def test_write_xml_file_from_pyuntl_is_identical(original_file, tmpdir):
    # Load data from XML file into a UNTLElement tree.
    original_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 original_file)
    well_formed_record = untlxml2py(original_path)
    # Write the loaded data back to a new XML file.
    new_path = os.path.join(tmpdir, 'xml_test_output.xml')
    well_formed_record.create_xml_file(new_path)

    # Compare initial XML to generated XML.
    with open(original_path) as original_f, open(new_path) as new_f:
        assert original_f.read() == new_f.read()
예제 #6
0
def test_untlxml2py_namespace():
    """Namespace is removed from element.tag when building py object.

    Including xmlns in the xml prepends "{http://purl.org/dc/elements/1.1/}"
    to element tags, but untlxml2py will ignore that.
    """
    xml = BytesIO(b'<?xml version="1.0" encoding="UTF-8"?>\n'
                  b'<metadata xmlns="http://purl.org/dc/elements/1.1/">\n'
                  b'  <title qualifier="officialtitle">Tres Actos</title>\n'
                  b'</metadata>\n')
    root = untldoc.untlxml2py(xml)
    assert isinstance(root, us.UNTLElement)
    assert root.tag == 'metadata'
    assert root.children[0].tag == 'title'
예제 #7
0
def test_untlxml2py():
    """Verify children are correctly added from the UNTL XML."""
    xml = BytesIO(b'<?xml version="1.0" encoding="UTF-8"?>\n'
                  b'<metadata>\n'
                  b'  <title qualifier="officialtitle">Tres Actos</title>\n'
                  b'  <creator qualifier="aut">\n'
                  b'    <name>Last, Furston, 1807-1865.</name>\n'
                  b'  </creator>\n'
                  b'</metadata>\n')
    root = untldoc.untlxml2py(xml)
    assert isinstance(root, us.UNTLElement)
    assert root.tag == 'metadata'
    assert len(root.children) == 2
    assert root.children[0].tag == 'title'
    assert root.children[0].content == 'Tres Actos'
    assert root.children[0].qualifier == 'officialtitle'
    assert root.children[1].tag == 'creator'
    assert root.children[1].qualifier == 'aut'
    assert root.children[1].children[0].tag == 'name'
    assert root.children[1].children[0].content == 'Last, Furston, 1807-1865.'
예제 #8
0
 def testCircularEquality(self):
     self.assertEqual(
         py2dict(untlxml2py(BytesIO(
                 pydict2xmlstring(UNTL_DICT)))), UNTL_DICT)
예제 #9
0
 def test_legacy_defaults_record(self):
     self.root_element = untlxml2py(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      'metadc_legacy_defaults.untl.xml'))
     self.assertTrue(isinstance(self.root_element, Metadata))
예제 #10
0
 def test_blank_description_record(self):
     self.root_element = untlxml2py(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      'metadc_blank_description.untl.xml'))
     self.assertTrue(isinstance(self.root_element, Metadata))
예제 #11
0
 def testCircularEquality(self):
     self.assertEqual(
         py2dict(untlxml2py(StringIO.StringIO(
                 pydict2xmlstring(UNTL_DICT)))), UNTL_DICT)
예제 #12
0
                # The content is not a legacy placeholder.
                if content not in COMMON_DEFAULT_ATTRIBUTE_VALUES and not match:
                    # Only consider <meta qualifier="system"> records.
                    if i.tag == 'meta':
                        if i.qualifier == 'system':
                            completeness_dict['%s' % i.tag]['present'] = True
                    else:
                        completeness_dict['%s' % i.tag]['present'] = True
    # Get total score of the pyuntl object.
    for k, v in completeness_dict.items():
        # If presence was toggled true, adjust score based on weight.
        if v['present']:
            py_untl_object_score += completeness_dict[k]['weight']
    # Calculate the float score completeness.
    completeness = py_untl_object_score / total_points
    return completeness


if __name__ == '__main__':
    import glob
    import os

    from pyuntl import untldoc

    path = os.getcwd()
    for infile in glob.glob(os.path.join(path, '../tests/*.untl.xml')):
        py_untl = untldoc.untlxml2py(infile)
        completeness = determine_completeness(py_untl)
        print('|||||| %s' % infile.split('/')[-1])
        print('completeness score: %s\n' % completeness)
예제 #13
0
 def test_legacy_defaults_record(self):
     self.root_element = untlxml2py(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      'metadc_legacy_defaults.untl.xml')
     )
     self.assertTrue(isinstance(self.root_element, Metadata))
예제 #14
0
 def test_blank_description_record(self):
     self.root_element = untlxml2py(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      'metadc_blank_description.untl.xml')
     )
     self.assertTrue(isinstance(self.root_element, Metadata))
            ark = meta["content"]
    return ark


metadata_fields = ["title", "creator", "contributor", "publisher", "date",
                   "language", "description", "subject", "primarySource",
                   "coverage", "source", "citation", "relation", "collection",
                   "institution", "rights", "resourceType", "format",
                   "identifier", "degree", "note", "meta"]

header = True

for filename in sys.stdin:
    try:
        rd = {}
        untl = untlxml2py(filename.strip())
        rd["completeness"] = untl.completeness
        rd["record_length"] = untl.record_length
        rd["record_content_length"] = untl.record_content_length

        untl_dict = untlpy2dict(untl)
        for field in metadata_fields:
            rd[field] = len(untl_dict.get(field, []))
            hash_name = "%s_hash" % field
            rd[hash_name] = hashlib.md5(str(untl_dict.get(field, []))).hexdigest()

        rd["hidden"] = get_is_hidden(untl_dict)
        rd["metadata_creator"] = get_metadata_creator(untl_dict)
        rd["metadata_editor"] = get_metadata_editor(untl_dict)
        rd["metadata_creation_date"] = get_metadata_creation_date(untl_dict)
        rd["metadata_edit_date"] = get_metadata_edit_date(untl_dict)
예제 #16
0
    return ark


metadata_fields = [
    "title", "creator", "contributor", "publisher", "date", "language",
    "description", "subject", "primarySource", "coverage", "source",
    "citation", "relation", "collection", "institution", "rights",
    "resourceType", "format", "identifier", "degree", "note", "meta"
]

header = True

for filename in sys.stdin:
    try:
        rd = {}
        untl = untlxml2py(filename.strip())
        rd["completeness"] = untl.completeness
        rd["record_length"] = untl.record_length
        rd["record_content_length"] = untl.record_content_length

        untl_dict = untlpy2dict(untl)
        for field in metadata_fields:
            rd[field] = len(untl_dict.get(field, []))
            hash_name = "%s_hash" % field
            rd[hash_name] = hashlib.md5(str(untl_dict.get(field,
                                                          []))).hexdigest()

        rd["hidden"] = get_is_hidden(untl_dict)
        rd["metadata_creator"] = get_metadata_creator(untl_dict)
        rd["metadata_editor"] = get_metadata_editor(untl_dict)
        rd["metadata_creation_date"] = get_metadata_creation_date(untl_dict)