示例#1
0
def merge_records_xml(marcxml_obj):
    """Function that takes in input a marcxml string and returns containing 
    multiple records identified by the tag "collection" and for each one calls the 
    function to merge the different flavors of the same record 
    (identified by the tag "record"). """
    logger.info(' Merger started.')
    #I get the bibrecord object from libxml2 one
    all_records = create_record_from_libxml_obj(marcxml_obj, logger)
    merged_records = []
    records_with_merging_probl = []
    for records in all_records:
        #I try to get the bibcode of the record I'm merging
        try:
            system_number_fields = records[0][FIELD_TO_MARC['system number']]
            bibcode = bibrecord.field_get_subfield_values(system_number_fields[0], SYSTEM_NUMBER_SUBFIELD)[0]
        except:
            bibcode = 'Unknown'
        logger.warn(' Merging bibcode "%s".' % bibcode)
        # Get the merged record
        try:
            merged_records.append(merge_multiple_records(records))
        except Exception, error:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            str_error_to_print = exc_type.__name__ + '\t' + str(error) + ' (Merger error)'
            logger.error(' Impossible to merge the record "%s" \t %s' % (bibcode, str_error_to_print))
            records_with_merging_probl.append((bibcode, str_error_to_print))
    def test_01_merge_two_records_one_field(self):
        """
        PRIORITY: 2 records, 1 field, 2 origins.
        """
        marcxml = """<collections><collection>
  <record>
    <datafield tag="300" ind1=" " ind2=" ">
      <subfield code="a">10</subfield>
      <subfield code="7">A&amp;A</subfield>
    </datafield>
    <datafield tag="980" ind1="" ind2="">
        <subfield code="a">ASTRONOMY</subfield>
        <subfield code="7">ADS metadata</subfield>
    </datafield>
  </record>
  <record>
    <datafield tag="300" ind1=" " ind2=" ">
      <subfield code="a">15</subfield>
      <subfield code="7">NED</subfield>
    </datafield>
  </record>
</collection></collections>"""
        expected = """<collections><collection><record>
  <datafield tag="300" ind1=" " ind2=" ">
    <subfield code="a">10</subfield>
    <subfield code="7">A&amp;A</subfield>
  </datafield>
  <datafield tag="980" ind1="" ind2="">
        <subfield code="a">ASTRONOMY</subfield>
        <subfield code="7">ADS metadata</subfield>
    </datafield>
</record></collection></collections>"""
        merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0]
        self.assertEqual(merged_record, create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0])
    def test_04_merge_three_records_two_fields(self):
        """
        3 records, 6 fields, 6 origins.
        """
        marcxml = """<collections><collection>
  <record>
    <datafield tag="300" ind1=" " ind2=" ">
      <subfield code="a">10</subfield>
      <subfield code="7">A&amp;A</subfield>
    </datafield>
    <datafield tag="773" ind1=" " ind2=" ">
      <subfield code="a">Libération</subfield>
      <subfield code="7">STI</subfield>
    </datafield>
    <datafield tag="980" ind1="" ind2="">
      <subfield code="a">ASTRONOMY</subfield>
      <subfield code="7">ADS metadata</subfield>
    </datafield>
  </record>
  <record>
    <datafield tag="773" ind1=" " ind2=" ">
      <subfield code="a">Le Monde</subfield>
      <subfield code="7">AAS</subfield>
    </datafield>
    <datafield tag="300" ind1=" " ind2=" ">
      <subfield code="a">15</subfield>
      <subfield code="7">NED</subfield>
    </datafield>
  </record>
  <record>
    <datafield tag="300" ind1=" " ind2=" ">
      <subfield code="a">5</subfield>
      <subfield code="7">ADS metadata</subfield>
    </datafield>
    <datafield tag="773" ind1=" " ind2=" ">
      <subfield code="a">L'Express</subfield>
      <subfield code="7">OCR</subfield>
    </datafield>
  </record>
</collection></collections>"""
        expected = """<collections><collection><record>
  <datafield tag="300" ind1=" " ind2=" ">
    <subfield code="a">5</subfield>
    <subfield code="7">ADS metadata</subfield>
  </datafield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="a">Le Monde</subfield>
    <subfield code="7">AAS</subfield>
  </datafield>
  <datafield tag="980" ind1="" ind2="">
    <subfield code="a">ASTRONOMY</subfield>
    <subfield code="7">ADS metadata</subfield>
  </datafield>
</record></collection></collections>"""
        merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0]
        self.assertEqual(merged_record, create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0])
def get_result_invenio_xmltransformer(xmlstring):
    xmlobj = libxml2.parseDoc(xmlstring)
    xslt = '../misc/AdsXML2MarcXML_v2.xsl'
    stylesheet = libxslt.parseStylesheetDoc(libxml2.parseFile(xslt))
    xml_transformed_object = stylesheet.applyStylesheet(xmlobj, None)
    marcxml = xml_transformed_object.serialize(encoding='utf-8')
    #result with internal function
    result_xml_transformer = x.create_record_from_libxml_obj(xml_transformed_object, logger)
    #result with function from invenio
    regex = re.compile('<collection>.*?</collection>', re.DOTALL)
    record_xmls = regex.findall(marcxml)
    result_invenio = [[res[0] for res in bibrecord.create_records(xml)] for xml in record_xmls]
    
    return (result_xml_transformer, result_invenio)
    def test_02_merge_two_records_additional_subfield(self):
        """
        AUTHORS: 2 records, 1 additional subfield.
        """
        marcxml = """<collections><collection>
  <record>
    <datafield tag="100" ind1=" " ind2=" ">
      <subfield code="a">Di Milia, Giovanni</subfield>
      <subfield code="b">Di Milia, G</subfield>
      <subfield code="7">A&amp;A</subfield>
    </datafield>
    <datafield tag="980" ind1="" ind2="">
      <subfield code="a">ASTRONOMY</subfield>
      <subfield code="7">ADS metadata</subfield>
    </datafield>
  </record>
  <record>
    <datafield tag="100" ind1=" " ind2=" ">
      <subfield code="a">Di Milia, Giancarlo</subfield>
      <subfield code="b">Di Milia, G</subfield>
      <subfield code="u">Center for astrophysics</subfield>
      <subfield code="7">ARXIV</subfield>
    </datafield>
  </record>
</collection></collections>"""
        expected = """<collections><collection><record>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="a">Di Milia, Giovanni</subfield>
    <subfield code="b">Di Milia, G</subfield>
    <subfield code="u">Center for astrophysics</subfield>
    <subfield code="7">A&amp;A</subfield>
  </datafield>
  <datafield tag="980" ind1="" ind2="">
    <subfield code="a">ASTRONOMY</subfield>
    <subfield code="7">ADS metadata</subfield>
  </datafield>
</record></collection></collections>"""
        #records = b.create_records(marcxml)
        expected_record = create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0]
        merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0]
        self.assertTrue(b._compare_fields(merged_record[0]['100'][0], expected_record[0]['100'][0], strict=False))
    def test_01_merge_two_records_one_field(self):
        """
        AUTHORS: 2 records, priority.
        """
        marcxml = """<collections><collection>
  <record>
    <datafield tag="100" ind1=" " ind2=" ">
      <subfield code="a">Di Milia, Giovanni</subfield>
      <subfield code="b">Di Milia, G</subfield>
      <subfield code="7">A&amp;A</subfield>
    </datafield>
    <datafield tag="700" ind1=" " ind2=" ">
      <subfield code="a">Luker, Jay</subfield>
      <subfield code="b">Luker, J</subfield>
      <subfield code="7">A&amp;A</subfield>
    </datafield>
    <datafield tag="700" ind1=" " ind2=" ">
      <subfield code="a">Henneken, Edwin</subfield>
      <subfield code="b">Henneken, E</subfield>
      <subfield code="7">A&amp;A</subfield>
    </datafield>
    <datafield tag="980" ind1="" ind2="">
      <subfield code="a">ASTRONOMY</subfield>
      <subfield code="7">ADS metadata</subfield>
    </datafield>
  </record>
  <record>
    <datafield tag="100" ind1=" " ind2=" ">
      <subfield code="a">Dimilia, Giovanni</subfield>
      <subfield code="b">Dimilia, G</subfield>
      <subfield code="7">ARXIV</subfield>
    </datafield>
    <datafield tag="700" ind1=" " ind2=" ">
      <subfield code="a">Luker, Jay</subfield>
      <subfield code="b">Luker, J</subfield>
      <subfield code="7">ARXIV</subfield>
    </datafield>
    <datafield tag="700" ind1=" " ind2=" ">
      <subfield code="a">Henneken, Edwin</subfield>
      <subfield code="b">Henneken, E</subfield>
      <subfield code="7">ARXIV</subfield>
    </datafield>
  </record>
</collection></collections>"""
        expected = """<collections><collection><record>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="a">Di Milia, Giovanni</subfield>
    <subfield code="b">Di Milia, G</subfield>
    <subfield code="7">A&amp;A</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="a">Luker, Jay</subfield>
    <subfield code="b">Luker, J</subfield>
    <subfield code="7">A&amp;A</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="a">Henneken, Edwin</subfield>
    <subfield code="b">Henneken, E</subfield>
    <subfield code="7">A&amp;A</subfield>
  </datafield>
  <datafield tag="980" ind1="" ind2="">
    <subfield code="a">ASTRONOMY</subfield>
    <subfield code="7">ADS metadata</subfield>
  </datafield>
</record></collection></collections>"""
        merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0]
        self.assertEqual(merged_record, create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0])