Пример #1
0
def take_all_no_checks(fields1, fields2, tag):
    """function that takes all the different fields
    and returns an unique list"""
    all_fields = []
    for field1 in fields1 + fields2:
        for field2 in all_fields:
            #I check if the fields are the same without considering the origin
            if compare_fields_exclude_subfiels(field1, field2, strict=False, exclude_subfields=[ORIGIN_SUBFIELD]+TEMP_SUBFIELDS_LIST):
                #then I check if with the origin the subfield are the same
                #if so I already have the value in the list
                if bibrecord._compare_fields(field1, field2, strict=False):
                    break
                #otherwise I have to compare the two fields and take the one with the most trusted origin
                else:
                    try:
                        trusted, untrusted = get_trusted_and_untrusted_fields([field1], [field2], tag)
                    except EqualOrigins:
                        try:
                            trusted, untrusted = _get_best_fields([field1], [field2], tag)
                        except EqualFields:
                            break
                    #if the trusted one is already in the list I don't do anything
                    if trusted[0] == field2:
                        break
                    #otherwise I remove the value in the list and I insert the trusted one
                    else:
                        del(all_fields[all_fields.index(field2)])
                        all_fields.append(field1)
                        break
        else:
            all_fields.append(field1)
    return all_fields
    def test_02_merge_two_records_additional_subfield(self):
        """
        AUTHORS: 2 records, 1 additional subfield.
        """
        marcxml = """<collections><collection>
  <record>
    <datafield tag="100" ind1=" " ind2=" ">
      <subfield code="a">Di Milia, Giovanni</subfield>
      <subfield code="b">Di Milia, G</subfield>
      <subfield code="7">A&amp;A</subfield>
    </datafield>
    <datafield tag="980" ind1="" ind2="">
      <subfield code="a">ASTRONOMY</subfield>
      <subfield code="7">ADS metadata</subfield>
    </datafield>
  </record>
  <record>
    <datafield tag="100" ind1=" " ind2=" ">
      <subfield code="a">Di Milia, Giancarlo</subfield>
      <subfield code="b">Di Milia, G</subfield>
      <subfield code="u">Center for astrophysics</subfield>
      <subfield code="7">ARXIV</subfield>
    </datafield>
  </record>
</collection></collections>"""
        expected = """<collections><collection><record>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="a">Di Milia, Giovanni</subfield>
    <subfield code="b">Di Milia, G</subfield>
    <subfield code="u">Center for astrophysics</subfield>
    <subfield code="7">A&amp;A</subfield>
  </datafield>
  <datafield tag="980" ind1="" ind2="">
    <subfield code="a">ASTRONOMY</subfield>
    <subfield code="7">ADS metadata</subfield>
  </datafield>
</record></collection></collections>"""
        #records = b.create_records(marcxml)
        expected_record = create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0]
        merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0]
        self.assertTrue(b._compare_fields(merged_record[0]['100'][0], expected_record[0]['100'][0], strict=False))
Пример #3
0
def _get_best_fields(fields1, fields2, tag):
    """
    Function that should be called ONLY if "get_trusted_and_untrusted_fields" raises an "EqualOrigins" exception.
    If so this function decides the most trusted based on the actual content of the two sets of fields
    """
    #first check: are the two set of fields exactly the same? if so I take the first one
    if len(fields1) == len(fields2) and all(bibrecord._compare_fields(field1, field2, strict=True) for field1, field2 in zip(fields1, fields2)):
        logger.info('      The two set of fields are exactly the same: picking the first one.')
        return (fields1, fields2)
    #second check alfa: are the two sets the same excluding the temporary fields? if so I pick the one with primary=True or if there is anything the first one
    if len(fields1) == len(fields2) and all(compare_fields_exclude_subfiels(field1, field2, strict=False, exclude_subfields=TEMP_SUBFIELDS_LIST) for field1, field2 in zip(fields1, fields2)):
        logger.info('      The two set of fields are the same (temporary fields excluded): proceeding with primary check')
        #if the two list are exactly the same even with the primary subfield, then I simply return one of the two
        if len(fields1) == len(fields2) and all(compare_fields_exclude_subfiels(field1, field2, strict=False, exclude_subfields=[CREATION_DATE_TMP_SUBFIELD, MODIFICATION_DATE_TMP_SUBFIELD]) for field1, field2 in zip(fields1, fields2)):
            logger.info('        The two set of fields are the same (extraction and modification date excluded: picking the first one')
            return (fields1, fields2)
        else:
            #otherwise I have to check if there is a set of fields with a primary and return this one
            try:
                #I count the occorrences of fields with primary true or false
                primary_occurrences_field1 = [bibrecord.field_get_subfield_values(field, PRIMARY_METADATA_SUBFIELD)[0] for field in fields1]
                primary_occurrences_field2 = [bibrecord.field_get_subfield_values(field, PRIMARY_METADATA_SUBFIELD)[0] for field in fields2]
                #then I consider primary = true only if the majority of fields is true
                if primary_occurrences_field1.count('True') > primary_occurrences_field1.count('False'):
                    primary_field1 = 'True'
                else:
                    primary_field1 = 'False'
                if primary_occurrences_field2.count('True') > primary_occurrences_field2.count('False'):
                    primary_field2 = 'True'
                else:
                    primary_field2 = 'False'
                #if one of the the two has priority true and the other has false I return the one with true
                if primary_field1 == 'True' and primary_field2 == 'False':
                    logger.info('        One set of fields has priority set to True: returning this one')
                    return (fields1, fields2)
                if primary_field1 == 'False' and primary_field2 == 'True':
                    logger.info('        One set of fields has priority set to True: returning this one')
                    return (fields2, fields1)
            except IndexError:
                pass
        
    #second check: are them the same not considering the origin? If so I take the first one
    if len(fields1) == len(fields2) and all(compare_fields_exclude_subfiels(field1, field2, strict=False, exclude_subfields=[ORIGIN_SUBFIELD]+TEMP_SUBFIELDS_LIST) for field1, field2 in zip(fields1, fields2)):
        logger.info('      The two set of fields are the same (origin excluded): picking the first one.')
        return (fields1, fields2)
    #third check: which one has more fields? If there is one I return this one
    if len(fields1) != len(fields2):
        logger.info('      The two set of fields have different length: picking the longest one.')
        return (fields1, fields2) if len(fields1) > len(fields2) else (fields2, fields1)
    #fourth check: which one has more subfields? If there is one I return this one
    subfields1 = subfields2 = 0
    for field in fields1:
        subfields1 += len(field[0])
    for field in fields2:
        subfields2 += len(field[0]) 
    if subfields1 != subfields2:
        logger.info('      The two set of fields have different number of subfields: picking the set with more subfields.')
        return (fields1, fields2) if subfields1 > subfields2 else (fields2, fields1)
    #fifth check: the sum of all the length of all the strings in the subfields
    subfields_strlen1 = subfields_strlen2 = 0
    for field in fields1:
        for subfield in field[0]:
            if subfield[0] not in TEMP_SUBFIELDS_LIST:
                subfields_strlen1 += len(subfield[1])
    for field in fields2:
        for subfield in field[0]:
            if subfield[0] not in TEMP_SUBFIELDS_LIST:
                subfields_strlen2 += len(subfield[1])
    if subfields_strlen1 != subfields_strlen2:
        logger.info('      The two set of fields have subfields with different length: picking the set with longer subfields.')
        return (fields1, fields2) if subfields_strlen1 > subfields_strlen2 else (fields2, fields1)
    #sixth check: if there is one set of field that has the subfield primary = true I take that one
    try:
        #I count the occorrences of fields with primary true or false
        primary_occurrences_field1 = [bibrecord.field_get_subfield_values(field, PRIMARY_METADATA_SUBFIELD)[0] for field in fields1]
        primary_occurrences_field2 = [bibrecord.field_get_subfield_values(field, PRIMARY_METADATA_SUBFIELD)[0] for field in fields2]
        #then I consider primary = true only if the majority of fields is true
        if primary_occurrences_field1.count('True') > primary_occurrences_field1.count('False'):
            primary_field1 = 'True'
        else:
            primary_field1 = 'False'
        if primary_occurrences_field2.count('True') > primary_occurrences_field2.count('False'):
            primary_field2 = 'True'
        else:
            primary_field2 = 'False'
        #if one of the the two has priority true and the other has false I return the one with true
        if primary_field1 == 'True' and primary_field2 == 'False':
            logger.info('      One set of fields has priority set to True: returning this one')
            return (fields1, fields2)
        if primary_field1 == 'False' and primary_field2 == 'True':
            logger.info('      One set of fields has priority set to True: returning this one')
            return (fields2, fields1)
    except IndexError:
        pass
    try:
        #seventh check: which is the newest file?
        all_dates1 = [bibrecord.field_get_subfield_values(field, CREATION_DATE_TMP_SUBFIELD)[0] for field in fields1] + [bibrecord.field_get_subfield_values(field, MODIFICATION_DATE_TMP_SUBFIELD)[0] for field in fields1]
        all_dates2 = [bibrecord.field_get_subfield_values(field, CREATION_DATE_TMP_SUBFIELD)[0] for field in fields2] + [bibrecord.field_get_subfield_values(field, MODIFICATION_DATE_TMP_SUBFIELD)[0] for field in fields2]
        if max(all_dates1) > max(all_dates2):
            logger.info('      One set of fields is coming from a more recent file: returning this one')
            return (fields1, fields2)
        if max(all_dates2) > max(all_dates1):
            logger.info('      One set of fields is coming from a more recent file: returning this one')
            return (fields2, fields1)
    except IndexError:
        pass
    
    #if all checks fail I reached a granularity of problem too small to make a difference, so I simply return the first one.
    logger.warning('      Set of fields too similar to have an automatic choice: choosing the first one.')
    return (fields1, fields2) 
    
    #if all the checks fail the two set of records are too similar for a script
    #raise EqualFields('Sets of fields too similar to have an automatic choice')