def take_all_no_checks(fields1, fields2, tag): """function that takes all the different fields and returns an unique list""" all_fields = [] for field1 in fields1 + fields2: for field2 in all_fields: #I check if the fields are the same without considering the origin if compare_fields_exclude_subfiels(field1, field2, strict=False, exclude_subfields=[ORIGIN_SUBFIELD]+TEMP_SUBFIELDS_LIST): #then I check if with the origin the subfield are the same #if so I already have the value in the list if bibrecord._compare_fields(field1, field2, strict=False): break #otherwise I have to compare the two fields and take the one with the most trusted origin else: try: trusted, untrusted = get_trusted_and_untrusted_fields([field1], [field2], tag) except EqualOrigins: try: trusted, untrusted = _get_best_fields([field1], [field2], tag) except EqualFields: break #if the trusted one is already in the list I don't do anything if trusted[0] == field2: break #otherwise I remove the value in the list and I insert the trusted one else: del(all_fields[all_fields.index(field2)]) all_fields.append(field1) break else: all_fields.append(field1) return all_fields
def test_02_merge_two_records_additional_subfield(self): """ AUTHORS: 2 records, 1 additional subfield. """ marcxml = """<collections><collection> <record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Di Milia, Giovanni</subfield> <subfield code="b">Di Milia, G</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record> <record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Di Milia, Giancarlo</subfield> <subfield code="b">Di Milia, G</subfield> <subfield code="u">Center for astrophysics</subfield> <subfield code="7">ARXIV</subfield> </datafield> </record> </collection></collections>""" expected = """<collections><collection><record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Di Milia, Giovanni</subfield> <subfield code="b">Di Milia, G</subfield> <subfield code="u">Center for astrophysics</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record></collection></collections>""" #records = b.create_records(marcxml) expected_record = create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0] merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0] self.assertTrue(b._compare_fields(merged_record[0]['100'][0], expected_record[0]['100'][0], strict=False))
def _get_best_fields(fields1, fields2, tag): """ Function that should be called ONLY if "get_trusted_and_untrusted_fields" raises an "EqualOrigins" exception. If so this function decides the most trusted based on the actual content of the two sets of fields """ #first check: are the two set of fields exactly the same? if so I take the first one if len(fields1) == len(fields2) and all(bibrecord._compare_fields(field1, field2, strict=True) for field1, field2 in zip(fields1, fields2)): logger.info(' The two set of fields are exactly the same: picking the first one.') return (fields1, fields2) #second check alfa: are the two sets the same excluding the temporary fields? if so I pick the one with primary=True or if there is anything the first one if len(fields1) == len(fields2) and all(compare_fields_exclude_subfiels(field1, field2, strict=False, exclude_subfields=TEMP_SUBFIELDS_LIST) for field1, field2 in zip(fields1, fields2)): logger.info(' The two set of fields are the same (temporary fields excluded): proceeding with primary check') #if the two list are exactly the same even with the primary subfield, then I simply return one of the two if len(fields1) == len(fields2) and all(compare_fields_exclude_subfiels(field1, field2, strict=False, exclude_subfields=[CREATION_DATE_TMP_SUBFIELD, MODIFICATION_DATE_TMP_SUBFIELD]) for field1, field2 in zip(fields1, fields2)): logger.info(' The two set of fields are the same (extraction and modification date excluded: picking the first one') return (fields1, fields2) else: #otherwise I have to check if there is a set of fields with a primary and return this one try: #I count the occorrences of fields with primary true or false primary_occurrences_field1 = [bibrecord.field_get_subfield_values(field, PRIMARY_METADATA_SUBFIELD)[0] for field in fields1] primary_occurrences_field2 = [bibrecord.field_get_subfield_values(field, PRIMARY_METADATA_SUBFIELD)[0] for field in fields2] #then I consider primary = true only if the majority of fields is true if primary_occurrences_field1.count('True') > primary_occurrences_field1.count('False'): primary_field1 = 'True' else: primary_field1 = 'False' if primary_occurrences_field2.count('True') > primary_occurrences_field2.count('False'): primary_field2 = 'True' else: primary_field2 = 'False' #if one of the the two has priority true and the other has false I return the one with true if primary_field1 == 'True' and primary_field2 == 'False': logger.info(' One set of fields has priority set to True: returning this one') return (fields1, fields2) if primary_field1 == 'False' and primary_field2 == 'True': logger.info(' One set of fields has priority set to True: returning this one') return (fields2, fields1) except IndexError: pass #second check: are them the same not considering the origin? If so I take the first one if len(fields1) == len(fields2) and all(compare_fields_exclude_subfiels(field1, field2, strict=False, exclude_subfields=[ORIGIN_SUBFIELD]+TEMP_SUBFIELDS_LIST) for field1, field2 in zip(fields1, fields2)): logger.info(' The two set of fields are the same (origin excluded): picking the first one.') return (fields1, fields2) #third check: which one has more fields? If there is one I return this one if len(fields1) != len(fields2): logger.info(' The two set of fields have different length: picking the longest one.') return (fields1, fields2) if len(fields1) > len(fields2) else (fields2, fields1) #fourth check: which one has more subfields? If there is one I return this one subfields1 = subfields2 = 0 for field in fields1: subfields1 += len(field[0]) for field in fields2: subfields2 += len(field[0]) if subfields1 != subfields2: logger.info(' The two set of fields have different number of subfields: picking the set with more subfields.') return (fields1, fields2) if subfields1 > subfields2 else (fields2, fields1) #fifth check: the sum of all the length of all the strings in the subfields subfields_strlen1 = subfields_strlen2 = 0 for field in fields1: for subfield in field[0]: if subfield[0] not in TEMP_SUBFIELDS_LIST: subfields_strlen1 += len(subfield[1]) for field in fields2: for subfield in field[0]: if subfield[0] not in TEMP_SUBFIELDS_LIST: subfields_strlen2 += len(subfield[1]) if subfields_strlen1 != subfields_strlen2: logger.info(' The two set of fields have subfields with different length: picking the set with longer subfields.') return (fields1, fields2) if subfields_strlen1 > subfields_strlen2 else (fields2, fields1) #sixth check: if there is one set of field that has the subfield primary = true I take that one try: #I count the occorrences of fields with primary true or false primary_occurrences_field1 = [bibrecord.field_get_subfield_values(field, PRIMARY_METADATA_SUBFIELD)[0] for field in fields1] primary_occurrences_field2 = [bibrecord.field_get_subfield_values(field, PRIMARY_METADATA_SUBFIELD)[0] for field in fields2] #then I consider primary = true only if the majority of fields is true if primary_occurrences_field1.count('True') > primary_occurrences_field1.count('False'): primary_field1 = 'True' else: primary_field1 = 'False' if primary_occurrences_field2.count('True') > primary_occurrences_field2.count('False'): primary_field2 = 'True' else: primary_field2 = 'False' #if one of the the two has priority true and the other has false I return the one with true if primary_field1 == 'True' and primary_field2 == 'False': logger.info(' One set of fields has priority set to True: returning this one') return (fields1, fields2) if primary_field1 == 'False' and primary_field2 == 'True': logger.info(' One set of fields has priority set to True: returning this one') return (fields2, fields1) except IndexError: pass try: #seventh check: which is the newest file? all_dates1 = [bibrecord.field_get_subfield_values(field, CREATION_DATE_TMP_SUBFIELD)[0] for field in fields1] + [bibrecord.field_get_subfield_values(field, MODIFICATION_DATE_TMP_SUBFIELD)[0] for field in fields1] all_dates2 = [bibrecord.field_get_subfield_values(field, CREATION_DATE_TMP_SUBFIELD)[0] for field in fields2] + [bibrecord.field_get_subfield_values(field, MODIFICATION_DATE_TMP_SUBFIELD)[0] for field in fields2] if max(all_dates1) > max(all_dates2): logger.info(' One set of fields is coming from a more recent file: returning this one') return (fields1, fields2) if max(all_dates2) > max(all_dates1): logger.info(' One set of fields is coming from a more recent file: returning this one') return (fields2, fields1) except IndexError: pass #if all checks fail I reached a granularity of problem too small to make a difference, so I simply return the first one. logger.warning(' Set of fields too similar to have an automatic choice: choosing the first one.') return (fields1, fields2) #if all the checks fail the two set of records are too similar for a script #raise EqualFields('Sets of fields too similar to have an automatic choice')