def compare_fieldvalues_authorname(field_comparisons, threshold, matches_needed): """ Performs field validation given an list of field comparisons using a technique that is meant for author-names taking into account initials vs. full-name, using matching techniques available from BibAuthorId. Each comparison is done according to given threshold which the result must be equal or above to match. During validation the fields are compared and matches are counted per field, up to the given amount of matches needed is met, causing the function to return True. If validation ends before this threshold is met it will return False. @param field_comparisons: list of comparisons, each which contains a list of field-value to field-value comparisons. @type field_comparisons: list @param threshold: number describing the match threshold a comparison must exceed to become a positive match. @type threshold: float @param matches_needed: number of positive field matches needed for the entire comparison process to give a positive result. @type matches_needed: int @return: tuple of matching result, True if enough matches are found, False if not, and number of matches. @rtype: tuple """ matches_found = 0 # Loop over all possible comparisons field by field, if a match is found, # we are done with this field and break out to try and match next field. for comparisons in field_comparisons: for value, other_value in comparisons: # Grab both permutations of a name (before, after and after, before) # and compare to each unique commutative combination. Ex: # Doe,J vs. Smith,J -> [(('Smith,J', 'Doe,J'), ('Smith,J', 'J,Doe')), # (('J,Smith', 'Doe,J'), ('J,Smith', 'J,Doe'))] author_comparisons = [pair for pair in get_paired_comparisons(\ get_reversed_string_variants(value), \ get_reversed_string_variants(other_value))][0] for str1, str2 in author_comparisons: # Author-name comparison - using BibAuthorid function diff = compare_names(str1, str2) if diff >= threshold: matches_found += 1 break else: # We continue as no match was found continue # We break out as a match was found break # If we already have found required number of matches, we return immediately if matches_found >= matches_needed: return True, matches_found # Often authors are not matching fully, so lets allow for the number of matches to # be a little lower, using the same threshold result = matches_found >= matches_needed or matches_found / float(matches_needed) > threshold return result, matches_found
def _compare_names(bib1, bib2): #metadata_comparison_print("Comparing names.") name1 = get_name_by_bibrecref(bib1) name2 = get_name_by_bibrecref(bib2) if name1 and name2: return compare_names(name1, name2, False) return '?'
def _compare_names(bib1, bib2): metadata_comparison_print("Comparing names.") name1 = get_name_by_bibrecref(bib1) name2 = get_name_by_bibrecref(bib2) metadata_comparison_print(" Found %s and %s" % (name1,name2)) if name1 and name2: cmpv = compare_names(name1, name2, False) metadata_comparison_print(" cmp(%s,%s) = %s" % (name1, name2, str(cmpv))) return cmpv return '?'
def compare_fieldvalues_authorname(field_comparisons, threshold, matches_needed): """ Performs field validation given an list of field comparisons using a technique that is meant for author-names taking into account initials vs. full-name, using matching techniques available from BibAuthorId. Each comparison is done according to given threshold which the result must be equal or above to match. During validation the fields are compared and matches are counted per field, up to the given amount of matches needed is met, causing the function to return True. If validation ends before this threshold is met it will return False. @param field_comparisons: list of comparisons, each which contains a list of field-value to field-value comparisons. @type field_comparisons: list @param threshold: number describing the match threshold a comparison must exceed to become a positive match. @type threshold: float @param matches_needed: number of positive field matches needed for the entire comparison process to give a positive result. @type matches_needed: int @return: tuple of matching result, True if enough matches are found, False if not, and number of matches. @rtype: tuple """ matches_found = 0 # Loop over all possible comparisons field by field, if a match is found, # we are done with this field and break out to try and match next field. for comparisons in field_comparisons: for value, other_value in comparisons: # Grab both permutations of a name (before, after and after, before) # and compare to each unique commutative combination. Ex: # Doe,J vs. Smith,J -> [(('Smith,J', 'Doe,J'), ('Smith,J', 'J,Doe')), # (('J,Smith', 'Doe,J'), ('J,Smith', 'J,Doe'))] author_comparisons = [pair for pair in get_paired_comparisons(\ get_reversed_string_variants(value), \ get_reversed_string_variants(other_value))][0] for str1, str2 in author_comparisons: # Author-name comparison - using BibAuthorid function diff = compare_names(str1, str2) if diff >= threshold: matches_found += 1 break else: # We continue as no match was found continue # We break out as a match was found break # If we already have found required number of matches, we return immediately if matches_found >= matches_needed: return True, matches_found # Often authors are not matching fully, so lets allow for the number of matches to # be a little lower, using the same threshold result = matches_found >= matches_needed or matches_found / float( matches_needed) > threshold return result, matches_found