def full_names_are_synonymous(name1, name2, name_variations): ''' Checks if two names are synonymous; e.g. "Robert" vs. "Bob" @param name1: Full Name string of the first name (w/ last name) @type name1: string @param name2: Full Name string of the second name (w/ last name) @type name2: string @param name_variations: name variations list @type name_variations: list of lists @return: are names synonymous @rtype: boolean ''' if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) names_are_synonymous_b = False max_matches = min(len(name1[2]), len(name2[2])) matches = [] for i in xrange(max_matches): matches.append(False) for nvar in name_variations: for i in xrange(max_matches): oname = name1[2][i].lower() tname = name2[2][i].lower() oname = clean_name_string(oname, "", False, True) tname = clean_name_string(tname, "", False, True) if (oname in nvar and tname in nvar) or oname == tname: name_comparison_print(' ', oname, ' and ', tname, ' are synonyms!') matches[i] = True if sum(matches) == max_matches: names_are_synonymous_b = True break return names_are_synonymous_b
def compare_names(origin_name, target_name, initials_penalty=False): ''' Compare two names. ''' MAX_ALLOWED_SURNAME_DISTANCE = 2 name_comparison_print("\nComparing: " , origin_name, ' ', target_name) gendernames = GLOBAL_gendernames name_variations = GLOBAL_name_variations no = split_name_parts(origin_name, True, "", True) nt = split_name_parts(target_name, True, "", True) name_comparison_print("|- splitted no: ", no) name_comparison_print("|- splitted nt: ", nt) score = 0.0 surname_dist = distance(no[0], nt[0]) name_comparison_print("|- surname distance: ", surname_dist) if surname_dist > 0: artifact_removal = re.compile("[^a-zA-Z0-9]") fn1 = artifact_removal.sub("", no[0]) fn2 = artifact_removal.sub("", nt[0]) if fn1 == fn2: score = 1.0 else: score = max(0.0, 0.5 - (float(surname_dist) / float(MAX_ALLOWED_SURNAME_DISTANCE))) else: score = 1.0 name_comparison_print('||- surname score: ', score) initials_only = ((min(len(no[2]), len(nt[2]))) == 0) only_initials_available = False if len(no[2]) == len(nt[2]) and initials_only: only_initials_available = True name_comparison_print('|- initials only: ', initials_only) name_comparison_print('|- only initials available: ', only_initials_available) names_are_equal_composites = False if not initials_only: names_are_equal_composites = full_names_are_equal_composites(origin_name, target_name) name_comparison_print("|- equal composites: ", names_are_equal_composites) max_n_initials = max(len(no[1]), len(nt[1])) initials_intersection = set(no[1]).intersection(set(nt[1])) n_initials_intersection = len(initials_intersection) initials_union = set(no[1]).union(set(nt[1])) n_initials_union = len(initials_union) initials_distance = distance("".join(no[1]), "".join(nt[1])) if n_initials_union > 0: initials_c = float(n_initials_intersection) / float(n_initials_union) else: initials_c = 1 if len(no[1]) > len(nt[1]): alo = no[1] alt = nt[1] else: alo = nt[1] alt = no[1] lo = len(alo) lt = len(alt) if max_n_initials > 0: initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo)) if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \ float(float(max_n_initials * (max_n_initials + 1)) / 2) initials_distance = initials_distance / max_n_initials else: initials_screwup = 0 initials_distance = 0 score = score - (0.75 * initials_screwup + 0.10 * (1 - initials_c)\ + 0.15 * initials_distance) * (score) name_comparison_print("|- initials sets: ", no[1], " ", nt[1]) name_comparison_print("|- initials distance: ", initials_distance) name_comparison_print("|- initials c: ", initials_c) name_comparison_print("|- initials screwup: ", initials_screwup) name_comparison_print("||- initials score: ", score) composits_eq = full_names_are_equal_composites(no, nt) if len(no[2]) > 0 and len(nt[2]) > 0: gender_eq = full_names_are_equal_gender(no, nt, gendernames) else: gender_eq = True vars_eq = full_names_are_synonymous(no, nt, name_variations) substr_eq = full_names_are_substrings(no, nt) if not initials_only: if len(no[2]) > len(nt[2]): nalo = no[2] nalt = nt[2] else: nalo = nt[2] nalt = no[2] nlo = len(nalo) nlt = len(nalt) names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i]))) for i, k in enumerate(reversed(nalo)) \ if nlo - 1 - i < nlt] max_names_screwup = max([float(i[0]) / i[1] for i in names_screwup_list]) avg_names_screwup = sum([float(i[0]) / i[1] for i in names_screwup_list])\ / len(names_screwup_list) else: max_names_screwup = 0 avg_names_screwup = 0 score = score - score * 0.75 * max_names_screwup - score * 0.25 * avg_names_screwup name_comparison_print("|- max names screwup: ", max_names_screwup) name_comparison_print("|- avg screwup: ", avg_names_screwup) name_comparison_print("||- names score: ", score) name_comparison_print("|- names composites: ", composits_eq) name_comparison_print("|- same gender: ", gender_eq) name_comparison_print("|- synonims: ", vars_eq) name_comparison_print("|- substrings: ", substr_eq) if vars_eq: synmap = [[i, j, names_are_synonymous(i, j, name_variations)] for i in no[2] for j in nt[2]] synmap = [i for i in synmap if i[2] == True] name_comparison_print("|-- synmap: ", synmap) for i in synmap: if no[2].index(i[0]) == nt[2].index(i[1]): score = score + (1 - score) * 0.5 else: score = score + (1 - score) * 0.15 else: name_comparison_print("|-- synmap: empty") name_comparison_print("|-- synmap score: ", score) if substr_eq and not initials_only: ssmap = [[i, j, names_are_substrings(i, j)] for i in no[2] for j in nt[2]] ssmap = [i for i in ssmap if i[2] == True] name_comparison_print("|-- substr map: ", ssmap) for i in ssmap: if no[2].index(i[0]) == nt[2].index(i[1]): score = score + (1 - score) * 0.2 else: score = score + (1 - score) * 0.05 else: name_comparison_print("|-- substr map: empty") name_comparison_print("|-- substring score: ", score) if composits_eq and not initials_only: name_comparison_print("|-- composite names") score = score + (1 - score) * 0.2 else: name_comparison_print("|-- not composite names") name_comparison_print("|-- composite score: ", score) if not gender_eq: score = score / 3. name_comparison_print("|-- apply gender penalty") else: name_comparison_print("|-- no gender penalty") name_comparison_print("|-- gender score: ", score) if surname_dist > MAX_ALLOWED_SURNAME_DISTANCE: score = 0.0 name_comparison_print("|- surname trim: ", score) else: name_comparison_print("|- no surname trim: ", score) if initials_only and (not only_initials_available or initials_penalty): score = score * .9 name_comparison_print("|- initials only penalty: ", score, initials_only, only_initials_available) else: name_comparison_print("|- no initials only penalty", initials_only, only_initials_available) name_comparison_print("||- final score: ", score) return score
from bibauthorid_general_utils import name_comparison_print try: from invenio.config import CFG_ETCDIR NO_CFG_ETCDIR = False except ImportError: NO_CFG_ETCDIR = True try: from editdist import distance except ImportError: try: from Levenshtein import distance except ImportError: name_comparison_print("Levenshtein Module not available!") def distance(s1, s2): d = {} lenstr1 = len(s1) lenstr2 = len(s2) for i in xrange(-1, lenstr1 + 1): d[(i, -1)] = i + 1 for j in xrange(-1, lenstr2 + 1): d[(-1, j)] = j + 1 for i in xrange(0, lenstr1): for j in xrange(0, lenstr2): if s1[i] == s2[j]: cost = 0 else: cost = 1