def surname_compatibility(sa, sb): name_comparison_print('|-- Comparing surnames: %s %s'% (sa,sb)) MAX_ALLOWED_SURNAME_DISTANCE_PERCENT = 0.33 sa = clean_name_string(sa, replacement='', keep_whitespace=False, trim_whitespaces=True) sb = clean_name_string(sb, replacement='', keep_whitespace=False, trim_whitespaces=True) dist = distance(sa, sb) ml = float(max(len(sa),len(sb))) name_comparison_print('|--- dist:%s, ml:%s' % (dist,ml)) if ml==0 or dist/ml > MAX_ALLOWED_SURNAME_DISTANCE_PERCENT: return 0.0 else: return 1.-float(dist)/max(len(sa),len(sb))
def full_names_are_synonymous(name1, name2, name_variations): ''' Checks if two names are synonymous; e.g. "Robert" vs. "Bob" @param name1: Full Name string of the first name (w/ last name) @type name1: string @param name2: Full Name string of the second name (w/ last name) @type name2: string @param name_variations: name variations list @type name_variations: list of lists @return: are names synonymous @rtype: boolean ''' if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) names_are_synonymous_b = False max_matches = min(len(name1[2]), len(name2[2])) matches = [] for i in xrange(max_matches): matches.append(False) for nvar in name_variations: for i in xrange(max_matches): oname = name1[2][i].lower() tname = name2[2][i].lower() oname = clean_name_string(oname, "", False, True) tname = clean_name_string(tname, "", False, True) if (oname in nvar and tname in nvar) or oname == tname: name_comparison_print(' ', oname, ' and ', tname, ' are synonyms!') matches[i] = True if sum(matches) == max_matches: names_are_synonymous_b = True break return names_are_synonymous_b
def initials_compatibility(ia, ib): max_n_initials = max(len(ia), len(ib)) initials_intersection = set(ia).intersection(set(ib)) n_initials_intersection = len(initials_intersection) initials_union = set(ia).union(set(ib)) n_initials_union = len(initials_union) initials_distance = distance("".join(ia), "".join(ib)) name_comparison_print('|-- Comparing initials, %s %s' % (ia, ib)) name_comparison_print('|--- initials distance %s' % (initials_distance)) if n_initials_union > 0: initials_c = float(n_initials_intersection) / float(n_initials_union) else: initials_c = 1 name_comparison_print('|--- initials c %s' % (initials_c)) if len(ia) > len(ib): alo = ia alt = ib else: alo = ib alt = ia lo = len(alo) lt = len(alt) if max_n_initials > 0: initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo)) if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \ float(float(max_n_initials * (max_n_initials + 1)) / 2) initials_distance = float(initials_distance) / max_n_initials else: initials_screwup = 0 initials_distance = 0 name_comparison_print('|--- initials screwup, %s ' % (initials_screwup)) name_comparison_print('|--- initials distance, %s' % (initials_distance)) return max( 0.0, 0.8 * initials_c + 0.1 * (1 - initials_distance) + 0.1 * (1 - initials_screwup))
def initials_compatibility(ia, ib): max_n_initials = max(len(ia), len(ib)) initials_intersection = set(ia).intersection(set(ib)) n_initials_intersection = len(initials_intersection) initials_union = set(ia).union(set(ib)) n_initials_union = len(initials_union) initials_distance = distance("".join(ia), "".join(ib)) name_comparison_print('|-- Comparing initials, %s %s' % (ia, ib)) name_comparison_print('|--- initials distance %s'% (initials_distance)) if n_initials_union > 0: initials_c = float(n_initials_intersection) / float(n_initials_union) else: initials_c = 1 name_comparison_print('|--- initials c %s'% (initials_c)) if len(ia) > len(ib): alo = ia alt = ib else: alo = ib alt = ia lo = len(alo) lt = len(alt) if max_n_initials > 0: initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo)) if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \ float(float(max_n_initials * (max_n_initials + 1)) / 2) initials_distance = float(initials_distance) / max_n_initials else: initials_screwup = 0 initials_distance = 0 name_comparison_print('|--- initials screwup, %s '% (initials_screwup)) name_comparison_print('|--- initials distance, %s'% (initials_distance)) return max(0.0, 0.8*initials_c + 0.1*(1-initials_distance) + 0.1*(1-initials_screwup))
def compare_names(origin_name, target_name, initials_penalty=False): ''' Compare two names. ''' MAX_ALLOWED_SURNAME_DISTANCE = 2 name_comparison_print("\nComparing: ", origin_name, ' ', target_name) gendernames = GLOBAL_gendernames name_variations = GLOBAL_name_variations no = split_name_parts(origin_name, True, "", True) nt = split_name_parts(target_name, True, "", True) name_comparison_print("|- splitted no: ", no) name_comparison_print("|- splitted nt: ", nt) score = 0.0 surname_dist = distance(no[0], nt[0]) name_comparison_print("|- surname distance: ", surname_dist) if surname_dist > 0: l_artifact_removal = re.compile("[^a-zA-Z0-9]") fn1 = l_artifact_removal.sub("", no[0]) fn2 = l_artifact_removal.sub("", nt[0]) if fn1 == fn2: score = 1.0 else: score = max( 0.0, 0.5 - (float(surname_dist) / float(MAX_ALLOWED_SURNAME_DISTANCE))) else: score = 1.0 name_comparison_print('||- surname score: ', score) initials_only = ((min(len(no[2]), len(nt[2]))) == 0) only_initials_available = False if len(no[2]) == len(nt[2]) and initials_only: only_initials_available = True name_comparison_print('|- initials only: ', initials_only) name_comparison_print('|- only initials available: ', only_initials_available) names_are_equal_composites = False if not initials_only: names_are_equal_composites = full_names_are_equal_composites( origin_name, target_name) name_comparison_print("|- equal composites: ", names_are_equal_composites) max_n_initials = max(len(no[1]), len(nt[1])) initials_intersection = set(no[1]).intersection(set(nt[1])) n_initials_intersection = len(initials_intersection) initials_union = set(no[1]).union(set(nt[1])) n_initials_union = len(initials_union) initials_distance = distance("".join(no[1]), "".join(nt[1])) if n_initials_union > 0: initials_c = float(n_initials_intersection) / float(n_initials_union) else: initials_c = 1 if len(no[1]) > len(nt[1]): alo = no[1] alt = nt[1] else: alo = nt[1] alt = no[1] lo = len(alo) lt = len(alt) if max_n_initials > 0: initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo)) if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \ float(float(max_n_initials * (max_n_initials + 1)) / 2) initials_distance = initials_distance / max_n_initials else: initials_screwup = 0 initials_distance = 0 score = score - (0.75 * initials_screwup + 0.10 * (1 - initials_c)\ + 0.15 * initials_distance) * (score) name_comparison_print("|- initials sets: ", no[1], " ", nt[1]) name_comparison_print("|- initials distance: ", initials_distance) name_comparison_print("|- initials c: ", initials_c) name_comparison_print("|- initials screwup: ", initials_screwup) name_comparison_print("||- initials score: ", score) composits_eq = full_names_are_equal_composites(no, nt) if len(no[2]) > 0 and len(nt[2]) > 0: gender_eq = full_names_are_equal_gender(no, nt, gendernames) else: gender_eq = True vars_eq = full_names_are_synonymous(no, nt, name_variations) substr_eq = full_names_are_substrings(no, nt) if not initials_only: if len(no[2]) > len(nt[2]): nalo = no[2] nalt = nt[2] else: nalo = nt[2] nalt = no[2] nlo = len(nalo) nlt = len(nalt) names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i]))) for i, k in enumerate(reversed(nalo)) \ if nlo - 1 - i < nlt] max_names_screwup = max( [float(i[0]) / i[1] for i in names_screwup_list]) avg_names_screwup = sum([float(i[0]) / i[1] for i in names_screwup_list])\ / len(names_screwup_list) else: max_names_screwup = 0 avg_names_screwup = 0 score = score - score * 0.75 * max_names_screwup - score * 0.25 * avg_names_screwup name_comparison_print("|- max names screwup: ", max_names_screwup) name_comparison_print("|- avg screwup: ", avg_names_screwup) name_comparison_print("||- names score: ", score) name_comparison_print("|- names composites: ", composits_eq) name_comparison_print("|- same gender: ", gender_eq) name_comparison_print("|- synonims: ", vars_eq) name_comparison_print("|- substrings: ", substr_eq) if vars_eq: synmap = [[i, j, names_are_synonymous(i, j, name_variations)] for i in no[2] for j in nt[2]] synmap = [i for i in synmap if i[2] == True] name_comparison_print("|-- synmap: ", synmap) for i in synmap: if no[2].index(i[0]) == nt[2].index(i[1]): score = score + (1 - score) * 0.5 else: score = score + (1 - score) * 0.15 else: name_comparison_print("|-- synmap: empty") name_comparison_print("|-- synmap score: ", score) if substr_eq and not initials_only: ssmap = [[i, j, names_are_substrings(i, j)] for i in no[2] for j in nt[2]] ssmap = [i for i in ssmap if i[2] == True] name_comparison_print("|-- substr map: ", ssmap) for i in ssmap: if no[2].index(i[0]) == nt[2].index(i[1]): score = score + (1 - score) * 0.2 else: score = score + (1 - score) * 0.05 else: name_comparison_print("|-- substr map: empty") name_comparison_print("|-- substring score: ", score) if composits_eq and not initials_only: name_comparison_print("|-- composite names") score = score + (1 - score) * 0.2 else: name_comparison_print("|-- not composite names") name_comparison_print("|-- composite score: ", score) if not gender_eq: score = score / 3. name_comparison_print("|-- apply gender penalty") else: name_comparison_print("|-- no gender penalty") name_comparison_print("|-- gender score: ", score) if surname_dist > MAX_ALLOWED_SURNAME_DISTANCE: score = 0.0 name_comparison_print("|- surname trim: ", score) else: name_comparison_print("|- no surname trim: ", score) if initials_only and (not only_initials_available or initials_penalty): score = score * .9 name_comparison_print("|- initials only penalty: ", score, initials_only, only_initials_available) else: name_comparison_print("|- no initials only penalty", initials_only, only_initials_available) name_comparison_print("||- final score: ", score) return score
from invenio.bibauthorid_general_utils import name_comparison_print try: from invenio.config import CFG_ETCDIR NO_CFG_ETCDIR = False except ImportError: NO_CFG_ETCDIR = True try: from editdist import distance except ImportError: try: from Levenshtein import distance except ImportError: name_comparison_print("Levenshtein Module not available!") def distance(s1, s2): d = {} lenstr1 = len(s1) lenstr2 = len(s2) for i in xrange(-1, lenstr1 + 1): d[(i, -1)] = i + 1 for j in xrange(-1, lenstr2 + 1): d[(-1, j)] = j + 1 for i in xrange(0, lenstr1): for j in xrange(0, lenstr2): if s1[i] == s2[j]: cost = 0 else:
def compare_names(origin_name, target_name, initials_penalty=False): ''' Compare two names. ''' MAX_ALLOWED_SURNAME_DISTANCE = 2 name_comparison_print("\nComparing: " , origin_name, ' ', target_name) gendernames = GLOBAL_gendernames name_variations = GLOBAL_name_variations no = split_name_parts(origin_name, True, "", True) nt = split_name_parts(target_name, True, "", True) name_comparison_print("|- splitted no: ", no) name_comparison_print("|- splitted nt: ", nt) score = 0.0 surname_dist = distance(no[0], nt[0]) name_comparison_print("|- surname distance: ", surname_dist) if surname_dist > 0: l_artifact_removal = re.compile("[^a-zA-Z0-9]") fn1 = l_artifact_removal.sub("", no[0]) fn2 = l_artifact_removal.sub("", nt[0]) if fn1 == fn2: score = 1.0 else: score = max(0.0, 0.5 - (float(surname_dist) / float(MAX_ALLOWED_SURNAME_DISTANCE))) else: score = 1.0 name_comparison_print('||- surname score: ', score) initials_only = ((min(len(no[2]), len(nt[2]))) == 0) only_initials_available = False if len(no[2]) == len(nt[2]) and initials_only: only_initials_available = True name_comparison_print('|- initials only: ', initials_only) name_comparison_print('|- only initials available: ', only_initials_available) names_are_equal_composites = False if not initials_only: names_are_equal_composites = full_names_are_equal_composites(origin_name, target_name) name_comparison_print("|- equal composites: ", names_are_equal_composites) max_n_initials = max(len(no[1]), len(nt[1])) initials_intersection = set(no[1]).intersection(set(nt[1])) n_initials_intersection = len(initials_intersection) initials_union = set(no[1]).union(set(nt[1])) n_initials_union = len(initials_union) initials_distance = distance("".join(no[1]), "".join(nt[1])) if n_initials_union > 0: initials_c = float(n_initials_intersection) / float(n_initials_union) else: initials_c = 1 if len(no[1]) > len(nt[1]): alo = no[1] alt = nt[1] else: alo = nt[1] alt = no[1] lo = len(alo) lt = len(alt) if max_n_initials > 0: initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo)) if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \ float(float(max_n_initials * (max_n_initials + 1)) / 2) initials_distance = initials_distance / max_n_initials else: initials_screwup = 0 initials_distance = 0 score = score - (0.75 * initials_screwup + 0.10 * (1 - initials_c)\ + 0.15 * initials_distance) * (score) name_comparison_print("|- initials sets: ", no[1], " ", nt[1]) name_comparison_print("|- initials distance: ", initials_distance) name_comparison_print("|- initials c: ", initials_c) name_comparison_print("|- initials screwup: ", initials_screwup) name_comparison_print("||- initials score: ", score) composits_eq = full_names_are_equal_composites(no, nt) if len(no[2]) > 0 and len(nt[2]) > 0: gender_eq = full_names_are_equal_gender(no, nt, gendernames) else: gender_eq = True vars_eq = full_names_are_synonymous(no, nt, name_variations) substr_eq = full_names_are_substrings(no, nt) if not initials_only: if len(no[2]) > len(nt[2]): nalo = no[2] nalt = nt[2] else: nalo = nt[2] nalt = no[2] nlo = len(nalo) nlt = len(nalt) names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i]))) for i, k in enumerate(reversed(nalo)) \ if nlo - 1 - i < nlt] max_names_screwup = max([float(i[0]) / i[1] for i in names_screwup_list]) avg_names_screwup = sum([float(i[0]) / i[1] for i in names_screwup_list])\ / len(names_screwup_list) else: max_names_screwup = 0 avg_names_screwup = 0 score = score - score * 0.75 * max_names_screwup - score * 0.25 * avg_names_screwup name_comparison_print("|- max names screwup: ", max_names_screwup) name_comparison_print("|- avg screwup: ", avg_names_screwup) name_comparison_print("||- names score: ", score) name_comparison_print("|- names composites: ", composits_eq) name_comparison_print("|- same gender: ", gender_eq) name_comparison_print("|- synonims: ", vars_eq) name_comparison_print("|- substrings: ", substr_eq) if vars_eq: synmap = [[i, j, names_are_synonymous(i, j, name_variations)] for i in no[2] for j in nt[2]] synmap = [i for i in synmap if i[2] == True] name_comparison_print("|-- synmap: ", synmap) for i in synmap: if no[2].index(i[0]) == nt[2].index(i[1]): score = score + (1 - score) * 0.5 else: score = score + (1 - score) * 0.15 else: name_comparison_print("|-- synmap: empty") name_comparison_print("|-- synmap score: ", score) if substr_eq and not initials_only: ssmap = [[i, j, names_are_substrings(i, j)] for i in no[2] for j in nt[2]] ssmap = [i for i in ssmap if i[2] == True] name_comparison_print("|-- substr map: ", ssmap) for i in ssmap: if no[2].index(i[0]) == nt[2].index(i[1]): score = score + (1 - score) * 0.2 else: score = score + (1 - score) * 0.05 else: name_comparison_print("|-- substr map: empty") name_comparison_print("|-- substring score: ", score) if composits_eq and not initials_only: name_comparison_print("|-- composite names") score = score + (1 - score) * 0.2 else: name_comparison_print("|-- not composite names") name_comparison_print("|-- composite score: ", score) if not gender_eq: score = score / 3. name_comparison_print("|-- apply gender penalty") else: name_comparison_print("|-- no gender penalty") name_comparison_print("|-- gender score: ", score) if surname_dist > MAX_ALLOWED_SURNAME_DISTANCE: score = 0.0 name_comparison_print("|- surname trim: ", score) else: name_comparison_print("|- no surname trim: ", score) if initials_only and (not only_initials_available or initials_penalty): score = score * .9 name_comparison_print("|- initials only penalty: ", score, initials_only, only_initials_available) else: name_comparison_print("|- no initials only penalty", initials_only, only_initials_available) name_comparison_print("||- final score: ", score) return score
from invenio.bibauthorid_general_utils import name_comparison_print try: from invenio.config import CFG_ETCDIR NO_CFG_ETCDIR = False except ImportError: NO_CFG_ETCDIR = True try: from editdist import distance except ImportError: try: from Levenshtein import distance except ImportError: name_comparison_print("Levenshtein Module not available!") def distance(s1, s2): d = {} lenstr1 = len(s1) lenstr2 = len(s2) for i in xrange(-1, lenstr1 + 1): d[(i, -1)] = i + 1 for j in xrange(-1, lenstr2 + 1): d[(-1, j)] = j + 1 for i in xrange(0, lenstr1): for j in xrange(0, lenstr2): if s1[i] == s2[j]: cost = 0 else: cost = 1
def compare_names(origin_name, target_name, initials_penalty=False): ''' Compare two names ''' name_comparison_print("\nComparing: " , origin_name, ' ', target_name) origin_name = translate_to_ascii(origin_name)[0] target_name = translate_to_ascii(target_name)[0] no = split_name_parts(origin_name, True, "", True) nt = split_name_parts(target_name, True, "", True) name_comparison_print("|- splitted no: %s"% no) name_comparison_print("|- splitted nt: %s"% nt) FS_surname_score = surname_compatibility(no[0], nt[0]) assert FS_surname_score >= 0 and FS_surname_score <=1, "Compare_names: Surname score out of range" name_comparison_print("|- surname score: %s"% FS_surname_score) FS_initials_only = ((min(len(no[2]), len(nt[2]))) == 0) FS_initials_score = initials_compatibility(no[1], nt[1]) assert FS_initials_score >= 0 and FS_initials_score <=1, "Compare_names: initials score out of range" name_comparison_print('|- initials only %s'% FS_initials_only) name_comparison_print('|- initials score %s'% FS_initials_score) FS_first_names_score = compare_first_names(no[2],nt[2]) assert FS_first_names_score >= 0 and FS_first_names_score <=1, "Compare_names: firstname score out of range" name_comparison_print('|- names score %s'% FS_first_names_score ) if not FS_initials_only: x = FS_initials_score y = FS_first_names_score try: FS_ns = (x*y)/sqrt(x**2+y**2)*SQRT2 except ZeroDivisionError: FS_ns = 0.0 else: FS_ns = FS_initials_score * 0.6 name_comparison_print('|- final scores %s %s'% (FS_surname_score, FS_ns)) x = FS_surname_score y = FS_ns try: final_score = (x*y)/sqrt(x**2+y**2)*SQRT2 except ZeroDivisionError: final_score = 0.0 name_comparison_print("|- final score is... %s" % final_score) return final_score
def compare_first_names(fna, fnb): gendernames = GLOBAL_gendernames name_variations = GLOBAL_name_variations initials_only = ((min(len(fna), len(fnb))) == 0) name_comparison_print('|-- Comparing names %s %s' % (fna,fnb)) if len(fna) > 0 and len(fnb) > 0: gender_eq = full_names_are_equal_gender(fna, fnb, gendernames, only_names=True) else: gender_eq = None name_comparison_print("|--- gender equal: %s" % gender_eq) names_are_equal_composites = False if not initials_only: names_are_equal_composites = full_names_are_equal_composites(fna,fnb, only_names=True) name_comparison_print("|--- equal composites: %s" % names_are_equal_composites) vars_eq = full_names_are_synonymous(fna, fnb, name_variations, only_names=True) substr_eq = full_names_are_substrings(fna, fnb, only_names=True) name_comparison_print("|--- synonims: %s" % vars_eq) name_comparison_print("|--- substrings: %s" % substr_eq) if not initials_only: if len(fna) > len(fnb): nalo = fna nalt = fnb else: nalo = fnb nalt = fna nlo = len(nalo) nlt = len(nalt) names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i]))) for i, k in enumerate(reversed(nalo)) \ if nlo - 1 - i < nlt] def _min_names_screwup_list(nalo, nalt): nalo = list(nalo) nalt = list(nalt) sl = [] for n in nalo: maxs = max(len(n), max((len(k) for k in nalt))) all_scr = [distance(n,k) for k in nalt] mins = min(all_scr) sl.append((mins,maxs)) nalt.pop(all_scr.index(mins)) if len(nalt) < 1: break return sl min_names_screwup_list = _min_names_screwup_list(nalo, nalt) max_names_screwup = max([float(i[0]) / i[1] for i in names_screwup_list]) min_names_screwup = min([float(i[0]) / i[1] for i in min_names_screwup_list]) avg_names_screwup = (sum([float(i[0]) / i[1] for i in names_screwup_list])/len(names_screwup_list)+ sum([float(i[0]) / i[1] for i in min_names_screwup_list])/len(min_names_screwup_list))/2 else: max_names_screwup = 0 min_names_screwup = 0 avg_names_screwup = 0 name_comparison_print('|--- screwups min, max, avg: %s %s %s' % (str(min_names_screwup),str(max_names_screwup), str(avg_names_screwup))) orig_max_names_screwup = max_names_screwup if max_names_screwup > 0.1: name_comparison_print("|--- forcing names screwup to one!") max_names_screwup = 1 min_names_screwup = 1 avg_names_screwup = 1 name_comparison_print("|--- min screwup: %s" % min_names_screwup) name_comparison_print("|--- max screwup: %s" % max_names_screwup) name_comparison_print("|--- avg screwup: %s" % avg_names_screwup) compat_score = max(1 - ( 0.25 * max_names_screwup + 0.5 * avg_names_screwup + 0.25 * min_names_screwup), 0.0) name_comparison_print("|--- Name compatibility score: %s" % compat_score) if names_are_equal_composites and substr_eq: compat_score = min(1.0, compat_score + 0.7) elif not names_are_equal_composites and substr_eq: compat_score = min(1.0, compat_score + max(0., (1-orig_max_names_screwup)*0.75)) name_comparison_print("|--- names are equal composites and subtring bonus: %s"% compat_score) if vars_eq: compat_score = min(1.0, compat_score + 0.5) name_comparison_print("|--- synonims bonus: %s"% compat_score) if gender_eq != None and not gender_eq: compat_score = max(0.0, compat_score * 0.25) name_comparison_print("|--- Different Gender penalty: %s"% compat_score) return compat_score
def compare_names(origin_name, target_name, initials_penalty=False): ''' Compare two names ''' name_comparison_print("\nComparing: ", origin_name, ' ', target_name) origin_name = translate_to_ascii(origin_name)[0] target_name = translate_to_ascii(target_name)[0] no = split_name_parts(origin_name, True, "", True) nt = split_name_parts(target_name, True, "", True) name_comparison_print("|- splitted no: %s" % no) name_comparison_print("|- splitted nt: %s" % nt) FS_surname_score = surname_compatibility(no[0], nt[0]) assert FS_surname_score >= 0 and FS_surname_score <= 1, "Compare_names: Surname score out of range" name_comparison_print("|- surname score: %s" % FS_surname_score) FS_initials_only = ((min(len(no[2]), len(nt[2]))) == 0) FS_initials_score = initials_compatibility(no[1], nt[1]) assert FS_initials_score >= 0 and FS_initials_score <= 1, "Compare_names: initials score out of range" name_comparison_print('|- initials only %s' % FS_initials_only) name_comparison_print('|- initials score %s' % FS_initials_score) FS_first_names_score = compare_first_names(no[2], nt[2]) assert FS_first_names_score >= 0 and FS_first_names_score <= 1, "Compare_names: firstname score out of range" name_comparison_print('|- names score %s' % FS_first_names_score) if not FS_initials_only: x = FS_initials_score y = FS_first_names_score try: FS_ns = (x * y) / sqrt(x**2 + y**2) * SQRT2 except ZeroDivisionError: FS_ns = 0.0 else: FS_ns = FS_initials_score * 0.6 name_comparison_print('|- final scores %s %s' % (FS_surname_score, FS_ns)) x = FS_surname_score y = FS_ns try: final_score = (x * y) / sqrt(x**2 + y**2) * SQRT2 except ZeroDivisionError: final_score = 0.0 name_comparison_print("|- final score is... %s" % final_score) return final_score
def compare_first_names(fna, fnb): gendernames = GLOBAL_gendernames name_variations = GLOBAL_name_variations initials_only = ((min(len(fna), len(fnb))) == 0) name_comparison_print('|-- Comparing names %s %s' % (fna, fnb)) if len(fna) > 0 and len(fnb) > 0: gender_eq = full_names_are_equal_gender(fna, fnb, gendernames, only_names=True) else: gender_eq = None name_comparison_print("|--- gender equal: %s" % gender_eq) names_are_equal_composites = False if not initials_only: names_are_equal_composites = full_names_are_equal_composites( fna, fnb, only_names=True) name_comparison_print("|--- equal composites: %s" % names_are_equal_composites) vars_eq = full_names_are_synonymous(fna, fnb, name_variations, only_names=True) substr_eq = full_names_are_substrings(fna, fnb, only_names=True) name_comparison_print("|--- synonims: %s" % vars_eq) name_comparison_print("|--- substrings: %s" % substr_eq) if not initials_only: if len(fna) > len(fnb): nalo = fna nalt = fnb else: nalo = fnb nalt = fna nlo = len(nalo) nlt = len(nalt) names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i]))) for i, k in enumerate(reversed(nalo)) \ if nlo - 1 - i < nlt] def _min_names_screwup_list(nalo, nalt): nalo = list(nalo) nalt = list(nalt) sl = [] for n in nalo: maxs = max(len(n), max((len(k) for k in nalt))) all_scr = [distance(n, k) for k in nalt] mins = min(all_scr) sl.append((mins, maxs)) nalt.pop(all_scr.index(mins)) if len(nalt) < 1: break return sl min_names_screwup_list = _min_names_screwup_list(nalo, nalt) max_names_screwup = max( [float(i[0]) / i[1] for i in names_screwup_list]) min_names_screwup = min( [float(i[0]) / i[1] for i in min_names_screwup_list]) avg_names_screwup = ( sum([float(i[0]) / i[1] for i in names_screwup_list]) / len(names_screwup_list) + sum([float(i[0]) / i[1] for i in min_names_screwup_list]) / len(min_names_screwup_list)) / 2 else: max_names_screwup = 0 min_names_screwup = 0 avg_names_screwup = 0 name_comparison_print('|--- screwups min, max, avg: %s %s %s' % (str(min_names_screwup), str(max_names_screwup), str(avg_names_screwup))) orig_max_names_screwup = max_names_screwup if max_names_screwup > 0.1: name_comparison_print("|--- forcing names screwup to one!") max_names_screwup = 1 min_names_screwup = 1 avg_names_screwup = 1 name_comparison_print("|--- min screwup: %s" % min_names_screwup) name_comparison_print("|--- max screwup: %s" % max_names_screwup) name_comparison_print("|--- avg screwup: %s" % avg_names_screwup) compat_score = max( 1 - (0.25 * max_names_screwup + 0.5 * avg_names_screwup + 0.25 * min_names_screwup), 0.0) name_comparison_print("|--- Name compatibility score: %s" % compat_score) if names_are_equal_composites and substr_eq: compat_score = min(1.0, compat_score + 0.7) elif not names_are_equal_composites and substr_eq: compat_score = min( 1.0, compat_score + max(0., (1 - orig_max_names_screwup) * 0.75)) name_comparison_print( "|--- names are equal composites and subtring bonus: %s" % compat_score) if vars_eq: compat_score = min(1.0, compat_score + 0.5) name_comparison_print("|--- synonims bonus: %s" % compat_score) if gender_eq != None and not gender_eq: compat_score = max(0.0, compat_score * 0.25) name_comparison_print("|--- Different Gender penalty: %s" % compat_score) return compat_score