Пример #1
0
def surname_compatibility(sa, sb):
    name_comparison_print('|-- Comparing surnames: %s %s'% (sa,sb))
    MAX_ALLOWED_SURNAME_DISTANCE_PERCENT = 0.33
    sa = clean_name_string(sa, replacement='', keep_whitespace=False, trim_whitespaces=True)
    sb = clean_name_string(sb, replacement='', keep_whitespace=False, trim_whitespaces=True)
    dist = distance(sa, sb)
    ml = float(max(len(sa),len(sb)))
    name_comparison_print('|--- dist:%s, ml:%s' % (dist,ml))

    if ml==0 or dist/ml > MAX_ALLOWED_SURNAME_DISTANCE_PERCENT:
        return 0.0
    else:
        return 1.-float(dist)/max(len(sa),len(sb))
def surname_compatibility(sa, sb):
    name_comparison_print('|-- Comparing surnames: %s %s'% (sa,sb))
    MAX_ALLOWED_SURNAME_DISTANCE_PERCENT = 0.33
    sa = clean_name_string(sa, replacement='', keep_whitespace=False, trim_whitespaces=True)
    sb = clean_name_string(sb, replacement='', keep_whitespace=False, trim_whitespaces=True)
    dist = distance(sa, sb)
    ml = float(max(len(sa),len(sb)))
    name_comparison_print('|--- dist:%s, ml:%s' % (dist,ml))

    if ml==0 or dist/ml > MAX_ALLOWED_SURNAME_DISTANCE_PERCENT:
        return 0.0
    else:
        return 1.-float(dist)/max(len(sa),len(sb))
Пример #3
0
def full_names_are_synonymous(name1, name2, name_variations):
    '''
    Checks if two names are synonymous; e.g. "Robert" vs. "Bob"

    @param name1: Full Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Full Name string of the second name (w/ last name)
    @type name2: string
    @param name_variations: name variations list
    @type name_variations: list of lists

    @return: are names synonymous
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    names_are_synonymous_b = False
    max_matches = min(len(name1[2]), len(name2[2]))
    matches = []

    for i in xrange(max_matches):
        matches.append(False)

    for nvar in name_variations:
        for i in xrange(max_matches):
            oname = name1[2][i].lower()
            tname = name2[2][i].lower()
            oname = clean_name_string(oname, "", False, True)
            tname = clean_name_string(tname, "", False, True)

            if (oname in nvar and tname in nvar) or oname == tname:
                name_comparison_print('      ', oname, ' and ', tname,
                                      ' are synonyms!')
                matches[i] = True

        if sum(matches) == max_matches:
            names_are_synonymous_b = True
            break

    return names_are_synonymous_b
Пример #4
0
def full_names_are_synonymous(name1, name2, name_variations):
    '''
    Checks if two names are synonymous; e.g. "Robert" vs. "Bob"

    @param name1: Full Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Full Name string of the second name (w/ last name)
    @type name2: string
    @param name_variations: name variations list
    @type name_variations: list of lists

    @return: are names synonymous
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    names_are_synonymous_b = False
    max_matches = min(len(name1[2]), len(name2[2]))
    matches = []

    for i in xrange(max_matches):
        matches.append(False)

    for nvar in name_variations:
        for i in xrange(max_matches):
            oname = name1[2][i].lower()
            tname = name2[2][i].lower()
            oname = clean_name_string(oname, "", False, True)
            tname = clean_name_string(tname, "", False, True)

            if (oname in nvar and tname in nvar) or oname == tname:
                name_comparison_print('      ', oname, ' and ', tname, ' are synonyms!')
                matches[i] = True

        if sum(matches) == max_matches:
            names_are_synonymous_b = True
            break

    return names_are_synonymous_b
Пример #5
0
def initials_compatibility(ia, ib):
    max_n_initials = max(len(ia), len(ib))
    initials_intersection = set(ia).intersection(set(ib))
    n_initials_intersection = len(initials_intersection)
    initials_union = set(ia).union(set(ib))
    n_initials_union = len(initials_union)
    initials_distance = distance("".join(ia), "".join(ib))

    name_comparison_print('|-- Comparing initials, %s %s' % (ia, ib))
    name_comparison_print('|--- initials distance %s' % (initials_distance))

    if n_initials_union > 0:
        initials_c = float(n_initials_intersection) / float(n_initials_union)
    else:
        initials_c = 1

    name_comparison_print('|--- initials c %s' % (initials_c))

    if len(ia) > len(ib):
        alo = ia
        alt = ib
    else:
        alo = ib
        alt = ia
    lo = len(alo)
    lt = len(alt)
    if max_n_initials > 0:
        initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo))
                            if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \
                            float(float(max_n_initials * (max_n_initials + 1)) / 2)
        initials_distance = float(initials_distance) / max_n_initials
    else:
        initials_screwup = 0
        initials_distance = 0

    name_comparison_print('|--- initials screwup, %s ' % (initials_screwup))
    name_comparison_print('|--- initials distance, %s' % (initials_distance))

    return max(
        0.0, 0.8 * initials_c + 0.1 * (1 - initials_distance) + 0.1 *
        (1 - initials_screwup))
Пример #6
0
def initials_compatibility(ia, ib):
    max_n_initials = max(len(ia), len(ib))
    initials_intersection = set(ia).intersection(set(ib))
    n_initials_intersection = len(initials_intersection)
    initials_union = set(ia).union(set(ib))
    n_initials_union = len(initials_union)
    initials_distance = distance("".join(ia), "".join(ib))

    name_comparison_print('|-- Comparing initials, %s %s' % (ia, ib))
    name_comparison_print('|--- initials distance %s'% (initials_distance))

    if n_initials_union > 0:
        initials_c = float(n_initials_intersection) / float(n_initials_union)
    else:
        initials_c = 1

    name_comparison_print('|--- initials c %s'% (initials_c))

    if len(ia) > len(ib):
        alo = ia
        alt = ib
    else:
        alo = ib
        alt = ia
    lo = len(alo)
    lt = len(alt)
    if max_n_initials > 0:
        initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo))
                            if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \
                            float(float(max_n_initials * (max_n_initials + 1)) / 2)
        initials_distance = float(initials_distance) / max_n_initials
    else:
        initials_screwup = 0
        initials_distance = 0

    name_comparison_print('|--- initials screwup, %s '% (initials_screwup))
    name_comparison_print('|--- initials distance, %s'% (initials_distance))

    return max(0.0, 0.8*initials_c + 0.1*(1-initials_distance) + 0.1*(1-initials_screwup))
Пример #7
0
def compare_names(origin_name, target_name, initials_penalty=False):
    '''
    Compare two names.
    '''
    MAX_ALLOWED_SURNAME_DISTANCE = 2
    name_comparison_print("\nComparing: ", origin_name, ' ', target_name)
    gendernames = GLOBAL_gendernames
    name_variations = GLOBAL_name_variations
    no = split_name_parts(origin_name, True, "", True)
    nt = split_name_parts(target_name, True, "", True)

    name_comparison_print("|- splitted no: ", no)
    name_comparison_print("|- splitted nt: ", nt)

    score = 0.0

    surname_dist = distance(no[0], nt[0])
    name_comparison_print("|- surname distance: ", surname_dist)

    if surname_dist > 0:
        l_artifact_removal = re.compile("[^a-zA-Z0-9]")
        fn1 = l_artifact_removal.sub("", no[0])
        fn2 = l_artifact_removal.sub("", nt[0])

        if fn1 == fn2:
            score = 1.0
        else:
            score = max(
                0.0, 0.5 -
                (float(surname_dist) / float(MAX_ALLOWED_SURNAME_DISTANCE)))
    else:
        score = 1.0
    name_comparison_print('||- surname score: ', score)

    initials_only = ((min(len(no[2]), len(nt[2]))) == 0)
    only_initials_available = False
    if len(no[2]) == len(nt[2]) and initials_only:
        only_initials_available = True

    name_comparison_print('|- initials only: ', initials_only)
    name_comparison_print('|- only initials available: ',
                          only_initials_available)

    names_are_equal_composites = False
    if not initials_only:
        names_are_equal_composites = full_names_are_equal_composites(
            origin_name, target_name)
    name_comparison_print("|- equal composites: ", names_are_equal_composites)

    max_n_initials = max(len(no[1]), len(nt[1]))
    initials_intersection = set(no[1]).intersection(set(nt[1]))
    n_initials_intersection = len(initials_intersection)
    initials_union = set(no[1]).union(set(nt[1]))
    n_initials_union = len(initials_union)

    initials_distance = distance("".join(no[1]), "".join(nt[1]))
    if n_initials_union > 0:
        initials_c = float(n_initials_intersection) / float(n_initials_union)
    else:
        initials_c = 1

    if len(no[1]) > len(nt[1]):
        alo = no[1]
        alt = nt[1]
    else:
        alo = nt[1]
        alt = no[1]
    lo = len(alo)
    lt = len(alt)
    if max_n_initials > 0:
        initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo))
                            if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \
                            float(float(max_n_initials * (max_n_initials + 1)) / 2)
        initials_distance = initials_distance / max_n_initials
    else:
        initials_screwup = 0
        initials_distance = 0

    score = score - (0.75 * initials_screwup + 0.10 * (1 - initials_c)\
            + 0.15 * initials_distance) * (score)
    name_comparison_print("|- initials sets: ", no[1], " ", nt[1])
    name_comparison_print("|- initials distance: ", initials_distance)
    name_comparison_print("|- initials c: ", initials_c)
    name_comparison_print("|- initials screwup: ", initials_screwup)
    name_comparison_print("||- initials score: ", score)

    composits_eq = full_names_are_equal_composites(no, nt)
    if len(no[2]) > 0 and len(nt[2]) > 0:
        gender_eq = full_names_are_equal_gender(no, nt, gendernames)
    else:
        gender_eq = True
    vars_eq = full_names_are_synonymous(no, nt, name_variations)
    substr_eq = full_names_are_substrings(no, nt)

    if not initials_only:
        if len(no[2]) > len(nt[2]):
            nalo = no[2]
            nalt = nt[2]
        else:
            nalo = nt[2]
            nalt = no[2]
        nlo = len(nalo)
        nlt = len(nalt)
        names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i])))
                             for i, k in enumerate(reversed(nalo)) \
                             if nlo - 1 - i < nlt]
        max_names_screwup = max(
            [float(i[0]) / i[1] for i in names_screwup_list])
        avg_names_screwup = sum([float(i[0]) / i[1] for i in names_screwup_list])\
                            / len(names_screwup_list)

    else:
        max_names_screwup = 0
        avg_names_screwup = 0

    score = score - score * 0.75 * max_names_screwup - score * 0.25 * avg_names_screwup
    name_comparison_print("|- max names screwup: ", max_names_screwup)
    name_comparison_print("|- avg screwup: ", avg_names_screwup)
    name_comparison_print("||- names score: ", score)
    name_comparison_print("|- names composites: ", composits_eq)
    name_comparison_print("|- same gender: ", gender_eq)
    name_comparison_print("|- synonims: ", vars_eq)
    name_comparison_print("|- substrings: ", substr_eq)

    if vars_eq:
        synmap = [[i, j, names_are_synonymous(i, j, name_variations)]
                  for i in no[2] for j in nt[2]]
        synmap = [i for i in synmap if i[2] == True]
        name_comparison_print("|-- synmap: ", synmap)
        for i in synmap:
            if no[2].index(i[0]) == nt[2].index(i[1]):
                score = score + (1 - score) * 0.5
            else:
                score = score + (1 - score) * 0.15
    else:
        name_comparison_print("|-- synmap: empty")
    name_comparison_print("|-- synmap score: ", score)

    if substr_eq and not initials_only:
        ssmap = [[i, j, names_are_substrings(i, j)] for i in no[2]
                 for j in nt[2]]
        ssmap = [i for i in ssmap if i[2] == True]
        name_comparison_print("|-- substr map: ", ssmap)
        for i in ssmap:
            if no[2].index(i[0]) == nt[2].index(i[1]):
                score = score + (1 - score) * 0.2
            else:
                score = score + (1 - score) * 0.05
    else:
        name_comparison_print("|-- substr map: empty")

    name_comparison_print("|-- substring score: ", score)

    if composits_eq and not initials_only:
        name_comparison_print("|-- composite names")
        score = score + (1 - score) * 0.2
    else:
        name_comparison_print("|-- not composite names")
    name_comparison_print("|-- composite score: ", score)

    if not gender_eq:
        score = score / 3.
        name_comparison_print("|-- apply gender penalty")
    else:
        name_comparison_print("|--   no  gender penalty")

    name_comparison_print("|-- gender score: ", score)

    if surname_dist > MAX_ALLOWED_SURNAME_DISTANCE:
        score = 0.0
        name_comparison_print("|- surname trim: ", score)
    else:
        name_comparison_print("|- no surname trim: ", score)

    if initials_only and (not only_initials_available or initials_penalty):
        score = score * .9
        name_comparison_print("|- initials only penalty: ", score,
                              initials_only, only_initials_available)
    else:
        name_comparison_print("|- no initials only penalty", initials_only,
                              only_initials_available)

    name_comparison_print("||- final score:  ", score)

    return score
Пример #8
0
from invenio.bibauthorid_general_utils import name_comparison_print

try:
    from invenio.config import CFG_ETCDIR
    NO_CFG_ETCDIR = False
except ImportError:
    NO_CFG_ETCDIR = True

try:
    from editdist import distance
except ImportError:
    try:
        from Levenshtein import distance
    except ImportError:
        name_comparison_print("Levenshtein Module not available!")

        def distance(s1, s2):
            d = {}
            lenstr1 = len(s1)
            lenstr2 = len(s2)
            for i in xrange(-1, lenstr1 + 1):
                d[(i, -1)] = i + 1
            for j in xrange(-1, lenstr2 + 1):
                d[(-1, j)] = j + 1

            for i in xrange(0, lenstr1):
                for j in xrange(0, lenstr2):
                    if s1[i] == s2[j]:
                        cost = 0
                    else:
Пример #9
0
def compare_names(origin_name, target_name, initials_penalty=False):
    '''
    Compare two names.
    '''
    MAX_ALLOWED_SURNAME_DISTANCE = 2
    name_comparison_print("\nComparing: " , origin_name, ' ', target_name)
    gendernames = GLOBAL_gendernames
    name_variations = GLOBAL_name_variations
    no = split_name_parts(origin_name, True, "", True)
    nt = split_name_parts(target_name, True, "", True)

    name_comparison_print("|- splitted no: ", no)
    name_comparison_print("|- splitted nt: ", nt)

    score = 0.0

    surname_dist = distance(no[0], nt[0])
    name_comparison_print("|- surname distance: ", surname_dist)

    if surname_dist > 0:
        l_artifact_removal = re.compile("[^a-zA-Z0-9]")
        fn1 = l_artifact_removal.sub("", no[0])
        fn2 = l_artifact_removal.sub("", nt[0])

        if fn1 == fn2:
            score = 1.0
        else:
            score = max(0.0, 0.5 - (float(surname_dist) / float(MAX_ALLOWED_SURNAME_DISTANCE)))
    else:
        score = 1.0
    name_comparison_print('||- surname score: ', score)

    initials_only = ((min(len(no[2]), len(nt[2]))) == 0)
    only_initials_available = False
    if len(no[2]) == len(nt[2]) and initials_only:
        only_initials_available = True

    name_comparison_print('|- initials only: ', initials_only)
    name_comparison_print('|- only initials available: ', only_initials_available)

    names_are_equal_composites = False
    if not initials_only:
        names_are_equal_composites = full_names_are_equal_composites(origin_name, target_name)
    name_comparison_print("|- equal composites: ", names_are_equal_composites)

    max_n_initials = max(len(no[1]), len(nt[1]))
    initials_intersection = set(no[1]).intersection(set(nt[1]))
    n_initials_intersection = len(initials_intersection)
    initials_union = set(no[1]).union(set(nt[1]))
    n_initials_union = len(initials_union)


    initials_distance = distance("".join(no[1]), "".join(nt[1]))
    if n_initials_union > 0:
        initials_c = float(n_initials_intersection) / float(n_initials_union)
    else:
        initials_c = 1

    if len(no[1]) > len(nt[1]):
        alo = no[1]
        alt = nt[1]
    else:
        alo = nt[1]
        alt = no[1]
    lo = len(alo)
    lt = len(alt)
    if max_n_initials > 0:
        initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo))
                            if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \
                            float(float(max_n_initials * (max_n_initials + 1)) / 2)
        initials_distance = initials_distance / max_n_initials
    else:
        initials_screwup = 0
        initials_distance = 0

    score = score - (0.75 * initials_screwup + 0.10 * (1 - initials_c)\
            + 0.15 * initials_distance) * (score)
    name_comparison_print("|- initials sets: ", no[1], " ", nt[1])
    name_comparison_print("|- initials distance: ", initials_distance)
    name_comparison_print("|- initials c: ", initials_c)
    name_comparison_print("|- initials screwup: ", initials_screwup)
    name_comparison_print("||- initials score: ", score)

    composits_eq = full_names_are_equal_composites(no, nt)
    if len(no[2]) > 0 and len(nt[2]) > 0:
        gender_eq = full_names_are_equal_gender(no, nt, gendernames)
    else:
        gender_eq = True
    vars_eq = full_names_are_synonymous(no, nt, name_variations)
    substr_eq = full_names_are_substrings(no, nt)

    if not initials_only:
        if len(no[2]) > len(nt[2]):
            nalo = no[2]
            nalt = nt[2]
        else:
            nalo = nt[2]
            nalt = no[2]
        nlo = len(nalo)
        nlt = len(nalt)
        names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i])))
                             for i, k in enumerate(reversed(nalo)) \
                             if nlo - 1 - i < nlt]
        max_names_screwup = max([float(i[0]) / i[1] for i in names_screwup_list])
        avg_names_screwup = sum([float(i[0]) / i[1] for i in names_screwup_list])\
                            / len(names_screwup_list)

    else:
        max_names_screwup = 0
        avg_names_screwup = 0

    score = score - score * 0.75 * max_names_screwup - score * 0.25 * avg_names_screwup
    name_comparison_print("|- max names screwup: ", max_names_screwup)
    name_comparison_print("|- avg screwup: ", avg_names_screwup)
    name_comparison_print("||- names score: ", score)
    name_comparison_print("|- names composites: ", composits_eq)
    name_comparison_print("|- same gender: ", gender_eq)
    name_comparison_print("|- synonims: ", vars_eq)
    name_comparison_print("|- substrings: ", substr_eq)

    if vars_eq:
        synmap = [[i, j, names_are_synonymous(i, j, name_variations)] for i in no[2] for j in nt[2]]
        synmap = [i for i in synmap if i[2] == True]
        name_comparison_print("|-- synmap: ", synmap)
        for i in synmap:
            if no[2].index(i[0]) == nt[2].index(i[1]):
                score = score + (1 - score) * 0.5
            else:
                score = score + (1 - score) * 0.15
    else:
        name_comparison_print("|-- synmap: empty")
    name_comparison_print("|-- synmap score: ", score)

    if substr_eq and not initials_only:
        ssmap = [[i, j, names_are_substrings(i, j)] for i in no[2] for j in nt[2]]
        ssmap = [i for i in ssmap if i[2] == True]
        name_comparison_print("|-- substr map: ", ssmap)
        for i in ssmap:
            if no[2].index(i[0]) == nt[2].index(i[1]):
                score = score + (1 - score) * 0.2
            else:
                score = score + (1 - score) * 0.05
    else:
        name_comparison_print("|-- substr map: empty")

    name_comparison_print("|-- substring score: ", score)

    if composits_eq and not initials_only:
        name_comparison_print("|-- composite names")
        score = score + (1 - score) * 0.2
    else:
        name_comparison_print("|-- not composite names")
    name_comparison_print("|-- composite score: ", score)

    if not gender_eq:
        score = score / 3.
        name_comparison_print("|-- apply gender penalty")
    else:
        name_comparison_print("|--   no  gender penalty")

    name_comparison_print("|-- gender score: ", score)

    if surname_dist > MAX_ALLOWED_SURNAME_DISTANCE:
        score = 0.0
        name_comparison_print("|- surname trim: ", score)
    else:
        name_comparison_print("|- no surname trim: ", score)

    if initials_only and (not only_initials_available or initials_penalty):
        score = score * .9
        name_comparison_print("|- initials only penalty: ", score, initials_only, only_initials_available)
    else:
        name_comparison_print("|- no initials only penalty", initials_only, only_initials_available)

    name_comparison_print("||- final score:  ", score)

    return score
Пример #10
0
from invenio.bibauthorid_general_utils import name_comparison_print

try:
    from invenio.config import CFG_ETCDIR
    NO_CFG_ETCDIR = False
except ImportError:
    NO_CFG_ETCDIR = True

try:
    from editdist import distance
except ImportError:
    try:
        from Levenshtein import distance
    except ImportError:
        name_comparison_print("Levenshtein Module not available!")
        def distance(s1, s2):
            d = {}
            lenstr1 = len(s1)
            lenstr2 = len(s2)
            for i in xrange(-1, lenstr1 + 1):
                d[(i, -1)] = i + 1
            for j in xrange(-1, lenstr2 + 1):
                d[(-1, j)] = j + 1

            for i in xrange(0, lenstr1):
                for j in xrange(0, lenstr2):
                    if s1[i] == s2[j]:
                        cost = 0
                    else:
                        cost = 1
Пример #11
0
def compare_names(origin_name, target_name, initials_penalty=False):
    ''' Compare two names '''

    name_comparison_print("\nComparing: " , origin_name, ' ', target_name)

    origin_name = translate_to_ascii(origin_name)[0]
    target_name = translate_to_ascii(target_name)[0]

    no = split_name_parts(origin_name, True, "", True)
    nt = split_name_parts(target_name, True, "", True)

    name_comparison_print("|- splitted no: %s"% no)
    name_comparison_print("|- splitted nt: %s"% nt)

    FS_surname_score = surname_compatibility(no[0], nt[0])

    assert FS_surname_score >= 0 and FS_surname_score <=1, "Compare_names: Surname score out of range"

    name_comparison_print("|- surname score: %s"% FS_surname_score)

    FS_initials_only = ((min(len(no[2]), len(nt[2]))) == 0)
    FS_initials_score = initials_compatibility(no[1], nt[1])

    assert FS_initials_score >= 0 and FS_initials_score <=1, "Compare_names: initials score out of range"

    name_comparison_print('|- initials only %s'% FS_initials_only)
    name_comparison_print('|- initials score %s'% FS_initials_score)

    FS_first_names_score = compare_first_names(no[2],nt[2])

    assert FS_first_names_score >= 0 and FS_first_names_score <=1, "Compare_names: firstname score out of range"
    name_comparison_print('|- names score %s'% FS_first_names_score )

    if not FS_initials_only:
        x = FS_initials_score
        y = FS_first_names_score
        try:
            FS_ns = (x*y)/sqrt(x**2+y**2)*SQRT2
        except ZeroDivisionError:
            FS_ns = 0.0
    else:
        FS_ns = FS_initials_score * 0.6

    name_comparison_print('|- final scores %s %s'% (FS_surname_score, FS_ns))

    x = FS_surname_score
    y = FS_ns

    try:
        final_score = (x*y)/sqrt(x**2+y**2)*SQRT2
    except ZeroDivisionError:
        final_score = 0.0

    name_comparison_print("|- final score is... %s" % final_score)
    return final_score
Пример #12
0
def compare_first_names(fna, fnb):
    gendernames = GLOBAL_gendernames
    name_variations = GLOBAL_name_variations

    initials_only = ((min(len(fna), len(fnb))) == 0)

    name_comparison_print('|-- Comparing names %s %s' % (fna,fnb))
    if len(fna) > 0 and len(fnb) > 0:
        gender_eq = full_names_are_equal_gender(fna, fnb, gendernames, only_names=True)
    else:
        gender_eq = None

    name_comparison_print("|--- gender equal: %s" % gender_eq)

    names_are_equal_composites = False
    if not initials_only:
        names_are_equal_composites = full_names_are_equal_composites(fna,fnb, only_names=True)
    name_comparison_print("|--- equal composites: %s" % names_are_equal_composites)

    vars_eq = full_names_are_synonymous(fna, fnb, name_variations, only_names=True)
    substr_eq = full_names_are_substrings(fna, fnb, only_names=True)

    name_comparison_print("|--- synonims: %s" % vars_eq)
    name_comparison_print("|--- substrings: %s" % substr_eq)

    if not initials_only:
        if len(fna) > len(fnb):
            nalo = fna
            nalt = fnb
        else:
            nalo = fnb
            nalt = fna
        nlo = len(nalo)
        nlt = len(nalt)
        names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i])))
                             for i, k in enumerate(reversed(nalo)) \
                             if nlo - 1 - i < nlt]

        def _min_names_screwup_list(nalo, nalt):
            nalo = list(nalo)
            nalt = list(nalt)
            sl = []
            for n in nalo:
                maxs = max(len(n), max((len(k) for k in nalt)))
                all_scr = [distance(n,k) for k in nalt]
                mins = min(all_scr)
                sl.append((mins,maxs))
                nalt.pop(all_scr.index(mins))
                if len(nalt) < 1:
                    break
            return sl

        min_names_screwup_list = _min_names_screwup_list(nalo, nalt)
        max_names_screwup = max([float(i[0]) / i[1] for i in names_screwup_list])
        min_names_screwup = min([float(i[0]) / i[1] for i in min_names_screwup_list])
        avg_names_screwup = (sum([float(i[0]) / i[1] for i in names_screwup_list])/len(names_screwup_list)+
                             sum([float(i[0]) / i[1] for i in min_names_screwup_list])/len(min_names_screwup_list))/2

    else:
        max_names_screwup = 0
        min_names_screwup = 0
        avg_names_screwup = 0

    name_comparison_print('|--- screwups min, max, avg: %s %s %s' %
                    (str(min_names_screwup),str(max_names_screwup), str(avg_names_screwup)))

    orig_max_names_screwup = max_names_screwup

    if max_names_screwup > 0.1:
        name_comparison_print("|--- forcing names screwup to one!")
        max_names_screwup = 1
        min_names_screwup = 1
        avg_names_screwup = 1

    name_comparison_print("|--- min screwup: %s" % min_names_screwup)
    name_comparison_print("|--- max screwup: %s" % max_names_screwup)
    name_comparison_print("|--- avg screwup: %s" % avg_names_screwup)

    compat_score = max(1 - ( 0.25 * max_names_screwup + 0.5 * avg_names_screwup + 0.25 * min_names_screwup), 0.0)

    name_comparison_print("|--- Name compatibility score: %s" % compat_score)

    if names_are_equal_composites and substr_eq:
        compat_score = min(1.0, compat_score + 0.7)
    elif not names_are_equal_composites and substr_eq:
        compat_score = min(1.0, compat_score + max(0., (1-orig_max_names_screwup)*0.75))

    name_comparison_print("|--- names are equal composites and subtring bonus: %s"% compat_score)


    if vars_eq:
        compat_score = min(1.0, compat_score + 0.5)

    name_comparison_print("|--- synonims bonus: %s"% compat_score)

    if gender_eq != None and not gender_eq:
        compat_score = max(0.0, compat_score * 0.25)

    name_comparison_print("|--- Different Gender penalty: %s"% compat_score)

    return compat_score
Пример #13
0
def compare_names(origin_name, target_name, initials_penalty=False):
    ''' Compare two names '''

    name_comparison_print("\nComparing: ", origin_name, ' ', target_name)

    origin_name = translate_to_ascii(origin_name)[0]
    target_name = translate_to_ascii(target_name)[0]

    no = split_name_parts(origin_name, True, "", True)
    nt = split_name_parts(target_name, True, "", True)

    name_comparison_print("|- splitted no: %s" % no)
    name_comparison_print("|- splitted nt: %s" % nt)

    FS_surname_score = surname_compatibility(no[0], nt[0])

    assert FS_surname_score >= 0 and FS_surname_score <= 1, "Compare_names: Surname score out of range"

    name_comparison_print("|- surname score: %s" % FS_surname_score)

    FS_initials_only = ((min(len(no[2]), len(nt[2]))) == 0)
    FS_initials_score = initials_compatibility(no[1], nt[1])

    assert FS_initials_score >= 0 and FS_initials_score <= 1, "Compare_names: initials score out of range"

    name_comparison_print('|- initials only %s' % FS_initials_only)
    name_comparison_print('|- initials score %s' % FS_initials_score)

    FS_first_names_score = compare_first_names(no[2], nt[2])

    assert FS_first_names_score >= 0 and FS_first_names_score <= 1, "Compare_names: firstname score out of range"
    name_comparison_print('|- names score %s' % FS_first_names_score)

    if not FS_initials_only:
        x = FS_initials_score
        y = FS_first_names_score
        try:
            FS_ns = (x * y) / sqrt(x**2 + y**2) * SQRT2
        except ZeroDivisionError:
            FS_ns = 0.0
    else:
        FS_ns = FS_initials_score * 0.6

    name_comparison_print('|- final scores %s %s' % (FS_surname_score, FS_ns))

    x = FS_surname_score
    y = FS_ns

    try:
        final_score = (x * y) / sqrt(x**2 + y**2) * SQRT2
    except ZeroDivisionError:
        final_score = 0.0

    name_comparison_print("|- final score is... %s" % final_score)
    return final_score
Пример #14
0
def compare_first_names(fna, fnb):
    gendernames = GLOBAL_gendernames
    name_variations = GLOBAL_name_variations

    initials_only = ((min(len(fna), len(fnb))) == 0)

    name_comparison_print('|-- Comparing names %s %s' % (fna, fnb))
    if len(fna) > 0 and len(fnb) > 0:
        gender_eq = full_names_are_equal_gender(fna,
                                                fnb,
                                                gendernames,
                                                only_names=True)
    else:
        gender_eq = None

    name_comparison_print("|--- gender equal: %s" % gender_eq)

    names_are_equal_composites = False
    if not initials_only:
        names_are_equal_composites = full_names_are_equal_composites(
            fna, fnb, only_names=True)
    name_comparison_print("|--- equal composites: %s" %
                          names_are_equal_composites)

    vars_eq = full_names_are_synonymous(fna,
                                        fnb,
                                        name_variations,
                                        only_names=True)
    substr_eq = full_names_are_substrings(fna, fnb, only_names=True)

    name_comparison_print("|--- synonims: %s" % vars_eq)
    name_comparison_print("|--- substrings: %s" % substr_eq)

    if not initials_only:
        if len(fna) > len(fnb):
            nalo = fna
            nalt = fnb
        else:
            nalo = fnb
            nalt = fna
        nlo = len(nalo)
        nlt = len(nalt)
        names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i])))
                             for i, k in enumerate(reversed(nalo)) \
                             if nlo - 1 - i < nlt]

        def _min_names_screwup_list(nalo, nalt):
            nalo = list(nalo)
            nalt = list(nalt)
            sl = []
            for n in nalo:
                maxs = max(len(n), max((len(k) for k in nalt)))
                all_scr = [distance(n, k) for k in nalt]
                mins = min(all_scr)
                sl.append((mins, maxs))
                nalt.pop(all_scr.index(mins))
                if len(nalt) < 1:
                    break
            return sl

        min_names_screwup_list = _min_names_screwup_list(nalo, nalt)
        max_names_screwup = max(
            [float(i[0]) / i[1] for i in names_screwup_list])
        min_names_screwup = min(
            [float(i[0]) / i[1] for i in min_names_screwup_list])
        avg_names_screwup = (
            sum([float(i[0]) / i[1]
                 for i in names_screwup_list]) / len(names_screwup_list) +
            sum([float(i[0]) / i[1] for i in min_names_screwup_list]) /
            len(min_names_screwup_list)) / 2

    else:
        max_names_screwup = 0
        min_names_screwup = 0
        avg_names_screwup = 0

    name_comparison_print('|--- screwups min, max, avg: %s %s %s' %
                          (str(min_names_screwup), str(max_names_screwup),
                           str(avg_names_screwup)))

    orig_max_names_screwup = max_names_screwup

    if max_names_screwup > 0.1:
        name_comparison_print("|--- forcing names screwup to one!")
        max_names_screwup = 1
        min_names_screwup = 1
        avg_names_screwup = 1

    name_comparison_print("|--- min screwup: %s" % min_names_screwup)
    name_comparison_print("|--- max screwup: %s" % max_names_screwup)
    name_comparison_print("|--- avg screwup: %s" % avg_names_screwup)

    compat_score = max(
        1 - (0.25 * max_names_screwup + 0.5 * avg_names_screwup +
             0.25 * min_names_screwup), 0.0)

    name_comparison_print("|--- Name compatibility score: %s" % compat_score)

    if names_are_equal_composites and substr_eq:
        compat_score = min(1.0, compat_score + 0.7)
    elif not names_are_equal_composites and substr_eq:
        compat_score = min(
            1.0, compat_score + max(0., (1 - orig_max_names_screwup) * 0.75))

    name_comparison_print(
        "|--- names are equal composites and subtring bonus: %s" %
        compat_score)

    if vars_eq:
        compat_score = min(1.0, compat_score + 0.5)

    name_comparison_print("|--- synonims bonus: %s" % compat_score)

    if gender_eq != None and not gender_eq:
        compat_score = max(0.0, compat_score * 0.25)

    name_comparison_print("|--- Different Gender penalty: %s" % compat_score)

    return compat_score