Python split_name_parts示例，bibauthorid_utils.split_name_parts Python示例

示例#1

0

显示文件

def soft_compare_names(origin_name, target_name):
    '''
    Soft comparison of names, to use in search engine an similar
    Base results:
    If surname is equal in [0.6,1.0]
    If surname similar in [0.4,0.8]
    If surname differs in [0.0,0.4]
    all depending on average compatibility of names and initials.
    '''
    jaro_fctn = None

    try:
        from Levenshtein import jaro_winkler
        jaro_fctn = jaro_winkler
    except ImportError:
        jaro_fctn = jaro_winkler_str_similarity
    score = 0.0
    oname = deepcopy(origin_name)
    tname = deepcopy(target_name)
    orig_name = split_name_parts(oname.lower())
    targ_name = split_name_parts(tname.lower())
    orig_name[0] = clean_name_string(orig_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    targ_name[0] = clean_name_string(targ_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    if orig_name[0] == targ_name[0]:
        score += 0.6
    else:
        if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95)
                or min(len(orig_name[0]), len(targ_name[0])) <= 4):
            score += 0.0
        else:
            score += 0.4

    if orig_name[1] and targ_name[1]:
        max_initials = max(len(orig_name[1]), len(targ_name[1]))
        matching_i = 0
        if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1:
            for i in orig_name[1]:
                if i in targ_name[1]:
                    matching_i += 1
        max_names = max(len(orig_name[2]), len(targ_name[2]))
        matching_n = 0
        if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1:
            cleaned_targ_name = [
                clean_name_string(i, replacement="", keep_whitespace=False)
                for i in targ_name[2]
            ]
            for i in orig_name[2]:
                if clean_name_string(
                        i, replacement="",
                        keep_whitespace=False) in cleaned_targ_name:
                    matching_n += 1

        name_score = (matching_i + matching_n) * 0.4 / (max_names +
                                                        max_initials)
        score += name_score
    return score

示例#2

0

显示文件

文件： bibauthorid_authorname_utils.py 项目： Markus-Goetz/CDS-Invenio-Authorlist

def names_are_substrings(name1, name2):
    '''
    Checks if two names are substrings of each other; e.g. "Christoph" vs. "Ch"
    Only checks for the beginning of the names. 

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string

    @return: are names synonymous
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    onames = name1[2]
    tnames = name2[2]
#    oname = "".join(onames).lower()
#    tname = "".join(tnames).lower()
    oname = clean_name_string("".join(onames).lower(), "", False, True)
    tname = clean_name_string("".join(tnames).lower(), "", False, True)
    names_are_substrings_b = False

    if (oname.startswith(tname)
        or tname.startswith(oname)):
        names_are_substrings_b = True

    return names_are_substrings_b

示例#3

0

显示文件

def names_are_equal_composites(name1, name2):
    '''
    Checks if names are equal composites; e.g. "guangsheng" vs. "guang sheng"

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string

    @return: Are the names equal composites?
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    is_equal_composite = False
    oname_variations = create_name_tuples(name1[2])
    tname_variations = create_name_tuples(name2[2])

    for oname_variation in oname_variations:
        for tname_variation in tname_variations:
            oname = clean_name_string(oname_variation.lower(), "", False, True)
            tname = clean_name_string(tname_variation.lower(), "", False, True)

            if oname == tname:
                is_equal_composite = True
                break

    return is_equal_composite

示例#4

0

显示文件

文件： bibauthorid_authorname_utils.py 项目： Markus-Goetz/CDS-Invenio-Authorlist

def names_are_equal_composites(name1, name2):
    '''
    Checks if names are equal composites; e.g. "guangsheng" vs. "guang sheng"

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string

    @return: Are the names equal composites?
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    is_equal_composite = False
    oname_variations = create_name_tuples(name1[2])
    tname_variations = create_name_tuples(name2[2])

    for oname_variation in oname_variations:
        for tname_variation in tname_variations:
            oname = clean_name_string(oname_variation.lower(), "", False, True)
            tname = clean_name_string(tname_variation.lower(), "", False, True)

            if oname == tname:
                is_equal_composite = True
                break

    return is_equal_composite

示例#5

0

显示文件

def names_are_substrings(name1, name2):
    '''
    Checks if two names are substrings of each other; e.g. "Christoph" vs. "Ch"
    Only checks for the beginning of the names. 

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string

    @return: are names synonymous
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    onames = name1[2]
    tnames = name2[2]
    #    oname = "".join(onames).lower()
    #    tname = "".join(tnames).lower()
    oname = clean_name_string("".join(onames).lower(), "", False, True)
    tname = clean_name_string("".join(tnames).lower(), "", False, True)
    names_are_substrings_b = False

    if (oname.startswith(tname) or tname.startswith(oname)):
        names_are_substrings_b = True

    return names_are_substrings_b

示例#6

0

显示文件

文件： bibauthorid_authorname_utils.py 项目： Markus-Goetz/CDS-Invenio-Authorlist

def soft_compare_names(origin_name, target_name):
    '''
    Soft comparison of names, to use in search engine an similar
    Base results:
    If surname is equal in [0.6,1.0]
    If surname similar in [0.4,0.8]
    If surname differs in [0.0,0.4]
    all depending on average compatibility of names and initials.
    '''
    jaro_fctn = None

    try:
        from Levenshtein import jaro_winkler
        jaro_fctn = jaro_winkler
    except ImportError:
        jaro_fctn = jaro_winkler_str_similarity
    score = 0.0
    oname = deepcopy(origin_name)
    tname = deepcopy(target_name)
    orig_name = split_name_parts(oname.lower())
    targ_name = split_name_parts(tname.lower())
    orig_name[0] = clean_name_string(orig_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    targ_name[0] = clean_name_string(targ_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    if orig_name[0] == targ_name[0]:
        score += 0.6
    else:
        if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95)
            or min(len(orig_name[0]), len(targ_name[0])) <= 4):
            score += 0.0
        else:
            score += 0.4

    if orig_name[1] and targ_name[1]:
        max_initials = max(len(orig_name[1]), len(targ_name[1]))
        matching_i = 0
        if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1:
            for i in orig_name[1]:
                if i in targ_name[1]:
                    matching_i += 1
        max_names = max(len(orig_name[2]), len(targ_name[2]))
        matching_n = 0
        if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1:
            cleaned_targ_name = [clean_name_string(i, replacement="", keep_whitespace=False) for i in targ_name[2]]
            for i in orig_name[2]:
                if clean_name_string(i, replacement="", keep_whitespace=False) in cleaned_targ_name:
                    matching_n += 1

        name_score = (matching_i + matching_n) * 0.4 / (max_names + max_initials)
        score += name_score
    return score

示例#7

0

显示文件

def search_matching_names(authorname_string,
                          match_function=name_matching,
                          consider_surname_only=True):
    """
    search for matching names give a matching function.
    @warning: searching for matching name with consider_surname_only=false
        will be painfully slow! You've been warned.

    @warning: for mental sanity purposes the surnames not ending with a comma
        are being ignored;
        if you're searching for a surname without comma or names, the comma is
        being added automatically to the end of the string.

    @param authorname_string: The author name string
    @type authorname_string: string
    @param match_function: The function to use for the name matching
    @type match_function: function descriptor
    @param consider_surname_only: Decides if only names with the same
        surname shall be considered or _all_ other names.
    @type consider_surname_only: boolean

    @return: an array containing a tuple
    @rtype: list of tuples

    @note: example:
        search_matching_names('einstein, albert')
        Out[7]: [[(962L, 'Einstein, Albert'), ['Einstein', ['A'], ['Albert']]],
                [(1128L, 'Einstein, A.'), ['Einstein', ['A'], []]]]
    """
    possible_names = []
    names = []

    if authorname_string.count(',') == 0:
        authorname_string += ','

    authorname = bibauthorid_utils.split_name_parts(authorname_string)

    if consider_surname_only:
        names = [
            row for row in dat.AUTHOR_NAMES
            if row['name'].startswith(authorname[0])
        ]
    else:
        names = [row for row in dat.AUTHOR_NAMES]

    for name in names:
        if match_function(authorname_string, name['name']):
            possible_names.append([
                (name['id'], name['name']),
                bibauthorid_utils.split_name_parts(name['name'])
            ])
    return possible_names

示例#8

0

显示文件

def names_are_equal_gender(name1, name2, gendernames):
    '''
    Checks on gender equality of two names baes on a word list

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string
    @param gendernames: dictionary of male/female names
    @type gendernames: dict

    @return: Are names gender-equal?
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    print_debug = False
    names_are_equal_gender_b = True
    ogender = None
    tgender = None
    oname = name1[2][0].lower()
    tname = name2[2][0].lower()
    oname = clean_name_string(oname, "", False, True)
    tname = clean_name_string(tname, "", False, True)

    if oname in gendernames['boys']:
        ogender = 'Male'
    elif oname in gendernames['girls']:
        ogender = 'Female'

    if tname in gendernames['boys']:
        tgender = 'Male'
    elif tname in gendernames['girls']:
        tgender = 'Female'

    if print_debug:
        print '     Gender check: ', oname, ' is a ', ogender
        print '     Gender check: ', tname, ' is a ', tgender

    if ogender and tgender:
        if ogender != tgender:
            if print_debug:
                print '    Gender differs, force split!'

            names_are_equal_gender_b = False

    return names_are_equal_gender_b

示例#9

0

显示文件

文件： bibauthorid_authorname_utils.py 项目： Markus-Goetz/CDS-Invenio-Authorlist

def names_are_equal_gender(name1, name2, gendernames):
    '''
    Checks on gender equality of two names baes on a word list

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string
    @param gendernames: dictionary of male/female names
    @type gendernames: dict

    @return: Are names gender-equal?
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    print_debug = False
    names_are_equal_gender_b = True
    ogender = None
    tgender = None
    oname = name1[2][0].lower()
    tname = name2[2][0].lower()
    oname = clean_name_string(oname, "", False, True)
    tname = clean_name_string(tname, "", False, True)

    if oname in gendernames['boys']:
        ogender = 'Male'
    elif oname in gendernames['girls']:
        ogender = 'Female'

    if tname in gendernames['boys']:
        tgender = 'Male'
    elif tname in gendernames['girls']:
        tgender = 'Female'

    if print_debug:
        print '     Gender check: ', oname, ' is a ', ogender
        print '     Gender check: ', tname, ' is a ', tgender

    if ogender and tgender:
        if ogender != tgender:
            if print_debug:
                print '    Gender differs, force split!'

            names_are_equal_gender_b = False

    return names_are_equal_gender_b

示例#10

0

显示文件

def get_va_ids_by_recid_lname(bibrec, lastname):
    '''
    Finds all the virtual author ids that belong to a certain record
    and hold a certain last name

    @param bibrec: bibrec id of a record
    @type bibrec: int
    @param lastname: The last name of a person
    @type lastname: string

    @return: list of virtual author ids
    @rtype: list of int
    '''
    va_ids = set()
    pot_va_ids = [
        row['virtualauthorid'] for row in dat.VIRTUALAUTHOR_DATA
        if ((row['tag'] == 'bibrec_id') and (row['value'] == str(bibrec)))
    ]
    for va_id in [
            row['virtualauthorid'] for row in dat.VIRTUALAUTHOR_DATA
            if ((row['virtualauthorid'] in pot_va_ids) and (
                row['tag'] == 'orig_name_string') and (
                    split_name_parts(row['value'])[0] == lastname))
    ]:
        va_ids.add(va_id)

    return list(va_ids)

示例#11

0

显示文件

文件： bibauthorid_realauthor_utils.py 项目： traviscb/cds-invenio

def find_and_process_updates(process_initials):
    """
    Finds and processes not updated virtualauthors (which are identified by
    the 'updated' tag) and delivers the ID of this virtualauthor to the
    function responsible for assigning the virtualauthor to a realauthor.

    @param process_initials: If names with initials only shall be
        processed or not
    @type process_initials: boolean
    """
    if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty():
        init_va_process_queue()

    while True:
        va_id = -1

        if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty():
            bconfig.LOGGER.debug("Empty Queue. Job finished. Nothing to do.")
            break
        else:
            va_id = dat.VIRTUALAUTHOR_PROCESS_QUEUE.get()

        va_name = bibauthorid_virtualauthor_utils.get_virtualauthor_records(va_id, tag="orig_name_string")[0]["value"]

        if not process_initials:
            if bibauthorid_utils.split_name_parts(va_name)[2]:
                (bibauthorid_virtualauthor_utils.delete_virtualauthor_record(va_id, "updated"))
                bconfig.LOGGER.log(25, "|> Inserting VA:" + " %s Orig. name: %s" % (va_id, va_name))
                add_virtualauthor(va_id)
        else:
            (bibauthorid_virtualauthor_utils.delete_virtualauthor_record(va_id, "updated"))
            bconfig.LOGGER.log(25, "|> Inserting VA: %s Orig. name: %s" % (va_id, va_name))
            add_virtualauthor(va_id)

示例#12

0

显示文件

文件： bibauthorid_tests.py 项目： valkyriesavage/invenio

    def test_split_name_parts(self):
        """bibauthorid - test split name parts"""

        self.assertEqual(['This', ['I', 'F'], ['Isacorrect', 'Fullname'], [0, 1]],
         baidu.split_name_parts('This, Isacorrect Fullname'))

        self.assertEqual(['', [], []], baidu.split_name_parts(''))

        self.assertEqual(['name', ['F', 'I'], ['Full', 'Inverted'], [0, 1]],
         baidu.split_name_parts('full inverted name'))

        self.assertEqual(['Two Words', ['S', 'N'], ['Surname', 'Name'], [0, 1]],
         baidu.split_name_parts('Two Words, Surname Name'))

        self.assertEqual(['Strange+)*{ (=]&-$Char', ['N'], ['Name'], [0]],
         baidu.split_name_parts('Strange+)*{ (=]&-$Char, Name'))

示例#13

0

显示文件

文件： bibauthorid_authorname_utils.py 项目： Markus-Goetz/CDS-Invenio-Authorlist

def search_matching_names(authorname_string, match_function=name_matching,
                          consider_surname_only=True):
    """
    search for matching names give a matching function.
    @warning: searching for matching name with consider_surname_only=false
        will be painfully slow! You've been warned.

    @warning: for mental sanity purposes the surnames not ending with a comma
        are being ignored;
        if you're searching for a surname without comma or names, the comma is
        being added automatically to the end of the string.

    @param authorname_string: The author name string
    @type authorname_string: string
    @param match_function: The function to use for the name matching
    @type match_function: function descriptor
    @param consider_surname_only: Decides if only names with the same
        surname shall be considered or _all_ other names.
    @type consider_surname_only: boolean

    @return: an array containing a tuple
    @rtype: list of tuples

    @note: example:
        search_matching_names('einstein, albert')
        Out[7]: [[(962L, 'Einstein, Albert'), ['Einstein', ['A'], ['Albert']]],
                [(1128L, 'Einstein, A.'), ['Einstein', ['A'], []]]]
    """
    possible_names = []
    names = []

    if authorname_string.count(',') == 0:
        authorname_string += ','

    authorname = bibauthorid_utils.split_name_parts(authorname_string)

    if consider_surname_only:
        names = [row for row in dat.AUTHOR_NAMES
                     if row['name'].startswith(authorname[0])]
    else:
        names = [row for row in dat.AUTHOR_NAMES]

    for name in names:
        if match_function(authorname_string, name['name']):
            possible_names.append([(name['id'], name['name']),
                           bibauthorid_utils.split_name_parts(name['name'])])
    return possible_names

示例#14

0

显示文件

文件： bibauthorid_authorname_utils.py 项目： Markus-Goetz/CDS-Invenio-Authorlist

def names_are_synonymous(name1, name2, name_variations):
    '''
    Checks if two names are synonymous; e.g. "Robert" vs. "Bob"

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string
    @param name_variations: name variations list
    @type name_variations: list of lists

    @return: are names synonymous
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    print_debug = False
    names_are_synonymous_b = False
    max_matches = min(len(name1[2]), len(name2[2]))
    matches = []

    for i in xrange(max_matches):
        matches.append(False)

    for nvar in name_variations:
        for i in xrange(max_matches):
            oname = name1[2][i].lower()
            tname = name2[2][i].lower()
            oname = clean_name_string(oname, "", False, True)
            tname = clean_name_string(tname, "", False, True)

            if oname in nvar and tname in nvar:
                if print_debug:
                    print '      ', oname, ' and ', tname, ' are synonyms! Not splitting!'

                matches[i] = True

        if sum(matches) == max_matches:
            names_are_synonymous_b = True
            break

    return names_are_synonymous_b

示例#15

0

显示文件

def names_are_synonymous(name1, name2, name_variations):
    '''
    Checks if two names are synonymous; e.g. "Robert" vs. "Bob"

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string
    @param name_variations: name variations list
    @type name_variations: list of lists

    @return: are names synonymous
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    print_debug = False
    names_are_synonymous_b = False
    max_matches = min(len(name1[2]), len(name2[2]))
    matches = []

    for i in xrange(max_matches):
        matches.append(False)

    for nvar in name_variations:
        for i in xrange(max_matches):
            oname = name1[2][i].lower()
            tname = name2[2][i].lower()
            oname = clean_name_string(oname, "", False, True)
            tname = clean_name_string(tname, "", False, True)

            if oname in nvar and tname in nvar:
                if print_debug:
                    print '      ', oname, ' and ', tname, ' are synonyms! Not splitting!'

                matches[i] = True

        if sum(matches) == max_matches:
            names_are_synonymous_b = True
            break

    return names_are_synonymous_b

示例#16

0

显示文件

文件： bibauthorid_authorname_utils.py 项目： Markus-Goetz/CDS-Invenio-Authorlist

def names_minimum_levenshtein_distance(name1, name2):
    '''
    Determines the minimum distance D between two names.
    Comparison is base on the minimum number of first names.
    Examples:
    D("guang", "guang sheng") = 0
    D("guang", "guangsheng") = 5
    D("guang sheng", "guangsheng") = 5
    D("guang sheng", "guang shing") = 1
    D("guang ming", "guang fin") = 2

    @precondition: Names have been checked for composition equality.
    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string

    @return: the minimum Levenshtein distance between two names
    @rtype: int
    '''
    try:
        from Levenshtein import distance
    except ImportError:
        bconfig.LOGGER.exception("Levenshtein Module not available!")
        return - 1

    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    onames = name1[2]
    tnames = name2[2]
#    min_names_count = min(len(onames), len(tnames))
#
#    if min_names_count <= 0:
#        return -1
#
#    oname = "".join(onames[:min_names_count]).lower()
#    tname = "".join(tnames[:min_names_count]).lower()
    oname = clean_name_string("".join(onames).lower(), "", False, True)
    tname = clean_name_string("".join(tnames).lower(), "", False, True)

    return distance(oname, tname)

示例#17

0

显示文件

def names_minimum_levenshtein_distance(name1, name2):
    '''
    Determines the minimum distance D between two names.
    Comparison is base on the minimum number of first names.
    Examples:
    D("guang", "guang sheng") = 0
    D("guang", "guangsheng") = 5
    D("guang sheng", "guangsheng") = 5
    D("guang sheng", "guang shing") = 1
    D("guang ming", "guang fin") = 2

    @precondition: Names have been checked for composition equality.
    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string

    @return: the minimum Levenshtein distance between two names
    @rtype: int
    '''
    try:
        from Levenshtein import distance
    except ImportError:
        bconfig.LOGGER.exception("Levenshtein Module not available!")
        return -1

    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    onames = name1[2]
    tnames = name2[2]
    #    min_names_count = min(len(onames), len(tnames))
    #
    #    if min_names_count <= 0:
    #        return -1
    #
    #    oname = "".join(onames[:min_names_count]).lower()
    #    tname = "".join(tnames[:min_names_count]).lower()
    oname = clean_name_string("".join(onames).lower(), "", False, True)
    tname = clean_name_string("".join(tnames).lower(), "", False, True)

    return distance(oname, tname)

示例#18

0

显示文件

文件： bibauthorid_tests.py 项目： valkyriesavage/invenio

    def test_create_normalized_name(self):
        """bibauthorid - test creation of normalized name strings"""

        self.assertEqual('this, Isa Fullname',
            baidu.create_normalized_name(
            baidu.split_name_parts('this, isa fullname')))

        self.assertEqual('fullname, This Isa',
            baidu.create_normalized_name(
            baidu.split_name_parts('this isa fullname')))

        self.assertEqual('Strange&][{}) ==}{$*]!, Name',
            baidu.create_normalized_name(
            baidu.split_name_parts('Strange&][{}) ==}{$*]!, Name')))

        self.assertEqual(',',
            baidu.create_normalized_name(
            baidu.split_name_parts('')))

示例#19

0

显示文件

文件： bibauthorid_authorname_utils.py 项目： Markus-Goetz/CDS-Invenio-Authorlist

def name_matching(orig_name, target_name):
    """
    Checks the compatibility of the given names.

    @param orig_name: The original name String
    @type orig_name: string
    @param target_name: The target name string
    @type target_name: string

    @return: true or false in respect to the compatibility of the given names
    @rtype: boolean
    """
    orig = bibauthorid_utils.split_name_parts(orig_name)
    targ = bibauthorid_utils.split_name_parts(target_name)

    if (len(orig[1]) == 0) or (len(targ[1]) == 0):
        return True

    else:
        initials_set = set(orig[1])
        names_set = set(orig[2])
        comp_initials_set = set(targ[1])
        comp_names_set = set(targ[2])

        names_intersection = names_set.intersection(comp_names_set)
        initials_intersection = initials_set.intersection(comp_initials_set)

        if len(initials_intersection) == 0:
            if len(names_intersection) != 0:
                bconfig.LOGGER.error("length of names intersection != 0..."
                                     "This should never happen!")

        if ((len(names_intersection) == 0) and (len(comp_names_set) > 0)
            and (len(names_set) > 0)):
            return False

        if orig[1][0] == targ[1][0]:
            return True

    return False

示例#20

0

显示文件

def name_matching(orig_name, target_name):
    """
    Checks the compatibility of the given names.

    @param orig_name: The original name String
    @type orig_name: string
    @param target_name: The target name string
    @type target_name: string

    @return: true or false in respect to the compatibility of the given names
    @rtype: boolean
    """
    orig = bibauthorid_utils.split_name_parts(orig_name)
    targ = bibauthorid_utils.split_name_parts(target_name)

    if (len(orig[1]) == 0) or (len(targ[1]) == 0):
        return True

    else:
        initials_set = set(orig[1])
        names_set = set(orig[2])
        comp_initials_set = set(targ[1])
        comp_names_set = set(targ[2])

        names_intersection = names_set.intersection(comp_names_set)
        initials_intersection = initials_set.intersection(comp_initials_set)

        if len(initials_intersection) == 0:
            if len(names_intersection) != 0:
                bconfig.LOGGER.error("length of names intersection != 0..."
                                     "This should never happen!")

        if ((len(names_intersection) == 0) and (len(comp_names_set) > 0)
                and (len(names_set) > 0)):
            return False

        if orig[1][0] == targ[1][0]:
            return True

    return False

示例#21

0

显示文件

文件： bibauthorid_realauthor_utils.py 项目： metandrey/invenio-metandrey

def find_and_process_updates(process_initials):
    '''
    Finds and processes not updated virtualauthors (which are identified by
    the 'updated' tag) and delivers the ID of this virtualauthor to the
    function responsible for assigning the virtualauthor to a realauthor.

    @param process_initials: If names with initials only shall be
        processed or not
    @type process_initials: boolean
    '''
    if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty():
        init_va_process_queue()

    while True:
        va_id = -1

        if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty():
            bconfig.LOGGER.debug("Empty Queue. Job finished. Nothing to do.")
            break
        else:
            va_id = dat.VIRTUALAUTHOR_PROCESS_QUEUE.get()

        va_name = (bibauthorid_virtualauthor_utils.
                   get_virtualauthor_records(va_id,
                                         tag='orig_name_string')[0]['value'])

        if not process_initials:
            if bibauthorid_utils.split_name_parts(va_name)[2]:
                (bibauthorid_virtualauthor_utils.
                 delete_virtualauthor_record(va_id, 'updated'))
                bconfig.LOGGER.log(25, "|> Inserting VA:"
                      + " %s Orig. name: %s" % (va_id, va_name))
                add_virtualauthor(va_id)
        else:
            (bibauthorid_virtualauthor_utils.
             delete_virtualauthor_record(va_id, 'updated'))
            bconfig.LOGGER.log(25, "|> Inserting VA: %s Orig. name: %s"
                          % (va_id, va_name))
            add_virtualauthor(va_id)

示例#22

0

显示文件

文件： bibauthorid_virtualauthor_utils.py 项目： Markus-Goetz/CDS-Invenio-Authorlist

def get_va_ids_by_recid_lname(bibrec, lastname):
    '''
    Finds all the virtual author ids that belong to a certain record
    and hold a certain last name

    @param bibrec: bibrec id of a record
    @type bibrec: int
    @param lastname: The last name of a person
    @type lastname: string

    @return: list of virtual author ids
    @rtype: list of int
    '''
    va_ids = set()
    pot_va_ids = [row['virtualauthorid'] for row in dat.VIRTUALAUTHOR_DATA
                  if ((row['tag'] == 'bibrec_id') and
                      (row['value'] == str(bibrec)))]
    for va_id in [row['virtualauthorid'] for row in dat.VIRTUALAUTHOR_DATA
                      if ((row['virtualauthorid'] in pot_va_ids) and
                          (row['tag'] == 'orig_name_string') and
                          (split_name_parts(row['value'])[0] == lastname))]:
        va_ids.add(va_id)

    return list(va_ids)

示例#23

0

显示文件

文件： bibauthorid_daemon.py 项目： metandrey/invenio-metandrey

def _update_authorid_universe():
    '''
    Updates all data related to the authorid algorithm.

    Sequence of operations:
        - Get all recently updated papers and remember time in the log
        - Get all authors on all papers
        - Extract collection of last names
        - For each last name:
            - Populate mem cache with cluster data
            - Delete updated records and their virtual authors from mem cache
            - Create virtual authors for new and updated records
            - Start matching algorithm
        - Update tables with results of the computation
        - Start personid update procedure
    '''

    def create_vas_from_specific_doclist(bibrec_ids):
        '''
        Processes the document list and creates a new minimal virtual author
        for each author in each record specified in the given list.

        @param bibrec_ids: Record IDs to concern in this update
        @type bibrec_ids: list of int
        '''
        num_docs = len([row for row in dat.DOC_LIST
                     if row['bibrecid'] in bibrec_ids])

        bconfig.LOGGER.log(25, "Creating minimal virtual authors for "
                                "all loaded docs (%s)"
                                % (num_docs))

        for docs in [row for row in dat.DOC_LIST
                     if row['bibrecid'] in bibrec_ids]:
            for author_id in docs['authornameids']:
                author_name = [an['name'] for an in dat.AUTHOR_NAMES
                               if an['id'] == author_id]
                refrecs = [ref[1] for ref in docs['authornameid_bibrefrec']
                           if ref[0] == author_id]
                refrec = -1

                if len(refrecs) > 1:
                    print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!"
                    refrec = refrecs[0]
                elif refrecs:
                    refrec = refrecs[0]

                if refrec and author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [], refrec)
                elif author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [])

    dat.reset_mem_cache(True)
    last_log = get_user_log(userinfo='daemon',
                            action='update_aid',
                            only_most_recent=True)
    updated_records = []

    if last_log:
        #select only the most recent papers
        recently_modified, last_update_time = get_papers_recently_modified(
                                                        date=last_log[0][2])
        insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status',
                    comment='bibauthorid_daemon, update_authorid_universe',
                    timestamp=last_update_time[0][0])
        bibtask.write_message("Update authorid will operate on %s records."
                              % (len(recently_modified)), stream=sys.stdout,
                              verbose=0)

        if not recently_modified:
            bibtask.write_message("Update authorid: Nothing to do",
                                  stream=sys.stdout, verbose=0)
            return

        for rec in recently_modified:
            updated_records.append(rec[0])
            dat.update_log("rec_updates", rec[0])

    else:
        bibtask.write_message("Update authorid: Nothing to do",
                              stream=sys.stdout, verbose=0)
        return

    authors = []
    author_last_names = set()

    bibtask.task_update_progress('Reading authors from updated records')
    bibtask.write_message("Reading authors from updated records",
                                stream=sys.stdout, verbose=0)
    updated_ras = set()

    # get all authors from all updated records
    for rec in updated_records:
        rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a",
                                                    source="API")

        for rec_author in rec_authors:
            if not rec_author:
                bconfig.LOGGER.error("Invalid empty author string, which "
                                     "will be skipped on record %s"
                                     % (rec))
                continue

            author_in_list = [row for row in authors
                              if row['db_name'] == rec_author]

            if author_in_list:
                for upd in [row for row in authors
                            if row['db_name'] == rec_author]:
                    upd['records'].append(rec)
            else:
                last_name = split_name_parts(rec_author)[0]
                author_last_names.add(last_name)
                authors.append({'db_name': rec_author,
                                'records': [rec],
                                'last_name': last_name})

    for status, author_last_name in enumerate(author_last_names):
        current_authors = [row for row in authors
                           if row['last_name'] == author_last_name]
        total_lnames = len(author_last_names)
        total_authors = len(current_authors)
        bibtask.task_update_progress('Processing %s of %s cluster: "%s" '
                                     '(%s authors)'
                                     % (status + 1, total_lnames,
                                        author_last_name, total_authors))
        bibtask.write_message('Processing %s of %s cluster: "%s" '
                              '(%s authors)'
                              % (status + 1, total_lnames, author_last_name,
                                 total_authors), stream=sys.stdout, verbose=0)
        dat.reset_mem_cache(True)
        init_authornames(author_last_name)
        load_mem_cache_from_tables()
        bconfig.LOGGER.log(25, "-- Relevant data successfully read into memory"
                               " to start processing")

        for current_author in current_authors:
            load_records_to_mem_cache(current_author['records'])
            authornamesid = [row['id'] for row in dat.AUTHOR_NAMES
                             if row['db_name'] == current_author['db_name']]

            if not authornamesid:
                bconfig.LOGGER.error("The author '%s' rec '%s' is not in authornames "
                                     "and will be skipped. You might want "
                                     "to run authornames update before?"
                                     % (current_author['db_name'], rec))
                continue
            else:
                try:
                    authornamesid = int(authornamesid[0])
                except (IndexError, TypeError, ValueError):
                    bconfig.LOGGER.error("Invalid authornames ID!")
                    continue

            if not current_author['records']:
                bconfig.LOGGER.error("The author '%s' is not associated to any"
                                     " document and will be skipped."
                                     % (current_author['db_name']))
                continue

            for rec in current_author['records']:
                # remove VAs already existing for the record
                va_ids = get_va_ids_by_recid_lname(rec,
                                                   current_author["last_name"])

                if va_ids:
                    for va_id in va_ids:
                        ra_list = get_realauthors_by_virtuala_id(va_id)

                        for ra_id in ra_list:
                            remove_va_from_ra(ra_id, va_id)
                            del_ra_data_by_vaid(ra_id, va_id)

                        va_anames_id = get_virtualauthor_records(va_id,
                                                        "orig_authorname_id")

                        for an_list in [row['authornameids'] for row in
                                    dat.DOC_LIST if row['bibrecid'] == rec]:
                            try:
                                an_list.remove(va_anames_id)
                            except (ValueError):
                                # This names id is not in the list...don't care
                                pass

                        delete_virtual_author(va_id)

                # create new VAs for the record.
                update_doclist(rec, authornamesid)
                dat.update_log("rec_updates", rec)

            create_vas_from_specific_doclist(current_author['records'])

        bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.")
        start_computation(process_doclist=False,
                          process_orphans=True,
                          print_stats=True)
        bconfig.LOGGER.log(25, "-- Computation finished. Will write back to "
                               "the database now.")
        update_db_result = update_tables_from_mem_cache(return_ra_updates=True)

        if not update_db_result[0]:
            bconfig.LOGGER.log(25, "Writing to persistence layer failed.")
        else:
            if update_db_result[1]:
                for updated_ra in update_db_result[1]:
                    if updated_ra:
                        updated_ras.add(updated_ra[0])

            bconfig.LOGGER.log(25, "Done updating authorid universe.")

    personid_ra_format = []

    for ra_id in updated_ras:
        personid_ra_format.append((ra_id,))

    bconfig.LOGGER.log(25, "Will now run personid update to make the "
                       "changes visible also on the front end and to "
                       "create person IDs for %s newly created and changed "
                       "authors." % len(updated_ras))
    bibtask.task_update_progress('Updating persistent Person IDs')
    update_personID_from_algorithm(personid_ra_format)
    bconfig.LOGGER.log(25, "Done updating everything. Thanks for flying "
                       "with bibauthorid!")

示例#24

0

显示文件

文件： bibauthorid_daemon.py 项目： metandrey/invenio-metandrey

def _update_authorid_universe():
    '''
    Updates all data related to the authorid algorithm.

    Sequence of operations:
        - Get all recently updated papers and remember time in the log
        - Get all authors on all papers
        - Extract collection of last names
        - For each last name:
            - Populate mem cache with cluster data
            - Delete updated records and their virtual authors from mem cache
            - Create virtual authors for new and updated records
            - Start matching algorithm
        - Update tables with results of the computation
        - Start personid update procedure
    '''
    def create_vas_from_specific_doclist(bibrec_ids):
        '''
        Processes the document list and creates a new minimal virtual author
        for each author in each record specified in the given list.

        @param bibrec_ids: Record IDs to concern in this update
        @type bibrec_ids: list of int
        '''
        num_docs = len(
            [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids])

        bconfig.LOGGER.log(
            25, "Creating minimal virtual authors for "
            "all loaded docs (%s)" % (num_docs))

        for docs in [
                row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids
        ]:
            for author_id in docs['authornameids']:
                author_name = [
                    an['name'] for an in dat.AUTHOR_NAMES
                    if an['id'] == author_id
                ]
                refrecs = [
                    ref[1] for ref in docs['authornameid_bibrefrec']
                    if ref[0] == author_id
                ]
                refrec = -1

                if len(refrecs) > 1:
                    print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!"
                    refrec = refrecs[0]
                elif refrecs:
                    refrec = refrecs[0]

                if refrec and author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [], refrec)
                elif author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [])

    dat.reset_mem_cache(True)
    last_log = get_user_log(userinfo='daemon',
                            action='update_aid',
                            only_most_recent=True)
    updated_records = []

    if last_log:
        #select only the most recent papers
        recently_modified, last_update_time = get_papers_recently_modified(
            date=last_log[0][2])
        insert_user_log('daemon',
                        '-1',
                        'update_aid',
                        'bibsched',
                        'status',
                        comment='bibauthorid_daemon, update_authorid_universe',
                        timestamp=last_update_time[0][0])
        bibtask.write_message("Update authorid will operate on %s records." %
                              (len(recently_modified)),
                              stream=sys.stdout,
                              verbose=0)

        if not recently_modified:
            bibtask.write_message("Update authorid: Nothing to do",
                                  stream=sys.stdout,
                                  verbose=0)
            return

        for rec in recently_modified:
            updated_records.append(rec[0])
            dat.update_log("rec_updates", rec[0])

    else:
        bibtask.write_message("Update authorid: Nothing to do",
                              stream=sys.stdout,
                              verbose=0)
        return

    authors = []
    author_last_names = set()

    bibtask.task_update_progress('Reading authors from updated records')
    bibtask.write_message("Reading authors from updated records",
                          stream=sys.stdout,
                          verbose=0)
    updated_ras = set()

    # get all authors from all updated records
    for rec in updated_records:
        rec_authors = get_field_values_on_condition(rec, ['100', '700'],
                                                    "a",
                                                    source="API")

        for rec_author in rec_authors:
            if not rec_author:
                bconfig.LOGGER.error("Invalid empty author string, which "
                                     "will be skipped on record %s" % (rec))
                continue

            author_in_list = [
                row for row in authors if row['db_name'] == rec_author
            ]

            if author_in_list:
                for upd in [
                        row for row in authors if row['db_name'] == rec_author
                ]:
                    upd['records'].append(rec)
            else:
                last_name = split_name_parts(rec_author)[0]
                author_last_names.add(last_name)
                authors.append({
                    'db_name': rec_author,
                    'records': [rec],
                    'last_name': last_name
                })

    for status, author_last_name in enumerate(author_last_names):
        current_authors = [
            row for row in authors if row['last_name'] == author_last_name
        ]
        total_lnames = len(author_last_names)
        total_authors = len(current_authors)
        bibtask.task_update_progress(
            'Processing %s of %s cluster: "%s" '
            '(%s authors)' %
            (status + 1, total_lnames, author_last_name, total_authors))
        bibtask.write_message(
            'Processing %s of %s cluster: "%s" '
            '(%s authors)' %
            (status + 1, total_lnames, author_last_name, total_authors),
            stream=sys.stdout,
            verbose=0)
        dat.reset_mem_cache(True)
        init_authornames(author_last_name)
        load_mem_cache_from_tables()
        bconfig.LOGGER.log(
            25, "-- Relevant data successfully read into memory"
            " to start processing")

        for current_author in current_authors:
            load_records_to_mem_cache(current_author['records'])
            authornamesid = [
                row['id'] for row in dat.AUTHOR_NAMES
                if row['db_name'] == current_author['db_name']
            ]

            if not authornamesid:
                bconfig.LOGGER.error(
                    "The author '%s' rec '%s' is not in authornames "
                    "and will be skipped. You might want "
                    "to run authornames update before?" %
                    (current_author['db_name'], rec))
                continue
            else:
                try:
                    authornamesid = int(authornamesid[0])
                except (IndexError, TypeError, ValueError):
                    bconfig.LOGGER.error("Invalid authornames ID!")
                    continue

            if not current_author['records']:
                bconfig.LOGGER.error("The author '%s' is not associated to any"
                                     " document and will be skipped." %
                                     (current_author['db_name']))
                continue

            for rec in current_author['records']:
                # remove VAs already existing for the record
                va_ids = get_va_ids_by_recid_lname(rec,
                                                   current_author["last_name"])

                if va_ids:
                    for va_id in va_ids:
                        ra_list = get_realauthors_by_virtuala_id(va_id)

                        for ra_id in ra_list:
                            remove_va_from_ra(ra_id, va_id)
                            del_ra_data_by_vaid(ra_id, va_id)

                        va_anames_id = get_virtualauthor_records(
                            va_id, "orig_authorname_id")

                        for an_list in [
                                row['authornameids'] for row in dat.DOC_LIST
                                if row['bibrecid'] == rec
                        ]:
                            try:
                                an_list.remove(va_anames_id)
                            except (ValueError):
                                # This names id is not in the list...don't care
                                pass

                        delete_virtual_author(va_id)

                # create new VAs for the record.
                update_doclist(rec, authornamesid)
                dat.update_log("rec_updates", rec)

            create_vas_from_specific_doclist(current_author['records'])

        bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.")
        start_computation(process_doclist=False,
                          process_orphans=True,
                          print_stats=True)
        bconfig.LOGGER.log(
            25, "-- Computation finished. Will write back to "
            "the database now.")
        update_db_result = update_tables_from_mem_cache(return_ra_updates=True)

        if not update_db_result[0]:
            bconfig.LOGGER.log(25, "Writing to persistence layer failed.")
        else:
            if update_db_result[1]:
                for updated_ra in update_db_result[1]:
                    if updated_ra:
                        updated_ras.add(updated_ra[0])

            bconfig.LOGGER.log(25, "Done updating authorid universe.")

    personid_ra_format = []

    for ra_id in updated_ras:
        personid_ra_format.append((ra_id, ))

    bconfig.LOGGER.log(
        25, "Will now run personid update to make the "
        "changes visible also on the front end and to "
        "create person IDs for %s newly created and changed "
        "authors." % len(updated_ras))
    bibtask.task_update_progress('Updating persistent Person IDs')
    update_personID_from_algorithm(personid_ra_format)
    bconfig.LOGGER.log(
        25, "Done updating everything. Thanks for flying "
        "with bibauthorid!")

示例#25

0

显示文件

文件： bibauthorid_authorname_utils.py 项目： Markus-Goetz/CDS-Invenio-Authorlist

def compare_names(origin_name, target_name):
    """
    Compute an index of confidence that would like to indicate whether two
    names might represent the same person.The computation is based on
    similarities of name structure, in particular:
        Initials:
            We assign an high score if all the initials matches are in the
            right order, much lower if they are in the wrong order
        Names:
            We assign a lower score for mismatching names and higher score for
            fully matching names
    If there is nothing to compare we are forced to assume a high score.

    Example for splitting names:
        In : bibauthorid.split_name_parts("Ellis, John R")
        Out: ['Ellis', ['J', 'R'], ['John']]

        Ellis, R. Keith        => [ [Ellis], [R, K], [Keith] ]
        Ellis, Richard Keith   => [ [Ellis], [R, K], [Richard, Keith] ]

    Since the initials are computed whether on the real initials present in the
    name string and using the full name, if there is no initials match we are 1
    00% confident that:
        1. we have no names/initials at all, or
        2. we have completely different names; hence if there is no initial
            match we skip this step.

    @param orig_name: The first author's last name, first name(s) and initial
    @type orig_name: list of strings and lists of strings
    @param targ_name: The second author's last name, first name(s) and initial
    @type targ_name: list of strings and lists of strings

    @return: a value that describes the likelihood of the names being the same
    @rtype: float
    """

    jaro_fctn = None

    try:
        from Levenshtein import jaro_winkler
        jaro_fctn = jaro_winkler
    except ImportError:
        jaro_fctn = jaro_winkler_str_similarity

    oname = deepcopy(origin_name)
    tname = deepcopy(target_name)

    orig_name = split_name_parts(oname.lower())
    targ_name = split_name_parts(tname.lower())

    bconfig.LOGGER.info("|--> Comparing Names: \"%s\" and \"%s\"" %
                      (origin_name, target_name))

    lastname_modifier = 0.0

    if not (orig_name[0] == targ_name[0]):
        # last names are not equal before cleaning them. Assign entry penalty.
        lastname_modifier = 0.15

    orig_name[0] = clean_name_string(orig_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    targ_name[0] = clean_name_string(targ_name[0],
                                     replacement="",
                                     keep_whitespace=False)

    if not (orig_name[0] == targ_name[0]):
        if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95)
            or min(len(orig_name[0]), len(targ_name[0])) <= 4):
            bconfig.LOGGER.warn(("Unequal lastnames(%s vs. %s)."
                               + "Skipping Comparison")
                               % (orig_name[0], targ_name[0]))
            return 0.0
        else:
            bconfig.LOGGER.log(25, "Last names are not equal; "
                          + "but similar enough to continue the comparison")
            # Let it go through...however, reduce the final result a little.
            lastname_modifier = 0.24
    else:
        # last names are equal after cleaning them. Reduce penalty.
        if lastname_modifier == 0.15:
            lastname_modifier = 0.02

    if orig_name[2] and targ_name[2]:
        if len(orig_name[2]) > 1 or len(targ_name[2]) > 1:
            variation_ps = []
            oname_variations = create_name_tuples(orig_name[2])
            tname_variations = create_name_tuples(targ_name[2])

            for oname_variation in oname_variations:
                for tname_variation in tname_variations:
                    oname_var = split_name_parts("%s, %s"
                                                 % (orig_name[0],
                                                    oname_variation))
                    tname_var = split_name_parts("%s, %s"
                                                 % (targ_name[0],
                                                    tname_variation))
                    variation_ps.append(_perform_matching(oname_var,
                                                          tname_var))

            return max(variation_ps) - lastname_modifier

    return _perform_matching(orig_name, targ_name) - lastname_modifier

示例#26

0

显示文件

def compare_names(origin_name, target_name):
    """
    Compute an index of confidence that would like to indicate whether two
    names might represent the same person.The computation is based on
    similarities of name structure, in particular:
        Initials:
            We assign an high score if all the initials matches are in the
            right order, much lower if they are in the wrong order
        Names:
            We assign a lower score for mismatching names and higher score for
            fully matching names
    If there is nothing to compare we are forced to assume a high score.

    Example for splitting names:
        In : bibauthorid.split_name_parts("Ellis, John R")
        Out: ['Ellis', ['J', 'R'], ['John']]

        Ellis, R. Keith        => [ [Ellis], [R, K], [Keith] ]
        Ellis, Richard Keith   => [ [Ellis], [R, K], [Richard, Keith] ]

    Since the initials are computed whether on the real initials present in the
    name string and using the full name, if there is no initials match we are 1
    00% confident that:
        1. we have no names/initials at all, or
        2. we have completely different names; hence if there is no initial
            match we skip this step.

    @param orig_name: The first author's last name, first name(s) and initial
    @type orig_name: list of strings and lists of strings
    @param targ_name: The second author's last name, first name(s) and initial
    @type targ_name: list of strings and lists of strings

    @return: a value that describes the likelihood of the names being the same
    @rtype: float
    """

    jaro_fctn = None

    try:
        from Levenshtein import jaro_winkler
        jaro_fctn = jaro_winkler
    except ImportError:
        jaro_fctn = jaro_winkler_str_similarity

    oname = deepcopy(origin_name)
    tname = deepcopy(target_name)

    orig_name = split_name_parts(oname.lower())
    targ_name = split_name_parts(tname.lower())

    bconfig.LOGGER.info("|--> Comparing Names: \"%s\" and \"%s\"" %
                        (origin_name, target_name))

    lastname_modifier = 0.0

    if not (orig_name[0] == targ_name[0]):
        # last names are not equal before cleaning them. Assign entry penalty.
        lastname_modifier = 0.15

    orig_name[0] = clean_name_string(orig_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    targ_name[0] = clean_name_string(targ_name[0],
                                     replacement="",
                                     keep_whitespace=False)

    if not (orig_name[0] == targ_name[0]):
        if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95)
                or min(len(orig_name[0]), len(targ_name[0])) <= 4):
            bconfig.LOGGER.warn(
                ("Unequal lastnames(%s vs. %s)." + "Skipping Comparison") %
                (orig_name[0], targ_name[0]))
            return 0.0
        else:
            bconfig.LOGGER.log(
                25, "Last names are not equal; " +
                "but similar enough to continue the comparison")
            # Let it go through...however, reduce the final result a little.
            lastname_modifier = 0.24
    else:
        # last names are equal after cleaning them. Reduce penalty.
        if lastname_modifier == 0.15:
            lastname_modifier = 0.02

    if orig_name[2] and targ_name[2]:
        if len(orig_name[2]) > 1 or len(targ_name[2]) > 1:
            variation_ps = []
            oname_variations = create_name_tuples(orig_name[2])
            tname_variations = create_name_tuples(targ_name[2])

            for oname_variation in oname_variations:
                for tname_variation in tname_variations:
                    oname_var = split_name_parts(
                        "%s, %s" % (orig_name[0], oname_variation))
                    tname_var = split_name_parts(
                        "%s, %s" % (targ_name[0], tname_variation))
                    variation_ps.append(_perform_matching(
                        oname_var, tname_var))

            return max(variation_ps) - lastname_modifier

    return _perform_matching(orig_name, targ_name) - lastname_modifier