def test_create_unified_name(self):
        """bibauthorid - test creation of unified name strings"""

        self.assertEqual('this, I. F. ',
            baidu.create_unified_name('this, isa fullname'))

        self.assertEqual('fullname, T. I. ',
            baidu.create_unified_name('this isa fullname'))

        self.assertEqual(', ',
            baidu.create_unified_name(''))

        self.assertEqual('Strange$![+{&]+)= Chars, T. ',
            baidu.create_unified_name('Strange$![+{&]+)= Chars, Twonames'))
def hash_coauthor_set(coauthors):
    '''
    In case a collaboration are not tagged as such in the appropriate MARC21
    field, this function will create a hash value for the list of authors
    after creating a sorted list of unified representations of the names.

    A collaboration is defined as a group of authors larger than the value
    MAX_COAUTHORS defined in the configuration file. MAX_COAUTHORS defaults to
    60 people.

    @param coauthors: a list of names
    @type coauthors: list of strings

    @return: A hash representation of the sorted, unified list
    @rtype: string
    '''
    hashlist = []

    for i in coauthors:
        hashlist.append(create_unified_name(i).strip())

    hashlist.sort()
    hashvalue = hash(str(hashlist))

    return hashvalue
def hash_coauthor_set(coauthors):
    '''
    In case a collaboration are not tagged as such in the appropriate MARC21
    field, this function will create a hash value for the list of authors
    after creating a sorted list of unified representations of the names.

    A collaboration is defined as a group of authors larger than the value
    MAX_COAUTHORS defined in the configuration file. MAX_COAUTHORS defaults to
    60 people.

    @param coauthors: a list of names
    @type coauthors: list of strings

    @return: A hash representation of the sorted, unified list
    @rtype: string
    '''
    hashlist = []

    for i in coauthors:
        hashlist.append(create_unified_name(i).strip())

    hashlist.sort()
    hashvalue = hash(str(hashlist))

    return hashvalue
def get_clusterids_from_name(name, return_matching=False):
    '''
    Returns a list of cluster IDs, which are fitting for the parameter 'name'.
    First checks if, in general, a cluster for this name exists. If not,
    create one. If there is a cluster, try to find all other fitting clusters
    and add the found cluster IDs to the list to be returned

    @param name: The name to be on the lookout for.
    @type name: string
    @param return_matching: also return the reference name's matching cluster
    @type return_matching: boolean

    @return:
        if return_matching: list of 1) list of cluster IDs 2) the cluster ID
            matching the name
        if not return_matching: list of cluster IDs
    @rtype:
        if return_matching: list of (list of int, int)
        if not return_matching: list of int
    '''
    search_string = create_unified_name(name)
    search_string = clean_name_string(search_string)

    if len(search_string) > 150:
        search_string = search_string[:150]

    clusterids = set()
    matching_cluster = -1
    initials = ""
    split_string = ""

    if search_string[:-1].count(",") > 0:
        split_string = search_string[:-1].replace(' ', '').split(',')

        if split_string[1]:
            initials = split_string[1].split('.')

    if len(initials) > 2 and len(initials) <= 5:
        permutation_list = initials

        permutation_base = ("%s, %s." %
                            (search_string.split(',')[0], permutation_list[0]))

        for permutation in permutations(permutation_list[1:]):
            name_string = "%s %s." % (permutation_base, ". ".join(permutation))
            clusters = _get_clusterids_from_name(name_string, return_matching)
            if return_matching:
                matching_cluster = clusters[1]
                for clusterid in clusters[0]:
                    clusterids.add(clusterid)
            else:
                for clusterid in clusters:
                    clusterids.add(clusterid)
    else:
        clusters = _get_clusterids_from_name(search_string, return_matching)
        if return_matching:
            matching_cluster = clusters[1]
            clusterids = clusters[0]
        else:
            clusterids = clusters

    if return_matching:
        return [clusterids, matching_cluster]
    else:
        return clusterids
示例#5
0
def get_clusterids_from_name(name, return_matching=False):
    '''
    Returns a list of cluster IDs, which are fitting for the parameter 'name'.
    First checks if, in general, a cluster for this name exists. If not,
    create one. If there is a cluster, try to find all other fitting clusters
    and add the found cluster IDs to the list to be returned

    @param name: The name to be on the lookout for.
    @type name: string
    @param return_matching: also return the reference name's matching cluster
    @type return_matching: boolean

    @return:
        if return_matching: list of 1) list of cluster IDs 2) the cluster ID
            matching the name
        if not return_matching: list of cluster IDs
    @rtype:
        if return_matching: list of (list of int, int)
        if not return_matching: list of int
    '''
    search_string = create_unified_name(name)
    search_string = clean_name_string(search_string)

    if len(search_string) > 150:
        search_string = search_string[:150]

    clusterids = set()
    matching_cluster = -1
    initials = ""
    split_string = ""

    if search_string[:-1].count(",") > 0:
        split_string = search_string[:-1].replace(' ', '').split(',')

        if split_string[1]:
            initials = split_string[1].split('.')

    if len(initials) > 2 and len(initials) <= 5:
        permutation_list = initials

        permutation_base = ("%s, %s." %
                            (search_string.split(',')[0], permutation_list[0]))

        for permutation in permutations(permutation_list[1:]):
            name_string = "%s %s." % (permutation_base, ". ".join(permutation))
            clusters = _get_clusterids_from_name(name_string, return_matching)
            if return_matching:
                matching_cluster = clusters[1]
                for clusterid in clusters[0]:
                    clusterids.add(clusterid)
            else:
                for clusterid in clusters:
                    clusterids.add(clusterid)
    else:
        clusters = _get_clusterids_from_name(search_string, return_matching)
        if return_matching:
            matching_cluster = clusters[1]
            clusterids = clusters[0]
        else:
            clusterids = clusters

    if return_matching:
        return [clusterids, matching_cluster]
    else:
        return clusterids
def get_information_from_dataset(va_id, ra_id=-1):
    '''
    Retrieves information about the coauthors/collaboration attachment
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the coauthors will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of coauthors OR the name of a
        collaboration
    @rtype: True if ra_id > -1 or list of strings or string
    '''
    va_data = get_virtualauthor_records(va_id)
    bibrec_id = ""
    authorname_id = -1

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)

    bconfig.LOGGER.info("| Reading coauthors for va %s: %s recid %s" %
                        (va_id, authorname_strings["name"], bibrec_id))

    coauthors = get_field_values_on_condition(bibrec_id, ['100', '700'], 'a',
                                              'a',
                                              authorname_strings["db_name"],
                                              "!=")

    collaboration = get_field_values_on_condition(bibrec_id, "710", "g")

    if (not coauthors) and (not collaboration):
        bconfig.LOGGER.info("|-> No coauthors and no collaboration found "
                            "for this author on this record")
    elif not ra_id:
        if collaboration:
            bconfig.LOGGER.info("|-> Collaboration found: %s" %
                                (list(collaboration)[0]))
        else:
            bconfig.LOGGER.info("|-> Coauthors found: %s" % (len(coauthors)))

    max_coauthors = MAX_COAUTHORS

    if ra_id > -1:
        if collaboration:
            cname = list(collaboration)[0]
            coauthor_formatted = create_unified_name(cname.lower())
            set_realauthor_data(
                ra_id, "coauthor",
                "%s;;%s" % (authorname_strings["name"], coauthor_formatted))
        else:
            if len(coauthors) <= max_coauthors:
                for coauthor in coauthors:
                    coauthor_formatted = create_unified_name(coauthor.lower())
                    set_realauthor_data(
                        ra_id, "coauthor", "%s;;%s" %
                        (authorname_strings["name"], coauthor_formatted))
            else:
                hashvalue = hash_coauthor_set(coauthors)
                bconfig.LOGGER.info("|--> Coauthor # > %s. To preserve"
                                    " information, a hash will be stored: %s" %
                                    (max_coauthors, hashvalue))
                set_realauthor_data(
                    ra_id, "coauthor",
                    "%s;;%s" % (authorname_strings["name"], hashvalue))

        return True
    else:
        if collaboration:
            return collaboration
        else:
            return coauthors
def compare_va_to_ra(va_id, ra_id):
    '''
    Compares the coauthors of a virtual author with all the coauthors of
    a real author. If a collaboration is detected on both sides, these
    collaboration detachments will be compared as well.

    @param va_id: Virtual author ID
    @type va_id: int
    @param ra_id: Real author ID
    @type ra_id: int

    @return: the probability of the virtual author belonging to the real author
    @rtype: float
    '''
    bconfig.LOGGER.info(
        "|-> Start of coauthorship comparison (va %s : ra %s)" %
        (va_id, ra_id))

    ra_coauth_set = set()

    ra_coauthors_data = get_realauthor_data(ra_id, "coauthor")
    va_coauth_set = get_information_from_dataset(va_id)
    va_coauth_set_format = set()

    #    max_coauthors = int(get_config_parameter('MAX_COAUTHORS')[0])
    max_coauthors = MAX_COAUTHORS

    if (len(ra_coauthors_data) == 0) and (len(va_coauth_set) == 0):
        bconfig.LOGGER.info("|-> End of coauthorship comparison (Sets empty)")
        return 0

    if (len(va_coauth_set) > max_coauthors):
        bconfig.LOGGER.info("|--> Many coauthors found. Will try hash" +
                            " values for collaboration testing.")
        hashed = str(hash_coauthor_set(va_coauth_set))

        for coauthor_data in ra_coauthors_data:
            if coauthor_data['value'].split(";;")[1] == hashed:
                bconfig.LOGGER.info("|---> Hash found! Assuming "
                                    "collaboration attachment.")
                return 1.0

        bconfig.LOGGER.info("|---> Hash NOT found. Skipping metric.")
        return 0

    for rcoauthor_data in ra_coauthors_data:
        ra_coauth_set.add(rcoauthor_data['value'].split(";;")[1])

    for vcoauthor_data in va_coauth_set:
        va_coauth_set_format.add(create_unified_name(vcoauthor_data.lower()))

    parity = ra_coauth_set.intersection(va_coauth_set_format)

    certainty = 0

    for collaborationsearch in parity:
        if collaborationsearch.count("ollaboration"):
            bconfig.LOGGER.info("|--> Found matching collaboration: %s" %
                                (collaborationsearch))
            return 1.0

    if len(va_coauth_set) > 0:
        certainty = 1 - exp(-.8 * pow(len(parity), .7))

    bconfig.LOGGER.info("|--> Found %s matching coauthors out of %s "
                        "on the paper. Result: %s%% similarity" %
                        (len(parity), len(va_coauth_set), certainty))

    return certainty
def get_information_from_dataset(va_id, ra_id= -1):
    '''
    Retrieves information about the coauthors/collaboration attachment
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the coauthors will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of coauthors OR the name of a
        collaboration
    @rtype: True if ra_id > -1 or list of strings or string
    '''
    va_data = get_virtualauthor_records(va_id)
    bibrec_id = ""
    authorname_id = -1

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)

    bconfig.LOGGER.info("| Reading coauthors for va %s: %s recid %s"
                  % (va_id, authorname_strings["name"], bibrec_id))

    coauthors = get_field_values_on_condition(
                                        bibrec_id, ['100', '700'], 'a', 'a',
                                        authorname_strings["db_name"], "!=")

    collaboration = get_field_values_on_condition(bibrec_id, "710", "g")

    if (not coauthors) and (not collaboration):
        bconfig.LOGGER.info("|-> No coauthors and no collaboration found "
                            "for this author on this record")
    elif not ra_id:
        if collaboration:
            bconfig.LOGGER.info("|-> Collaboration found: %s"
                          % (list(collaboration)[0]))
        else:
            bconfig.LOGGER.info("|-> Coauthors found: %s" % (len(coauthors)))

    max_coauthors = MAX_COAUTHORS

    if ra_id > -1:
        if collaboration:
            cname = list(collaboration)[0]
            coauthor_formatted = create_unified_name(cname.lower())
            set_realauthor_data(ra_id, "coauthor", "%s;;%s"
                                % (authorname_strings["name"],
                                   coauthor_formatted))
        else:
            if len(coauthors) <= max_coauthors:
                for coauthor in coauthors:
                    coauthor_formatted = create_unified_name(coauthor.lower())
                    set_realauthor_data(ra_id, "coauthor", "%s;;%s"
                                    % (authorname_strings["name"],
                                       coauthor_formatted))
            else:
                hashvalue = hash_coauthor_set(coauthors)
                bconfig.LOGGER.info("|--> Coauthor # > %s. To preserve"
                                    " information, a hash will be stored: %s"
                                    % (max_coauthors, hashvalue))
                set_realauthor_data(ra_id, "coauthor", "%s;;%s"
                                    % (authorname_strings["name"],
                                       hashvalue))

        return True
    else:
        if collaboration:
            return collaboration
        else:
            return coauthors
def compare_va_to_ra(va_id, ra_id):
    '''
    Compares the coauthors of a virtual author with all the coauthors of
    a real author. If a collaboration is detected on both sides, these
    collaboration detachments will be compared as well.

    @param va_id: Virtual author ID
    @type va_id: int
    @param ra_id: Real author ID
    @type ra_id: int

    @return: the probability of the virtual author belonging to the real author
    @rtype: float
    '''
    bconfig.LOGGER.info("|-> Start of coauthorship comparison (va %s : ra %s)"
                  % (va_id, ra_id))

    ra_coauth_set = set()

    ra_coauthors_data = get_realauthor_data(ra_id, "coauthor")
    va_coauth_set = get_information_from_dataset(va_id)
    va_coauth_set_format = set()

#    max_coauthors = int(get_config_parameter('MAX_COAUTHORS')[0])
    max_coauthors = MAX_COAUTHORS

    if (len(ra_coauthors_data) == 0) and (len(va_coauth_set) == 0):
        bconfig.LOGGER.info("|-> End of coauthorship comparison (Sets empty)")
        return 0

    if (len(va_coauth_set) > max_coauthors):
        bconfig.LOGGER.info("|--> Many coauthors found. Will try hash"
                      + " values for collaboration testing.")
        hashed = str(hash_coauthor_set(va_coauth_set))

        for coauthor_data in ra_coauthors_data:
            if coauthor_data['value'].split(";;")[1] == hashed:
                bconfig.LOGGER.info("|---> Hash found! Assuming "
                              "collaboration attachment.")
                return 1.0

        bconfig.LOGGER.info("|---> Hash NOT found. Skipping metric.")
        return 0

    for rcoauthor_data in ra_coauthors_data:
        ra_coauth_set.add(rcoauthor_data['value'].split(";;")[1])

    for vcoauthor_data in va_coauth_set:
        va_coauth_set_format.add(create_unified_name(vcoauthor_data.lower()))

    parity = ra_coauth_set.intersection(va_coauth_set_format)

    certainty = 0

    for collaborationsearch in parity:
        if collaborationsearch.count("ollaboration"):
            bconfig.LOGGER.info("|--> Found matching collaboration: %s"
                          % (collaborationsearch))
            return 1.0

    if len(va_coauth_set) > 0:
        certainty = 1 - exp(-.8 * pow(len(parity), .7))

    bconfig.LOGGER.info("|--> Found %s matching coauthors out of %s "
                        "on the paper. Result: %s%% similarity"
                        % (len(parity), len(va_coauth_set), certainty))

    return certainty