Пример #1
0
from invenio.textutils import translate_to_ascii
from invenio.intbitset import intbitset
from invenio.bibauthorid_name_utils import create_indexable_name, split_name_parts
from invenio.bibauthorid_dbinterface import get_confirmed_name_to_authors_mapping, \
    get_author_to_name_and_occurrence_mapping, \
    populate_table, set_inverted_lists_ready, \
    set_dense_index_ready, search_engine_is_operating, \
    get_indexed_strings, get_author_groups_from_string_ids, \
    get_name_variants_for_authors, get_inverted_lists

from invenio.bibauthorid_general_utils import memoized

indexable_name_re = re.compile("[^a-zA-Z,\s]")

# Memoize some functions
translate_to_ascii = memoized(translate_to_ascii)
split_name_parts = memoized(split_name_parts)


def get_qgrams_from_string(s, q):
    '''
    It decomposes the given string to its qgrams. The qgrams of a string are
    its substrings of length q. For example the 2-grams (q=2) of the string
    'cathey' are (ca,at,th,he,ey).

    @param string: string to be decomposed
    @type string: str
    @param q: length of the grams
    @type q: int

    @return: the string qgrams ordered accordingly to the position they withhold in the string
Пример #2
0
bibauthorid_name_utils
    Bibauthorid utilities used by many parts of the framework
'''

import re
import invenio.bibauthorid_config as bconfig
from copy import deepcopy

from invenio.bibauthorid_general_utils import memoized

from invenio.bibauthorid_logutils import Logger

from math import sqrt
from invenio.textutils import translate_to_ascii as original_translate_to_ascii

translate_to_ascii = memoized(original_translate_to_ascii)
SQRT2 = sqrt(2)

try:
    from invenio.config import CFG_ETCDIR
    NO_CFG_ETCDIR = False
except ImportError:
    NO_CFG_ETCDIR = True

from Levenshtein import distance

logger = Logger("name comparison",
                verbose=bconfig.DEBUG_NAME_COMPARISON_OUTPUT)

artifact_removal = re.compile("[^a-zA-Z0-9]")
surname_cleaning = re.compile("-([a-z])")
Пример #3
0
def rabbit(bibrecs=None,
           check_invalid_papers=False,
           personids_to_update_extids=None,
           verbose=False):

    logger = Logger("Rabbit")

    if verbose:
        logger.verbose = True

    if not bibrecs:
        logger.log("Running on all records")
    else:
        logger.log("Running on %s " % (str(bibrecs)))

    populate_mnames_pids_cache()

    global M_NAME_PIDS_CACHE

    memoized_compare_names = memoized(comp_names)
    compare_names = lambda x, y: memoized_compare_names(*sorted((x, y)))

    def find_pids_by_matchable_name_with_cache(matchable_name):
        try:
            matched_pids = [M_NAME_PIDS_CACHE[matchable_name]]
        except KeyError:
            matched_pids = get_authors_by_name(matchable_name,
                                               use_matchable_name=True)
            if matched_pids:
                M_NAME_PIDS_CACHE[matchable_name] = matched_pids[0]
        return matched_pids

    if USE_EXT_IDS:

        def get_matched_pids_by_external_ids(sig, rec, pids_having_rec):
            '''
            This function returns all the matched pids after iterating
            through all available external IDs of the system.
            '''
            for get_external_id_of_signature in external_id_getters:
                external_id = get_external_id_of_signature(sig + (rec, ))
                if external_id:
                    matched_pids = list(
                        get_author_by_external_id(external_id[0]))
                    if matched_pids and int(
                            matched_pids[0][0]) in pids_having_rec:
                        matched_pids = list()
                    return matched_pids

    threshold = 0.8

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    bibrecs = list(bibrecs)
    for idx, rec in enumerate(bibrecs):

        logger.log("Considering %s" % str(rec))

        if idx % 100 == 0:
            task_update_progress("%d/%d current: %d" %
                                 (idx, len(bibrecs), rec))

        if idx % 1000 == 0:
            destroy_partial_marc_caches()
            populate_partial_marc_caches(bibrecs[idx:idx + 1000])

            logger.log(
                float(idx) / len(bibrecs), "%d/%d" % (idx, len(bibrecs)))

        if rec in deleted:
            remove_papers([rec])
            continue

        author_refs = get_author_refs_of_paper(rec)
        coauthor_refs = get_coauthor_refs_of_paper(rec)

        markrefs = frozenset(
            chain(izip(cycle([100]), imap(itemgetter(0), author_refs)),
                  izip(cycle([700]), imap(itemgetter(0), coauthor_refs))))

        personid_rows = [
            map(int, row[:3]) + [row[4]]
            for row in get_signatures_of_paper(rec)
        ]
        personidrefs_names = dict(
            ((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict(
            (new, get_name_by_bibref(new)) for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[
            compare_names(new_signatures_names[new], personidrefs_names[old])
            for old in old_signatures
        ] for new in new_signatures]

        logger.log(" - Deleted signatures: %s" % str(old_signatures))
        logger.log(" - Added signatures: %s" % str(new_signatures))
        logger.log(" - Matrix: %s" % str(matrix))

        #[new_signatures, old_signatures]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix)
                      if score > threshold]

        logger.log(" - Best match: %s " % str(best_match))

        for new, old in best_match:
            logger.log("  -  -  Moving signature: %s on %s to %s as %s" %
                       (old, rec, new, new_signatures_names[new]))
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_signatures(tuple(list(old) + [rec]) for old in old_signatures)
        not_matched = frozenset(new_signatures) - frozenset(
            map(itemgetter(0), best_match))

        remaining_personid_rows = ([
            x for x in personid_rows if x[1:3] in old_signatures
        ])

        pids_having_rec = set([int(row[0]) for row in remaining_personid_rows])
        logger.log(" - Not matched: %s" % str(not_matched))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matchable_name = create_matchable_name(name)
            matched_pids = list()
            if USE_EXT_IDS:
                matched_pids = get_matched_pids_by_external_ids(
                    sig, rec, pids_having_rec)

                if matched_pids:
                    add_signature(list(sig) + [rec],
                                  name,
                                  matched_pids[0][0],
                                  m_name=matchable_name)
                    M_NAME_PIDS_CACHE[matchable_name] = matched_pids[0][0]
                    updated_pids.add(matched_pids[0][0])
                    pids_having_rec.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_matchable_name_with_cache(
                matchable_name)
            if not matched_pids:
                for matching_function in M_NAME_FUNCTIONS[1:]:
                    matchable_name = matching_function(name)
                    matched_pids = find_pids_by_matchable_name_with_cache(
                        matchable_name)
                    if matched_pids:
                        break

            matched_pids = [p for p in matched_pids if int(p) not in used_pids]

            best_matched_pid = None
            for matched_pid in matched_pids:
                # Because of the wrongly labeled data in the db, all
                # of the possible choices have to be checked. If one of the
                # coauthors, who had his signature already considered, claimed
                # in the past one of the signatures of currently considered
                # author, the algorithm will think that two signatures belong
                # to the same person, and, will create an unnecessary new
                # profile.
                if not int(matched_pid) in pids_having_rec:
                    best_matched_pid = matched_pid
                    break

            if not best_matched_pid:
                new_pid = new_person_from_signature(
                    list(sig) + [rec], name, matchable_name)
                M_NAME_PIDS_CACHE[matchable_name] = new_pid
                used_pids.add(new_pid)
                updated_pids.add(new_pid)
            else:
                add_signature(list(sig) + [rec],
                              name,
                              best_matched_pid,
                              m_name=matchable_name)
                M_NAME_PIDS_CACHE[matchable_name] = best_matched_pid
                used_pids.add(best_matched_pid)
                updated_pids.add(best_matched_pid)
                pids_having_rec.add(best_matched_pid)

        logger.log('Finished with %s' % str(rec))

    logger.update_status_final()

    destroy_partial_marc_caches()

    if personids_to_update_extids:
        updated_pids |= set(personids_to_update_extids)
    if updated_pids:  # an empty set will update all canonical_names
        update_canonical_names_of_authors(updated_pids)
        update_external_ids_of_authors(
            updated_pids,
            limit_to_claimed_papers=bconfig.
            LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS,
            force_cache_tables=True)

    destroy_partial_marc_caches()
    destroy_mnames_pids_cache()

    remove_empty_authors()

    task_update_progress("Done!")
Пример #4
0
    return tuple(signatures)


def get_signatures_with_inspireID_cache(inspireid):
    return _get_signatures_with_tag_value_cache(inspireid, '__i')


def get_signatures_with_orcid_cache(orcid):
    return _get_signatures_with_tag_value_cache(orcid, '__j')


def get_all_recids_in_hepnames():
    return set(perform_request_search(p='', cc='HepNames', rg=0))


get_all_recids_in_hepnames = memoized(get_all_recids_in_hepnames)


def get_inspireID_from_hepnames(pid):
    """return inspireID of a pid by searching the hepnames

    Arguments:
    pid -- the pid of the author to search in the hepnames dataset
    """
    author_canonical_name = get_canonical_name_of_author(pid)
    hepnames_recids = get_all_recids_in_hepnames()
    try:
        recid = set(
            search_unit_in_bibxxx(p=author_canonical_name[0][0],
                                  f='035__',
                                  type='='))