Пример #1
0
def crossref_normalize_name(record):
    """
    Changes the format of author's name (often with initials) to the proper,
    unified one, using bibauthor_name_utils tools
    @return: changed record
    """
    # pattern for removing the spaces between two initials
    pattern_initials = '([A-Z]\\.)\\s([A-Z]\\.)'
    # first, change the main author
    for field in record_get_field_instances(record, '100'):
        main_author = field[0][0][1]
        new_author = create_normalized_name(split_name_parts(main_author))
        # remove spaces between initials
        # two iterations are required
        for _ in range(2):
            new_author = re.sub(pattern_initials, '\g<1>\g<2>', new_author)
        position = field[4]
        record_modify_subfield(rec=record, tag='100', subfield_code='a', \
        value=new_author, subfield_position=0, field_position_global=position)

    # then, change additional authors
    for field in record_get_field_instances(record, '700'):
        author = field[0][0][1]
        new_author = create_normalized_name(split_name_parts(author))
        for _ in range(2):
            new_author = re.sub(pattern_initials, '\g<1>\g<2>', new_author)
        position = field[4]
        record_modify_subfield(rec=record, tag='700', subfield_code='a', \
            value=new_author, subfield_position=0, field_position_global=position)
Пример #2
0
def crossref_normalize_name(record):
    """
    Changes the format of author's name (often with initials) to the proper,
    unified one, using bibauthor_name_utils tools
    @return: changed record
    """
    # pattern for removing the spaces between two initials
    pattern_initials = '([A-Z]\\.)\\s([A-Z]\\.)'
    # first, change the main author
    for field in record_get_field_instances(record, '100'):
        main_author = field[0][0][1]
        new_author = create_normalized_name(split_name_parts(main_author))
        # remove spaces between initials
        # two iterations are required
        for _ in range(2):
            new_author = re.sub(pattern_initials, r'\g<1>\g<2>', new_author)
        position = field[4]
        record_modify_subfield(rec=record, tag='100', subfield_code='a',
        value=new_author, subfield_position=0, field_position_global=position)

    # then, change additional authors
    for field in record_get_field_instances(record, '700'):
        author = field[0][0][1]
        new_author = create_normalized_name(split_name_parts(author))
        for _ in range(2):
            new_author = re.sub(pattern_initials, r'\g<1>\g<2>', new_author)
        position = field[4]
        record_modify_subfield(rec=record, tag='700', subfield_code='a',
            value=new_author, subfield_position=0, field_position_global=position)
def convert_personid():
    from invenio.dbquery import run_sql # oh come on, the whole function will be removed soon
    from itertools import repeat
    chunk = 1000

    old_personid = run_sql("SELECT `personid`, `tag`, `data`, `flag`, `lcul` FROM `aidPERSONID`")

    def flush_papers(args):
        run_sql("INSERT INTO `aidPERSONIDPAPERS` "
                "(`personid`, "
                " `bibref_table`, "
                " `bibref_value`, "
                " `bibrec`, "
                " `name`, "
                " `flag`, "
                " `lcul`) "
                "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s, %s, %s)", len(args) / 7))
                , tuple(args))

    def flush_data(args):
        run_sql("INSERT INTO `aidPERSONIDDATA` "
                "(`personid`, "
                " `tag`, "
                " `data`, "
                " `opt1`, "
                " `opt2`) "
                "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s)", len(args) / 5))
               , tuple(args))

    paper_args = []
    data_args = []
    for row in old_personid:
        if row[1] == 'paper':
            bibref, rec = row[2].split(',')
            tab, ref = bibref.split(':')
            try:
                name = get_name_by_bibref((int(tab), int(ref), int(rec)))
            except:
                continue
            name = split_name_parts(name)
            name = create_normalized_name(name)
            paper_args += [row[0], tab, ref, rec, name, row[3], row[4]]
            if len(paper_args) > chunk:
                flush_papers(paper_args)
                paper_args = []

        elif row[1] == 'gathered_name':
            continue
        else:
            data_args += list(row)
            if len(data_args) > chunk:
                flush_data(data_args)
                data_args = []

    if paper_args:
        flush_papers(paper_args)

    if data_args:
        flush_data(data_args)
Пример #4
0
def convert_personid():
    from dbquery import run_sql # oh come on, the whole function will be removed soon
    from itertools import repeat
    chunk = 1000

    old_personid = run_sql("SELECT `personid`, `tag`, `data`, `flag`, `lcul` FROM `aidPERSONID`")

    def flush_papers(args):
        run_sql("INSERT INTO `aidPERSONIDPAPERS` "
                "(`personid`, "
                " `bibref_table`, "
                " `bibref_value`, "
                " `bibrec`, "
                " `name`, "
                " `flag`, "
                " `lcul`) "
                "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s, %s, %s)", len(args) / 7))
                , tuple(args))

    def flush_data(args):
        run_sql("INSERT INTO `aidPERSONIDDATA` "
                "(`personid`, "
                " `tag`, "
                " `data`, "
                " `opt1`, "
                " `opt2`) "
                "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s)", len(args) / 5))
               , tuple(args))

    paper_args = []
    data_args = []
    for row in old_personid:
        if row[1] == 'paper':
            bibref, rec = row[2].split(',')
            tab, ref = bibref.split(':')
            try:
                name = get_name_by_bibrecref((int(tab), int(ref), int(rec)))
            except:
                continue
            name = split_name_parts(name)
            name = create_normalized_name(name)
            paper_args += [row[0], tab, ref, rec, name, row[3], row[4]]
            if len(paper_args) > chunk:
                flush_papers(paper_args)
                paper_args = []

        elif row[1] == 'gathered_name':
            continue
        else:
            data_args += list(row)
            if len(data_args) > chunk:
                flush_data(data_args)
                data_args = []

    if paper_args:
        flush_papers(paper_args)

    if data_args:
        flush_data(data_args)
Пример #5
0
def rabbit(bibrecs,
           check_invalid_papers=False,
           personids_to_update_extids=None,
           verbose=False):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w')
    logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs)))

    def logwrite(msg, is_error):
        verb = 9
        if is_error or verbose:
            verb = 1
        write_message(msg, verbose=verb)

    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_to_authors_mapping()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_free_author_id()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_papers()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) >
            bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):

        logwrite("\nConsidering %s" % str(rec), False)

        if idx % 200 == 0:
            task_sleep_now_if_required(True)

            update_status(
                float(idx) / len(bibrecs),
                "%d/%d current: %d" % (idx, len(bibrecs), rec))
            task_update_progress("%d/%d current: %d" %
                                 (idx, len(bibrecs), rec))

        if rec in deleted:
            logwrite(
                " - Record was deleted, removing from pid and continuing with next record",
                True)
            remove_papers([rec])
            continue

        markrefs = frozenset(
            chain(
                izip(cycle([100]),
                     imap(itemgetter(0), get_author_refs_of_paper(rec))),
                izip(cycle([700]),
                     imap(itemgetter(0), get_coauthor_refs_of_paper(rec)))))

        personid_rows = [
            map(int, row[:3]) + [row[4]]
            for row in get_signatures_of_paper(rec)
        ]
        personidrefs_names = dict(
            ((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict(
            (new,
             create_normalized_name(split_name_parts(get_name_by_bibref(new))))
            for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[
            compare_names(new_signatures_names[new], personidrefs_names[old])
            for old in old_signatures
        ] for new in new_signatures]

        logwrite(" - Old signatures: %s" % str(old_signatures),
                 bool(old_signatures))
        logwrite(" - New signatures: %s" % str(new_signatures),
                 bool(new_signatures))
        logwrite(" - Matrix: %s" % str(matrix), bool(matrix))

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix)
                      if score > threshold]

        logwrite(" - Best match: %s " % str(best_match), bool(best_match))

        for new, old in best_match:
            logwrite(
                " - - Moving signature: %s on %s to %s as %s" %
                (old, rec, new, new_signatures_names[new]), True)
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_signatures(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(
            map(itemgetter(0), best_match))

        pids_having_rec = set(
            [int(row[0]) for row in get_signatures_of_paper(rec)])
        logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = list()
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id_of_signature(sig + (rec, ))
                    if inspire_id:
                        matched_pids = list(
                            get_author_by_external_id(inspire_id[0]))
                        if matched_pids and int(
                                matched_pids[0][0]) in pids_having_rec:
                            matched_pids = list()
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    pids_having_rec.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [
                p for p in matched_pids if int(p[0]) not in used_pids
            ]

            if not matched_pids or int(matched_pids[0][0]) in pids_having_rec:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])
                pids_having_rec.add(matched_pids[0][0])

        logwrite('Finished with %s' % str(rec), False)

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids:  # an empty set will update all canonical_names
        update_canonical_names_of_authors(updated_pids)
        update_external_ids_of_authors(
            updated_pids,
            limit_to_claimed_papers=bconfig.
            LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()

    remove_empty_authors()
Пример #6
0
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_string_to_pid_dictionary()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_new_personid()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and
        len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):
        task_sleep_now_if_required(True)
        update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec))
        if rec in deleted:
            delete_paper_from_personid(rec)
            continue

        markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))),
                                   izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec)))))

        personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)]
        personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new))))
                                    for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old])
                  for old in old_signatures] for new in new_signatures]

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix) if score > threshold]
        for new, old in best_match:
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_sigs(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = []
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id(sig + (rec,))
                    if inspire_id:
                        matched_pids = list(get_person_with_extid(inspire_id[0]))
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids]

            if not matched_pids:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids: # an empty set will update all canonical_names
        update_personID_canonical_names(updated_pids)
        update_personID_external_ids(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()
Пример #7
0
def arxiv_login(req, picked_profile=None):
    '''
    Log in through arxive. If user already associated to a personid, returns the personid.
    If user has no pid, try to guess which personid to associate based on surname and papers
    from arxiv. If no compatible person is found, creates a new person.
    At the end of the process opens a ticket for the user claiming the papers from arxiv.
    !!! the user will find the open ticket, which will require him to go through the
    final review before getting committed.

    @param req: Apache request object
    @type req: Apache request object

    @return: Returns the pid resulting in the process
    @rtype: int
    '''
    def session_bareinit(req):
        session = get_session(req)
        try:
            pinfo = session["personinfo"]
            if 'ticket' not in pinfo:
                pinfo["ticket"] = []
        except KeyError:
            pinfo = dict()
            session['personinfo'] = pinfo
            pinfo["ticket"] = []
        session.dirty = True




    session_bareinit(req)
    session = get_session(req)

    pinfo = session['personinfo']
    ticket = session['personinfo']['ticket']

    uinfo = collect_user_info(req)
    pinfo['external_first_entry'] = False

    try:
        name = uinfo['external_firstname']
    except KeyError:
        name = ''
    try:
        surname = uinfo['external_familyname']
    except KeyError:
        surname = ''

    if surname:
        session['personinfo']['arxiv_name'] = nameapi.create_normalized_name(
                                          nameapi.split_name_parts(surname + ', ' + name))
    else:
        session['personinfo']['arxiv_name'] = ''

    session.dirty = True

    try:
        arxiv_p_ids = uinfo['external_arxivids'].split(';')
    except KeyError:
        arxiv_p_ids = []

    #'external_arxivids': 'hep-th/0112017;hep-th/0112020',
    #'external_familyname': 'Weiler',
    #'external_firstname': 'Henning',

    try:
        found_bibrecs = set(reduce(add, [perform_request_search(p='037:' + str(arx), of='id', rg=0)for arx in arxiv_p_ids]))
    except (IndexError, TypeError):
        found_bibrecs = set()

    #found_bibrecs = [567700, 567744]

    uid = getUid(req)
    pid, pid_found = dbapi.get_personid_from_uid([[uid]])

    if pid_found:
        pid = pid[0]
    else:
        if picked_profile == None:
            top5_list = dbapi.find_top5_personid_for_new_arXiv_user(found_bibrecs,
                nameapi.create_normalized_name(nameapi.split_name_parts(surname + ', ' + name)))
            return ("top5_list", top5_list)
        else:
            pid = dbapi.check_personids_availability(picked_profile, uid)

    pid_bibrecs = set([i[0] for i in dbapi.get_all_personids_recs(pid, claimed_only=True)])
    missing_bibrecs = found_bibrecs - pid_bibrecs
    #present_bibrecs = found_bibrecs.intersection(pid_bibrecs)

    #assert len(found_bibrecs) == len(missing_bibrecs) + len(present_bibrecs)

    tempticket = []
    #now we have to open the tickets...
    #person_papers contains the papers which are already assigned to the person and came from arxive,
    #they can be claimed regardless

    for bibrec in missing_bibrecs:
        tempticket.append({'pid':pid, 'bibref':str(bibrec), 'action':'confirm'})

    #check if ticket targets (bibref for pid) are already in ticket
    for t in list(tempticket):
        for e in list(ticket):
            if e['pid'] == t['pid'] and e['bibref'] == t['bibref']:
                ticket.remove(e)
        ticket.append(t)

    session.dirty = True

    if picked_profile != None and picked_profile != pid and picked_profile != -1:

        return ("chosen pid not available", pid)
    elif picked_profile != None and picked_profile == pid and picked_profile != -1:
        return ("pid assigned by user", pid)
    else:
        return ("pid", pid)
Пример #8
0
def arxiv_login(req, picked_profile=None):
    '''
    Log in through arxive. If user already associated to a personid, returns the personid.
    If user has no pid, try to guess which personid to associate based on surname and papers
    from arxiv. If no compatible person is found, creates a new person.
    At the end of the process opens a ticket for the user claiming the papers from arxiv.
    !!! the user will find the open ticket, which will require him to go through the
    final review before getting committed.

    @param req: Apache request object
    @type req: Apache request object

    @return: Returns the pid resulting in the process
    @rtype: int
    '''
    def session_bareinit(req):
        session = get_session(req)
        try:
            pinfo = session["personinfo"]
            if 'ticket' not in pinfo:
                pinfo["ticket"] = []
        except KeyError:
            pinfo = dict()
            session['personinfo'] = pinfo
            pinfo["ticket"] = []
        session.dirty = True




    session_bareinit(req)
    session = get_session(req)

    pinfo = session['personinfo']
    ticket = session['personinfo']['ticket']

    uinfo = collect_user_info(req)
    pinfo['external_first_entry'] = False

    try:
        name = uinfo['external_firstname']
    except KeyError:
        name = ''
    try:
        surname = uinfo['external_familyname']
    except KeyError:
        surname = ''

    if surname:
        session['personinfo']['arxiv_name'] = nameapi.create_normalized_name(
                                          nameapi.split_name_parts(surname + ', ' + name))
    else:
        session['personinfo']['arxiv_name'] = ''

    session.dirty = True

    try:
        arxiv_p_ids = uinfo['external_arxivids'].split(';')
    except KeyError:
        arxiv_p_ids = []

    #'external_arxivids': 'hep-th/0112017;hep-th/0112020',
    #'external_familyname': 'Weiler',
    #'external_firstname': 'Henning',

    try:
        found_bibrecs = set(reduce(add, [perform_request_search(p='037:' + str(arx), of='id', rg=0)for arx in arxiv_p_ids]))
    except (IndexError, TypeError):
        found_bibrecs = set()

    #found_bibrecs = [567700, 567744]

    uid = getUid(req)
    pid, pid_found = dbapi.get_personid_from_uid([[uid]])

    if pid_found:
        pid = pid[0]
    else:
        if picked_profile == None:
            top5_list = dbapi.find_top5_personid_for_new_arXiv_user(found_bibrecs,
                nameapi.create_normalized_name(nameapi.split_name_parts(surname + ', ' + name)))
            return ("top5_list", top5_list)
        else:
            pid = dbapi.check_personids_availability(picked_profile, uid)

    pid_bibrecs = set([i[0] for i in dbapi.get_all_personids_recs(pid, claimed_only=True)])
    missing_bibrecs = found_bibrecs - pid_bibrecs
    #present_bibrecs = found_bibrecs.intersection(pid_bibrecs)

    #assert len(found_bibrecs) == len(missing_bibrecs) + len(present_bibrecs)

    tempticket = []
    #now we have to open the tickets...
    #person_papers contains the papers which are already assigned to the person and came from arxive,
    #they can be claimed regardless

    for bibrec in missing_bibrecs:
        tempticket.append({'pid':pid, 'bibref':str(bibrec), 'action':'confirm'})

    #check if ticket targets (bibref for pid) are already in ticket
    for t in list(tempticket):
        for e in list(ticket):
            if e['pid'] == t['pid'] and e['bibref'] == t['bibref']:
                ticket.remove(e)
        ticket.append(t)

    session.dirty = True

    if picked_profile != None and picked_profile != pid and picked_profile != -1:

        return ("chosen pid not available", pid)
    elif picked_profile != None and picked_profile == pid and picked_profile != -1:
        return ("pid assigned by user", pid)
    else:
        return ("pid", pid)
Пример #9
0
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None, verbose=False):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w')
    logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs)))

    def logwrite(msg, is_error):
        verb = 9
        if is_error or verbose:
            verb = 1
        write_message(msg, verbose=verb)

    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_to_authors_mapping()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_free_author_id()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_papers()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and
        len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):

        logwrite("\nConsidering %s" % str(rec), False)

        if idx%200 == 0:
            task_sleep_now_if_required(True)

            update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec))
            task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec))

        if rec in deleted:
            logwrite(" - Record was deleted, removing from pid and continuing with next record", True)
            remove_papers([rec])
            continue


        markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_author_refs_of_paper(rec))),
                                   izip(cycle([700]), imap(itemgetter(0), get_coauthor_refs_of_paper(rec)))))

        personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec)]
        personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibref(new))))
                                    for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old])
                  for old in old_signatures] for new in new_signatures]

        logwrite(" - Old signatures: %s" % str(old_signatures), bool(old_signatures))
        logwrite(" - New signatures: %s" % str(new_signatures), bool(new_signatures))
        logwrite(" - Matrix: %s" % str(matrix), bool(matrix))

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix) if score > threshold]

        logwrite(" - Best match: %s " % str(best_match), bool(best_match))

        for new, old in best_match:
            logwrite(" - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new]), True)
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_signatures(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match))

        pids_having_rec = set([int(row[0]) for row in get_signatures_of_paper(rec)])
        logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = list()
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id_of_signature(sig + (rec,))
                    if inspire_id:
                        matched_pids = list(get_author_by_external_id(inspire_id[0]))
                        if matched_pids and int(matched_pids[0][0]) in pids_having_rec:
                            matched_pids = list()
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    pids_having_rec.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids]

            if not matched_pids or int(matched_pids[0][0]) in pids_having_rec:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])
                pids_having_rec.add(matched_pids[0][0])

        logwrite('Finished with %s' % str(rec), False)

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids: # an empty set will update all canonical_names
        update_canonical_names_of_authors(updated_pids)
        update_external_ids_of_authors(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()

    remove_empty_authors()
Пример #10
0
def rabbit(bibrecs,
           check_invalid_papers=False,
           personids_to_update_extids=None):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_string_to_pid_dictionary()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_new_personid()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) >
            bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):
        task_sleep_now_if_required(True)
        update_status(
            float(idx) / len(bibrecs),
            "%d/%d current: %d" % (idx, len(bibrecs), rec))
        if rec in deleted:
            delete_paper_from_personid(rec)
            continue

        markrefs = frozenset(
            chain(
                izip(cycle([100]),
                     imap(itemgetter(0), get_authors_from_paper(rec))),
                izip(cycle([700]),
                     imap(itemgetter(0), get_coauthors_from_paper(rec)))))

        personid_rows = [
            map(int, row[:3]) + [row[4]]
            for row in get_signatures_from_rec(rec)
        ]
        personidrefs_names = dict(
            ((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict(
            (new,
             create_normalized_name(
                 split_name_parts(get_name_by_bibrecref(new))))
            for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[
            compare_names(new_signatures_names[new], personidrefs_names[old])
            for old in old_signatures
        ] for new in new_signatures]

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix)
                      if score > threshold]
        for new, old in best_match:
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_sigs(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(
            map(itemgetter(0), best_match))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = []
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id(sig + (rec, ))
                    if inspire_id:
                        matched_pids = list(
                            get_person_with_extid(inspire_id[0]))
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [
                p for p in matched_pids if int(p[0]) not in used_pids
            ]

            if not matched_pids:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids:  # an empty set will update all canonical_names
        update_personID_canonical_names(updated_pids)
        update_personID_external_ids(
            updated_pids,
            limit_to_claimed_papers=bconfig.
            LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()
def main():
    """
    Reads import file and verfies the md5 hash.
    For each line in the import file:
        find new record from bibcode, find new ref from name on record
        find old row in personid tables
        copy row with new authorref (tab:bibref,rec) to temp table
    overwrite personid tables w/ temp table
    """
    ## create temporary tables...
    print "Creating temporary tables..."
    create_temp_pid_sql_table()
    create_temp_piddata_sql_table()
    create_temp_user_input_log_sql_table()
    
    ## fill temp tables w/ static values...
    print "Filling temporary tables with static, unchanged content"
    copy_unaltered_piddata_rows_to_temp()
    copy_unaltered_user_input_log_table_rows_to_temp()
    ## compile regexp for line break removal
    nlr = re.compile('[\n\r]+')

    #verify file integrity
    print ("Verifying file integrity of %s with"
           " MD5 checksum from %s" % (IMPORT_FILE_NAME, IMPORT_MD5_FILE_NAME))
    fp = open(IMPORT_FILE_NAME, "rb")
    fmd5 = md5_for_file(fp)
    fp.close()

    fp = open(IMPORT_MD5_FILE_NAME, "r")
    vmd5 = fp.read()
    fp.close()
    
    if not fmd5 == vmd5:
        print "WARNING: Detected a disturbance in the file. Will exit here."
        return

    total_lines = file_len()
    fp = open(IMPORT_FILE_NAME, "r")
    print "Processing file %s..." % IMPORT_FILE_NAME

    for index, line in enumerate(fp.readlines()):
#        if index == 100:
#            break
        if index % 5000 == 0:
            percent = float(index) / float(total_lines)
            update_status(percent, "%s of %s lines processed in %s" % (index, total_lines, IMPORT_FILE_NAME))

        new_ref = None
        tab1, old_ref, old_rec, tab2, enname, bibcode = line.split("    ")
        
        assert tab1 == tab2

        if tab1 == "table":
            continue

        name = base64.b64decode(enname)
#        name = nq.sub("", name)
        bibcode = nlr.sub("", bibcode)
        new_rec = get_bibrec_from_bibcode(bibcode)

        for ref in get_authorrefs_and_names_from_bibrec(new_rec):
#            refname = create_normalized_name(split_name_parts(ref[2]))
            refname = ref[2]

            if refname == name and str(ref[0]) == tab1:
                #MySQL equivalent: col_name COLLATE utf8_bin = 'Case SenSitive name'
                new_ref = ref[1]
        
        if not new_ref:
            print "WARN: Authorref not found for name %s on new record %s?!" % (name, new_rec)
            continue

        # get personid, flag, lcul and last_updated from old aidPERSONIDPAPERS
        old_data = find_old_pidtable_row(tab1, old_ref, old_rec)

        if old_data:
            ## prepare data in temporary tables...
            pid, flag, lcul, lupdate = old_data
            old_authorref = "%s:%s,%s" % (tab1, old_ref, old_rec)
            new_authorref = "%s:%s,%s" % (tab1, new_ref, new_rec)
            ## Transform the name into a more consistent form
            inname = create_normalized_name(split_name_parts(name))
            ## Insert transformed data into temp tables...
            insert_into_temp_table(pid, tab1, new_ref, new_rec, inname, flag, lcul, lupdate)
            update_temp_piddata_table(old_authorref, new_authorref)
            update_temp_user_input_log_table(old_authorref, new_authorref)
        else:
            print "WARN: %s does not exist in db!" % ([tab1, old_ref, old_rec])

        # The following is true only if applied on the same data set
        # Commented out by default. For testing/debug uses only
        try:
            if RUN_IN_TEST_MODE:
                assert str(old_rec) == str(new_rec)
                assert str(old_ref) == str(new_ref)
                pass
        except AssertionError, e:
            print "ERROR: ", e
            print "%s:%s,%s vs. %s:%s,%s on %s:%s" % (tab1, old_ref, old_rec, tab1, new_ref, new_rec, bibcode, name)
Пример #12
0
 def test_create_normalized_name(self):
     for tn in self.tc.keys():
         self.assertEqual(create_normalized_name(split_name_parts(tn)),
                          self.tc[tn])
 def test_create_normalized_name(self):
     for tn in self.tc.keys():
         self.assertEqual(create_normalized_name(split_name_parts(tn)), self.tc[tn])