Пример #1
0
def compare_fieldvalues_authorname(field_comparisons, threshold, matches_needed):
    """
    Performs field validation given an list of field comparisons using a technique
    that is meant for author-names taking into account initials vs. full-name,
    using matching techniques available from BibAuthorId.

    Each comparison is done according to given threshold which the result must
    be equal or above to match.

    During validation the fields are compared and matches are counted per
    field, up to the given amount of matches needed is met, causing the
    function to return True. If validation ends before this threshold is met
    it will return False.

    @param field_comparisons: list of comparisons, each which contains a list
        of field-value to field-value comparisons.
    @type field_comparisons: list

    @param threshold: number describing the match threshold a comparison must
        exceed to become a positive match.
    @type threshold: float

    @param matches_needed: number of positive field matches needed for the entire
        comparison process to give a positive result.
    @type matches_needed: int

    @return: tuple of matching result, True if enough matches are found, False if not,
        and number of matches.
    @rtype: tuple
    """
    matches_found = 0
    # Loop over all possible comparisons field by field, if a match is found,
    # we are done with this field and break out to try and match next field.
    for comparisons in field_comparisons:
        for value, other_value in comparisons:
            # Grab both permutations of a name (before, after and after, before)
            # and compare to each unique commutative combination. Ex:
            # Doe,J vs. Smith,J -> [(('Smith,J', 'Doe,J'), ('Smith,J', 'J,Doe')),
            #                       (('J,Smith', 'Doe,J'), ('J,Smith', 'J,Doe'))]
            author_comparisons = [pair for pair in get_paired_comparisons(\
                                          get_reversed_string_variants(value), \
                                          get_reversed_string_variants(other_value))][0]
            for str1, str2 in author_comparisons:
                # Author-name comparison - using BibAuthorid function
                diff = soft_compare_names(str1, str2)
                if diff >= threshold:
                    matches_found += 1
                    break
            else:
                # We continue as no match was found
                continue
            # We break out as a match was found
            break
        # If we already have found required number of matches, we return immediately
        if matches_found >= matches_needed:
            return True, matches_found
    # Often authors are not matching fully, so lets allow for the number of matches to
    # be a little lower, using the same threshold
    result = matches_found >= matches_needed or matches_found / float(matches_needed) > threshold
    return result, matches_found
Пример #2
0
def fallback_find_personids_by_name_string(target):
    '''
    Search engine to find persons matching the given string
    The matching is done on the surname first, and names if present.
    An ordered list (per compatibility) of pids and found names is returned.

    @param namestring: string name, 'surname, names I.'
    @type: string
    @param strict: Define if this shall perform an exact or a fuzzy match
    @type strict: boolean
    @return: pid list of lists
    [pid,[[name string, occur count, compatibility]]]
    '''
    family = get_surname(target)
    ascii_family = get_surname(translate_to_ascii(target)[0])
    clean_family = get_surname(clean_string(target))

    #SANITY: avoid empty queries
    if not family:
        return list()

    levels = (  # target + '%', #this introduces a weird problem: different results for mele, salvatore and salvatore mele
        family + '%', '%' + family + ',%', '%' + family[1:-1] + '%')

    if len(family) <= 4:
        levels = [levels[0], levels[2]]

    names = list(set().union(
        *map(get_authors_by_name_regexp, (family + ',%', ascii_family + ',%',
                                          clean_family + ',%'))))

    if not names:
        for lev in levels:
            names = dbinter.get_authors_by_name_regexp(lev)
            if names:
                break

    is_canonical = False
    if not names:
        names = dbinter.get_authors_by_canonical_name_regexp(target)
        is_canonical = True

    names = groupby(sorted(names))
    names = [(key[0], key[1], len(list(data)),
              soft_compare_names(target, key[1])) for key, data in names]
    names = groupby(names, itemgetter(0))
    names = [(key,
              sorted([(d[1], d[2], d[3])
                      for d in data if (d[3] > 0.5 or is_canonical)],
                     key=itemgetter(2),
                     reverse=True)) for key, data in names]
    names = [name for name in names if name[1]]
    names = sorted(names,
                   key=lambda x: (x[1][0][2], x[1][0][0], x[1][0][1]),
                   reverse=True)

    return names
Пример #3
0
def find_personIDs_by_name_string(target):
    '''
    Search engine to find persons matching the given string
    The matching is done on the surname first, and names if present.
    An ordered list (per compatibility) of pids and found names is returned.

    @param namestring: string name, 'surname, names I.'
    @type: string
    @param strict: Define if this shall perform an exact or a fuzzy match
    @type strict: boolean
    @return: pid list of lists
    [pid,[[name string, occur count, compatibility]]]
    '''
    splitted_name = split_name_parts(target)
    family = splitted_name[0]

    levels = (  #target + '%', #this introduces a weird problem: different results for mele, salvatore and salvatore mele
        family + ',%', family[:-2] + '%', '%' + family + ',%',
        '%' + family[1:-1] + '%')

    if len(family) <= 4:
        levels = [levels[0], levels[2]]

    for lev in levels:
        names = dbinter.get_all_personids_by_name(lev)
        if names:
            break

    is_canonical = False
    if not names:
        names = dbinter.get_personids_by_canonical_name(target)
        is_canonical = True

    names = groupby(sorted(names))
    names = [(key[0], key[1], len(list(data)),
              soft_compare_names(target, key[1])) for key, data in names]
    names = groupby(names, itemgetter(0))
    names = [(key,
              sorted([(d[1], d[2], d[3])
                      for d in data if (d[3] > 0.5 or is_canonical)],
                     key=itemgetter(2),
                     reverse=True)) for key, data in names]
    names = [name for name in names if name[1]]
    names = sorted(names,
                   key=lambda x: (x[1][0][2], x[1][0][0], x[1][0][1]),
                   reverse=True)

    return names
def fallback_find_personids_by_name_string(target):
    '''
    Search engine to find persons matching the given string
    The matching is done on the surname first, and names if present.
    An ordered list (per compatibility) of pids and found names is returned.

    @param namestring: string name, 'surname, names I.'
    @type: string
    @param strict: Define if this shall perform an exact or a fuzzy match
    @type strict: boolean
    @return: pid list of lists
    [pid,[[name string, occur count, compatibility]]]
    '''
    splitted_name = split_name_parts(target)
    family = splitted_name[0]

    levels = (# target + '%', #this introduces a weird problem: different results for mele, salvatore and salvatore mele
              family + ',%',
              family[:-2] + '%',
              '%' + family + ',%',
              '%' + family[1:-1] + '%')

    if len(family) <= 4:
        levels = [levels[0], levels[2]]

    for lev in levels:
        names = dbinter.get_authors_by_name_regexp(lev)
        if names:
            print "%s" % lev
            break

    is_canonical = False
    if not names:
        names = dbinter.get_authors_by_canonical_name_regexp(target)
        is_canonical = True

    names = groupby(sorted(names))
    names = [(key[0], key[1], len(list(data)), soft_compare_names(target, key[1])) for key, data in names]
    names = groupby(names, itemgetter(0))
    names = [(key, sorted([(d[1], d[2], d[3]) for d in data if (d[3] > 0.5 or is_canonical)],
             key=itemgetter(2), reverse=True)) for key, data in names]
    names = [name for name in names if name[1]]
    names = sorted(names, key=lambda x: (x[1][0][2], x[1][0][0], x[1][0][1]), reverse=True)

    return names
Пример #5
0
def compare_fieldvalues_authorname(field_comparisons, threshold,
                                   matches_needed):
    """
    Performs field validation given an list of field comparisons using a technique
    that is meant for author-names taking into account initials vs. full-name,
    using matching techniques available from BibAuthorId.

    Each comparison is done according to given threshold which the result must
    be equal or above to match.

    During validation the fields are compared and matches are counted per
    field, up to the given amount of matches needed is met, causing the
    function to return True. If validation ends before this threshold is met
    it will return False.

    @param field_comparisons: list of comparisons, each which contains a list
        of field-value to field-value comparisons.
    @type field_comparisons: list

    @param threshold: number describing the match threshold a comparison must
        exceed to become a positive match.
    @type threshold: float

    @param matches_needed: number of positive field matches needed for the entire
        comparison process to give a positive result.
    @type matches_needed: int

    @return: tuple of matching result, True if enough matches are found, False if not,
        and number of matches.
    @rtype: tuple
    """
    matches_found = 0
    # Loop over all possible comparisons field by field, if a match is found,
    # we are done with this field and break out to try and match next field.
    for comparisons in field_comparisons:
        for value, other_value in comparisons:
            # Grab both permutations of a name (before, after and after, before)
            # and compare to each unique commutative combination. Ex:
            # Doe,J vs. Smith,J -> [(('Smith,J', 'Doe,J'), ('Smith,J', 'J,Doe')),
            #                       (('J,Smith', 'Doe,J'), ('J,Smith', 'J,Doe'))]
            author_comparisons = [pair for pair in get_paired_comparisons(\
                                          get_reversed_string_variants(value), \
                                          get_reversed_string_variants(other_value))][0]
            for str1, str2 in author_comparisons:
                # Author-name comparison - using BibAuthorid function
                diff = soft_compare_names(str1, str2)
                if diff >= threshold:
                    matches_found += 1
                    break
            else:
                # We continue as no match was found
                continue
            # We break out as a match was found
            break
        # If we already have found required number of matches, we return immediately
        if matches_found >= matches_needed:
            return True, matches_found
    # Often authors are not matching fully, so lets allow for the number of matches to
    # be a little lower, using the same threshold
    result = matches_found >= matches_needed or matches_found / float(
        matches_needed) > threshold
    return result, matches_found
Пример #6
0
def arxiv_login(req):
    '''
    Log in through arxive. If user already associated to a personid, returns the personid.
    If user has no pid, try to guess which personid to associate based on surname and papers
    from arxiv. If no compatible person is found, creates a new person.
    At the end of the process opens a ticket for the user claiming the papers from arxiv.
    !!! the user will find the open ticket, which will require him to go through the
    final review before getting committed.

    @param req: Apache request object
    @type req: Apache request object

    @return: Returns the pid resulting in the process
    @rtype: int
    '''
    def session_bareinit(req):
        session = get_session(req)
        try:
            pinfo = session["personinfo"]
            if 'ticket' not in pinfo:
                pinfo["ticket"] = []
        except KeyError:
            pinfo = dict()
            session['personinfo'] = pinfo
            pinfo["ticket"] = []
        session.save()

    session_bareinit(req)
    session = get_session(req)
    pinfo = session['personinfo']
    ticket = session['personinfo']['ticket']

    uinfo = collect_user_info(req)
    pinfo['external_first_entry'] = True
    session.save()

    arxiv_p_ids = []
    name = ''
    surname = ''
    try:
        for i in uinfo['external_arxivids'].split(';'):
            arxiv_p_ids.append(i)
        name = uinfo['external_firstname']
        surname = uinfo['external_familyname']
    #'external_arxivids': 'hep-th/0112017;hep-th/0112020',
    #'external_familyname': 'Weiler',
    #'external_firstname': 'Henning',
    except KeyError:
        pass

    found_bibrecs = []
    for arx in arxiv_p_ids:
        t = search_engine.perform_request_search(p='037:' + str(arx), of='id')
        for i in t:
            found_bibrecs.append(i)
    #found_bibrecs = [567700, 567744]

    uid = getUid(req)
    pid = dbapi.get_personid_from_uid([[uid]])
    if pid[1]:
        pid_bibrecs = dbapi.get_all_personids_recs(pid[0][0])
        pid_bibrecs = set(pid_bibrecs)
        missing_bibrecs = [bib for bib in found_bibrecs if int(bib) not in pid_bibrecs]
        found_bibrecs = [bib for bib in found_bibrecs if int(bib) in pid_bibrecs]
    else:
        missing_bibrecs = []

    bibrec_names = []
    for b in found_bibrecs + missing_bibrecs:
        bibrec_names.append([b, get_field_values_on_condition(b, source='API', get_table=['100', '700'], get_tag='a')])

    for n in list(bibrec_names):
        for i in list(n[1]):
            if nameapi.soft_compare_names(surname, i.encode('utf-8')) < 0.45:
                n[1].remove(i)
    #bibrec_names = [[78, set([u'M\xfcck, W'])]]

    #what is left are only suitable names for each record.
    bibrefrecs = []

    for bibrec in bibrec_names:
        for name in bibrec[1]:
            bibrefs = dbapi.get_bibrefs_from_name_string(name.encode('utf-8'))
            if len(bibrefs) < 1:
                continue
            for bibref in bibrefs[0][0].split(','):
                bibrefrecs.append(str(bibref) + ',' + str(bibrec[0]))
    #bibrefrecs = ['100:116,78', '700:505,78']

    person_papers = []
    if not pid[1]:
        brr = [[i] for i in bibrefrecs]
        possible_persons = dbapi.get_possible_personids_from_paperlist(brr)
        #[[0L, ['700:316,10']]]
        possible_persons = sorted(possible_persons, key=lambda k: len(k[1]))

        if len(possible_persons) > 1:
            for pp in possible_persons:
                pid = dbapi.assign_person_to_uid(uid, pp[0])
                person_papers = pp[1]
                if pid != -1:
                    break
            if pid == -1:
                pid = dbapi.assign_person_to_uid(uid, -1)
        elif len(possible_persons) == 1:
            pid = dbapi.assign_person_to_uid(uid, possible_persons[0][0])
            person_papers = possible_persons[0][1]
        else:
            pid = dbapi.assign_person_to_uid(uid, -1)
    else:
        pid = long(pid[0][0])

    tempticket = []
    #now we have to open the tickets...
    #person_papers contains the papers which are already assigned to the person and came from arxive,
    #they can be claimed regardless
    for bibref in person_papers:
        tempticket.append({'pid':pid, 'bibref':bibref, 'action':'confirm'})

    done_bibrecs = set(b.split(',')[1] for b in person_papers)
    for b in found_bibrecs + missing_bibrecs:
        if str(b) not in done_bibrecs:
            tempticket.append({'pid':pid, 'bibref':str(b), 'action':'confirm'})

    #check if ticket targets (bibref for pid) are already in ticket
    for t in list(tempticket):
        for e in list(ticket):
            if e['pid'] == t['pid'] and e['bibref'] == t['bibref']:
                ticket.remove(e)
        ticket.append(t)
    session.save()
    return pid
Пример #7
0
def arxiv_login(req):
    '''
    Log in through arxive. If user already associated to a personid, returns the personid.
    If user has no pid, try to guess which personid to associate based on surname and papers
    from arxiv. If no compatible person is found, creates a new person.
    At the end of the process opens a ticket for the user claiming the papers from arxiv.
    !!! the user will find the open ticket, which will require him to go through the
    final review before getting committed.

    @param req: Apache request object
    @type req: Apache request object

    @return: Returns the pid resulting in the process
    @rtype: int
    '''
    def session_bareinit(req):
        session = get_session(req)
        try:
            pinfo = session["personinfo"]
            if 'ticket' not in pinfo:
                pinfo["ticket"] = []
        except KeyError:
            pinfo = dict()
            session['personinfo'] = pinfo
            pinfo["ticket"] = []
        session.save()

    session_bareinit(req)
    session = get_session(req)
    pinfo = session['personinfo']
    ticket = session['personinfo']['ticket']

    uinfo = collect_user_info(req)
    pinfo['external_first_entry'] = True
    session.save()

    arxiv_p_ids = []
    name = ''
    surname = ''
    try:
        for i in uinfo['external_arxivids'].split(';'):
            arxiv_p_ids.append(i)
        name = uinfo['external_firstname']
        surname = uinfo['external_familyname']
    #'external_arxivids': 'hep-th/0112017;hep-th/0112020',
    #'external_familyname': 'Weiler',
    #'external_firstname': 'Henning',
    except KeyError:
        pass

    found_bibrecs = []
    for arx in arxiv_p_ids:
        t = search_engine.perform_request_search(p='037:' + str(arx), of='id')
        for i in t:
            found_bibrecs.append(i)
    #found_bibrecs = [567700, 567744]

    uid = getUid(req)
    pid = dbapi.get_personid_from_uid([[uid]])
    if pid[1]:
        pid_bibrecs = dbapi.get_all_personids_recs(pid[0][0])
        pid_bibrecs = set(pid_bibrecs)
        missing_bibrecs = [
            bib for bib in found_bibrecs if int(bib) not in pid_bibrecs
        ]
        found_bibrecs = [
            bib for bib in found_bibrecs if int(bib) in pid_bibrecs
        ]
    else:
        missing_bibrecs = []

    bibrec_names = []
    for b in found_bibrecs + missing_bibrecs:
        bibrec_names.append([
            b,
            get_field_values_on_condition(b,
                                          source='API',
                                          get_table=['100', '700'],
                                          get_tag='a')
        ])

    for n in list(bibrec_names):
        for i in list(n[1]):
            if nameapi.soft_compare_names(surname, i.encode('utf-8')) < 0.45:
                n[1].remove(i)
    #bibrec_names = [[78, set([u'M\xfcck, W'])]]

    #what is left are only suitable names for each record.
    bibrefrecs = []

    for bibrec in bibrec_names:
        for name in bibrec[1]:
            bibrefs = dbapi.get_bibrefs_from_name_string(name.encode('utf-8'))
            if len(bibrefs) < 1:
                continue
            for bibref in bibrefs[0][0].split(','):
                bibrefrecs.append(str(bibref) + ',' + str(bibrec[0]))
    #bibrefrecs = ['100:116,78', '700:505,78']

    person_papers = []
    if not pid[1]:
        brr = [[i] for i in bibrefrecs]
        possible_persons = dbapi.get_possible_personids_from_paperlist(brr)
        #[[0L, ['700:316,10']]]
        possible_persons = sorted(possible_persons, key=lambda k: len(k[1]))

        if len(possible_persons) > 1:
            for pp in possible_persons:
                pid = dbapi.assign_person_to_uid(uid, pp[0])
                person_papers = pp[1]
                if pid != -1:
                    break
            if pid == -1:
                pid = dbapi.assign_person_to_uid(uid, -1)
        elif len(possible_persons) == 1:
            pid = dbapi.assign_person_to_uid(uid, possible_persons[0][0])
            person_papers = possible_persons[0][1]
        else:
            pid = dbapi.assign_person_to_uid(uid, -1)
    else:
        pid = long(pid[0][0])

    tempticket = []
    #now we have to open the tickets...
    #person_papers contains the papers which are already assigned to the person and came from arxive,
    #they can be claimed regardless
    for bibref in person_papers:
        tempticket.append({'pid': pid, 'bibref': bibref, 'action': 'confirm'})

    done_bibrecs = set(b.split(',')[1] for b in person_papers)
    for b in found_bibrecs + missing_bibrecs:
        if str(b) not in done_bibrecs:
            tempticket.append({
                'pid': pid,
                'bibref': str(b),
                'action': 'confirm'
            })

    #check if ticket targets (bibref for pid) are already in ticket
    for t in list(tempticket):
        for e in list(ticket):
            if e['pid'] == t['pid'] and e['bibref'] == t['bibref']:
                ticket.remove(e)
        ticket.append(t)
    session.save()
    return pid