def compare_fieldvalues_authorname(field_comparisons, threshold, matches_needed): """ Performs field validation given an list of field comparisons using a technique that is meant for author-names taking into account initials vs. full-name, using matching techniques available from BibAuthorId. Each comparison is done according to given threshold which the result must be equal or above to match. During validation the fields are compared and matches are counted per field, up to the given amount of matches needed is met, causing the function to return True. If validation ends before this threshold is met it will return False. @param field_comparisons: list of comparisons, each which contains a list of field-value to field-value comparisons. @type field_comparisons: list @param threshold: number describing the match threshold a comparison must exceed to become a positive match. @type threshold: float @param matches_needed: number of positive field matches needed for the entire comparison process to give a positive result. @type matches_needed: int @return: tuple of matching result, True if enough matches are found, False if not, and number of matches. @rtype: tuple """ matches_found = 0 # Loop over all possible comparisons field by field, if a match is found, # we are done with this field and break out to try and match next field. for comparisons in field_comparisons: for value, other_value in comparisons: # Grab both permutations of a name (before, after and after, before) # and compare to each unique commutative combination. Ex: # Doe,J vs. Smith,J -> [(('Smith,J', 'Doe,J'), ('Smith,J', 'J,Doe')), # (('J,Smith', 'Doe,J'), ('J,Smith', 'J,Doe'))] author_comparisons = [pair for pair in get_paired_comparisons(\ get_reversed_string_variants(value), \ get_reversed_string_variants(other_value))][0] for str1, str2 in author_comparisons: # Author-name comparison - using BibAuthorid function diff = soft_compare_names(str1, str2) if diff >= threshold: matches_found += 1 break else: # We continue as no match was found continue # We break out as a match was found break # If we already have found required number of matches, we return immediately if matches_found >= matches_needed: return True, matches_found # Often authors are not matching fully, so lets allow for the number of matches to # be a little lower, using the same threshold result = matches_found >= matches_needed or matches_found / float(matches_needed) > threshold return result, matches_found
def fallback_find_personids_by_name_string(target): ''' Search engine to find persons matching the given string The matching is done on the surname first, and names if present. An ordered list (per compatibility) of pids and found names is returned. @param namestring: string name, 'surname, names I.' @type: string @param strict: Define if this shall perform an exact or a fuzzy match @type strict: boolean @return: pid list of lists [pid,[[name string, occur count, compatibility]]] ''' family = get_surname(target) ascii_family = get_surname(translate_to_ascii(target)[0]) clean_family = get_surname(clean_string(target)) #SANITY: avoid empty queries if not family: return list() levels = ( # target + '%', #this introduces a weird problem: different results for mele, salvatore and salvatore mele family + '%', '%' + family + ',%', '%' + family[1:-1] + '%') if len(family) <= 4: levels = [levels[0], levels[2]] names = list(set().union( *map(get_authors_by_name_regexp, (family + ',%', ascii_family + ',%', clean_family + ',%')))) if not names: for lev in levels: names = dbinter.get_authors_by_name_regexp(lev) if names: break is_canonical = False if not names: names = dbinter.get_authors_by_canonical_name_regexp(target) is_canonical = True names = groupby(sorted(names)) names = [(key[0], key[1], len(list(data)), soft_compare_names(target, key[1])) for key, data in names] names = groupby(names, itemgetter(0)) names = [(key, sorted([(d[1], d[2], d[3]) for d in data if (d[3] > 0.5 or is_canonical)], key=itemgetter(2), reverse=True)) for key, data in names] names = [name for name in names if name[1]] names = sorted(names, key=lambda x: (x[1][0][2], x[1][0][0], x[1][0][1]), reverse=True) return names
def find_personIDs_by_name_string(target): ''' Search engine to find persons matching the given string The matching is done on the surname first, and names if present. An ordered list (per compatibility) of pids and found names is returned. @param namestring: string name, 'surname, names I.' @type: string @param strict: Define if this shall perform an exact or a fuzzy match @type strict: boolean @return: pid list of lists [pid,[[name string, occur count, compatibility]]] ''' splitted_name = split_name_parts(target) family = splitted_name[0] levels = ( #target + '%', #this introduces a weird problem: different results for mele, salvatore and salvatore mele family + ',%', family[:-2] + '%', '%' + family + ',%', '%' + family[1:-1] + '%') if len(family) <= 4: levels = [levels[0], levels[2]] for lev in levels: names = dbinter.get_all_personids_by_name(lev) if names: break is_canonical = False if not names: names = dbinter.get_personids_by_canonical_name(target) is_canonical = True names = groupby(sorted(names)) names = [(key[0], key[1], len(list(data)), soft_compare_names(target, key[1])) for key, data in names] names = groupby(names, itemgetter(0)) names = [(key, sorted([(d[1], d[2], d[3]) for d in data if (d[3] > 0.5 or is_canonical)], key=itemgetter(2), reverse=True)) for key, data in names] names = [name for name in names if name[1]] names = sorted(names, key=lambda x: (x[1][0][2], x[1][0][0], x[1][0][1]), reverse=True) return names
def fallback_find_personids_by_name_string(target): ''' Search engine to find persons matching the given string The matching is done on the surname first, and names if present. An ordered list (per compatibility) of pids and found names is returned. @param namestring: string name, 'surname, names I.' @type: string @param strict: Define if this shall perform an exact or a fuzzy match @type strict: boolean @return: pid list of lists [pid,[[name string, occur count, compatibility]]] ''' splitted_name = split_name_parts(target) family = splitted_name[0] levels = (# target + '%', #this introduces a weird problem: different results for mele, salvatore and salvatore mele family + ',%', family[:-2] + '%', '%' + family + ',%', '%' + family[1:-1] + '%') if len(family) <= 4: levels = [levels[0], levels[2]] for lev in levels: names = dbinter.get_authors_by_name_regexp(lev) if names: print "%s" % lev break is_canonical = False if not names: names = dbinter.get_authors_by_canonical_name_regexp(target) is_canonical = True names = groupby(sorted(names)) names = [(key[0], key[1], len(list(data)), soft_compare_names(target, key[1])) for key, data in names] names = groupby(names, itemgetter(0)) names = [(key, sorted([(d[1], d[2], d[3]) for d in data if (d[3] > 0.5 or is_canonical)], key=itemgetter(2), reverse=True)) for key, data in names] names = [name for name in names if name[1]] names = sorted(names, key=lambda x: (x[1][0][2], x[1][0][0], x[1][0][1]), reverse=True) return names
def compare_fieldvalues_authorname(field_comparisons, threshold, matches_needed): """ Performs field validation given an list of field comparisons using a technique that is meant for author-names taking into account initials vs. full-name, using matching techniques available from BibAuthorId. Each comparison is done according to given threshold which the result must be equal or above to match. During validation the fields are compared and matches are counted per field, up to the given amount of matches needed is met, causing the function to return True. If validation ends before this threshold is met it will return False. @param field_comparisons: list of comparisons, each which contains a list of field-value to field-value comparisons. @type field_comparisons: list @param threshold: number describing the match threshold a comparison must exceed to become a positive match. @type threshold: float @param matches_needed: number of positive field matches needed for the entire comparison process to give a positive result. @type matches_needed: int @return: tuple of matching result, True if enough matches are found, False if not, and number of matches. @rtype: tuple """ matches_found = 0 # Loop over all possible comparisons field by field, if a match is found, # we are done with this field and break out to try and match next field. for comparisons in field_comparisons: for value, other_value in comparisons: # Grab both permutations of a name (before, after and after, before) # and compare to each unique commutative combination. Ex: # Doe,J vs. Smith,J -> [(('Smith,J', 'Doe,J'), ('Smith,J', 'J,Doe')), # (('J,Smith', 'Doe,J'), ('J,Smith', 'J,Doe'))] author_comparisons = [pair for pair in get_paired_comparisons(\ get_reversed_string_variants(value), \ get_reversed_string_variants(other_value))][0] for str1, str2 in author_comparisons: # Author-name comparison - using BibAuthorid function diff = soft_compare_names(str1, str2) if diff >= threshold: matches_found += 1 break else: # We continue as no match was found continue # We break out as a match was found break # If we already have found required number of matches, we return immediately if matches_found >= matches_needed: return True, matches_found # Often authors are not matching fully, so lets allow for the number of matches to # be a little lower, using the same threshold result = matches_found >= matches_needed or matches_found / float( matches_needed) > threshold return result, matches_found
def arxiv_login(req): ''' Log in through arxive. If user already associated to a personid, returns the personid. If user has no pid, try to guess which personid to associate based on surname and papers from arxiv. If no compatible person is found, creates a new person. At the end of the process opens a ticket for the user claiming the papers from arxiv. !!! the user will find the open ticket, which will require him to go through the final review before getting committed. @param req: Apache request object @type req: Apache request object @return: Returns the pid resulting in the process @rtype: int ''' def session_bareinit(req): session = get_session(req) try: pinfo = session["personinfo"] if 'ticket' not in pinfo: pinfo["ticket"] = [] except KeyError: pinfo = dict() session['personinfo'] = pinfo pinfo["ticket"] = [] session.save() session_bareinit(req) session = get_session(req) pinfo = session['personinfo'] ticket = session['personinfo']['ticket'] uinfo = collect_user_info(req) pinfo['external_first_entry'] = True session.save() arxiv_p_ids = [] name = '' surname = '' try: for i in uinfo['external_arxivids'].split(';'): arxiv_p_ids.append(i) name = uinfo['external_firstname'] surname = uinfo['external_familyname'] #'external_arxivids': 'hep-th/0112017;hep-th/0112020', #'external_familyname': 'Weiler', #'external_firstname': 'Henning', except KeyError: pass found_bibrecs = [] for arx in arxiv_p_ids: t = search_engine.perform_request_search(p='037:' + str(arx), of='id') for i in t: found_bibrecs.append(i) #found_bibrecs = [567700, 567744] uid = getUid(req) pid = dbapi.get_personid_from_uid([[uid]]) if pid[1]: pid_bibrecs = dbapi.get_all_personids_recs(pid[0][0]) pid_bibrecs = set(pid_bibrecs) missing_bibrecs = [bib for bib in found_bibrecs if int(bib) not in pid_bibrecs] found_bibrecs = [bib for bib in found_bibrecs if int(bib) in pid_bibrecs] else: missing_bibrecs = [] bibrec_names = [] for b in found_bibrecs + missing_bibrecs: bibrec_names.append([b, get_field_values_on_condition(b, source='API', get_table=['100', '700'], get_tag='a')]) for n in list(bibrec_names): for i in list(n[1]): if nameapi.soft_compare_names(surname, i.encode('utf-8')) < 0.45: n[1].remove(i) #bibrec_names = [[78, set([u'M\xfcck, W'])]] #what is left are only suitable names for each record. bibrefrecs = [] for bibrec in bibrec_names: for name in bibrec[1]: bibrefs = dbapi.get_bibrefs_from_name_string(name.encode('utf-8')) if len(bibrefs) < 1: continue for bibref in bibrefs[0][0].split(','): bibrefrecs.append(str(bibref) + ',' + str(bibrec[0])) #bibrefrecs = ['100:116,78', '700:505,78'] person_papers = [] if not pid[1]: brr = [[i] for i in bibrefrecs] possible_persons = dbapi.get_possible_personids_from_paperlist(brr) #[[0L, ['700:316,10']]] possible_persons = sorted(possible_persons, key=lambda k: len(k[1])) if len(possible_persons) > 1: for pp in possible_persons: pid = dbapi.assign_person_to_uid(uid, pp[0]) person_papers = pp[1] if pid != -1: break if pid == -1: pid = dbapi.assign_person_to_uid(uid, -1) elif len(possible_persons) == 1: pid = dbapi.assign_person_to_uid(uid, possible_persons[0][0]) person_papers = possible_persons[0][1] else: pid = dbapi.assign_person_to_uid(uid, -1) else: pid = long(pid[0][0]) tempticket = [] #now we have to open the tickets... #person_papers contains the papers which are already assigned to the person and came from arxive, #they can be claimed regardless for bibref in person_papers: tempticket.append({'pid':pid, 'bibref':bibref, 'action':'confirm'}) done_bibrecs = set(b.split(',')[1] for b in person_papers) for b in found_bibrecs + missing_bibrecs: if str(b) not in done_bibrecs: tempticket.append({'pid':pid, 'bibref':str(b), 'action':'confirm'}) #check if ticket targets (bibref for pid) are already in ticket for t in list(tempticket): for e in list(ticket): if e['pid'] == t['pid'] and e['bibref'] == t['bibref']: ticket.remove(e) ticket.append(t) session.save() return pid
def arxiv_login(req): ''' Log in through arxive. If user already associated to a personid, returns the personid. If user has no pid, try to guess which personid to associate based on surname and papers from arxiv. If no compatible person is found, creates a new person. At the end of the process opens a ticket for the user claiming the papers from arxiv. !!! the user will find the open ticket, which will require him to go through the final review before getting committed. @param req: Apache request object @type req: Apache request object @return: Returns the pid resulting in the process @rtype: int ''' def session_bareinit(req): session = get_session(req) try: pinfo = session["personinfo"] if 'ticket' not in pinfo: pinfo["ticket"] = [] except KeyError: pinfo = dict() session['personinfo'] = pinfo pinfo["ticket"] = [] session.save() session_bareinit(req) session = get_session(req) pinfo = session['personinfo'] ticket = session['personinfo']['ticket'] uinfo = collect_user_info(req) pinfo['external_first_entry'] = True session.save() arxiv_p_ids = [] name = '' surname = '' try: for i in uinfo['external_arxivids'].split(';'): arxiv_p_ids.append(i) name = uinfo['external_firstname'] surname = uinfo['external_familyname'] #'external_arxivids': 'hep-th/0112017;hep-th/0112020', #'external_familyname': 'Weiler', #'external_firstname': 'Henning', except KeyError: pass found_bibrecs = [] for arx in arxiv_p_ids: t = search_engine.perform_request_search(p='037:' + str(arx), of='id') for i in t: found_bibrecs.append(i) #found_bibrecs = [567700, 567744] uid = getUid(req) pid = dbapi.get_personid_from_uid([[uid]]) if pid[1]: pid_bibrecs = dbapi.get_all_personids_recs(pid[0][0]) pid_bibrecs = set(pid_bibrecs) missing_bibrecs = [ bib for bib in found_bibrecs if int(bib) not in pid_bibrecs ] found_bibrecs = [ bib for bib in found_bibrecs if int(bib) in pid_bibrecs ] else: missing_bibrecs = [] bibrec_names = [] for b in found_bibrecs + missing_bibrecs: bibrec_names.append([ b, get_field_values_on_condition(b, source='API', get_table=['100', '700'], get_tag='a') ]) for n in list(bibrec_names): for i in list(n[1]): if nameapi.soft_compare_names(surname, i.encode('utf-8')) < 0.45: n[1].remove(i) #bibrec_names = [[78, set([u'M\xfcck, W'])]] #what is left are only suitable names for each record. bibrefrecs = [] for bibrec in bibrec_names: for name in bibrec[1]: bibrefs = dbapi.get_bibrefs_from_name_string(name.encode('utf-8')) if len(bibrefs) < 1: continue for bibref in bibrefs[0][0].split(','): bibrefrecs.append(str(bibref) + ',' + str(bibrec[0])) #bibrefrecs = ['100:116,78', '700:505,78'] person_papers = [] if not pid[1]: brr = [[i] for i in bibrefrecs] possible_persons = dbapi.get_possible_personids_from_paperlist(brr) #[[0L, ['700:316,10']]] possible_persons = sorted(possible_persons, key=lambda k: len(k[1])) if len(possible_persons) > 1: for pp in possible_persons: pid = dbapi.assign_person_to_uid(uid, pp[0]) person_papers = pp[1] if pid != -1: break if pid == -1: pid = dbapi.assign_person_to_uid(uid, -1) elif len(possible_persons) == 1: pid = dbapi.assign_person_to_uid(uid, possible_persons[0][0]) person_papers = possible_persons[0][1] else: pid = dbapi.assign_person_to_uid(uid, -1) else: pid = long(pid[0][0]) tempticket = [] #now we have to open the tickets... #person_papers contains the papers which are already assigned to the person and came from arxive, #they can be claimed regardless for bibref in person_papers: tempticket.append({'pid': pid, 'bibref': bibref, 'action': 'confirm'}) done_bibrecs = set(b.split(',')[1] for b in person_papers) for b in found_bibrecs + missing_bibrecs: if str(b) not in done_bibrecs: tempticket.append({ 'pid': pid, 'bibref': str(b), 'action': 'confirm' }) #check if ticket targets (bibref for pid) are already in ticket for t in list(tempticket): for e in list(ticket): if e['pid'] == t['pid'] and e['bibref'] == t['bibref']: ticket.remove(e) ticket.append(t) session.save() return pid