def name_signature(first, last): ident = iunaccent(last.strip()) ident = nn_escaping_chars_re.sub('',ident) ident = nn_final_nontext_re.sub('',ident) ident = nn_nontext_re.sub('-',ident) if len(first): ident = iunaccent(first[0])+'-'+ident return ident
def name_signature(first, last): ident = iunaccent(last.strip()) ident = nn_escaping_chars_re.sub('', ident) ident = nn_final_nontext_re.sub('', ident) ident = nn_nontext_re.sub('-', ident) if len(first): ident = iunaccent(first[0]) + '-' + ident return ident
def name_similarity(a, b): """ Returns a float: how similar are these two names? """ if not a or not b or len(a) != 2 or len(b) != 2: return False firstA, lastA = a firstB, lastB = b firstA = iunaccent(firstA) firstB = iunaccent(firstB) lastA = iunaccent(lastA) lastB = iunaccent(lastB) if lastA != lastB: return 0. partsA, sepsA = split_name_words(firstA) partsB, sepsB = split_name_words(firstB) parts = list(zip(partsA, partsB)) if not all(map(match_first_names, parts)): # Try to match in reverse partsA.reverse() partsB.reverse() parts = list(zip(partsA, partsB)) if not all(map(match_first_names, parts)): return 0. maxlen = max(len(partsA), len(partsB)) sumscores = 0 expanded = [] for i in range(maxlen): if i < len(parts): sumscores += weight_first_names(parts[i]) expanded.append((len(partsA[i]) > 1, len(partsB[i]) > 1)) elif i < len(partsA): sumscores -= 0.25*weight_first_name(partsA[i]) expanded.append((len(partsA[i]) > 1, False)) else: sumscores -= 0.25*weight_first_name(partsB[i]) expanded.append((False, len(partsB[i]) > 1)) # Make sure expanded first names of A are included in that of B # or that of B and included in that of A # This prevents ('Amanda P.','Brown') and ('A. Patrick','Brown') # frow matching if not (all([wa or not wb for wa, wb in expanded]) or all([wb or not wa for wa, wb in expanded])): return 0. sumscores = max(min(sumscores, 1), 0) return sumscores
def name_similarity(a, b): """ Returns a float: how similar are these two names? Examples: name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder')) == 0.8 name_similarity(('Robin', 'Ryder'),('R.', 'Ryder')) == 0.4 name_similarity(('R.', 'Ryder'),('R.', 'Ryder')) == 0.4 name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder')) ==0.3 name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder')) == 0.8 name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder')) == 0.3 name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder')) == 0.7 name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers')) == 0.7 name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder')) == 0 name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu') == 0 """ if not a or not b: return False (firstA, lastA) = a (firstB, lastB) = b firstA = iunaccent(firstA) firstB = iunaccent(firstB) lastA = iunaccent(lastA) lastB = iunaccent(lastB) if lastA != lastB: return 0. #if firstA == firstB: # return 1. partsA, sepsA = split_name_words(firstA) partsB, sepsB = split_name_words(firstB) parts = zip(partsA, partsB) if not all(map(match_first_names, parts)): # Try to match in reverse partsA.reverse() partsB.reverse() parts = zip(partsA, partsB) if not all(map(match_first_names, parts)): return 0. maxlen = max(len(partsA), len(partsB)) sumscores = 0 for i in range(maxlen): if i < len(parts): sumscores += weight_first_names(parts[i]) elif i < len(partsA): sumscores -= 0.25 * weight_first_name(partsA[i]) else: sumscores -= 0.25 * weight_first_name(partsB[i]) sumscores = max(min(sumscores, 1), 0) return sumscores
def name_similarity(a,b): """ Returns a float: how similar are these two names? Examples: name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder')) == 0.8 name_similarity(('Robin', 'Ryder'),('R.', 'Ryder')) == 0.4 name_similarity(('R.', 'Ryder'),('R.', 'Ryder')) == 0.4 name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder')) ==0.3 name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder')) == 0.8 name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder')) == 0.3 name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder')) == 0.7 name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers')) == 0.7 name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder')) == 0 name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu') == 0 """ if not a or not b: return False (firstA,lastA) = a (firstB,lastB) = b firstA = iunaccent(firstA) firstB = iunaccent(firstB) lastA = iunaccent(lastA) lastB = iunaccent(lastB) if lastA != lastB: return 0. #if firstA == firstB: # return 1. partsA, sepsA = split_name_words(firstA) partsB, sepsB = split_name_words(firstB) parts = zip(partsA, partsB) if not all(map(match_first_names, parts)): # Try to match in reverse partsA.reverse() partsB.reverse() parts = zip(partsA, partsB) if not all(map(match_first_names, parts)): return 0. maxlen = max(len(partsA), len(partsB)) sumscores = 0 for i in range(maxlen): if i < len(parts): sumscores += weight_first_names(parts[i]) elif i < len(partsA): sumscores -= 0.25*weight_first_name(partsA[i]) else: sumscores -= 0.25*weight_first_name(partsB[i]) sumscores = max(min(sumscores, 1), 0) return sumscores
def populate_identifiers(apps, se): Institution = apps.get_model('papers', 'Institution') for i in Institution.objects.all(): if i.country and i.name: i.identifiers = [i.identifier, i.country + ':' + iunaccent(i.name)] else: i.identifiers = [i.identifier] i.save(update_fields=['identifiers'])
def populate_identifiers(apps, se): Institution = apps.get_model('papers', 'Institution') for i in Institution.objects.all(): if i.country and i.name: i.identifiers = [i.identifier, i.country+':'+iunaccent(i.name)] else: i.identifiers = [i.identifier] i.save(update_fields=['identifiers'])
def create(cls, first, last): """ Creates an instance of the Name object without saving it. Useful for name lookups where we are not sure we want to keep the name in the model. """ instance = cls() instance.first = sanitize_html(first[:MAX_NAME_LENGTH].strip()) instance.last = sanitize_html(last[:MAX_NAME_LENGTH].strip()) instance.full = iunaccent(instance.first+' '+instance.last) return instance
def shallower_name_similarity(a, b): """ Same as name_similarity, but accepts differences in the last names. This heuristics is more costly but is only used to attribute an ORCID affiliation to the right author in papers fetched from ORCID. (in the next function) """ if not a or not b or len(a) != 2 or len(b) != 2: return False firstA, lastA = a firstB, lastB = b # Matching last names lastA = iunaccent(lastA) lastB = iunaccent(lastB) wordsA, _ = split_name_words(lastA) wordsB, _ = split_name_words(lastB) wordsA = set(wordsA) wordsB = set(wordsB) if not wordsA or not wordsB: return False ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB) partsA, sepsA = split_name_words(firstA) partsB, sepsB = split_name_words(firstB) partsA = [p[0] for p in partsA] partsB = [p[0] for p in partsB] parts = zip(partsA, partsB) if not all(map(match_first_names, parts)): # Try to match in reverse partsA.reverse() partsB.reverse() parts = zip(partsA, partsB) if not all(map(match_first_names, parts)): return 0. maxlen = max(len(partsA), len(partsB)) return ratio * (len(parts) + 1) / (maxlen + 1)
def shallower_name_similarity(a, b): """ Same as name_similarity, but accepts differences in the last names. This heuristics is more costly but is only used to attribute an ORCID affiliation to the right author in papers fetched from ORCID. (in the next function) """ if not a or not b or len(a) != 2 or len(b) != 2: return False firstA, lastA = a firstB, lastB = b # Matching last names lastA = iunaccent(lastA) lastB = iunaccent(lastB) wordsA, _ = split_name_words(lastA) wordsB, _ = split_name_words(lastB) wordsA = set(wordsA) wordsB = set(wordsB) if not wordsA or not wordsB: return False ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB) partsA, sepsA = split_name_words(firstA) partsB, sepsB = split_name_words(firstB) partsA = [ p[0] for p in partsA ] partsB = [ p[0] for p in partsB ] parts = list(zip(partsA, partsB)) if not all(map(match_first_names, parts)): # Try to match in reverse partsA.reverse() partsB.reverse() parts = list(zip(partsA, partsB)) if not all(map(match_first_names, parts)): return 0. maxlen = max(len(partsA), len(partsB)) return ratio*(len(parts)+1)/(maxlen+1)
def shallower_name_similarity(a, b): """ Same as name_similarity, but accepts differences in the last names. This heuristics is more costly but is only used to attribute an ORCID affiliation to the right author in papers fetched from ORCID. """ if not a or not b: return False firstA, lastA = a firstB, lastB = b # Matching last names lastA = iunaccent(lastA) lastB = iunaccent(lastB) wordsA, sepA = split_name_words(lastA) wordsB, sepB = split_name_words(lastB) wordsA = set(wordsA) wordsB = set(wordsB) ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB) partsA, sepsA = split_name_words(firstA) partsB, sepsB = split_name_words(firstB) partsA = map(lambda x: x[0], partsA) partsB = map(lambda x: x[0], partsB) parts = zip(partsA, partsB) if not all(map(match_first_names, parts)): # Try to match in reverse partsA.reverse() partsB.reverse() parts = zip(partsA, partsB) if not all(map(match_first_names, parts)): return 0. maxlen = max(len(partsA), len(partsB)) if maxlen > 0: return ratio * len(parts) / maxlen return 0.
def shallower_name_similarity(a, b): """ Same as name_similarity, but accepts differences in the last names. This heuristics is more costly but is only used to attribute an ORCID affiliation to the right author in papers fetched from ORCID. """ if not a or not b: return False firstA, lastA = a firstB, lastB = b # Matching last names lastA = iunaccent(lastA) lastB = iunaccent(lastB) wordsA, sepA = split_name_words(lastA) wordsB, sepB = split_name_words(lastB) wordsA = set(wordsA) wordsB = set(wordsB) ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB) partsA, sepsA = split_name_words(firstA) partsB, sepsB = split_name_words(firstB) partsA = map(lambda x: x[0], partsA) partsB = map(lambda x: x[0], partsB) parts = zip(partsA, partsB) if not all(map(match_first_names, parts)): # Try to match in reverse partsA.reverse() partsB.reverse() parts = zip(partsA, partsB) if not all(map(match_first_names, parts)): return 0. maxlen = max(len(partsA), len(partsB)) if maxlen > 0: return ratio*len(parts)/maxlen return 0.
def test_iunaccent(self): self.assertEqual(iunaccent('BÉPO forever'), 'bepo forever')
def name_similarity(a, b): """ Returns a float: how similar are these two names? Examples: >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder'))) 8 >>> int(10*name_similarity(('Robin', 'Ryder'),('R.', 'Ryder'))) 4 >>> int(10*name_similarity(('R.', 'Ryder'),('R.', 'Ryder'))) 4 >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder'))) 3 >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder'))) 8 >>> int(10*name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder'))) 3 >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder'))) 7 >>> int(10*name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers'))) 7 >>> int(10*name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder'))) 0 >>> int(10*name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu'))) 0 >>> int(10*name_similarity(('Amanda P.','Brown'),('Patrick','Brown'))) 0 """ if not a or not b: return False (firstA, lastA) = a (firstB, lastB) = b firstA = iunaccent(firstA) firstB = iunaccent(firstB) lastA = iunaccent(lastA) lastB = iunaccent(lastB) if lastA != lastB: return 0. partsA, sepsA = split_name_words(firstA) partsB, sepsB = split_name_words(firstB) parts = zip(partsA, partsB) if not all(map(match_first_names, parts)): # Try to match in reverse partsA.reverse() partsB.reverse() parts = zip(partsA, partsB) if not all(map(match_first_names, parts)): return 0. maxlen = max(len(partsA), len(partsB)) sumscores = 0 expanded = [] for i in range(maxlen): if i < len(parts): sumscores += weight_first_names(parts[i]) expanded.append((len(partsA[i]) > 1, len(partsB[i]) > 1)) elif i < len(partsA): sumscores -= 0.25 * weight_first_name(partsA[i]) expanded.append((len(partsA[i]) > 1, False)) else: sumscores -= 0.25 * weight_first_name(partsB[i]) expanded.append((False, len(partsB[i]) > 1)) # Make sure expanded first names of A are included in that of B # or that of B and included in that of A # This prevents ('Amanda P.','Brown') and ('A. Patrick','Brown') # frow matching if not (all([a or not b for a, b in expanded]) or all([b or not a for a, b in expanded])): return 0. sumscores = max(min(sumscores, 1), 0) return sumscores
def name_similarity(a,b): """ Returns a float: how similar are these two names? Examples: >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder'))) 8 >>> int(10*name_similarity(('Robin', 'Ryder'),('R.', 'Ryder'))) 4 >>> int(10*name_similarity(('R.', 'Ryder'),('R.', 'Ryder'))) 4 >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder'))) 3 >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder'))) 8 >>> int(10*name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder'))) 3 >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder'))) 7 >>> int(10*name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers'))) 7 >>> int(10*name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder'))) 0 >>> int(10*name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu'))) 0 >>> int(10*name_similarity(('Amanda P.','Brown'),('Patrick','Brown'))) 0 """ if not a or not b: return False (firstA,lastA) = a (firstB,lastB) = b firstA = iunaccent(firstA) firstB = iunaccent(firstB) lastA = iunaccent(lastA) lastB = iunaccent(lastB) if lastA != lastB: return 0. partsA, sepsA = split_name_words(firstA) partsB, sepsB = split_name_words(firstB) parts = zip(partsA, partsB) if not all(map(match_first_names, parts)): # Try to match in reverse partsA.reverse() partsB.reverse() parts = zip(partsA, partsB) if not all(map(match_first_names, parts)): return 0. maxlen = max(len(partsA), len(partsB)) sumscores = 0 expanded = [] for i in range(maxlen): if i < len(parts): sumscores += weight_first_names(parts[i]) expanded.append((len(partsA[i])>1, len(partsB[i])>1)) elif i < len(partsA): sumscores -= 0.25*weight_first_name(partsA[i]) expanded.append((len(partsA[i])>1, False)) else: sumscores -= 0.25*weight_first_name(partsB[i]) expanded.append((False, len(partsB[i])>1)) # Make sure expanded first names of A are included in that of B # or that of B and included in that of A # This prevents ('Amanda P.','Brown') and ('A. Patrick','Brown') # frow matching if not (all([a or not b for a,b in expanded]) or all([b or not a for a,b in expanded])): return 0. sumscores = max(min(sumscores, 1), 0) return sumscores