def _test_match_score(): corr, expected, cand = corresponding_authors[8] corr = Author(corr) print print corr, corr.chunks print import difflib scored_can = [] for a in cand: p1 = ' '.join([c.name for c in sorted(corr.chunks)]) p2 = ' '.join([c.name for c in sorted(Author(a).chunks)]) l = levenshtein(p1, p2) l = float(l ** 2) / len(a) / len(corr.name) c = corr.match_score(Author(a)) scored_can.append(( Author(a), c, l, c - 10 * l, corr.distance(Author(a)) )) for a, c, l, c2, d in scored_can: print a.chunks, c, l, c2, d print print 'Score:', max(scored_can, key=lambda c: c[1]) print 'Leven:', min(scored_can, key=lambda c: c[2]) print 'Mixed:', max(scored_can, key=lambda c: c[3]) print 'Implemented:', min(scored_can, key=lambda c: c[4]) print 'Diff:', difflib.get_close_matches(corr.name, cand, 1)
def process_record(self, record): record['institutions'] = [] record['authors'] = [] affiliations = self.splitter.findall( record['authors_with_affiliations'].strip()) if not affiliations: aut = record['AF'].split('; ') aff = record['authors_with_affiliations'].split('; ') if len(aut) == len(aff): affiliations = list(zip(aut, aff)) elif len(aff) == 1: affiliations = [(a, aff[0]) for a in aut] else: self.pipeline.inc_metric('ambiguous_author_affiliations') print('-' * 80) print(u'Ambiguous author affiliations for "{title}":'.format( **record)) print(' Authors:') for a in aut: print(' * {}'.format(a)) print(' Affiliations:') for a in aff: print(' * {}'.format(a)) print('-' * 80) if self.include_ambiguous_affiliations: # Set all author affiliations to the first institution in # the list, and set the ambiguous flag... record['ambiguous_affiliations'] = True affiliations = [(a, aff[0]) for a in aut] else: return None # TODO: Some authors could have two affiliations! This should be # checked here and a warning raised. for i, (authors, institution) in enumerate(affiliations): record['institutions'].append(institution) for a in authors.split('; '): author = Author(a) record['authors'].append((author, i)) if record['RP']: t = ' (reprint author)' corresponding = record['RP'][:record['RP'].find(t)] corresponding = Author(corresponding.strip()) t += ', ' institution = record['RP'][record['RP'].find(t) + len(t):] match = corresponding.find_best_match( [a[0] for a in record['authors']]) if not match: self.pipeline.inc_metric('corresponding_author_unmatched') print('-' * 80) print('No corresponding author match found for:') print(' {!r}/{!r}'.format(record['title'], corresponding.name)) names = (a[0].name for a in record['authors']) pprint.pprint((corresponding.name, 0, tuple(names)), self.unmatched_authors) print('-' * 80) return None else: for i, (a, institution_id) in enumerate(record['authors']): if a is match: record['corresponding_author'] = i curr_inst = record['institutions'][institution_id] if institution != curr_inst: record['institutions'].append(institution) record['authors'][i] = ( a, len(record['institutions']) - 1) break else: self.pipeline.inc_metric('corresponding_author_undefined') record['corresponding_author'] = 0 print (u'Undefined corresponding author for "{}", selecting "{}"' .format(record['title'], record['authors'][0][0].name)) return record