def ident(self, name): orig_name = name best_matches = [] results = self.entity_type.query.get_results() mapper = self.entity_type.result_mapper name = mapper.stand(name) for uri, given, stand in mapper.map(results): dist = jaccard_ngram_dist(stand,name,3) best_matches.append(((given,uri),dist)) if len(best_matches)>100: best_matches.sort(key=lambda x:x[1]) best_matches = best_matches[:self.max_matches] best_matches.sort(key=lambda x:x[1]) best_matches = best_matches[:self.max_matches] best_dist = best_matches[0][1] possibles = [best_matches[0][0]] for match, dist in best_matches[1:]: if (dist+best_dist)==0: percent_diff = 0 else: percent_diff = (dist-best_dist)*2/float(dist+best_dist) if percent_diff < self.max_percent_diff: possibles.append(match) if len(possibles)>1: identified = self.prompt_possibles(orig_name, possibles) else: identified = possibles[0] return identified
'adam smith', 'bob smith', 'carl smith', 'dale jones', 'ernest kirstein' ] import ngrams to_compare = [ 'tom smith', 'john smith', 'tom' ] #to_compare = ['rj smith', 'rj', 'cs smith'] def run_comps(f, to_compare): for i,s1 in enumerate(to_compare): for s2 in to_compare[i+1:]: print f(s1,s2), repr(s1), repr(s2) print "Without Info:" d = lambda a,b: ngrams.jaccard_ngram_dist(a,b,3) run_comps(d, to_compare) print "\nSample Set:" print '\t'+'\n\t'.join(names) model = ngram_model(names, 3) print "\nWith Info:" d = lambda a,b: info_ngram_dist(3,a,b,model) run_comps(d, to_compare)