def contributor_name_similarity(c1, c2): ''' How similar are the two contribution donors' full unprocessed contributor names? Based on jaccard similarity of 3-shingles of the names. Returns a float between 0 and 1, with 1 being identical. ''' e1_shingles = shingle(clean_str(c1.contributor_name), 3) e2_shingles = shingle(clean_str(c2.contributor_name), 3) return jaccard_sim(e1_shingles, e2_shingles)
def employer_similarity(c1, c2): ''' How similar are the two contribution donors' employer strings? Based on jaccard similarity of 3-shingles of the names. Returns a float between 0 and 1, with 1 being identical. ''' e1_shingles = shingle(clean_str(c1.employer), 3) e2_shingles = shingle(clean_str(c2.employer), 3) return jaccard_sim(e1_shingles, e2_shingles)
def occupation_similarity(c1, c2): ''' How similar are the two contribution donors' occupation strings? Based on jaccard similarity of 3-shingles of the names. Returns a float between 0 and 1, with 1 being identical. ''' o1_shingles = shingle(clean_str(c1.occupation), 3) o2_shingles = shingle(clean_str(c2.occupation), 3) return jaccard_sim(o1_shingles, o2_shingles)
def first_name_similarity(c1, c2): ''' How similar are the two contribution donors' first names? Based on jaccard similarity of 3-shingles of the names. Returns a float between 0 and 1, with 1 being identical. ''' name1_shingles = shingle(clean_str(c1.first_name), 3) name2_shingles = shingle(clean_str(c2.first_name), 3) return jaccard_sim(name1_shingles, name2_shingles)
def group_by_lsh(): ''' Groups all of our contribution data by the output of a locality sensitive hashing function. The LSH implementation is stored in utils/lsh. You can read more about it here: http://en.wikipedia.org/wiki/Locality-sensitive_hashing ''' # First step is to create the actual LSH clusters, based on 1-shingles of the names cluster = Cluster(threshold=1.0) for ln in Contribution.objects.values('last_name').distinct(): name = ln['last_name'] if not name: continue # If last name isn't filled out for some reason cluster.add_set(shingle(name, 1), name) # Next step is to iterate through those clusters and produce an output of each set # of last names, along with the contributions associated with them. for c in enumerate(cluster.get_sets()): for name in c[1]: g, created = Group.objects.get_or_create(name='LSH: %s' % c[0]) Contribution.objects.filter(last_name=name).update(group=g) return
if __name__ == '__main__': # First do the initial groupings print 'Forming initial groups ...' group_by_last_name() # In this case, we're using the last name function from above # Now loop through the groups we just created and start putting together potential matches print 'Preprocessing matches ...' for g in Group.objects.all(): tocreate = [] # For any given last name, split up the contributions into every possible combibation of pairs for c in itertools.combinations(g.contribution_set.all(), 2): compstring1 = '%s %s %s' % (c[0].first_name, c[0].city, c[0].state) compstring2 = '%s %s %s' % (c[1].first_name, c[1].city, c[1].state) # Check to see if the two donors in a given pair are even remotely similar. If they're not, ignore. if jaccard_sim(shingle(compstring1.lower(), 2), shingle(compstring2.lower(), 2)) >= INITIAL_SIM: # But if they are, create a feature vector describing the dimensions of their similarity for the # machine learning algorithm to use later. c1, c2 = c[0], c[1] featurevector = str(create_featurevector(c1, c2)) # Save that feature vector and other information into a match object match = Match(c1=c1, c2=c2, features=featurevector) match.same = False # If the two contributions in the pair are regarded as coming from the same donor by the ground-truth # CRP data, mark them as a match so we can use them for testing and training the classifier. if (c1.donor_id and c2.donor_id) and (c1.donor_id == c2.donor_id): match.same = True tocreate.append(match) # Again, we're bulk creating to cut down on database transactions. Match.objects.bulk_create(tocreate)