/
binning.py
166 lines (133 loc) · 5.55 KB
/
binning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import itertools
import csv
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from matplotlib import pylab
from collections import Counter
import time
from timer import Timer
from jellyfish import damerau_levenshtein_distance as ld
# This function finds the bins in binArray that are "close" to binVec, as
# defined by the threshold. It then returns those bins, and the strings that are
# in those bins. (binVec and binArray need to be numpy arrays)
def compareBins(binVec, binArray, binDict, threshold=3):
matchBins = list(binArray[np.sum(abs(binArray - binVec),1) <= threshold])
matchStrings = list(itertools.chain.from_iterable(map(binDict.get, [tuple(x) for x in matchBins])))
return (matchBins, matchStrings)
# Got this from here: https://wiki.python.org/moin/BitManipulation
# Adds up the number of bits that are "on"
def bitCount(int_type):
count = 0
while(int_type):
int_type &= int_type - 1
count += 1
return(count)
def makeBinDict(stringList):
binDict = {}
for name in stringList:
try:
binDict[binCount(name,bins)].append(name)
except KeyError:
binDict[binCount(name,bins)] = [name]
return binDict
def removeDuplicates(stringList):
return list(set(stringList))
def makeBit(word, bitDict):
return sum([bitDict[letter] for letter in set(word)])
def binCount(string,bins):
stringCounter = Counter(string)
coord = []
for x,y in bins:
#print LETTERS[x:y], sum([stringCounter[z] for z in LETTERS[x:y]])
coord.append(sum([stringCounter[z] for z in LETTERS[x:y]]))
return tuple(coord)
# Read in the two input files
fn = "bank_data.csv"
fn2 = "call_report_names_unique.csv"
with Timer() as t:
with open(fn,'rb') as f:
reader = csv.reader(f)
banknamesShort = [line[2].lower() for line in reader if line[2] != "Bank_Name"]
with open(fn2,'rb') as f:
reader = csv.reader(f)
banknamesLong = [line[1].lower() for line in reader if line[2] != "rssd9348"]
banknamesLong = removeDuplicates(banknamesLong)
banknamesShort = removeDuplicates(banknamesShort)
#set of all letters used in strings
LETTERS = list(set("".join(banknamesLong + banknamesShort)))
#pick threshold
threshold = 3
#
i = 0
bins = []
while i < len(LETTERS):
bins.append(tuple([i, i+7]))
i = i + 7
bitDict = {}
i = 1
for x in xrange(len(LETTERS)):
bitDict[LETTERS[x]] = i
i = i << 1
print "Startup Stage: %s seconds" % t.secs
with Timer() as t:
binDictLong = makeBinDict(banknamesLong)
binDictShort = makeBinDict(banknamesShort)
binDictLongKeys = np.array(binDictLong.keys())
binMatchesDict = {}
nameMatchesDict = {}
for keyShort in binDictShort.keys():
(binMatches[keyShort], nameMatchesDict[keyShort]) = compareBins(np.array(keyShort), binDictLongKeys, binDictLong)
print "---------------------"
print "Bin Stage: %s seconds" % t.secs
# Use bitwise comparison to compare strings
counter = 0
successCounter = 0
compareDict = {}
with Timer() as t:
for keyShort in binDictShort.keys():
# Gets all words from the long list in the bins that
# are in the ball close to the keyShort tuple.
compareWords = nameMatchesDict[keyShort]
# Make the comparison words into bit strings
compareWordsBits = [makeBit(word2, bitDict) for word2 in compareWords]
for shortWord in binDictShort[keyShort]:
shortWordBits = makeBit(shortWord, bitDict)
for i in range(len(compareWords)):
if bitCount(shortWordBits ^ compareWordsBits[i]) <= threshold:
successCounter = successCounter + 1
try:
compareDict[shortWord].append(compareWords[i])
except KeyError:
compareDict[shortWord] = [compareWords[i]]
# print "Comparing %s and %s" % (shortWord, compareWords[i])
# print "Bit Representations:"
# print "%s" % (str(bin(shortWordBits))[2:])
# print "%s" % (str(bin(compareWordsBits[i]))[2:])
# print "Difference: %d" % bitCount(shortWordBits ^ compareWordsBits[i])
counter = counter + 1
print "---------------------"
print "Bit Stage: %s seconds" % t.secs
print "Number of Short Words: %d" % len(banknamesShort)
print "Number of Long Words: %d" % len(banknamesLong)
print "Theoretical Number of Matches (short x long): %d" % (len(banknamesShort) * len(banknamesLong))
print "Number of Total Comparisons: %d" % counter
matchCount =[]
with Timer() as t:
for shortWord, compareWordsList in compareDict.items():
compareVals = [ld(shortWord, longWord) for longWord in compareWordsList]
minVal = min(compareVals)
if minVal <= threshold:
matches = [word for dist, word in zip(compareVals, compareWordsList) if dist == minVal]
matchCount.append(len(matches))
#matchString = ", ".join(matches)
#print "Potential Matches for %s: %s, distance %d" % (shortWord, matchString, minVal)
#else:
# print "No matches for %s" % shortWord
avgMatches = float(len(matchCount))/len(compareDict.keys())
fracMatched = float(len(matchCount))/len(banknamesShort)
print "---------------------"
print "LD Stage: %s seconds" % t.secs
print "Number of Comparisons: %d" % successCounter
print "Average Number of Matches: %f" % avgMatches
print "Fraction of Short Words w Match: %f" % fracMatched