def simhash_test(): data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } for k, v in data.items(): print k, get_phrases(v) for k, v in data.items(): print k, Simhash(get_phrases(v)).value objs = [(str(k), Simhash(get_phrases(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print index.bucket_size() s1 = Simhash(get_phrases(u'How are you i am fine. blar blar blar blar blar thank')) print index.get_near_dups(s1) index.add('4', s1) print index.get_near_dups(s1)
def use_simhash_index(): data = { 1: "How are you? I Am fine. blar blar blar blar blar Thanks.", 2: "How are you i am fine. blar blar blar blar blar than", 3: "This is simhash test.", } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash(get_features(u"How are you i am fine. blar blar blar blar blar thank")) print(index.get_near_dups(s1)) index.add("4", s1) print(index.get_near_dups(s1))
def use_simhash_index(): data = { 1: "How are you? I Am fine. blar blar blar blar blar Thanks.", 2: "How are you i am fine. blar blar blar blar blar than", 3: "This is simhash test.", } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash( get_features(u"How are you i am fine. blar blar blar blar blar thank")) print(index.get_near_dups(s1)) index.add("4", s1) print(index.get_near_dups(s1))
class TestSimhashIndex(TestCase): def setUp(self): data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } objs = [(str(k), Simhash(v)) for k, v in data.items()] self.index = SimhashIndex(objs) def test_bucket_size(self): self.assertEqual(self.index.bucket_size(), 6) def test_get_near_dup(self): s1 = Simhash(u'How are you i am fine. blar blar blar blar blar thank') dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2)
def get_features(s): width = 3 s = s.lower() s = re.sub(r'[^\w]+', '', s) return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))] data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash(get_features(u'How are you i am fine. blar blar blar blar blar thank')) print(index.get_near_dups(s1)) index.add('4', s1) print(index.get_near_dups(s1)) def main(): pass if __name__ == '__main__': main()
import re from simhash import Simhash, SimhashIndex def get_features(s): width = 3 s = s.lower() s = re.sub(r'[^\w]+', '', s) return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))] data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash( get_features(u'How are you i am fine. blar blar blar blar blar thank')) print(index.get_near_dups(s1)) index.add('4', s1) print(index.get_near_dups(s1))
index = SimhashIndex(objs, k=3) if os.path.isfile(args.db): print 'MatchMeta.Info Database Located' print 'Patience...Loading Index...' conn = sqlite3.connect(args.db) meta = conn.execute( "SELECT path FROM MatchMeta WHERE path NOT LIKE '%winsxs%'") count = 1 for line in meta: item = Simhash(get_features(unicode(line[0]))) count = count + 1 index.add(count, item) print index.bucket_size() print 'Excluding the WINSXS Directory' print '---------------------------------' print ' MatchMeta.Info Database Loaded' print '---------------------------------' conn.close() else: print 'MatchMeta.Info Database -- FAILED' sys.exit() elif (args.near.upper() == 'N'): print 'Skipping MatchMeta.Info Database Fuzzing' else: print 'Please use only Y or N' #########################################################################################################################################
#duplicate detection keys = fourgram.keys() f1 = open('rezFinalNoDuplicates.txt', 'w') objs = [] for k in fourgram: try: objs.append((k, Simhash(fourgram[k]))) except Exception as e: print e #objs = [(k, Simhash(fourgram[k])) for k in fourgram] index = SimhashIndex(objs, k=3) print "bucket_size", index.bucket_size() for key in keys: s1 = Simhash(fourgram[key]) duplicates = ", ".join(index.get_near_dups(s1)) f1.write(key + "\t" + duplicates+"\n") print key, duplicates ''' while len(keys) > 0: key = keys.pop() keysJ = list(keys) f1.write(key + '\t' + text[key]) while len(keysJ) > 0: j = keysJ.pop()
objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) if os.path.isfile(args.db): print 'MatchMeta.Info Database Located' print 'Patience...Loading Index...' conn = sqlite3.connect(args.db) meta = conn.execute("SELECT path FROM MatchMeta WHERE path NOT LIKE '%winsxs%'") count = 1 for line in meta: item = Simhash(get_features(unicode(line[0]))) count = count+1 index.add(count,item) print index.bucket_size() print 'Excluding the WINSXS Directory' print '---------------------------------' print ' MatchMeta.Info Database Loaded' print '---------------------------------' conn.close() else: print 'MatchMeta.Info Database -- FAILED' sys.exit() elif(args.near.upper() == 'N'): print 'Skipping MatchMeta.Info Database Fuzzing' else: print 'Please use only Y or N' #########################################################################################################################################
func = delayed(read_file) F = F[:cutoff_users] with Parallel(-1) as MP: for res in MP(func(x) for x in tqdm(F)): if res is None: continue ox, dx = res objs.extend(ox) data.update(dx) print "OBJS SIZE", len(objs) index = SimhashIndex(objs, f=f_dim, k=3) print "Bucket size", index.bucket_size() accounted_keys = set() dataset = [] C = collections.Counter() for key, val in tqdm(objs): # Skip if we've seen this pattern before if key in accounted_keys: continue dupes = index.get_near_dups(val) tweet = data[key] # Don't report self-matches if len(dupes) <= 1: