def extract_next_links(url, resp) -> "list()": defrag=urldefrag(url)[0] print(defrag) if resp.status == 200: print("Scanning") if defrag not in urls: content = resp.raw_response.text data=getVisibleText(content) simmed=Simhash(data) if simmed.value not in sims: index=SimhashIndex(objs,k=3) if len(index.get_near_dups(simmed))==0: urls.add(defrag) sims.add(simmed.value) objs.append((url,simmed)) print(len(urls),len(sims),len(objs)) try: file=open("data_dump.txt","a",errors="ignore") to_write=url+ " \n "+ data+ "\n"+ str(simmed.value) +"\n\n" file.write(to_write) except Exception as e: raise e finally: file.close() #urls[defrag].add(getVisibleText(content)) #print(urls[defrag]) return getAllUrls(url,content) else: print("Cant scan") return []
def __init__(self, vocab_to_freq, f=64, k=32): self.vocab_to_freq = vocab_to_freq self.simhash_index = SimhashIndex([], f=f, k=k) self.f = f self.k = k simhash_index = self.simhash_index for w in vocab_to_freq: sh = Simhash(w, f=f) simhash_index.add(w, sh)
def clustering(): fout = open('cluster.txt', 'w', encoding='UTF-8') cursor = conn.cursor() cursor.execute( 'SELECT id, title, cluster, sim_count, link, simhash FROM entries where cluster=0' ) entrylist = cursor.fetchall() objs = [] entrydic = {} for item in entrylist: if not is_en(item[1]): if not item[4].startswith("https://weibo.com"): sim = Simhash(get_features_cn(item[1])) objs.append((str(item[0]), sim)) entrydic[str(item[0])] = { 'title': item[1], 'cluster': 0, 'sim_count': 0, 'link': item[4], 'simhash': sim.value } else: sim = Simhash(get_features(item[1])) objs.append((str(item[0]), sim)) entrydic[str(item[0])] = { 'title': item[1], 'cluster': 0, 'sim_count': 0, 'link': item[4], 'simhash': sim.value } index = SimhashIndex(objs, k=tolerance) cluster_num = last_cluster_num for key in entrydic: if entrydic[key]['cluster'] == 0: sims = index.get_near_dups( Simhash(get_features_cn(entrydic[key]['title']))) for item in sims: entrydic[item]['cluster'] = cluster_num # if len(sims) > 1: entrydic[item]['sim_count'] = len(sims) - 1 if len(sims) > 1: fout.write(item + '\t' + str(entrydic[item]['cluster']) + '\t' + entrydic[item]['title'] + '\n') cursor.execute( 'UPDATE entries SET cluster=%s, sim_count=%s, simhash=%s where id = %s', (entrydic[item]['cluster'], entrydic[item]['sim_count'], str(entrydic[item]['simhash']), item)) # conn.commit() # fout.write(item + '\t' + str(entrydic[item]['cluster']) + '\t' + entrydic[item]['title'] + '\t' + entrydic[item]['link'] + '\n') cluster_num += 1 # cursor.execute('UPDATE somevariables SET last_cluster=%s', (cluster_num,)) # conn.commit() conn.close()
def main(): # user_query = input() DOCID = 0 numPartial = 1 index = SimhashIndex([]) totaldocs = 0 docnum = 0 validDocFile = open('validDocs2', 'w') for root, dirs, files in os.walk(DEVPATH): for fname in files: if not fname.endswith(".json"): continue totaldocs += 1 h2t = html2text.HTML2Text() file = open(root + "/" + fname) pageDict = json.loads(file.read()) # close file to get memory back file.close() # get html formated content htmlContent = pageDict['content'] print(pageDict['url']) plainContent = h2t.handle(htmlContent) feat = get_features(plainContent) sim = Simhash(feat) if len(index.get_near_dups(sim)) > 0: continue print(docnum, totaldocs) index.add(str(docnum), sim) validDocFile.write(root + "/" + fname + "\n") docnum+=1 validDocFile.close()
class SpellingCorrector(object): def __init__(self, vocab_to_freq, f=64, k=32): self.vocab_to_freq = vocab_to_freq self.simhash_index = SimhashIndex([], f=f, k=k) self.f = f self.k = k simhash_index = self.simhash_index for w in vocab_to_freq: sh = Simhash(w, f=f) simhash_index.add(w, sh) def add_valid_word(self, word): if word not in self.vocab_to_freq: sh = Simhash(word, self.f) self.simhash_index.add(word, sh) self.vocab_to_freq[word] = self.vocab_to_freq.get(word, 0) + 1 def correct_word(self, word): if word in self.vocab_to_freq: return word #Edit distance between sh = Simhash(word, f=self.f) candidates = self.simhash_index.get_near_dups(sh) if not candidates: #No near dups. Oh well. This word will go as it is. print 'no candidates' return word if len(candidates) == 1: #Only one candidate, so assume this is the correction return candidates[0] lev_dist_gen = ((other_w, levenshtein(other_w, word)) for other_w in candidates) closest_words, dists = zip(*all_min_or_max(lev_dist_gen, min, lambda item: item[1])) if len(closest_words) == 1: #One of the candidates had the best edit distance. Return that. return closest_words[0] #OK, there are multiple closest words. Rely on word frequency to choose the right one. vocab_to_freq = self.vocab_to_freq word_freq_gen = ((other_w, vocab_to_freq[other_w]) for other_w in closest_words) most_freq_words, freqs = zip(*all_min_or_max(word_freq_gen, max, lambda item: item[1])) #using choice because at this point there's no other way to narrow it down, unless we #track higher order ngrams. return choice(most_freq_words)
def get_near_dups(query_simhash, candidates_simhash, k): res = [0] * len(candidates_simhash) query = Simhash(value=query_simhash) for i in range(len(candidates_simhash)): candidates_simhash[i] = (str(i), Simhash(value=candidates_simhash[i])) i = i + 1 index = SimhashIndex(candidates_simhash, k=k) near_dups = index.get_near_dups(query) for dup in near_dups: res[int(dup)] = 1 return res
def sim_merge(finaldb_cut, simdb): d = {} index_list = [] hashurl2sim = {} max_distance = 10 with open(finaldb_cut, 'r') as f: for line in f: if not line: break # hashurl title author images links text pub_time # 1 2 3 4 5 6 7 # jushi shouji zujin dizhi ditie url crawl_time source ext # 8 9 10 11 12 13 14 15 16 array = line.rstrip('\r\n').split('\t') hashurl=array[0] #string,key title=array[1] #string text=array[5] #string pub_time=array[6] #string url=array[12] #string s = Simhash((title+text).decode('utf-8')) d.update({ hashurl:(title, url, pub_time) }) sim = Simhash((title+text).decode('utf-8')) index_list.append((hashurl, sim)) hashurl2sim.update({hashurl:sim}) index = SimhashIndex(index_list, k=max_distance) merged = {} while d: hashurl, (title, url, pub_time) = d.popitem() merged[hashurl] = (title, url, pub_time) sim_list = index.get_near_dups(hashurl2sim[hashurl]) buf_list = [] for h in sim_list: if h != hashurl: if d.has_key(h): title2, url2, pub_time2 = d.pop(h) merged[h] = (title2, url2, pub_time2) else: title2, url2, pub_time2 = merged[h] else: title2, url2, pub_time2 = title, url, pub_time buf_list.append((h, title2, url2, pub_time2)) if len(buf_list) > 1: buf_list = sorted(buf_list, key=lambda i:i[3], reverse=True) simdb.insert('\t'.join( [buf_list[0][0], json.dumps(buf_list[1:])] ))
def get_simHashindex(hash_list): """ 功能:创建Simhash索引 参数:SimHash列表 返回值:SimHash索 """ return SimhashIndex(hash_list, k=5) #创建索引
def simhash_clustering( signatures: List[int], hamming_distance: int = 3, # num_blocks: Optional[int] = 5, ) -> List[List[int]]: index = SimhashIndex([(i, Simhash(value=signature)) for i, signature in enumerate(signatures)], k=hamming_distance) neighbors: List[List[int]] = [] for signature in signatures: neighbors.append( list(map(int, index.get_near_dups(Simhash(value=signature))))) return neighbors
def _build_index(): global _INDEX index_list = [] for domain in _TRAIN.keys(): sim = Simhash(domain) index_list.append((domain, sim)) _INDEX = SimhashIndex(index_list, k=100)
def setUp(self): data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } objs = [(str(k), Simhash(v)) for k, v in data.items()] self.index = SimhashIndex(objs)
def process_graph(self, project_id): visits = defaultdict(list) p = 0 hashtags_db = Hashtag.objects.filter(project_id=project_id) logger.info("Total hashtags to process " + str(len(hashtags_db))) for hashtag_entry in hashtags_db: visits[hashtag_entry.user_id].append(hashtag_entry.hashtag) p += 1 logger.info("Hashtag read") logger.info("Hashtag processed " + str(p)) logger.info("Visits count " + str(len(visits))) objs = [] cant_users = 0 cant_processed = 0 index = SimhashIndex(objs, f=f1, k=k1) for user, hashtags in visits.iteritems(): if len(hashtags) > MIN_HASHTAG_PER_USER: simhash = Simhash(hashtags, f=f1) index.add(user, simhash) cant_processed += 1 cant_users += 1 if cant_users % 10000 == 0: logger.info("%s processed" % cant_users) logger.info("Simash index build for %i out of %i users" % (cant_processed, len(visits))) cant_processed = 0 for user, hashtags in visits.iteritems(): near_dups = index.get_near_dups(Simhash(hashtags, f=f1)) for user_near_dups in near_dups: user_near_dups = long(user_near_dups) if user_near_dups != long(user): hashtag_near_dups = visits[user_near_dups] intersect = set(hashtags).intersection(hashtag_near_dups) ratio = len(intersect) * 1.0 / len(hashtag_near_dups) if ratio >= 0.1: hashtag_graph = HashtagGraph(user_oid_i=user, user_oid_j=user_near_dups, ratio=ratio) hashtag_graph.save() cant_processed += 1 if cant_processed % 10000 == 0: logger.info("%i processed" % cant_processed)
def sim_merge(finaldb_cut, simdb): d = {} index_list = [] hashurl2sim = {} max_distance = 10 with open(finaldb_cut, 'r') as f: for line in f: if not line: break # hashurl title author images links text pub_time # 1 2 3 4 5 6 7 # jushi shouji zujin dizhi ditie url crawl_time source ext # 8 9 10 11 12 13 14 15 16 array = line.rstrip('\r\n').split('\t') hashurl = array[0] #string,key title = array[1] #string text = array[5] #string pub_time = array[6] #string url = array[12] #string s = Simhash((title + text).decode('utf-8')) d.update({hashurl: (title, url, pub_time)}) sim = Simhash((title + text).decode('utf-8')) index_list.append((hashurl, sim)) hashurl2sim.update({hashurl: sim}) index = SimhashIndex(index_list, k=max_distance) merged = {} while d: hashurl, (title, url, pub_time) = d.popitem() merged[hashurl] = (title, url, pub_time) sim_list = index.get_near_dups(hashurl2sim[hashurl]) buf_list = [] for h in sim_list: if h != hashurl: if d.has_key(h): title2, url2, pub_time2 = d.pop(h) merged[h] = (title2, url2, pub_time2) else: title2, url2, pub_time2 = merged[h] else: title2, url2, pub_time2 = title, url, pub_time buf_list.append((h, title2, url2, pub_time2)) if len(buf_list) > 1: buf_list = sorted(buf_list, key=lambda i: i[3], reverse=True) simdb.insert('\t'.join([buf_list[0][0], json.dumps(buf_list[1:])]))
def process_graph(self, project_id): visits = defaultdict(list) processed = 0 urls_db = Urls.objects.filter(project_id=project_id) logger.info("Total urls to process " + str(len(urls_db))) for url_entry in urls_db: visits[url_entry.user_id].append(url_entry.url) processed += 1 logger.info("Urls read") logger.info("Urls processed " + str(processed)) logger.info("Visits count " + str(len(visits))) objs = [] cant_users = 0 cant_processed = 0 index = SimhashIndex(objs, f=f1, k=k1) for user, urls in visits.iteritems(): if len(urls) > MIN_URLS_PER_USER: simhash = Simhash(urls, f=f1) index.add(user, simhash) cant_processed += 1 cant_users += 1 if cant_users % 10000 == 0: logger.info("%s processed" % cant_users) logger.info("Simash index build for %i out of %i users" % (cant_processed, len(visits))) cant_processed = 0 for user, urls in visits.iteritems(): near_dups = index.get_near_dups(Simhash(urls, f=f1)) for user_near_dups in near_dups: user_near_dups = long(user_near_dups) if user_near_dups != long(user): urls_near_dups = visits[user_near_dups] intersect = set(urls).intersection(urls_near_dups) ratio = len(intersect) * 1.0 / len(urls_near_dups) if ratio >= 0.1: url_graph = UrlsGraph(user_oid_i=user, user_oid_j=user_near_dups, ratio=ratio) url_graph.save() cant_processed += 1 if cant_processed % 10000 == 0: logger.info("%i processed" % cant_processed)
def simhashsort(datadic, entryset): objs = [(id, Simhash(sent)) for id, sent in datadic.items()] index = SimhashIndex(objs, k = tolerance) # k是容忍度;k越大,检索出的相似文本就越多 kind = 1 # 类型号 sorted = set() for id in datadic: if str(id) in sorted: # 不重复分类 continue # 求相似集 similiarlist = index.get_near_dups(Simhash(datadic[id])) similiarlist.append(str(id)) # 将相似集信息返回到entryset中 for id in similiarlist: sorted.add(id) for entry in entryset: if str(entry["id"]) in similiarlist: entry["cluster"] = kind kind += 1
class TestSimhashIndex(TestCase): def setUp(self): data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } objs = [(str(k), Simhash(v)) for k, v in data.items()] self.index = SimhashIndex(objs) def test_bucket_size(self): self.assertEqual(self.index.bucket_size(), 6) def test_get_near_dup(self): s1 = Simhash(u'How are you i am fine. blar blar blar blar blar thank') dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2)
def save_duplicates(save_path, text2hash_dict, k=5): """Group similar docs' title""" # Construct SimhashIndex object for similar docs detection. k is tolerance. index = SimhashIndex(text2hash_dict, k=k) done = list() with tqdm(total=len(text2hash_dict)) as pbar: with open(save_path, 'w', encoding='utf8') as file: for i in range(len(text2hash_dict) - 1): # get near duplicates near_dups = index.get_near_dups(text2hash_dict[i][1]) # near dups includes origin title, len > 1 requested if len(near_dups) > 1 and text2hash_dict[i][0] not in done: for title in near_dups: file.write(title) file.write('\n') file.write('#' * 5 + '\n') done.extend(near_dups) pbar.update()
def process_graph(self, project_id): visits = defaultdict(list) p = 0; hashtags_db = Hashtag.objects.filter(project_id=project_id) logger.info("Total hashtags to process "+str(len(hashtags_db))) for hashtag_entry in hashtags_db: visits[hashtag_entry.user_id].append(hashtag_entry.hashtag) p +=1 logger.info("Hashtag read") logger.info("Hashtag processed " + str(p)) logger.info("Visits count " + str(len(visits))) objs = [] cant_users = 0 cant_processed = 0 index = SimhashIndex(objs, f=f1, k=k1) for user, hashtags in visits.iteritems(): if len(hashtags) > MIN_HASHTAG_PER_USER: simhash = Simhash(hashtags, f=f1) index.add(user, simhash) cant_processed += 1 cant_users += 1 if cant_users % 10000 == 0: logger.info("%s processed" % cant_users) logger.info("Simash index build for %i out of %i users" % (cant_processed, len(visits))) cant_processed = 0 for user, hashtags in visits.iteritems(): near_dups = index.get_near_dups(Simhash(hashtags, f=f1)) for user_near_dups in near_dups: user_near_dups = long(user_near_dups) if user_near_dups != long(user): hashtag_near_dups = visits[user_near_dups] intersect = set(hashtags).intersection(hashtag_near_dups) ratio = len(intersect)*1.0/len(hashtag_near_dups) if ratio >= 0.1: hashtag_graph = HashtagGraph(user_oid_i=user, user_oid_j=user_near_dups, ratio=ratio) hashtag_graph.save() cant_processed += 1 if cant_processed % 10000 == 0: logger.info("%i processed" % cant_processed)
def __init__(self, hash_size=64, hash_tol=3, num_words_to_complete=10): """ Params: hash_size : The number of output bits of the hash function used in SimHash. Higher values -> able to handle more noise. hash_tol : The number of bits that can differ for a candidate near-match in Simhash num_words_to_complete : The number of words to complete given a context when a new document is encountered in get_best_match """ self.num_words_to_complete = num_words_to_complete self.hash_size = hash_size self.hash_tol = hash_tol #This implementation of simhash stores the index in RAM, but it could easily be # put on disk. self.simhash_index = SimhashIndex(objs=[], f=self.hash_size, k=self.hash_tol) self.author_identifier = LanguageModelAuthorIdentifier() self.author_semantic_models = SemanticLanguageModels()
def simhashSort2(datadic, entryset): objs = [] for entry in datadic: objs.append((entry[0], Simhash(entry[1]))) index = SimhashIndex(objs, k=tolerance) # k是容忍度;k越大,检索出的相似文本就越多 kind = 1 # 类型号 sorted = set() for item in datadic: if str(item[0]) in sorted: # 不重复分类 continue # 求相似集 similiarlist = index.get_near_dups(Simhash(item[1])) similiarlist.append(str(item[1])) # 将相似集信息返回到entryset中 for id in similiarlist: sorted.add(id) for entry in entryset: if str(entry["id"]) in similiarlist: entry["sim_count"] = kind kind += 1
def create_test_data(): """For 1 million records, it takes 5 minutes. """ complexity = 1000**2 print("creat data ...") data = [rand_str(8) for i in range(complexity)] print("calculate simhash ...") objs = [(i, Simhash(item)) for i, item in enumerate(data)] print("creat index ...") index = SimhashIndex(objs, k=3) safe_dump_pk(data, datafile) safe_dump_pk(index, indexfile)
def add_to_redis(CONN, hashes): logger.info(type(hashes)) objs = [] for k, v in hashes.iteritems(): a = Simhash('a') a.value = int(k) objs.append((v, a)) logger.info(objs[0]) logger.info('Number of objects: {}'.format(len(objs))) index = SimhashIndex(CONN, objs, k=3) return index
def simhash_1(labels, targets, query, query_url, dataset, k=2, width=5): dictionary = dict(zip(labels, targets)) objs = [(str(k), Simhash(get_features(v, width))) for k, v in dictionary.items()] index = SimhashIndex(objs, k=k) query_simhash = Simhash(get_features(query, width)) near_dups = index.get_near_dups(query_simhash) # Save fingerprints for future use appendToFingerprints( dataset, './dataset/fingerprints.csv', { "query": str(query_simhash.value), "duplicates": ' '.join([str(obj[1].value) for obj in objs]) }) # print("QUERY: {}".format(query_url)) # pp(near_dups) return { "dataset": dataset, "query": query_url, "duplicates": ' '.join(near_dups) }
def simhash_test(): data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } for k, v in data.items(): print k, get_phrases(v) for k, v in data.items(): print k, Simhash(get_phrases(v)).value objs = [(str(k), Simhash(get_phrases(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print index.bucket_size() s1 = Simhash(get_phrases(u'How are you i am fine. blar blar blar blar blar thank')) print index.get_near_dups(s1) index.add('4', s1) print index.get_near_dups(s1)
async def gen_simhash_index(conf): m = 0 n = 0 objs = [] simhash_answer_index = {} for items in conf.DEMO_QUESTION: for item in items: objs.append((n, Simhash(await _tokenization(conf, item)))) simhash_answer_index[n] = m n += 1 m += 1 simhash_index = SimhashIndex(objs, k=6) return simhash_index, simhash_answer_index
def test(n): import time import distance from simhash import Simhash, SimhashIndex WIDTH = 3 def gg(): import random from random import randint from simhash import Simhash, SimhashIndex from itertools import groupby # text = str(bin(randint(2**63, 2**64-1)))[2:] # tokens = [text[i:i + WIDTH] for i in range(max(len(text) - WIDTH + 1, 1))] # return text, Simhash({k: sum(1 for _ in g) for k, g in groupby(sorted(tokens))}) text = ''.join([random.choice('0123456789abcdef') for _ in range(36)]) return text, Simhash(text) hashes = [gg() for _ in range(n)] d1, d2 = [], [] test_string, test_hash = gg() start = time.time() for s, h in hashes: d1.append([distance.hamming(test_string, s), s]) print time.time() - start start = time.time() index = SimhashIndex(hashes, k=5) for st in index.get_near_dups(test_hash): d2.append([distance.hamming(test_string, st), st]) print time.time() - start print len(d1), len(d2) for a, b in zip(sorted(d1)[:20], sorted(d2)): print a[1] == b[1], '\t', a, '\t', b
def find_near_matches(session, collection, index_size, probability_index_near_match): from simhash import Simhash, SimhashIndex logging.getLogger().setLevel(logging.CRITICAL) tweet_id_simhash_value = session.execute( sa.select([model.Tweet.tweet_id, model.Tweet.features['filter','simhash']]) .where(model.Tweet.collection == collection) ) simhash_index = SimhashIndex([], k=7) insert_relation_stmt = pg.insert(model.relation) # insert_tweet_near_matches_stmt = insert_tweet_near_matches_stmt.on_conflict_do_update( # index_elements=['tweet_id', 'collection'], # set_={ # 'earliest_near_match_id': insert_tweet_near_matches_stmt.excluded.earliest_near_match_id # } # ) indexed_tweet_ids = [] for i, (tweet_id, simhash_value) in enumerate(tweet_id_simhash_value): if (i % 100000) == 1000: logger.info('Processed %s tweets. Committing.', i) session.commit() simhash = Simhash(simhash_value) near_matches_ids = simhash_index.get_near_dups(simhash) if not near_matches_ids: simhash_index.add(tweet_id, simhash) indexed_tweet_ids.append((tweet_id, simhash)) if len(indexed_tweet_ids) > index_size: simhash_index.delete(*indexed_tweet_ids.pop(0)) if near_matches_ids: near_match_id = min(near_matches_ids) logger.debug('A near match %s for tweet %s', near_match_id, tweet_id) session.execute( insert_relation_stmt.values( [(tweet_id, collection, 'near_match', near_match_id)] ) ) session.commit()
def use_simhash_index(): data = { 1: "How are you? I Am fine. blar blar blar blar blar Thanks.", 2: "How are you i am fine. blar blar blar blar blar than", 3: "This is simhash test.", } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash( get_features(u"How are you i am fine. blar blar blar blar blar thank")) print(index.get_near_dups(s1)) index.add("4", s1) print(index.get_near_dups(s1))
def use_simhash_index(): data = { 1: "How are you? I Am fine. blar blar blar blar blar Thanks.", 2: "How are you i am fine. blar blar blar blar blar than", 3: "This is simhash test.", } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash(get_features(u"How are you i am fine. blar blar blar blar blar thank")) print(index.get_near_dups(s1)) index.add("4", s1) print(index.get_near_dups(s1))
def console_test(): from simhash import Simhash, SimhashIndex data = { 1: 'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: 'How are you i am fine. blar blar blar blar blar than', 3: 'This is simhash test.', 4: 'How are you i am fine. blar blar blar blar blar thank1', } objs = [(str(k), Simhash(v)) for k, v in data.items()] index = SimhashIndex(objs, k=10) s1 = Simhash( u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank' ) dups = index.get_near_dups(s1) dups = index.get_near_dups2(s1, 5) index.remove(s1)
def __init__(self, config, worker=None): self.config = config self.host, self.port = config.cache_server #self.robots = list of banned paths self.robots = {} self.simhashes = SimhashIndex([]) self.link = 1 self.worker = worker self.maxWords = ( "", 0 ) # maxWords[0] is the URL, maxWords[1] is the number of words in it self.wordCounter = Counter( ) # a dictionary that keeps track of the # of words self.stopWords = [ '1', 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'also', 'am', 'an', 'and', 'any', 'are', 'are', "aren't", 'as', 'at', 'b', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'can', "can't", 'cannot', 'could', "couldn't", 'd', 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', 'has', "hasn't", "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", "ll", 'm', 'may', 'me', 'more', 'most', "mustn't", 'my', 'myself', 'next', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'once', 'one', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 's', 'same', 'say', 'says', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'under', 'until', 'until', 'up', 've', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where', 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'will', 'with', "won't", 'would', "wouldn't", 'x', 'y', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yourself', 'yourselves' ]
def main(path): corpuses = readFiles.normalize(path) results = [] for corpus in corpuses: hashset = {} listofitems = [] for item in corpus.keys(): if item == 'desc': continue z = Simhash(corpus[item]) hashset[item] = z listofitems += [(item, z)] l = SimhashIndex(listofitems) #print(l.get_near_dups(hashset['../corpus/bbc/tech1/001.txt'])) hashlist = {} for i, item1 in enumerate(hashset.keys()): hashlist[item1] = [] for j, item2 in enumerate(hashset.keys()): if j < i: hashlist[item1] += [' '] continue hashlist[item1] += [hashset[item1].distance(hashset[item2])] #print item1, item2, hashset[item1].distance(hashset[item2]) results += [[hashset, hashlist, corpus['desc']]] with open('results.csv', 'wb') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='{') for hashset, hashlist, desc in results: writer.writerow([" "]) writer.writerow([i for i in desc.split()]) record = [] record += [['Table'] + [key for key in hashset.keys()]] for k in hashset.keys(): record += [[k] + hashlist[k]] for item in record: writer.writerow(item)
def clear(self): self.simhash_index = SimhashIndex(objs=[], f=self.hash_size, k=self.hash_tol)
s3 = 'This is simhash test.'.decode('utf-8', 'ignore') # print get_features(s1) # # print Simhash(get_features('How are you? I am fine. Thanks.')).value sh1 = Simhash(s1) sh2 = Simhash(s2) sh3 = Simhash(s3) # print sh.value # print sh1.distance(sh2) shIndex = SimhashIndex([], k=3) shIndex.add('1', sh1) shIndex.add('2', sh2) # shIndex.add('3', sh3) if shIndex.get_near_dups(sh3): print 'YES' else: print 'NO' # print shIndex.get_near_dups(sh2)
def init_index(url, initial_data): data[url] = initial_data objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] global index index = SimhashIndex(objs, k=3)
f_stop.close() f_stop_seg_list = f_stop_text.split('\n') for myword in liststr.split('/'): if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1: mywordlist.append(myword) return ''.join(mywordlist) #data.head()['content'].apply(lambda x:jiebaclearText(str(x))) data['content'] = data['content'].apply(lambda x: jiebaclearText(str(x))) data['simhash'] = data['content'].apply(lambda x: Simhash(x).value) train = data.loc[data['source'] == 'train'] test = data.loc[data['source'] == 'test'] train.drop('source', axis=1, inplace=True) test.drop([ 'source', ], axis=1, inplace=True) objs = [(row["id"], Simhash(row["content"])) for index, row in train.iterrows()] index = SimhashIndex(objs, k=12) test['result'] = test['content'].apply( lambda x: index.get_near_dups(Simhash(x))) sub['result'] = test['result'] sub.to_csv('../output/simhash.csv', index=False)
sys.exit(-1) logger.info('connect mongo ok.' ) try: logger.info('{create_time:{$gte:%ld,$lt:%ld} }' %(lasttimestamp,curtimestamp) ) status_count = weibocollection.find({'create_time':{'$gte':lasttimestamp,'$lt':curtimestamp} }).count() logger.info('status_count: %d' %status_count) if status_count < 10: connection.close();mylogger.close() sys.exit(0) stopwords = loadstopwords(stopwordsfilename) fdoc=open(docfile,'w');fcut=open(cutfile,'w') num=0;simnum=0;cutnum=0 #simhash index = SimhashIndex({}) for one in weibocollection.find({'create_time':{'$gte':lasttimestamp,'$lt':curtimestamp} }): weibo_id = str(one['_id']) weibo_text = one['data']['text'].strip() text_sh = Simhash(weibo_text ) if len(index.get_near_dups(text_sh) ) == 0: #not find sim #cut text_seg = jieba.cut(weibo_text) text_result = list(set(text_seg) - stopwords) content = ' 1 '.join(text_result) if content != '': fdoc.write(weibo_id+'\t'+weibo_text.encode('utf-8')+'\n');fcut.write(content.encode('utf-8')+' 1\n') cutnum += 1 simnum += 1 num += 1 index.add(num,text_sh)
print(Simhash('aa').distance(Simhash('aa'))) def get_features(s): width = 3 s = s.lower() s = re.sub(r'[^\w]+', '', s) return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))] data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash(get_features(u'How are you i am fine. blar blar blar blar blar thank')) print(index.get_near_dups(s1)) index.add('4', s1) print(index.get_near_dups(s1)) def main(): pass if __name__ == '__main__':
def setUp(self): objs = [(str(k), Simhash(v)) for k, v in self.data.items()] self.index = SimhashIndex(objs, k=10)
class TestSimhashIndex(TestCase): data = { 1: 'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: 'How are you i am fine. blar blar blar blar blar than', 3: 'This is simhash test.', 4: 'How are you i am fine. blar blar blar blar blar thank1', } def setUp(self): objs = [(str(k), Simhash(v)) for k, v in self.data.items()] self.index = SimhashIndex(objs, k=10) def test_get_near_dup(self): s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank') dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 3) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 3) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 3)
class DocCollection(object): def __init__(self, hash_size=64, hash_tol=3, num_words_to_complete=10): """ Params: hash_size : The number of output bits of the hash function used in SimHash. Higher values -> able to handle more noise. hash_tol : The number of bits that can differ for a candidate near-match in Simhash num_words_to_complete : The number of words to complete given a context when a new document is encountered in get_best_match """ self.num_words_to_complete = num_words_to_complete self.hash_size = hash_size self.hash_tol = hash_tol #This implementation of simhash stores the index in RAM, but it could easily be # put on disk. self.simhash_index = SimhashIndex(objs=[], f=self.hash_size, k=self.hash_tol) self.author_identifier = LanguageModelAuthorIdentifier() self.author_semantic_models = SemanticLanguageModels() def generate_simhash(self, tokens): #Generate a Simhash from Spacy tokens. sh = Simhash(u'', f=self.hash_size) #silly interface... sh.build_by_features(tokens) return sh def add(self, doc, title, author): add_to_index = self.simhash_index.add #Index each paragraph in the document into the simhash index paras = extract_paragraphs(doc) #Update the word shape language model for this author para_toks = [tokenize(p) for p in paras] flat_tokens = [item for sublist in para_toks for item in sublist] self.author_semantic_models.add_doc(flat_tokens, author) #Update the semantic model for this author self.author_identifier.add_doc(flat_tokens, author) #Add each paragraph to the simhash index for para_num, tokens in enumerate(para_toks, 1): if not tokens: continue sh = self.generate_simhash(tokens) self.simhash_index.add((tokens, title, author, para_num), sh) def get_best_match(self, snippet): get_near_dups = self.simhash_index.get_near_dups generate_simhash = self.generate_simhash title_author_to_count = {} paras = extract_paragraphs(snippet) #evenly distribute the corrupted paragraphs #shuffle(paras) #For each paragraph, get the closest matching previously encountered paragraphs. #If multiple matches, prune via edit distance. #The work of art that matches the most paragraphs is the winner (if it matches enough) paras_done = 0 for para in paras: tokens = tokenize(para) if not tokens: continue paras_done += 1 sh = generate_simhash(tokens) candidates = [make_tuple(match) for match in get_near_dups(sh)] #Increment the count of these works for candidate in candidates: _, title, author, para_num = candidate k = (title, author) title_author_to_count[k] = title_author_to_count.get(k, 0) + 1 if title_author_to_count: #OK, what work was the most frequent, and what was that frequency? (title, author), f = max(title_author_to_count.iteritems(), key=lambda item: item[1]) score = 1.*f/paras_done if score >= 0.1: return {'title': title, 'author': author, 'score': score, 'author_score': None, 'completion': None} #This is either so corrupt that we can't tell what it is, or is a new work. #Guess the author tokens = [item for sublist in [tokenize(p) for p in paras] for item in sublist] author_guess, author_score = self.author_identifier.predict_author(tokens) completion = self.author_semantic_models.complete(author_guess, tokens, self.num_words_to_complete, 1) return {'title': None, 'author': author_guess, 'score': None, 'author_score': author_score, 'completion': completion} def clear(self): self.simhash_index = SimhashIndex(objs=[], f=self.hash_size, k=self.hash_tol)
test_index = [(u[0], Simhash(u[0])) for u in urls] # simhash_results_a.txt : k=20 (subset) # simhash_results_b.txt with open('testdata/solr_20150320/simhash_results_k10.txt', 'w') as f: f.write('') start_time = time.time() for index, (test_url, test_simhash) in enumerate(test_index): i_start_time = time.time() if index % 50 == 0: print 'completed {0} of {1}'.format(index, len(urls)) duplicates = [] for i in xrange(0, len(test_index), 300): index = SimhashIndex(test_index[i:i + 300], k=10) dupes = index.get_near_dups(test_simhash) if len(dupes) > 0: duplicates += dupes print '\t{0} takes {1}'.format(len(duplicates), time.time() - i_start_time) with open('testdata/solr_20150320/simhash_results_k10.txt', 'a') as f: f.write(json.dumps({test_url: duplicates}) + '\n') print 'takes:', time.time() - start_time
class NearDuplicate: def __init__(self, filenames, k=2, metadata_dictionary=None): self.filenames = filenames self.simhash_index = None self.image_dictionary = {} self.metadata_dictionary = metadata_dictionary self.k = k # Need to store the image hashes in some fashion # Possibly cluster the hashes (k-means) def tika_metadata(self, filename): """Use the tika-py module to grab metadata for a file""" parsed = parser.from_file(filename) return parsed.get("metadata", {}) def exifread_metadata(self, filename): """Use the exifread module to grab metadata for a file""" f = open(filename, 'rb') tags = exifread.process_file(f) return tags def generate_features_from_dict(self, filename): """ Use this function when we provide json metadata information from the tika java module""" # Find the metadata object from the json metadata file for the image_file named 'filename' metadata = self.metadata_dictionary.get(filename, {}) # The tags or type of metadata we want feature_tags = ["Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes", "File Name Suffix"] # Create a feature array using these metadata values features = [] feature_weight_dict = { "Image Height" : 1, "Image Width" : 1, "Files Size" : 2, "Content-Type" : 3, "Image Bytes" : 6, "File Name Suffix" :2 } # Grab the bytes of the entire file image_bytes = "NONE" try: image_bytes = open(filename, 'rb').read() except OSError: image_bytes = "NONE" # Get the central bytes image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore") byte_offset = len(image_bytes_str)//4 filename_suffix = filename[-10:] modified_metadata = { "Image Height" : metadata.get("Image Height", "NONE"), "Image Width" : metadata.get("Image Width", "NONE"), "File Size" : metadata.get("File Size", "NONE"), "Content-Type" : metadata.get("Content-Type", "NONE"), "Image Bytes" : image_bytes_str[byte_offset:-byte_offset], "File Name Suffix" : filename_suffix } # Create an array of (token, weight) tuples. These are our features and weights # to be used for the Simhash for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), modified_metadata.items()): features.append((meta_tag + ":" + meta_value, weight)) return features def generate_features(self, filename): """Given an image generate a feature vector""" """ Since Tika-Py requires a server call (i.e. slower) Do native image metadata grabbing, and fallback on tika if the image can't be opened (i.e., it's an svg or gif) """ im, use_tika = None, False try: im = Image.open(filename) use_tika = False except IOError: use_tika = True # Grab the metadata for the image metadata = {} # We'll store features to use for simhash in a tuple array [(token, weight)] features = [] if use_tika: # Use only metadata from tika # The image file can't be opened using PIL.Image, so that means # a diff type of image besides jpg, png metadata = self.tika_metadata(filename) # Grab the bytes of the entire file image_bytes = open(filename).read() # Get the central bytes image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore") #image_bytes_str = str(image_bytes) byte_offset = len(image_bytes_str)//4 metadata["Image Bytes"] = image_bytes_str[byte_offset:-byte_offset] feature_tags = ["Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes"] features = [tag + ":" + metadata.get(tag,"NONE") for tag in feature_tags] return features """ FEATURES We'll resize the image so all images are normalized to a certain size Also make sure to retain aspect ratio Features to use (in order of importance) - center region bytes - color histogram - content type - image width - image height We can take subregions of the image, and hash those """ # Resize the image so all images are normalized width = im.size[0] height = im.size[1] resize_width = 30 resize_height = resize_width*height/width resize_im = None histogram_bytes, histogram_weight = "", 0 center_region_bytes, center_region_weight = "", 5 extension = "" try : resize_im = im.resize((resize_width, resize_height), Image.ANTIALIAS) # Crop sub regions height_padding, width_padding = resize_height/5, resize_width/5 box = (width_padding, height_padding, resize_width - width_padding, resize_height - height_padding) sub_region = resize_im.crop(box) # Generate a histogram histogram_bytes, histogram_weight = str(resize_im.histogram()), 4 center_region_bytes, center_region_weight = str(list(sub_region.getdata())), 3 except OSError: # Couldn't resize the image. Let's print >> sys.stderr, "Couldn't resize the image. Prob an eps or svg" resize_im = im resize_width = im.size[0] resize_height = im.size[1] sub_region = im # Grab the bytes of the entire file image_bytes = open(filename).read() # Get the central bytes #image_bytes_str = str(image_bytes) histogram_bytes = "NONE" image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore") byte_offset = len(image_bytes_str)//4 center_region_bytes = image_bytes_str[byte_offset:-byte_offset] extension = resize_im.format if resize_im.format != None else os.path.splitext(filename)[1] # Figure out the content type (png, jpg, etc.) content_type = "image/" + str(extension.lower()) feature_weight_dict = { "Image Height" : 1, "Image Width" : 1, "Image Histogram" : histogram_weight, "Content-Type" : 5, "Center Region Bytes" : center_region_weight } metadata = { "Image Height" : str(width), "Image Width" : str(height), "Image Histogram" : histogram_bytes, "Content-Type" : content_type, "Center Region Bytes" : center_region_bytes } # Create an array of (token, weight) tuples. These are our features and weights # to be used for the Simhash for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), metadata.items()): features.append((meta_tag + ":" + meta_value, weight)) return features def merge_near_duplicate_dictionaries(self, nd): """Merge the current near duplicate instance with another near duplicate instance""" smaller_nd = self if len(self.image_dictionary) <= len(nd.image_dictionary) else nd larger_nd = self if len(self.image_dictionary) > len(nd.image_dictionary) else nd final_dict = larger_nd.image_dictionary # Iterate over the smaller near duplicate instance for key in smaller_nd.image_dictionary.keys(): # If an exact duplicate exists, just grab it and merge them if larger_nd.image_dictionary.get(key, None) != None: arr = smaller_nd.image_dictionary.get(key, []) +\ larger_nd.image_dictionary.get(key, []) final_dict[key] = arr continue # Find the closest near duplicate in the larger dictionary by # using it's index simhash_obj = smaller_nd.image_dictionary[key][0]["hash_object"] near_duplicates_keys = larger_nd.simhash_index.get_near_dups(simhash_obj) # If a near duplicate exists if len(near_duplicates_keys) > 0: # grab the array of images at that key in the larger dictionary # Merge it the array of images in the smaller dictionary near_dup_key = near_duplicates_keys[0] arr = smaller_nd.image_dictionary.get(key, []) +\ larger_nd.image_dictionary.get(near_dup_key, []) # create an entry in the new dictionary final_dict[near_dup_key] = arr continue # Otherwise we should just add this key-object from the dictionary # to this array final_dict[key] = smaller_nd.image_dictionary[key] # Add this simhash to the Index for efficient searching larger_nd.simhash_index.add(key, simhash_obj) self.image_dictionary = final_dict self.simhash_index = larger_nd.simhash_index nd.image_dicionary = final_dict nd.simhash_index = larger_nd.simhash_index # Now simply return this final dict return final_dict def simhash_value_to_key(self, simhash): """Given a simhash object, convert it's value to a hexadecimal key This key will be used in our image_file dictionary """ return str(hex(simhash.value)) def deduplicate_images(self): """ Given a list of image files "self.filenames", deduplicate the images using near deduplication """ # Iterate through our files for image_file in self.filenames: feature_array = [] if self.metadata_dictionary != None: # Will use a java tika program to generate metadata # Metadata will be a json file with {filename : metadata} objects feature_array = self.generate_features_from_dict(image_file) else: # Use our own function for grabbing metadata # Create a list of features feature_array = self.generate_features(image_file) # Simhash this list of features sHash = Simhash(feature_array) if self.simhash_index == None: # First image, so we create the index add it to the dictionary # And move on to next iteration key = self.simhash_value_to_key(sHash) # We will use this index to speed up the process for finding # nearby simhashes self.simhash_index = SimhashIndex([(key, sHash)], k=self.k) self.image_dictionary[key] = [{ "filename" : image_file, "hash_key" : key, "hash_object": sHash }] continue near_duplicates_keys = self.simhash_index.get_near_dups(sHash) if len(near_duplicates_keys) > 0: # There are duplicates, so we should add them to the corresponding entry # in the file dictionary # Get the key for the nearest duplicate image near_dup_simhash_key = near_duplicates_keys[0] # Get the key for this current image current_simhash_key = self.simhash_value_to_key(sHash) # Create an object comprised of the image filename and key # We'll store this in a dictionary to be used in our merge step current_simhash_object = { "filename" : image_file, "hash_key" : current_simhash_key, "hash_object" : sHash } self.image_dictionary[near_dup_simhash_key].append(current_simhash_object) else: # No duplicates, so let's create an entry in our image filename dictionary key = self.simhash_value_to_key(sHash) # Add this simhash to the Index for efficient searching self.simhash_index.add(key, sHash) # Create an object in our image file dictionary self.image_dictionary[key] = [{ "filename" : image_file, "hash_key" : key, "hash_object" : sHash }]
def deduplicate_images(self): """ Given a list of image files "self.filenames", deduplicate the images using near deduplication """ # Iterate through our files for image_file in self.filenames: feature_array = [] if self.metadata_dictionary != None: # Will use a java tika program to generate metadata # Metadata will be a json file with {filename : metadata} objects feature_array = self.generate_features_from_dict(image_file) else: # Use our own function for grabbing metadata # Create a list of features feature_array = self.generate_features(image_file) # Simhash this list of features sHash = Simhash(feature_array) if self.simhash_index == None: # First image, so we create the index add it to the dictionary # And move on to next iteration key = self.simhash_value_to_key(sHash) # We will use this index to speed up the process for finding # nearby simhashes self.simhash_index = SimhashIndex([(key, sHash)], k=self.k) self.image_dictionary[key] = [{ "filename" : image_file, "hash_key" : key, "hash_object": sHash }] continue near_duplicates_keys = self.simhash_index.get_near_dups(sHash) if len(near_duplicates_keys) > 0: # There are duplicates, so we should add them to the corresponding entry # in the file dictionary # Get the key for the nearest duplicate image near_dup_simhash_key = near_duplicates_keys[0] # Get the key for this current image current_simhash_key = self.simhash_value_to_key(sHash) # Create an object comprised of the image filename and key # We'll store this in a dictionary to be used in our merge step current_simhash_object = { "filename" : image_file, "hash_key" : current_simhash_key, "hash_object" : sHash } self.image_dictionary[near_dup_simhash_key].append(current_simhash_object) else: # No duplicates, so let's create an entry in our image filename dictionary key = self.simhash_value_to_key(sHash) # Add this simhash to the Index for efficient searching self.simhash_index.add(key, sHash) # Create an object in our image file dictionary self.image_dictionary[key] = [{ "filename" : image_file, "hash_key" : key, "hash_object" : sHash }]
class TestSimhashIndex(TestCase): data = { 1: 'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: 'How are you i am fine. blar blar blar blar blar than', 3: 'This is simhash test.', 4: 'How are you i am fine. blar blar blar blar blar thank1', } def setUp(self): objs = [(str(k), Simhash(v)) for k, v in self.data.items()] self.index = SimhashIndex(objs, k=10) def test_get_near_dup(self): s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank') dups = self.index.get_near_dups(s1) self.assertEqual(3, len(dups)) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(2, len(dups)) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(2, len(dups)) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(3, len(dups)) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(3, len(dups))
for j in range(n, n+4): fourgram[url].append(words[j]) #duplicate detection keys = fourgram.keys() f1 = open('rezFinalNoDuplicates.txt', 'w') objs = [] for k in fourgram: try: objs.append((k, Simhash(fourgram[k]))) except Exception as e: print e #objs = [(k, Simhash(fourgram[k])) for k in fourgram] index = SimhashIndex(objs, k=3) print "bucket_size", index.bucket_size() for key in keys: s1 = Simhash(fourgram[key]) duplicates = ", ".join(index.get_near_dups(s1)) f1.write(key + "\t" + duplicates+"\n") print key, duplicates ''' while len(keys) > 0: key = keys.pop() keysJ = list(keys) f1.write(key + '\t' + text[key])
######################################################################################################################################### if(args.near.upper() == 'Y'): print '---------------------------------' print ' MatchMeta.Info Database Fuzzing' print '---------------------------------' def get_features(s): width = 3 s = s.lower() s = re.sub(r'[^\w]+', '', s) return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))] data = {} objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) if os.path.isfile(args.db): print 'MatchMeta.Info Database Located' print 'Patience...Loading Index...' conn = sqlite3.connect(args.db) meta = conn.execute("SELECT path FROM MatchMeta WHERE path NOT LIKE '%winsxs%'") count = 1 for line in meta: item = Simhash(get_features(unicode(line[0]))) count = count+1 index.add(count,item) print index.bucket_size() print 'Excluding the WINSXS Directory'