class TestSimhashIndex(TestCase): data = { 1: u'How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.', 2: u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than', 3: u'This is a different one.', } def setUp(self): objs = [(str(k), Simhash(v)) for k, v in self.data.items()] self.index = SimhashIndex(objs) def test_get_near_dup(self): s1 = Simhash( u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank' ) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 1) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 1) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2)
class TestSimhashIndex(TestCase): data = { 1: 'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: 'How are you i am fine. blar blar blar blar blar than', 3: 'This is simhash test.', 4: 'How are you i am fine. blar blar blar blar blar thank1', } def setUp(self): objs = [(str(k), Simhash(v)) for k, v in self.data.items()] self.index = SimhashIndex(objs, k=10) def test_get_near_dup(self): s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank') dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 3) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 3) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 3)
class TestSimhashIndex(TestCase): data = { 1: 'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: 'How are you i am fine. blar blar blar blar blar than', 3: 'This is simhash test.', 4: 'How are you i am fine. blar blar blar blar blar thank1', } def setUp(self): objs = [(str(k), Simhash(v)) for k, v in self.data.items()] self.index = SimhashIndex(objs, k=10) def test_get_near_dup(self): s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank') dups = self.index.get_near_dups(s1) self.assertEqual(3, len(dups)) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(2, len(dups)) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(2, len(dups)) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(3, len(dups)) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(3, len(dups))
def extract_next_links(url, resp) -> "list()": defrag=urldefrag(url)[0] print(defrag) if resp.status == 200: print("Scanning") if defrag not in urls: content = resp.raw_response.text data=getVisibleText(content) simmed=Simhash(data) if simmed.value not in sims: index=SimhashIndex(objs,k=3) if len(index.get_near_dups(simmed))==0: urls.add(defrag) sims.add(simmed.value) objs.append((url,simmed)) print(len(urls),len(sims),len(objs)) try: file=open("data_dump.txt","a",errors="ignore") to_write=url+ " \n "+ data+ "\n"+ str(simmed.value) +"\n\n" file.write(to_write) except Exception as e: raise e finally: file.close() #urls[defrag].add(getVisibleText(content)) #print(urls[defrag]) return getAllUrls(url,content) else: print("Cant scan") return []
def use_simhash_index(): data = { 1: "How are you? I Am fine. blar blar blar blar blar Thanks.", 2: "How are you i am fine. blar blar blar blar blar than", 3: "This is simhash test.", } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash(get_features(u"How are you i am fine. blar blar blar blar blar thank")) print(index.get_near_dups(s1)) index.add("4", s1) print(index.get_near_dups(s1))
def use_simhash_index(): data = { 1: "How are you? I Am fine. blar blar blar blar blar Thanks.", 2: "How are you i am fine. blar blar blar blar blar than", 3: "This is simhash test.", } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash( get_features(u"How are you i am fine. blar blar blar blar blar thank")) print(index.get_near_dups(s1)) index.add("4", s1) print(index.get_near_dups(s1))
def simhash_test(): data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } for k, v in data.items(): print k, get_phrases(v) for k, v in data.items(): print k, Simhash(get_phrases(v)).value objs = [(str(k), Simhash(get_phrases(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print index.bucket_size() s1 = Simhash(get_phrases(u'How are you i am fine. blar blar blar blar blar thank')) print index.get_near_dups(s1) index.add('4', s1) print index.get_near_dups(s1)
def clustering(): fout = open('cluster.txt', 'w', encoding='UTF-8') cursor = conn.cursor() cursor.execute( 'SELECT id, title, cluster, sim_count, link, simhash FROM entries where cluster=0' ) entrylist = cursor.fetchall() objs = [] entrydic = {} for item in entrylist: if not is_en(item[1]): if not item[4].startswith("https://weibo.com"): sim = Simhash(get_features_cn(item[1])) objs.append((str(item[0]), sim)) entrydic[str(item[0])] = { 'title': item[1], 'cluster': 0, 'sim_count': 0, 'link': item[4], 'simhash': sim.value } else: sim = Simhash(get_features(item[1])) objs.append((str(item[0]), sim)) entrydic[str(item[0])] = { 'title': item[1], 'cluster': 0, 'sim_count': 0, 'link': item[4], 'simhash': sim.value } index = SimhashIndex(objs, k=tolerance) cluster_num = last_cluster_num for key in entrydic: if entrydic[key]['cluster'] == 0: sims = index.get_near_dups( Simhash(get_features_cn(entrydic[key]['title']))) for item in sims: entrydic[item]['cluster'] = cluster_num # if len(sims) > 1: entrydic[item]['sim_count'] = len(sims) - 1 if len(sims) > 1: fout.write(item + '\t' + str(entrydic[item]['cluster']) + '\t' + entrydic[item]['title'] + '\n') cursor.execute( 'UPDATE entries SET cluster=%s, sim_count=%s, simhash=%s where id = %s', (entrydic[item]['cluster'], entrydic[item]['sim_count'], str(entrydic[item]['simhash']), item)) # conn.commit() # fout.write(item + '\t' + str(entrydic[item]['cluster']) + '\t' + entrydic[item]['title'] + '\t' + entrydic[item]['link'] + '\n') cluster_num += 1 # cursor.execute('UPDATE somevariables SET last_cluster=%s', (cluster_num,)) # conn.commit() conn.close()
def main(): # user_query = input() DOCID = 0 numPartial = 1 index = SimhashIndex([]) totaldocs = 0 docnum = 0 validDocFile = open('validDocs2', 'w') for root, dirs, files in os.walk(DEVPATH): for fname in files: if not fname.endswith(".json"): continue totaldocs += 1 h2t = html2text.HTML2Text() file = open(root + "/" + fname) pageDict = json.loads(file.read()) # close file to get memory back file.close() # get html formated content htmlContent = pageDict['content'] print(pageDict['url']) plainContent = h2t.handle(htmlContent) feat = get_features(plainContent) sim = Simhash(feat) if len(index.get_near_dups(sim)) > 0: continue print(docnum, totaldocs) index.add(str(docnum), sim) validDocFile.write(root + "/" + fname + "\n") docnum+=1 validDocFile.close()
class SpellingCorrector(object): def __init__(self, vocab_to_freq, f=64, k=32): self.vocab_to_freq = vocab_to_freq self.simhash_index = SimhashIndex([], f=f, k=k) self.f = f self.k = k simhash_index = self.simhash_index for w in vocab_to_freq: sh = Simhash(w, f=f) simhash_index.add(w, sh) def add_valid_word(self, word): if word not in self.vocab_to_freq: sh = Simhash(word, self.f) self.simhash_index.add(word, sh) self.vocab_to_freq[word] = self.vocab_to_freq.get(word, 0) + 1 def correct_word(self, word): if word in self.vocab_to_freq: return word #Edit distance between sh = Simhash(word, f=self.f) candidates = self.simhash_index.get_near_dups(sh) if not candidates: #No near dups. Oh well. This word will go as it is. print 'no candidates' return word if len(candidates) == 1: #Only one candidate, so assume this is the correction return candidates[0] lev_dist_gen = ((other_w, levenshtein(other_w, word)) for other_w in candidates) closest_words, dists = zip(*all_min_or_max(lev_dist_gen, min, lambda item: item[1])) if len(closest_words) == 1: #One of the candidates had the best edit distance. Return that. return closest_words[0] #OK, there are multiple closest words. Rely on word frequency to choose the right one. vocab_to_freq = self.vocab_to_freq word_freq_gen = ((other_w, vocab_to_freq[other_w]) for other_w in closest_words) most_freq_words, freqs = zip(*all_min_or_max(word_freq_gen, max, lambda item: item[1])) #using choice because at this point there's no other way to narrow it down, unless we #track higher order ngrams. return choice(most_freq_words)
def get_near_dups(query_simhash, candidates_simhash, k): res = [0] * len(candidates_simhash) query = Simhash(value=query_simhash) for i in range(len(candidates_simhash)): candidates_simhash[i] = (str(i), Simhash(value=candidates_simhash[i])) i = i + 1 index = SimhashIndex(candidates_simhash, k=k) near_dups = index.get_near_dups(query) for dup in near_dups: res[int(dup)] = 1 return res
def sim_merge(finaldb_cut, simdb): d = {} index_list = [] hashurl2sim = {} max_distance = 10 with open(finaldb_cut, 'r') as f: for line in f: if not line: break # hashurl title author images links text pub_time # 1 2 3 4 5 6 7 # jushi shouji zujin dizhi ditie url crawl_time source ext # 8 9 10 11 12 13 14 15 16 array = line.rstrip('\r\n').split('\t') hashurl=array[0] #string,key title=array[1] #string text=array[5] #string pub_time=array[6] #string url=array[12] #string s = Simhash((title+text).decode('utf-8')) d.update({ hashurl:(title, url, pub_time) }) sim = Simhash((title+text).decode('utf-8')) index_list.append((hashurl, sim)) hashurl2sim.update({hashurl:sim}) index = SimhashIndex(index_list, k=max_distance) merged = {} while d: hashurl, (title, url, pub_time) = d.popitem() merged[hashurl] = (title, url, pub_time) sim_list = index.get_near_dups(hashurl2sim[hashurl]) buf_list = [] for h in sim_list: if h != hashurl: if d.has_key(h): title2, url2, pub_time2 = d.pop(h) merged[h] = (title2, url2, pub_time2) else: title2, url2, pub_time2 = merged[h] else: title2, url2, pub_time2 = title, url, pub_time buf_list.append((h, title2, url2, pub_time2)) if len(buf_list) > 1: buf_list = sorted(buf_list, key=lambda i:i[3], reverse=True) simdb.insert('\t'.join( [buf_list[0][0], json.dumps(buf_list[1:])] ))
def find_near_matches(session, collection, index_size, probability_index_near_match): from simhash import Simhash, SimhashIndex logging.getLogger().setLevel(logging.CRITICAL) tweet_id_simhash_value = session.execute( sa.select([model.Tweet.tweet_id, model.Tweet.features['filter','simhash']]) .where(model.Tweet.collection == collection) ) simhash_index = SimhashIndex([], k=7) insert_relation_stmt = pg.insert(model.relation) # insert_tweet_near_matches_stmt = insert_tweet_near_matches_stmt.on_conflict_do_update( # index_elements=['tweet_id', 'collection'], # set_={ # 'earliest_near_match_id': insert_tweet_near_matches_stmt.excluded.earliest_near_match_id # } # ) indexed_tweet_ids = [] for i, (tweet_id, simhash_value) in enumerate(tweet_id_simhash_value): if (i % 100000) == 1000: logger.info('Processed %s tweets. Committing.', i) session.commit() simhash = Simhash(simhash_value) near_matches_ids = simhash_index.get_near_dups(simhash) if not near_matches_ids: simhash_index.add(tweet_id, simhash) indexed_tweet_ids.append((tweet_id, simhash)) if len(indexed_tweet_ids) > index_size: simhash_index.delete(*indexed_tweet_ids.pop(0)) if near_matches_ids: near_match_id = min(near_matches_ids) logger.debug('A near match %s for tweet %s', near_match_id, tweet_id) session.execute( insert_relation_stmt.values( [(tweet_id, collection, 'near_match', near_match_id)] ) ) session.commit()
def console_test(): from simhash import Simhash, SimhashIndex data = { 1: 'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: 'How are you i am fine. blar blar blar blar blar than', 3: 'This is simhash test.', 4: 'How are you i am fine. blar blar blar blar blar thank1', } objs = [(str(k), Simhash(v)) for k, v in data.items()] index = SimhashIndex(objs, k=10) s1 = Simhash( u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank' ) dups = index.get_near_dups(s1) dups = index.get_near_dups2(s1, 5) index.remove(s1)
def simhash_clustering( signatures: List[int], hamming_distance: int = 3, # num_blocks: Optional[int] = 5, ) -> List[List[int]]: index = SimhashIndex([(i, Simhash(value=signature)) for i, signature in enumerate(signatures)], k=hamming_distance) neighbors: List[List[int]] = [] for signature in signatures: neighbors.append( list(map(int, index.get_near_dups(Simhash(value=signature))))) return neighbors
def process_graph(self, project_id): visits = defaultdict(list) p = 0 hashtags_db = Hashtag.objects.filter(project_id=project_id) logger.info("Total hashtags to process " + str(len(hashtags_db))) for hashtag_entry in hashtags_db: visits[hashtag_entry.user_id].append(hashtag_entry.hashtag) p += 1 logger.info("Hashtag read") logger.info("Hashtag processed " + str(p)) logger.info("Visits count " + str(len(visits))) objs = [] cant_users = 0 cant_processed = 0 index = SimhashIndex(objs, f=f1, k=k1) for user, hashtags in visits.iteritems(): if len(hashtags) > MIN_HASHTAG_PER_USER: simhash = Simhash(hashtags, f=f1) index.add(user, simhash) cant_processed += 1 cant_users += 1 if cant_users % 10000 == 0: logger.info("%s processed" % cant_users) logger.info("Simash index build for %i out of %i users" % (cant_processed, len(visits))) cant_processed = 0 for user, hashtags in visits.iteritems(): near_dups = index.get_near_dups(Simhash(hashtags, f=f1)) for user_near_dups in near_dups: user_near_dups = long(user_near_dups) if user_near_dups != long(user): hashtag_near_dups = visits[user_near_dups] intersect = set(hashtags).intersection(hashtag_near_dups) ratio = len(intersect) * 1.0 / len(hashtag_near_dups) if ratio >= 0.1: hashtag_graph = HashtagGraph(user_oid_i=user, user_oid_j=user_near_dups, ratio=ratio) hashtag_graph.save() cant_processed += 1 if cant_processed % 10000 == 0: logger.info("%i processed" % cant_processed)
def sim_merge(finaldb_cut, simdb): d = {} index_list = [] hashurl2sim = {} max_distance = 10 with open(finaldb_cut, 'r') as f: for line in f: if not line: break # hashurl title author images links text pub_time # 1 2 3 4 5 6 7 # jushi shouji zujin dizhi ditie url crawl_time source ext # 8 9 10 11 12 13 14 15 16 array = line.rstrip('\r\n').split('\t') hashurl = array[0] #string,key title = array[1] #string text = array[5] #string pub_time = array[6] #string url = array[12] #string s = Simhash((title + text).decode('utf-8')) d.update({hashurl: (title, url, pub_time)}) sim = Simhash((title + text).decode('utf-8')) index_list.append((hashurl, sim)) hashurl2sim.update({hashurl: sim}) index = SimhashIndex(index_list, k=max_distance) merged = {} while d: hashurl, (title, url, pub_time) = d.popitem() merged[hashurl] = (title, url, pub_time) sim_list = index.get_near_dups(hashurl2sim[hashurl]) buf_list = [] for h in sim_list: if h != hashurl: if d.has_key(h): title2, url2, pub_time2 = d.pop(h) merged[h] = (title2, url2, pub_time2) else: title2, url2, pub_time2 = merged[h] else: title2, url2, pub_time2 = title, url, pub_time buf_list.append((h, title2, url2, pub_time2)) if len(buf_list) > 1: buf_list = sorted(buf_list, key=lambda i: i[3], reverse=True) simdb.insert('\t'.join([buf_list[0][0], json.dumps(buf_list[1:])]))
def process_graph(self, project_id): visits = defaultdict(list) processed = 0 urls_db = Urls.objects.filter(project_id=project_id) logger.info("Total urls to process " + str(len(urls_db))) for url_entry in urls_db: visits[url_entry.user_id].append(url_entry.url) processed += 1 logger.info("Urls read") logger.info("Urls processed " + str(processed)) logger.info("Visits count " + str(len(visits))) objs = [] cant_users = 0 cant_processed = 0 index = SimhashIndex(objs, f=f1, k=k1) for user, urls in visits.iteritems(): if len(urls) > MIN_URLS_PER_USER: simhash = Simhash(urls, f=f1) index.add(user, simhash) cant_processed += 1 cant_users += 1 if cant_users % 10000 == 0: logger.info("%s processed" % cant_users) logger.info("Simash index build for %i out of %i users" % (cant_processed, len(visits))) cant_processed = 0 for user, urls in visits.iteritems(): near_dups = index.get_near_dups(Simhash(urls, f=f1)) for user_near_dups in near_dups: user_near_dups = long(user_near_dups) if user_near_dups != long(user): urls_near_dups = visits[user_near_dups] intersect = set(urls).intersection(urls_near_dups) ratio = len(intersect) * 1.0 / len(urls_near_dups) if ratio >= 0.1: url_graph = UrlsGraph(user_oid_i=user, user_oid_j=user_near_dups, ratio=ratio) url_graph.save() cant_processed += 1 if cant_processed % 10000 == 0: logger.info("%i processed" % cant_processed)
class TestSimhashIndex(TestCase): def setUp(self): data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } objs = [(str(k), Simhash(v)) for k, v in data.items()] self.index = SimhashIndex(objs) def test_bucket_size(self): self.assertEqual(self.index.bucket_size(), 6) def test_get_near_dup(self): s1 = Simhash(u'How are you i am fine. blar blar blar blar blar thank') dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2)
def simhashsort(datadic, entryset): objs = [(id, Simhash(sent)) for id, sent in datadic.items()] index = SimhashIndex(objs, k = tolerance) # k是容忍度;k越大,检索出的相似文本就越多 kind = 1 # 类型号 sorted = set() for id in datadic: if str(id) in sorted: # 不重复分类 continue # 求相似集 similiarlist = index.get_near_dups(Simhash(datadic[id])) similiarlist.append(str(id)) # 将相似集信息返回到entryset中 for id in similiarlist: sorted.add(id) for entry in entryset: if str(entry["id"]) in similiarlist: entry["cluster"] = kind kind += 1
def process_graph(self, project_id): visits = defaultdict(list) p = 0; hashtags_db = Hashtag.objects.filter(project_id=project_id) logger.info("Total hashtags to process "+str(len(hashtags_db))) for hashtag_entry in hashtags_db: visits[hashtag_entry.user_id].append(hashtag_entry.hashtag) p +=1 logger.info("Hashtag read") logger.info("Hashtag processed " + str(p)) logger.info("Visits count " + str(len(visits))) objs = [] cant_users = 0 cant_processed = 0 index = SimhashIndex(objs, f=f1, k=k1) for user, hashtags in visits.iteritems(): if len(hashtags) > MIN_HASHTAG_PER_USER: simhash = Simhash(hashtags, f=f1) index.add(user, simhash) cant_processed += 1 cant_users += 1 if cant_users % 10000 == 0: logger.info("%s processed" % cant_users) logger.info("Simash index build for %i out of %i users" % (cant_processed, len(visits))) cant_processed = 0 for user, hashtags in visits.iteritems(): near_dups = index.get_near_dups(Simhash(hashtags, f=f1)) for user_near_dups in near_dups: user_near_dups = long(user_near_dups) if user_near_dups != long(user): hashtag_near_dups = visits[user_near_dups] intersect = set(hashtags).intersection(hashtag_near_dups) ratio = len(intersect)*1.0/len(hashtag_near_dups) if ratio >= 0.1: hashtag_graph = HashtagGraph(user_oid_i=user, user_oid_j=user_near_dups, ratio=ratio) hashtag_graph.save() cant_processed += 1 if cant_processed % 10000 == 0: logger.info("%i processed" % cant_processed)
def save_duplicates(save_path, text2hash_dict, k=5): """Group similar docs' title""" # Construct SimhashIndex object for similar docs detection. k is tolerance. index = SimhashIndex(text2hash_dict, k=k) done = list() with tqdm(total=len(text2hash_dict)) as pbar: with open(save_path, 'w', encoding='utf8') as file: for i in range(len(text2hash_dict) - 1): # get near duplicates near_dups = index.get_near_dups(text2hash_dict[i][1]) # near dups includes origin title, len > 1 requested if len(near_dups) > 1 and text2hash_dict[i][0] not in done: for title in near_dups: file.write(title) file.write('\n') file.write('#' * 5 + '\n') done.extend(near_dups) pbar.update()
def simhashSort2(datadic, entryset): objs = [] for entry in datadic: objs.append((entry[0], Simhash(entry[1]))) index = SimhashIndex(objs, k=tolerance) # k是容忍度;k越大,检索出的相似文本就越多 kind = 1 # 类型号 sorted = set() for item in datadic: if str(item[0]) in sorted: # 不重复分类 continue # 求相似集 similiarlist = index.get_near_dups(Simhash(item[1])) similiarlist.append(str(item[1])) # 将相似集信息返回到entryset中 for id in similiarlist: sorted.add(id) for entry in entryset: if str(entry["id"]) in similiarlist: entry["sim_count"] = kind kind += 1
def simhash_1(labels, targets, query, query_url, dataset, k=2, width=5): dictionary = dict(zip(labels, targets)) objs = [(str(k), Simhash(get_features(v, width))) for k, v in dictionary.items()] index = SimhashIndex(objs, k=k) query_simhash = Simhash(get_features(query, width)) near_dups = index.get_near_dups(query_simhash) # Save fingerprints for future use appendToFingerprints( dataset, './dataset/fingerprints.csv', { "query": str(query_simhash.value), "duplicates": ' '.join([str(obj[1].value) for obj in objs]) }) # print("QUERY: {}".format(query_url)) # pp(near_dups) return { "dataset": dataset, "query": query_url, "duplicates": ' '.join(near_dups) }
def test(n): import time import distance from simhash import Simhash, SimhashIndex WIDTH = 3 def gg(): import random from random import randint from simhash import Simhash, SimhashIndex from itertools import groupby # text = str(bin(randint(2**63, 2**64-1)))[2:] # tokens = [text[i:i + WIDTH] for i in range(max(len(text) - WIDTH + 1, 1))] # return text, Simhash({k: sum(1 for _ in g) for k, g in groupby(sorted(tokens))}) text = ''.join([random.choice('0123456789abcdef') for _ in range(36)]) return text, Simhash(text) hashes = [gg() for _ in range(n)] d1, d2 = [], [] test_string, test_hash = gg() start = time.time() for s, h in hashes: d1.append([distance.hamming(test_string, s), s]) print time.time() - start start = time.time() index = SimhashIndex(hashes, k=5) for st in index.get_near_dups(test_hash): d2.append([distance.hamming(test_string, st), st]) print time.time() - start print len(d1), len(d2) for a, b in zip(sorted(d1)[:20], sorted(d2)): print a[1] == b[1], '\t', a, '\t', b
keys = fourgram.keys() f1 = open('rezFinalNoDuplicates.txt', 'w') objs = [] for k in fourgram: try: objs.append((k, Simhash(fourgram[k]))) except Exception as e: print e #objs = [(k, Simhash(fourgram[k])) for k in fourgram] index = SimhashIndex(objs, k=3) print "bucket_size", index.bucket_size() for key in keys: s1 = Simhash(fourgram[key]) duplicates = ", ".join(index.get_near_dups(s1)) f1.write(key + "\t" + duplicates+"\n") print key, duplicates ''' while len(keys) > 0: key = keys.pop() keysJ = list(keys) f1.write(key + '\t' + text[key]) while len(keysJ) > 0: j = keysJ.pop() intersect = fourgram[key] & fourgram[j] #print "checking", text[j] #print "forgram ", fourgram[key], "fourgram", fourgram[j] #print "calculation", len(intersect) , len(fourgram[key]) / 2.0
import re from simhash import Simhash, SimhashIndex def get_features(s): width = 3 s = s.lower() s = re.sub(r'[^\w]+', '', s) return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))] data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash( get_features(u'How are you i am fine. blar blar blar blar blar thank')) print(index.get_near_dups(s1)) index.add('4', s1) print(index.get_near_dups(s1))
from simhash import Simhash print '%x' % Simhash('How are you? I am fine. Thanks.').value print '%x' % Simhash('How are u? I am fine. Thanks.').value print '%x' % Simhash('How r you?I am fine. Thanks.').value from simhash import Simhash, SimhashIndex data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } objs = [(str(k), Simhash(v)) for k, v in data.items()] index = SimhashIndex(objs) print index.bucket_size() s1 = Simhash(u'How are you i am fine. blar blar blar blar blar thank') print index.get_near_dups(s1) index.add('4', s1) print index.get_near_dups(s1) # build by features print '%x' % Simhash('How').value print '%x' % Simhash(['How']).value print '%x' % Simhash(['How', 'are', 'you']).value
wrongos = list(set(match) & set(osArray)) print '---------------------------------' print ' Wrong Operating System' print '---------------------------------' print len(wrongos) ######################################################################################################################################### if (args.near.upper() == 'Y'): fuzzy = [] for line in unknown: try: fuzz = Simhash(get_features(unicode(line))) num = index.get_near_dups(fuzz) if len(num) != 0: fuzzy.append(line) except: pass print '---------------------------------' print ' Total Fuzzy Near Matches' print '---------------------------------' print len(fuzzy) ######################################################################################################################################### wrongpath = [] for line in unknown:
test_index = [(u[0], Simhash(u[0])) for u in urls] # simhash_results_a.txt : k=20 (subset) # simhash_results_b.txt with open('testdata/solr_20150320/simhash_results_k10.txt', 'w') as f: f.write('') start_time = time.time() for index, (test_url, test_simhash) in enumerate(test_index): i_start_time = time.time() if index % 50 == 0: print 'completed {0} of {1}'.format(index, len(urls)) duplicates = [] for i in xrange(0, len(test_index), 300): index = SimhashIndex(test_index[i:i + 300], k=10) dupes = index.get_near_dups(test_simhash) if len(dupes) > 0: duplicates += dupes print '\t{0} takes {1}'.format(len(duplicates), time.time() - i_start_time) with open('testdata/solr_20150320/simhash_results_k10.txt', 'a') as f: f.write(json.dumps({test_url: duplicates}) + '\n') print 'takes:', time.time() - start_time
logger.info('{create_time:{$gte:%ld,$lt:%ld} }' %(lasttimestamp,curtimestamp) ) status_count = weibocollection.find({'create_time':{'$gte':lasttimestamp,'$lt':curtimestamp} }).count() logger.info('status_count: %d' %status_count) if status_count < 10: connection.close();mylogger.close() sys.exit(0) stopwords = loadstopwords(stopwordsfilename) fdoc=open(docfile,'w');fcut=open(cutfile,'w') num=0;simnum=0;cutnum=0 #simhash index = SimhashIndex({}) for one in weibocollection.find({'create_time':{'$gte':lasttimestamp,'$lt':curtimestamp} }): weibo_id = str(one['_id']) weibo_text = one['data']['text'].strip() text_sh = Simhash(weibo_text ) if len(index.get_near_dups(text_sh) ) == 0: #not find sim #cut text_seg = jieba.cut(weibo_text) text_result = list(set(text_seg) - stopwords) content = ' 1 '.join(text_result) if content != '': fdoc.write(weibo_id+'\t'+weibo_text.encode('utf-8')+'\n');fcut.write(content.encode('utf-8')+' 1\n') cutnum += 1 simnum += 1 num += 1 index.add(num,text_sh) except pymongo.errors,e: logger.critical('mongo find error: %s' %e) sys.exit(-2) logger.info('simnum: %d ' %simnum);
class NearDuplicate: def __init__(self, filenames, k=2, metadata_dictionary=None): self.filenames = filenames self.simhash_index = None self.image_dictionary = {} self.metadata_dictionary = metadata_dictionary self.k = k # Need to store the image hashes in some fashion # Possibly cluster the hashes (k-means) def tika_metadata(self, filename): """Use the tika-py module to grab metadata for a file""" parsed = parser.from_file(filename) return parsed.get("metadata", {}) def exifread_metadata(self, filename): """Use the exifread module to grab metadata for a file""" f = open(filename, 'rb') tags = exifread.process_file(f) return tags def generate_features_from_dict(self, filename): """ Use this function when we provide json metadata information from the tika java module""" # Find the metadata object from the json metadata file for the image_file named 'filename' metadata = self.metadata_dictionary.get(filename, {}) # The tags or type of metadata we want feature_tags = [ "Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes", "File Name Suffix" ] # Create a feature array using these metadata values features = [] feature_weight_dict = { "Image Height": 1, "Image Width": 1, "Files Size": 2, "Content-Type": 3, "Image Bytes": 6, "File Name Suffix": 2 } # Grab the bytes of the entire file image_bytes = "NONE" try: image_bytes = open(filename, 'rb').read() except OSError: image_bytes = "NONE" # Get the central bytes image_bytes_str = unicode(str(image_bytes), 'utf-8', "ignore") byte_offset = len(image_bytes_str) // 4 filename_suffix = filename[-10:] modified_metadata = { "Image Height": metadata.get("Image Height", "NONE"), "Image Width": metadata.get("Image Width", "NONE"), "File Size": metadata.get("File Size", "NONE"), "Content-Type": metadata.get("Content-Type", "NONE"), "Image Bytes": image_bytes_str[byte_offset:-byte_offset], "File Name Suffix": filename_suffix } # Create an array of (token, weight) tuples. These are our features and weights # to be used for the Simhash for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), modified_metadata.items()): features.append((meta_tag + ":" + meta_value, weight)) return features def generate_features(self, filename): """Given an image generate a feature vector""" """ Since Tika-Py requires a server call (i.e. slower) Do native image metadata grabbing, and fallback on tika if the image can't be opened (i.e., it's an svg or gif) """ im, use_tika = None, False try: im = Image.open(filename) use_tika = False except IOError: use_tika = True # Grab the metadata for the image metadata = {} # We'll store features to use for simhash in a tuple array [(token, weight)] features = [] if use_tika: # Use only metadata from tika # The image file can't be opened using PIL.Image, so that means # a diff type of image besides jpg, png metadata = self.tika_metadata(filename) # Grab the bytes of the entire file image_bytes = open(filename).read() # Get the central bytes image_bytes_str = unicode(str(image_bytes), 'utf-8', "ignore") #image_bytes_str = str(image_bytes) byte_offset = len(image_bytes_str) // 4 metadata["Image Bytes"] = image_bytes_str[byte_offset:-byte_offset] feature_tags = [ "Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes" ] features = [ tag + ":" + metadata.get(tag, "NONE") for tag in feature_tags ] return features """ FEATURES We'll resize the image so all images are normalized to a certain size Also make sure to retain aspect ratio Features to use (in order of importance) - center region bytes - color histogram - content type - image width - image height We can take subregions of the image, and hash those """ # Resize the image so all images are normalized width = im.size[0] height = im.size[1] resize_width = 30 resize_height = resize_width * height / width resize_im = None histogram_bytes, histogram_weight = "", 0 center_region_bytes, center_region_weight = "", 5 extension = "" try: resize_im = im.resize((resize_width, resize_height), Image.ANTIALIAS) # Crop sub regions height_padding, width_padding = resize_height / 5, resize_width / 5 box = (width_padding, height_padding, resize_width - width_padding, resize_height - height_padding) sub_region = resize_im.crop(box) # Generate a histogram histogram_bytes, histogram_weight = str(resize_im.histogram()), 4 center_region_bytes, center_region_weight = str( list(sub_region.getdata())), 3 except OSError: # Couldn't resize the image. Let's print >> sys.stderr, "Couldn't resize the image. Prob an eps or svg" resize_im = im resize_width = im.size[0] resize_height = im.size[1] sub_region = im # Grab the bytes of the entire file image_bytes = open(filename).read() # Get the central bytes #image_bytes_str = str(image_bytes) histogram_bytes = "NONE" image_bytes_str = unicode(str(image_bytes), 'utf-8', "ignore") byte_offset = len(image_bytes_str) // 4 center_region_bytes = image_bytes_str[byte_offset:-byte_offset] extension = resize_im.format if resize_im.format != None else os.path.splitext( filename)[1] # Figure out the content type (png, jpg, etc.) content_type = "image/" + str(extension.lower()) feature_weight_dict = { "Image Height": 1, "Image Width": 1, "Image Histogram": histogram_weight, "Content-Type": 5, "Center Region Bytes": center_region_weight } metadata = { "Image Height": str(width), "Image Width": str(height), "Image Histogram": histogram_bytes, "Content-Type": content_type, "Center Region Bytes": center_region_bytes } # Create an array of (token, weight) tuples. These are our features and weights # to be used for the Simhash for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), metadata.items()): features.append((meta_tag + ":" + meta_value, weight)) return features def merge_near_duplicate_dictionaries(self, nd): """Merge the current near duplicate instance with another near duplicate instance""" smaller_nd = self if len(self.image_dictionary) <= len( nd.image_dictionary) else nd larger_nd = self if len(self.image_dictionary) > len( nd.image_dictionary) else nd final_dict = larger_nd.image_dictionary # Iterate over the smaller near duplicate instance for key in smaller_nd.image_dictionary.keys(): # If an exact duplicate exists, just grab it and merge them if larger_nd.image_dictionary.get(key, None) != None: arr = smaller_nd.image_dictionary.get(key, []) +\ larger_nd.image_dictionary.get(key, []) final_dict[key] = arr continue # Find the closest near duplicate in the larger dictionary by # using it's index simhash_obj = smaller_nd.image_dictionary[key][0]["hash_object"] near_duplicates_keys = larger_nd.simhash_index.get_near_dups( simhash_obj) # If a near duplicate exists if len(near_duplicates_keys) > 0: # grab the array of images at that key in the larger dictionary # Merge it the array of images in the smaller dictionary near_dup_key = near_duplicates_keys[0] arr = smaller_nd.image_dictionary.get(key, []) +\ larger_nd.image_dictionary.get(near_dup_key, []) # create an entry in the new dictionary final_dict[near_dup_key] = arr continue # Otherwise we should just add this key-object from the dictionary # to this array final_dict[key] = smaller_nd.image_dictionary[key] # Add this simhash to the Index for efficient searching larger_nd.simhash_index.add(key, simhash_obj) self.image_dictionary = final_dict self.simhash_index = larger_nd.simhash_index nd.image_dicionary = final_dict nd.simhash_index = larger_nd.simhash_index # Now simply return this final dict return final_dict def simhash_value_to_key(self, simhash): """Given a simhash object, convert it's value to a hexadecimal key This key will be used in our image_file dictionary """ return str(hex(simhash.value)) def deduplicate_images(self): """ Given a list of image files "self.filenames", deduplicate the images using near deduplication """ # Iterate through our files for image_file in self.filenames: feature_array = [] if self.metadata_dictionary != None: # Will use a java tika program to generate metadata # Metadata will be a json file with {filename : metadata} objects feature_array = self.generate_features_from_dict(image_file) else: # Use our own function for grabbing metadata # Create a list of features feature_array = self.generate_features(image_file) # Simhash this list of features sHash = Simhash(feature_array) if self.simhash_index == None: # First image, so we create the index add it to the dictionary # And move on to next iteration key = self.simhash_value_to_key(sHash) # We will use this index to speed up the process for finding # nearby simhashes self.simhash_index = SimhashIndex([(key, sHash)], k=self.k) self.image_dictionary[key] = [{ "filename": image_file, "hash_key": key, "hash_object": sHash }] continue near_duplicates_keys = self.simhash_index.get_near_dups(sHash) if len(near_duplicates_keys) > 0: # There are duplicates, so we should add them to the corresponding entry # in the file dictionary # Get the key for the nearest duplicate image near_dup_simhash_key = near_duplicates_keys[0] # Get the key for this current image current_simhash_key = self.simhash_value_to_key(sHash) # Create an object comprised of the image filename and key # We'll store this in a dictionary to be used in our merge step current_simhash_object = { "filename": image_file, "hash_key": current_simhash_key, "hash_object": sHash } self.image_dictionary[near_dup_simhash_key].append( current_simhash_object) else: # No duplicates, so let's create an entry in our image filename dictionary key = self.simhash_value_to_key(sHash) # Add this simhash to the Index for efficient searching self.simhash_index.add(key, sHash) # Create an object in our image file dictionary self.image_dictionary[key] = [{ "filename": image_file, "hash_key": key, "hash_object": sHash }]
f_stop.close() f_stop_seg_list = f_stop_text.split('\n') for myword in liststr.split('/'): if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1: mywordlist.append(myword) return ''.join(mywordlist) #data.head()['content'].apply(lambda x:jiebaclearText(str(x))) data['content'] = data['content'].apply(lambda x: jiebaclearText(str(x))) data['simhash'] = data['content'].apply(lambda x: Simhash(x).value) train = data.loc[data['source'] == 'train'] test = data.loc[data['source'] == 'test'] train.drop('source', axis=1, inplace=True) test.drop([ 'source', ], axis=1, inplace=True) objs = [(row["id"], Simhash(row["content"])) for index, row in train.iterrows()] index = SimhashIndex(objs, k=12) test['result'] = test['content'].apply( lambda x: index.get_near_dups(Simhash(x))) sub['result'] = test['result'] sub.to_csv('../output/simhash.csv', index=False)
data.update(dx) print "OBJS SIZE", len(objs) index = SimhashIndex(objs, f=f_dim, k=3) print "Bucket size", index.bucket_size() accounted_keys = set() dataset = [] C = collections.Counter() for key, val in tqdm(objs): # Skip if we've seen this pattern before if key in accounted_keys: continue dupes = index.get_near_dups(val) tweet = data[key] # Don't report self-matches if len(dupes) <= 1: continue # Don't report if person repeats only themself unique_names = set(['_'.join(x.split('_')[:-1]) for x in dupes]) if len(unique_names) <= 1: continue accounted_keys.update(dupes) for k1, k2 in itertools.combinations(unique_names, r=2): #dataset.append({"name1":k1,"name2":k2,"tweet":tweet})
mimas_cursor = db_miams_eagle.cursor() sql = 'select id,content from api_tractate limit 100' mimas_cursor.execute(sql) data = list(mimas_cursor.fetchall()) file = open("tractate.txt", "w", encoding="utf-8") for one in range(0, len(data)): begin = time.time() text1 = data[one].get("content", None) one_id = data[one].get("id", None) all_similar_data[one_id] = text1 objs = [(str(k), Simhash(get_features(v))) for k, v in all_similar_data.items()] index = SimhashIndex(objs, k=6) print(index.bucket_size()) for key, value in all_similar_data.items(): s1 = Simhash(get_features(value)) simi_list = index.get_near_dups(s1) simi_list.sort(key=lambda x: int(x), reverse=False) if len(simi_list) > 1 and simi_list not in all_similar_set: all_similar_set.append(simi_list) for item in all_similar_set: file.write(str(item)) file.write("\n") print("100条数据计算的时间:%f" % (time.time() - begin))
print '---------------------------------' print ' Wrong Operating System' print '---------------------------------' print len(wrongos) ######################################################################################################################################### if(args.near.upper() == 'Y'): fuzzy = [] for line in unknown: try: fuzz = Simhash(get_features(unicode(line))) num = index.get_near_dups(fuzz) if len(num) != 0: fuzzy.append(line) except: pass print '---------------------------------' print ' Total Fuzzy Near Matches' print '---------------------------------' print len(fuzzy) ######################################################################################################################################### wrongpath = [] for line in unknown:
fdoc = open(docfile, 'w') fcut = open(cutfile, 'w') num = 0 simnum = 0 cutnum = 0 #simhash index = SimhashIndex({}) for one in weibocollection.find( {'create_time': { '$gte': lasttimestamp, '$lt': curtimestamp }}): weibo_id = str(one['_id']) weibo_text = one['data']['text'].strip() text_sh = Simhash(weibo_text) if len(index.get_near_dups(text_sh)) == 0: #not find sim #cut text_seg = jieba.cut(weibo_text) text_result = list(set(text_seg) - stopwords) content = ' 1 '.join(text_result) if content != '': fdoc.write(weibo_id + '\t' + weibo_text.encode('utf-8') + '\n') fcut.write(content.encode('utf-8') + ' 1\n') cutnum += 1 simnum += 1 num += 1 index.add(num, text_sh) except pymongo.errors, e: logger.critical('mongo find error: %s' % e) sys.exit(-2)
def get_features(s): width = 3 s = s.lower() s = re.sub(r'[^\w]+', '', s) return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))] data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash(get_features(u'How are you i am fine. blar blar blar blar blar thank')) print(index.get_near_dups(s1)) index.add('4', s1) print(index.get_near_dups(s1)) def main(): pass if __name__ == '__main__': main()
'utf-8', 'ignore') s2 = 'How are you i am fine. blar blar blar blar blar than'.decode( 'utf-8', 'ignore') s3 = 'This is simhash test.'.decode('utf-8', 'ignore') # print get_features(s1) # # print Simhash(get_features('How are you? I am fine. Thanks.')).value sh1 = Simhash(s1) sh2 = Simhash(s2) sh3 = Simhash(s3) # print sh.value # print sh1.distance(sh2) shIndex = SimhashIndex([], k=3) shIndex.add('1', sh1) shIndex.add('2', sh2) # shIndex.add('3', sh3) if shIndex.get_near_dups(sh3): print 'YES' else: print 'NO' # print shIndex.get_near_dups(sh2)
s3 = 'This is simhash test.'.decode('utf-8', 'ignore') # print get_features(s1) # # print Simhash(get_features('How are you? I am fine. Thanks.')).value sh1 = Simhash(s1) sh2 = Simhash(s2) sh3 = Simhash(s3) # print sh.value # print sh1.distance(sh2) shIndex = SimhashIndex([], k=3) shIndex.add('1', sh1) shIndex.add('2', sh2) # shIndex.add('3', sh3) if shIndex.get_near_dups(sh3): print 'YES' else: print 'NO' # print shIndex.get_near_dups(sh2)
def is_valid(config, robot_cache_a, robot_cache_d, robot_url_cache, mem, mem2, longest_page, common_dict, ics_subdomains, url): """ mem = set() #memory cache of unique urls robot_cache_a = set() #memory cache of allowed urls robot_cache_d = set() #memory cache of disallowed urls robot_url_cache = set() #memory cache of crawled robots.txt stored as netloc """ try: parsed = urlparse(url) if parsed.scheme not in set(["http", "https"]): return False else: url = url.replace(parsed.fragment, "") extbool = not re.match( r".*\.(css|js|bmp|gif|jpe?g|ico" + r"|png|tiff?|mid|mp2|mp3|mp4" + r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf" + r"|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names" + r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso" + r"|epub|dll|cnf|tgz|sha1|sql" + r"|thmx|mso|arff|rtf|jar|csv" + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$", parsed.path.lower()) extbool2 = not re.match( r".*\.(css|js|bmp|gif|jpe?g|ico" + r"|png|tiff?|mid|mp2|mp3|mp4" + r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf" + r"|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names" + r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso" + r"|epub|dll|cnf|tgz|sha1|sql" + r"|thmx|mso|arff|rtf|jar|csv" + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$", parsed.query.lower()) extbool3 = not re.match( r".*/(css|js|bmp|gif|jpe?g|ico" + r"|png|tiff?|mid|mp2|mp3|mp4" + r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf" + r"|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names" + r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso" + r"|epub|dll|cnf|tgz|sha1|sql" + r"|thmx|mso|arff|rtf|jar|csv" + r"|rm|smil|wmv|swf|wma|zip|rar|gz)/.*", parsed.path.lower()) ebool = extbool and extbool2 and extbool3 sub_bool = re.match(r"(www.)?[-a-zA-Z0-9.]*\.ics\.uci\.edu", parsed.netloc) sub_bool2 = re.match(r"(www.)?[-a-zA-Z0-9.]*\.cs\.uci\.edu", parsed.netloc) sub_bool3 = re.match( r"(www.)?[-a-zA-Z0-9.]*\.informatics\.uci\.edu", parsed.netloc) sub_bool4 = re.match(r"(www.)?[-a-zA-Z0-9.]*\.stat\.uci\.edu", parsed.netloc) sub_bool5 = (re.match(r"(www.)?[-a-zA-Z0-9.]*today\.uci\.edu", parsed.netloc) and (parsed.path == "/department/information_computer_sciences/")) sbool = sub_bool or sub_bool2 or sub_bool3 or sub_bool4 or sub_bool5 if (ebool and sbool): try: if parsed.netloc not in robot_url_cache: robot_url_cache.add(parsed.netloc) robot_site = parsed.scheme + "://" + parsed.netloc + "/robots.txt" robot_resp = download.download(robot_site, config, logger=None) if robot_resp.status == 200: robot_txt = robot_resp.raw_response.text parse(parsed, robot_txt, robot_cache_a, robot_cache_d) if url not in mem: site_resp = requests.get(url) if site_resp.status_code == 200: #simhash here doc = site_resp.text soup = BeautifulSoup(doc, 'html.parser') #filter text from site [ s.extract() for s in soup([ 'style', 'script', '[document]', 'head', 'title' ]) ] text_only = soup.getText() filtered_text = text_only.split() #LOW INFO CONTENT if len(filtered_text) < 20: return False s = Simhash(filtered_text) index = SimhashIndex(mem2) #k=2 if index.get_near_dups(s) != []: return False else: if url in robot_cache_a: check(filtered_text, common_dict, longest_page, ics_subdomains, sub_bool, parsed.netloc, url) mem.add(url) mem2.append((str(url), s)) return True elif url in robot_cache_d: return False else: check(filtered_text, common_dict, longest_page, ics_subdomains, sub_bool, parsed.netloc, url) mem.add(url) mem2.append((str(url), s)) return True else: return False else: return False except socket.gaierror: return False except requests.exceptions.Timeout: return False except requests.exceptions.TooManyRedirects: return False except requests.exceptions.ConnectionError: return False except requests.exceptions.RequestException: return False else: return False except TypeError: #print ("TypeError for ", parsed) return False
class NearDuplicate: def __init__(self, filenames, k=2, metadata_dictionary=None): self.filenames = filenames self.simhash_index = None self.image_dictionary = {} self.metadata_dictionary = metadata_dictionary self.k = k # Need to store the image hashes in some fashion # Possibly cluster the hashes (k-means) def tika_metadata(self, filename): """Use the tika-py module to grab metadata for a file""" parsed = parser.from_file(filename) return parsed.get("metadata", {}) def exifread_metadata(self, filename): """Use the exifread module to grab metadata for a file""" f = open(filename, 'rb') tags = exifread.process_file(f) return tags def generate_features_from_dict(self, filename): """ Use this function when we provide json metadata information from the tika java module""" # Find the metadata object from the json metadata file for the image_file named 'filename' metadata = self.metadata_dictionary.get(filename, {}) # The tags or type of metadata we want feature_tags = ["Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes", "File Name Suffix"] # Create a feature array using these metadata values features = [] feature_weight_dict = { "Image Height" : 1, "Image Width" : 1, "Files Size" : 2, "Content-Type" : 3, "Image Bytes" : 6, "File Name Suffix" :2 } # Grab the bytes of the entire file image_bytes = "NONE" try: image_bytes = open(filename, 'rb').read() except OSError: image_bytes = "NONE" # Get the central bytes image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore") byte_offset = len(image_bytes_str)//4 filename_suffix = filename[-10:] modified_metadata = { "Image Height" : metadata.get("Image Height", "NONE"), "Image Width" : metadata.get("Image Width", "NONE"), "File Size" : metadata.get("File Size", "NONE"), "Content-Type" : metadata.get("Content-Type", "NONE"), "Image Bytes" : image_bytes_str[byte_offset:-byte_offset], "File Name Suffix" : filename_suffix } # Create an array of (token, weight) tuples. These are our features and weights # to be used for the Simhash for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), modified_metadata.items()): features.append((meta_tag + ":" + meta_value, weight)) return features def generate_features(self, filename): """Given an image generate a feature vector""" """ Since Tika-Py requires a server call (i.e. slower) Do native image metadata grabbing, and fallback on tika if the image can't be opened (i.e., it's an svg or gif) """ im, use_tika = None, False try: im = Image.open(filename) use_tika = False except IOError: use_tika = True # Grab the metadata for the image metadata = {} # We'll store features to use for simhash in a tuple array [(token, weight)] features = [] if use_tika: # Use only metadata from tika # The image file can't be opened using PIL.Image, so that means # a diff type of image besides jpg, png metadata = self.tika_metadata(filename) # Grab the bytes of the entire file image_bytes = open(filename).read() # Get the central bytes image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore") #image_bytes_str = str(image_bytes) byte_offset = len(image_bytes_str)//4 metadata["Image Bytes"] = image_bytes_str[byte_offset:-byte_offset] feature_tags = ["Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes"] features = [tag + ":" + metadata.get(tag,"NONE") for tag in feature_tags] return features """ FEATURES We'll resize the image so all images are normalized to a certain size Also make sure to retain aspect ratio Features to use (in order of importance) - center region bytes - color histogram - content type - image width - image height We can take subregions of the image, and hash those """ # Resize the image so all images are normalized width = im.size[0] height = im.size[1] resize_width = 30 resize_height = resize_width*height/width resize_im = None histogram_bytes, histogram_weight = "", 0 center_region_bytes, center_region_weight = "", 5 extension = "" try : resize_im = im.resize((resize_width, resize_height), Image.ANTIALIAS) # Crop sub regions height_padding, width_padding = resize_height/5, resize_width/5 box = (width_padding, height_padding, resize_width - width_padding, resize_height - height_padding) sub_region = resize_im.crop(box) # Generate a histogram histogram_bytes, histogram_weight = str(resize_im.histogram()), 4 center_region_bytes, center_region_weight = str(list(sub_region.getdata())), 3 except OSError: # Couldn't resize the image. Let's print >> sys.stderr, "Couldn't resize the image. Prob an eps or svg" resize_im = im resize_width = im.size[0] resize_height = im.size[1] sub_region = im # Grab the bytes of the entire file image_bytes = open(filename).read() # Get the central bytes #image_bytes_str = str(image_bytes) histogram_bytes = "NONE" image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore") byte_offset = len(image_bytes_str)//4 center_region_bytes = image_bytes_str[byte_offset:-byte_offset] extension = resize_im.format if resize_im.format != None else os.path.splitext(filename)[1] # Figure out the content type (png, jpg, etc.) content_type = "image/" + str(extension.lower()) feature_weight_dict = { "Image Height" : 1, "Image Width" : 1, "Image Histogram" : histogram_weight, "Content-Type" : 5, "Center Region Bytes" : center_region_weight } metadata = { "Image Height" : str(width), "Image Width" : str(height), "Image Histogram" : histogram_bytes, "Content-Type" : content_type, "Center Region Bytes" : center_region_bytes } # Create an array of (token, weight) tuples. These are our features and weights # to be used for the Simhash for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), metadata.items()): features.append((meta_tag + ":" + meta_value, weight)) return features def merge_near_duplicate_dictionaries(self, nd): """Merge the current near duplicate instance with another near duplicate instance""" smaller_nd = self if len(self.image_dictionary) <= len(nd.image_dictionary) else nd larger_nd = self if len(self.image_dictionary) > len(nd.image_dictionary) else nd final_dict = larger_nd.image_dictionary # Iterate over the smaller near duplicate instance for key in smaller_nd.image_dictionary.keys(): # If an exact duplicate exists, just grab it and merge them if larger_nd.image_dictionary.get(key, None) != None: arr = smaller_nd.image_dictionary.get(key, []) +\ larger_nd.image_dictionary.get(key, []) final_dict[key] = arr continue # Find the closest near duplicate in the larger dictionary by # using it's index simhash_obj = smaller_nd.image_dictionary[key][0]["hash_object"] near_duplicates_keys = larger_nd.simhash_index.get_near_dups(simhash_obj) # If a near duplicate exists if len(near_duplicates_keys) > 0: # grab the array of images at that key in the larger dictionary # Merge it the array of images in the smaller dictionary near_dup_key = near_duplicates_keys[0] arr = smaller_nd.image_dictionary.get(key, []) +\ larger_nd.image_dictionary.get(near_dup_key, []) # create an entry in the new dictionary final_dict[near_dup_key] = arr continue # Otherwise we should just add this key-object from the dictionary # to this array final_dict[key] = smaller_nd.image_dictionary[key] # Add this simhash to the Index for efficient searching larger_nd.simhash_index.add(key, simhash_obj) self.image_dictionary = final_dict self.simhash_index = larger_nd.simhash_index nd.image_dicionary = final_dict nd.simhash_index = larger_nd.simhash_index # Now simply return this final dict return final_dict def simhash_value_to_key(self, simhash): """Given a simhash object, convert it's value to a hexadecimal key This key will be used in our image_file dictionary """ return str(hex(simhash.value)) def deduplicate_images(self): """ Given a list of image files "self.filenames", deduplicate the images using near deduplication """ # Iterate through our files for image_file in self.filenames: feature_array = [] if self.metadata_dictionary != None: # Will use a java tika program to generate metadata # Metadata will be a json file with {filename : metadata} objects feature_array = self.generate_features_from_dict(image_file) else: # Use our own function for grabbing metadata # Create a list of features feature_array = self.generate_features(image_file) # Simhash this list of features sHash = Simhash(feature_array) if self.simhash_index == None: # First image, so we create the index add it to the dictionary # And move on to next iteration key = self.simhash_value_to_key(sHash) # We will use this index to speed up the process for finding # nearby simhashes self.simhash_index = SimhashIndex([(key, sHash)], k=self.k) self.image_dictionary[key] = [{ "filename" : image_file, "hash_key" : key, "hash_object": sHash }] continue near_duplicates_keys = self.simhash_index.get_near_dups(sHash) if len(near_duplicates_keys) > 0: # There are duplicates, so we should add them to the corresponding entry # in the file dictionary # Get the key for the nearest duplicate image near_dup_simhash_key = near_duplicates_keys[0] # Get the key for this current image current_simhash_key = self.simhash_value_to_key(sHash) # Create an object comprised of the image filename and key # We'll store this in a dictionary to be used in our merge step current_simhash_object = { "filename" : image_file, "hash_key" : current_simhash_key, "hash_object" : sHash } self.image_dictionary[near_dup_simhash_key].append(current_simhash_object) else: # No duplicates, so let's create an entry in our image filename dictionary key = self.simhash_value_to_key(sHash) # Add this simhash to the Index for efficient searching self.simhash_index.add(key, sHash) # Create an object in our image file dictionary self.image_dictionary[key] = [{ "filename" : image_file, "hash_key" : key, "hash_object" : sHash }]
import re from simhash import Simhash, SimhashIndex def get_features(s): width = 3 s = s.lower() s = re.sub(r'[^\w]+', '', s) return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))] data = {1:u'2018-02-17 0:00:00,2018-02-17 0:00:00,2018-02-17 0:00:00,86000300159583,2018-01-26 0:00:00', 2:u'2018-02-16 0:00:00,60125170993,2018-02-16 0:00:00,86000300257742,2018-01-26 0:00:00', 3:u'2018-02-15 0:00:00,60125170993,2018-02-15 0:00:00,86011600116290,2018-01-26 0:00:00', 4:u'2018-02-14 0:00:00,60125170993,2018-02-14 0:00:00,86008501214219,2018-01-26 0:00:00', 5:u'2018-02-13 0:00:00,60125170993,2018-02-13 0:00:00,86000300420496,2018-01-26 0:00:00', 6:u'2018-02-12 0:00:00,60125170993,2018-02-12 0:00:00,86000300656419,2018-01-26 0:00:00', 7:u'2018-02-11 0:00:00,60125170993,2018-02-11 0:00:00,86553802671042,2018-01-26 0:00:00' } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print index.bucket_size() s1 = Simhash(get_features(u'2018-02-17 0:00:00,2018-03-17 0:00:00,2018-02-17 0:00:00,86000300159583,2018-01-26 0:00:00')) print index.get_near_dups(s1) index.add('0', s1) print index.get_near_dups(s1)