class TestSimhashIndex(TestCase): data = { 1: u'How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.', 2: u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than', 3: u'This is a different one.', } def setUp(self): objs = [(str(k), Simhash(v)) for k, v in self.data.items()] self.index = SimhashIndex(objs) def test_get_near_dup(self): s1 = Simhash( u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank' ) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 1) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 1) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2)
class TestSimhashIndex(TestCase): data = { 1: 'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: 'How are you i am fine. blar blar blar blar blar than', 3: 'This is simhash test.', 4: 'How are you i am fine. blar blar blar blar blar thank1', } def setUp(self): objs = [(str(k), Simhash(v)) for k, v in self.data.items()] self.index = SimhashIndex(objs, k=10) def test_get_near_dup(self): s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank') dups = self.index.get_near_dups(s1) self.assertEqual(3, len(dups)) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(2, len(dups)) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(2, len(dups)) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(3, len(dups)) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(3, len(dups))
class TestSimhashIndex(TestCase): data = { 1: 'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: 'How are you i am fine. blar blar blar blar blar than', 3: 'This is simhash test.', 4: 'How are you i am fine. blar blar blar blar blar thank1', } def setUp(self): objs = [(str(k), Simhash(v)) for k, v in self.data.items()] self.index = SimhashIndex(objs, k=10) def test_get_near_dup(self): s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank') dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 3) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 3) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 3)
def main(): # user_query = input() DOCID = 0 numPartial = 1 index = SimhashIndex([]) totaldocs = 0 docnum = 0 validDocFile = open('validDocs2', 'w') for root, dirs, files in os.walk(DEVPATH): for fname in files: if not fname.endswith(".json"): continue totaldocs += 1 h2t = html2text.HTML2Text() file = open(root + "/" + fname) pageDict = json.loads(file.read()) # close file to get memory back file.close() # get html formated content htmlContent = pageDict['content'] print(pageDict['url']) plainContent = h2t.handle(htmlContent) feat = get_features(plainContent) sim = Simhash(feat) if len(index.get_near_dups(sim)) > 0: continue print(docnum, totaldocs) index.add(str(docnum), sim) validDocFile.write(root + "/" + fname + "\n") docnum+=1 validDocFile.close()
class SpellingCorrector(object): def __init__(self, vocab_to_freq, f=64, k=32): self.vocab_to_freq = vocab_to_freq self.simhash_index = SimhashIndex([], f=f, k=k) self.f = f self.k = k simhash_index = self.simhash_index for w in vocab_to_freq: sh = Simhash(w, f=f) simhash_index.add(w, sh) def add_valid_word(self, word): if word not in self.vocab_to_freq: sh = Simhash(word, self.f) self.simhash_index.add(word, sh) self.vocab_to_freq[word] = self.vocab_to_freq.get(word, 0) + 1 def correct_word(self, word): if word in self.vocab_to_freq: return word #Edit distance between sh = Simhash(word, f=self.f) candidates = self.simhash_index.get_near_dups(sh) if not candidates: #No near dups. Oh well. This word will go as it is. print 'no candidates' return word if len(candidates) == 1: #Only one candidate, so assume this is the correction return candidates[0] lev_dist_gen = ((other_w, levenshtein(other_w, word)) for other_w in candidates) closest_words, dists = zip(*all_min_or_max(lev_dist_gen, min, lambda item: item[1])) if len(closest_words) == 1: #One of the candidates had the best edit distance. Return that. return closest_words[0] #OK, there are multiple closest words. Rely on word frequency to choose the right one. vocab_to_freq = self.vocab_to_freq word_freq_gen = ((other_w, vocab_to_freq[other_w]) for other_w in closest_words) most_freq_words, freqs = zip(*all_min_or_max(word_freq_gen, max, lambda item: item[1])) #using choice because at this point there's no other way to narrow it down, unless we #track higher order ngrams. return choice(most_freq_words)
def find_near_matches(session, collection, index_size, probability_index_near_match): from simhash import Simhash, SimhashIndex logging.getLogger().setLevel(logging.CRITICAL) tweet_id_simhash_value = session.execute( sa.select([model.Tweet.tweet_id, model.Tweet.features['filter','simhash']]) .where(model.Tweet.collection == collection) ) simhash_index = SimhashIndex([], k=7) insert_relation_stmt = pg.insert(model.relation) # insert_tweet_near_matches_stmt = insert_tweet_near_matches_stmt.on_conflict_do_update( # index_elements=['tweet_id', 'collection'], # set_={ # 'earliest_near_match_id': insert_tweet_near_matches_stmt.excluded.earliest_near_match_id # } # ) indexed_tweet_ids = [] for i, (tweet_id, simhash_value) in enumerate(tweet_id_simhash_value): if (i % 100000) == 1000: logger.info('Processed %s tweets. Committing.', i) session.commit() simhash = Simhash(simhash_value) near_matches_ids = simhash_index.get_near_dups(simhash) if not near_matches_ids: simhash_index.add(tweet_id, simhash) indexed_tweet_ids.append((tweet_id, simhash)) if len(indexed_tweet_ids) > index_size: simhash_index.delete(*indexed_tweet_ids.pop(0)) if near_matches_ids: near_match_id = min(near_matches_ids) logger.debug('A near match %s for tweet %s', near_match_id, tweet_id) session.execute( insert_relation_stmt.values( [(tweet_id, collection, 'near_match', near_match_id)] ) ) session.commit()
def use_simhash_index(): data = { 1: "How are you? I Am fine. blar blar blar blar blar Thanks.", 2: "How are you i am fine. blar blar blar blar blar than", 3: "This is simhash test.", } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash(get_features(u"How are you i am fine. blar blar blar blar blar thank")) print(index.get_near_dups(s1)) index.add("4", s1) print(index.get_near_dups(s1))
def use_simhash_index(): data = { 1: "How are you? I Am fine. blar blar blar blar blar Thanks.", 2: "How are you i am fine. blar blar blar blar blar than", 3: "This is simhash test.", } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash( get_features(u"How are you i am fine. blar blar blar blar blar thank")) print(index.get_near_dups(s1)) index.add("4", s1) print(index.get_near_dups(s1))
def process_graph(self, project_id): visits = defaultdict(list) p = 0 hashtags_db = Hashtag.objects.filter(project_id=project_id) logger.info("Total hashtags to process " + str(len(hashtags_db))) for hashtag_entry in hashtags_db: visits[hashtag_entry.user_id].append(hashtag_entry.hashtag) p += 1 logger.info("Hashtag read") logger.info("Hashtag processed " + str(p)) logger.info("Visits count " + str(len(visits))) objs = [] cant_users = 0 cant_processed = 0 index = SimhashIndex(objs, f=f1, k=k1) for user, hashtags in visits.iteritems(): if len(hashtags) > MIN_HASHTAG_PER_USER: simhash = Simhash(hashtags, f=f1) index.add(user, simhash) cant_processed += 1 cant_users += 1 if cant_users % 10000 == 0: logger.info("%s processed" % cant_users) logger.info("Simash index build for %i out of %i users" % (cant_processed, len(visits))) cant_processed = 0 for user, hashtags in visits.iteritems(): near_dups = index.get_near_dups(Simhash(hashtags, f=f1)) for user_near_dups in near_dups: user_near_dups = long(user_near_dups) if user_near_dups != long(user): hashtag_near_dups = visits[user_near_dups] intersect = set(hashtags).intersection(hashtag_near_dups) ratio = len(intersect) * 1.0 / len(hashtag_near_dups) if ratio >= 0.1: hashtag_graph = HashtagGraph(user_oid_i=user, user_oid_j=user_near_dups, ratio=ratio) hashtag_graph.save() cant_processed += 1 if cant_processed % 10000 == 0: logger.info("%i processed" % cant_processed)
def process_graph(self, project_id): visits = defaultdict(list) processed = 0 urls_db = Urls.objects.filter(project_id=project_id) logger.info("Total urls to process " + str(len(urls_db))) for url_entry in urls_db: visits[url_entry.user_id].append(url_entry.url) processed += 1 logger.info("Urls read") logger.info("Urls processed " + str(processed)) logger.info("Visits count " + str(len(visits))) objs = [] cant_users = 0 cant_processed = 0 index = SimhashIndex(objs, f=f1, k=k1) for user, urls in visits.iteritems(): if len(urls) > MIN_URLS_PER_USER: simhash = Simhash(urls, f=f1) index.add(user, simhash) cant_processed += 1 cant_users += 1 if cant_users % 10000 == 0: logger.info("%s processed" % cant_users) logger.info("Simash index build for %i out of %i users" % (cant_processed, len(visits))) cant_processed = 0 for user, urls in visits.iteritems(): near_dups = index.get_near_dups(Simhash(urls, f=f1)) for user_near_dups in near_dups: user_near_dups = long(user_near_dups) if user_near_dups != long(user): urls_near_dups = visits[user_near_dups] intersect = set(urls).intersection(urls_near_dups) ratio = len(intersect) * 1.0 / len(urls_near_dups) if ratio >= 0.1: url_graph = UrlsGraph(user_oid_i=user, user_oid_j=user_near_dups, ratio=ratio) url_graph.save() cant_processed += 1 if cant_processed % 10000 == 0: logger.info("%i processed" % cant_processed)
def simhash_test(): data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } for k, v in data.items(): print k, get_phrases(v) for k, v in data.items(): print k, Simhash(get_phrases(v)).value objs = [(str(k), Simhash(get_phrases(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print index.bucket_size() s1 = Simhash(get_phrases(u'How are you i am fine. blar blar blar blar blar thank')) print index.get_near_dups(s1) index.add('4', s1) print index.get_near_dups(s1)
def process_graph(self, project_id): visits = defaultdict(list) p = 0; hashtags_db = Hashtag.objects.filter(project_id=project_id) logger.info("Total hashtags to process "+str(len(hashtags_db))) for hashtag_entry in hashtags_db: visits[hashtag_entry.user_id].append(hashtag_entry.hashtag) p +=1 logger.info("Hashtag read") logger.info("Hashtag processed " + str(p)) logger.info("Visits count " + str(len(visits))) objs = [] cant_users = 0 cant_processed = 0 index = SimhashIndex(objs, f=f1, k=k1) for user, hashtags in visits.iteritems(): if len(hashtags) > MIN_HASHTAG_PER_USER: simhash = Simhash(hashtags, f=f1) index.add(user, simhash) cant_processed += 1 cant_users += 1 if cant_users % 10000 == 0: logger.info("%s processed" % cant_users) logger.info("Simash index build for %i out of %i users" % (cant_processed, len(visits))) cant_processed = 0 for user, hashtags in visits.iteritems(): near_dups = index.get_near_dups(Simhash(hashtags, f=f1)) for user_near_dups in near_dups: user_near_dups = long(user_near_dups) if user_near_dups != long(user): hashtag_near_dups = visits[user_near_dups] intersect = set(hashtags).intersection(hashtag_near_dups) ratio = len(intersect)*1.0/len(hashtag_near_dups) if ratio >= 0.1: hashtag_graph = HashtagGraph(user_oid_i=user, user_oid_j=user_near_dups, ratio=ratio) hashtag_graph.save() cant_processed += 1 if cant_processed % 10000 == 0: logger.info("%i processed" % cant_processed)
index = SimhashIndex({}) for one in weibocollection.find({'create_time':{'$gte':lasttimestamp,'$lt':curtimestamp} }): weibo_id = str(one['_id']) weibo_text = one['data']['text'].strip() text_sh = Simhash(weibo_text ) if len(index.get_near_dups(text_sh) ) == 0: #not find sim #cut text_seg = jieba.cut(weibo_text) text_result = list(set(text_seg) - stopwords) content = ' 1 '.join(text_result) if content != '': fdoc.write(weibo_id+'\t'+weibo_text.encode('utf-8')+'\n');fcut.write(content.encode('utf-8')+' 1\n') cutnum += 1 simnum += 1 num += 1 index.add(num,text_sh) except pymongo.errors,e: logger.critical('mongo find error: %s' %e) sys.exit(-2) logger.info('simnum: %d ' %simnum); logger.info('cutnum: %d ' %cutnum); connection.close() fdoc.close();fcut.close() def main(): curtimestamp=0;lasttimestamp=0 if len(sys.argv)==4 and sys.argv[1]=='-BETime': lasttimestamp = long(sys.argv[2]) curtimestamp = long(sys.argv[3]) elif len(sys.argv)==2 and sys.argv[1]=='-SYSTime':
import re from simhash import Simhash, SimhashIndex def get_features(s): width = 3 s = s.lower() s = re.sub(r'[^\w]+', '', s) return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))] data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash( get_features(u'How are you i am fine. blar blar blar blar blar thank')) print(index.get_near_dups(s1)) index.add('4', s1) print(index.get_near_dups(s1))
data = {} objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) if os.path.isfile(args.db): print 'MatchMeta.Info Database Located' print 'Patience...Loading Index...' conn = sqlite3.connect(args.db) meta = conn.execute( "SELECT path FROM MatchMeta WHERE path NOT LIKE '%winsxs%'") count = 1 for line in meta: item = Simhash(get_features(unicode(line[0]))) count = count + 1 index.add(count, item) print index.bucket_size() print 'Excluding the WINSXS Directory' print '---------------------------------' print ' MatchMeta.Info Database Loaded' print '---------------------------------' conn.close() else: print 'MatchMeta.Info Database -- FAILED' sys.exit() elif (args.near.upper() == 'N'): print 'Skipping MatchMeta.Info Database Fuzzing' else: print 'Please use only Y or N'
'utf-8', 'ignore') s2 = 'How are you i am fine. blar blar blar blar blar than'.decode( 'utf-8', 'ignore') s3 = 'This is simhash test.'.decode('utf-8', 'ignore') # print get_features(s1) # # print Simhash(get_features('How are you? I am fine. Thanks.')).value sh1 = Simhash(s1) sh2 = Simhash(s2) sh3 = Simhash(s3) # print sh.value # print sh1.distance(sh2) shIndex = SimhashIndex([], k=3) shIndex.add('1', sh1) shIndex.add('2', sh2) # shIndex.add('3', sh3) if shIndex.get_near_dups(sh3): print 'YES' else: print 'NO' # print shIndex.get_near_dups(sh2)
data = {} objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) if os.path.isfile(args.db): print 'MatchMeta.Info Database Located' print 'Patience...Loading Index...' conn = sqlite3.connect(args.db) meta = conn.execute("SELECT path FROM MatchMeta WHERE path NOT LIKE '%winsxs%'") count = 1 for line in meta: item = Simhash(get_features(unicode(line[0]))) count = count+1 index.add(count,item) print index.bucket_size() print 'Excluding the WINSXS Directory' print '---------------------------------' print ' MatchMeta.Info Database Loaded' print '---------------------------------' conn.close() else: print 'MatchMeta.Info Database -- FAILED' sys.exit() elif(args.near.upper() == 'N'): print 'Skipping MatchMeta.Info Database Fuzzing' else: print 'Please use only Y or N'
weibo_id = str(one['_id']) weibo_text = one['data']['text'].strip() text_sh = Simhash(weibo_text) if len(index.get_near_dups(text_sh)) == 0: #not find sim #cut text_seg = jieba.cut(weibo_text) text_result = list(set(text_seg) - stopwords) content = ' 1 '.join(text_result) if content != '': fdoc.write(weibo_id + '\t' + weibo_text.encode('utf-8') + '\n') fcut.write(content.encode('utf-8') + ' 1\n') cutnum += 1 simnum += 1 num += 1 index.add(num, text_sh) except pymongo.errors, e: logger.critical('mongo find error: %s' % e) sys.exit(-2) logger.info('simnum: %d ' % simnum) logger.info('cutnum: %d ' % cutnum) connection.close() fdoc.close() fcut.close() def main(): curtimestamp = 0 lasttimestamp = 0 if len(sys.argv) == 4 and sys.argv[1] == '-BETime':
class NearDuplicate: def __init__(self, filenames, k=2, metadata_dictionary=None): self.filenames = filenames self.simhash_index = None self.image_dictionary = {} self.metadata_dictionary = metadata_dictionary self.k = k # Need to store the image hashes in some fashion # Possibly cluster the hashes (k-means) def tika_metadata(self, filename): """Use the tika-py module to grab metadata for a file""" parsed = parser.from_file(filename) return parsed.get("metadata", {}) def exifread_metadata(self, filename): """Use the exifread module to grab metadata for a file""" f = open(filename, 'rb') tags = exifread.process_file(f) return tags def generate_features_from_dict(self, filename): """ Use this function when we provide json metadata information from the tika java module""" # Find the metadata object from the json metadata file for the image_file named 'filename' metadata = self.metadata_dictionary.get(filename, {}) # The tags or type of metadata we want feature_tags = [ "Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes", "File Name Suffix" ] # Create a feature array using these metadata values features = [] feature_weight_dict = { "Image Height": 1, "Image Width": 1, "Files Size": 2, "Content-Type": 3, "Image Bytes": 6, "File Name Suffix": 2 } # Grab the bytes of the entire file image_bytes = "NONE" try: image_bytes = open(filename, 'rb').read() except OSError: image_bytes = "NONE" # Get the central bytes image_bytes_str = unicode(str(image_bytes), 'utf-8', "ignore") byte_offset = len(image_bytes_str) // 4 filename_suffix = filename[-10:] modified_metadata = { "Image Height": metadata.get("Image Height", "NONE"), "Image Width": metadata.get("Image Width", "NONE"), "File Size": metadata.get("File Size", "NONE"), "Content-Type": metadata.get("Content-Type", "NONE"), "Image Bytes": image_bytes_str[byte_offset:-byte_offset], "File Name Suffix": filename_suffix } # Create an array of (token, weight) tuples. These are our features and weights # to be used for the Simhash for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), modified_metadata.items()): features.append((meta_tag + ":" + meta_value, weight)) return features def generate_features(self, filename): """Given an image generate a feature vector""" """ Since Tika-Py requires a server call (i.e. slower) Do native image metadata grabbing, and fallback on tika if the image can't be opened (i.e., it's an svg or gif) """ im, use_tika = None, False try: im = Image.open(filename) use_tika = False except IOError: use_tika = True # Grab the metadata for the image metadata = {} # We'll store features to use for simhash in a tuple array [(token, weight)] features = [] if use_tika: # Use only metadata from tika # The image file can't be opened using PIL.Image, so that means # a diff type of image besides jpg, png metadata = self.tika_metadata(filename) # Grab the bytes of the entire file image_bytes = open(filename).read() # Get the central bytes image_bytes_str = unicode(str(image_bytes), 'utf-8', "ignore") #image_bytes_str = str(image_bytes) byte_offset = len(image_bytes_str) // 4 metadata["Image Bytes"] = image_bytes_str[byte_offset:-byte_offset] feature_tags = [ "Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes" ] features = [ tag + ":" + metadata.get(tag, "NONE") for tag in feature_tags ] return features """ FEATURES We'll resize the image so all images are normalized to a certain size Also make sure to retain aspect ratio Features to use (in order of importance) - center region bytes - color histogram - content type - image width - image height We can take subregions of the image, and hash those """ # Resize the image so all images are normalized width = im.size[0] height = im.size[1] resize_width = 30 resize_height = resize_width * height / width resize_im = None histogram_bytes, histogram_weight = "", 0 center_region_bytes, center_region_weight = "", 5 extension = "" try: resize_im = im.resize((resize_width, resize_height), Image.ANTIALIAS) # Crop sub regions height_padding, width_padding = resize_height / 5, resize_width / 5 box = (width_padding, height_padding, resize_width - width_padding, resize_height - height_padding) sub_region = resize_im.crop(box) # Generate a histogram histogram_bytes, histogram_weight = str(resize_im.histogram()), 4 center_region_bytes, center_region_weight = str( list(sub_region.getdata())), 3 except OSError: # Couldn't resize the image. Let's print >> sys.stderr, "Couldn't resize the image. Prob an eps or svg" resize_im = im resize_width = im.size[0] resize_height = im.size[1] sub_region = im # Grab the bytes of the entire file image_bytes = open(filename).read() # Get the central bytes #image_bytes_str = str(image_bytes) histogram_bytes = "NONE" image_bytes_str = unicode(str(image_bytes), 'utf-8', "ignore") byte_offset = len(image_bytes_str) // 4 center_region_bytes = image_bytes_str[byte_offset:-byte_offset] extension = resize_im.format if resize_im.format != None else os.path.splitext( filename)[1] # Figure out the content type (png, jpg, etc.) content_type = "image/" + str(extension.lower()) feature_weight_dict = { "Image Height": 1, "Image Width": 1, "Image Histogram": histogram_weight, "Content-Type": 5, "Center Region Bytes": center_region_weight } metadata = { "Image Height": str(width), "Image Width": str(height), "Image Histogram": histogram_bytes, "Content-Type": content_type, "Center Region Bytes": center_region_bytes } # Create an array of (token, weight) tuples. These are our features and weights # to be used for the Simhash for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), metadata.items()): features.append((meta_tag + ":" + meta_value, weight)) return features def merge_near_duplicate_dictionaries(self, nd): """Merge the current near duplicate instance with another near duplicate instance""" smaller_nd = self if len(self.image_dictionary) <= len( nd.image_dictionary) else nd larger_nd = self if len(self.image_dictionary) > len( nd.image_dictionary) else nd final_dict = larger_nd.image_dictionary # Iterate over the smaller near duplicate instance for key in smaller_nd.image_dictionary.keys(): # If an exact duplicate exists, just grab it and merge them if larger_nd.image_dictionary.get(key, None) != None: arr = smaller_nd.image_dictionary.get(key, []) +\ larger_nd.image_dictionary.get(key, []) final_dict[key] = arr continue # Find the closest near duplicate in the larger dictionary by # using it's index simhash_obj = smaller_nd.image_dictionary[key][0]["hash_object"] near_duplicates_keys = larger_nd.simhash_index.get_near_dups( simhash_obj) # If a near duplicate exists if len(near_duplicates_keys) > 0: # grab the array of images at that key in the larger dictionary # Merge it the array of images in the smaller dictionary near_dup_key = near_duplicates_keys[0] arr = smaller_nd.image_dictionary.get(key, []) +\ larger_nd.image_dictionary.get(near_dup_key, []) # create an entry in the new dictionary final_dict[near_dup_key] = arr continue # Otherwise we should just add this key-object from the dictionary # to this array final_dict[key] = smaller_nd.image_dictionary[key] # Add this simhash to the Index for efficient searching larger_nd.simhash_index.add(key, simhash_obj) self.image_dictionary = final_dict self.simhash_index = larger_nd.simhash_index nd.image_dicionary = final_dict nd.simhash_index = larger_nd.simhash_index # Now simply return this final dict return final_dict def simhash_value_to_key(self, simhash): """Given a simhash object, convert it's value to a hexadecimal key This key will be used in our image_file dictionary """ return str(hex(simhash.value)) def deduplicate_images(self): """ Given a list of image files "self.filenames", deduplicate the images using near deduplication """ # Iterate through our files for image_file in self.filenames: feature_array = [] if self.metadata_dictionary != None: # Will use a java tika program to generate metadata # Metadata will be a json file with {filename : metadata} objects feature_array = self.generate_features_from_dict(image_file) else: # Use our own function for grabbing metadata # Create a list of features feature_array = self.generate_features(image_file) # Simhash this list of features sHash = Simhash(feature_array) if self.simhash_index == None: # First image, so we create the index add it to the dictionary # And move on to next iteration key = self.simhash_value_to_key(sHash) # We will use this index to speed up the process for finding # nearby simhashes self.simhash_index = SimhashIndex([(key, sHash)], k=self.k) self.image_dictionary[key] = [{ "filename": image_file, "hash_key": key, "hash_object": sHash }] continue near_duplicates_keys = self.simhash_index.get_near_dups(sHash) if len(near_duplicates_keys) > 0: # There are duplicates, so we should add them to the corresponding entry # in the file dictionary # Get the key for the nearest duplicate image near_dup_simhash_key = near_duplicates_keys[0] # Get the key for this current image current_simhash_key = self.simhash_value_to_key(sHash) # Create an object comprised of the image filename and key # We'll store this in a dictionary to be used in our merge step current_simhash_object = { "filename": image_file, "hash_key": current_simhash_key, "hash_object": sHash } self.image_dictionary[near_dup_simhash_key].append( current_simhash_object) else: # No duplicates, so let's create an entry in our image filename dictionary key = self.simhash_value_to_key(sHash) # Add this simhash to the Index for efficient searching self.simhash_index.add(key, sHash) # Create an object in our image file dictionary self.image_dictionary[key] = [{ "filename": image_file, "hash_key": key, "hash_object": sHash }]
import re from simhash import Simhash, SimhashIndex def get_features(s): width = 3 s = s.lower() s = re.sub(r'[^\w]+', '', s) return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))] data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] print(objs) index = SimhashIndex(objs, k=3) # k:海明距离 print(index.bucket_size()) s1 = Simhash( get_features( u'How are you i am fine. blar blar blar blar blar blar thank')) print(index.get_near_dups(s1)) index.add('4', s1) # 相当于将s1当做data的第四个kv对进行比对 print(index.get_near_dups(s1))
def get_features(s): width = 3 s = s.lower() s = re.sub(r'[^\w]+', '', s) return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))] data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } objs = [] # objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) s1 = Simhash(get_features( u'How are you i am fine. blar blar blar blar blar thank')) print(s1.value) print (index.get_near_dups(s1)) index.add('4', s1) print (index.get_near_dups(s1)) s2 = Simhash(7604580641891645972) print(s2.value) index.add('5', s2) index.add('3', s2) print (index.get_near_dups(s2))
def get_features(s): width = 3 s = s.lower() s = re.sub(r'[^\w]+', '', s) return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))] data = { 1: u'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: u'How are you i am fine. blar blar blar blar blar than', 3: u'This is simhash test.', } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash(get_features(u'How are you i am fine. blar blar blar blar blar thank')) print(index.get_near_dups(s1)) index.add('4', s1) print(index.get_near_dups(s1)) def main(): pass if __name__ == '__main__': main()
s3 = 'This is simhash test.'.decode('utf-8', 'ignore') # print get_features(s1) # # print Simhash(get_features('How are you? I am fine. Thanks.')).value sh1 = Simhash(s1) sh2 = Simhash(s2) sh3 = Simhash(s3) # print sh.value # print sh1.distance(sh2) shIndex = SimhashIndex([], k=3) shIndex.add('1', sh1) shIndex.add('2', sh2) # shIndex.add('3', sh3) if shIndex.get_near_dups(sh3): print 'YES' else: print 'NO' # print shIndex.get_near_dups(sh2)
class NearDuplicate: def __init__(self, filenames, k=2, metadata_dictionary=None): self.filenames = filenames self.simhash_index = None self.image_dictionary = {} self.metadata_dictionary = metadata_dictionary self.k = k # Need to store the image hashes in some fashion # Possibly cluster the hashes (k-means) def tika_metadata(self, filename): """Use the tika-py module to grab metadata for a file""" parsed = parser.from_file(filename) return parsed.get("metadata", {}) def exifread_metadata(self, filename): """Use the exifread module to grab metadata for a file""" f = open(filename, 'rb') tags = exifread.process_file(f) return tags def generate_features_from_dict(self, filename): """ Use this function when we provide json metadata information from the tika java module""" # Find the metadata object from the json metadata file for the image_file named 'filename' metadata = self.metadata_dictionary.get(filename, {}) # The tags or type of metadata we want feature_tags = ["Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes", "File Name Suffix"] # Create a feature array using these metadata values features = [] feature_weight_dict = { "Image Height" : 1, "Image Width" : 1, "Files Size" : 2, "Content-Type" : 3, "Image Bytes" : 6, "File Name Suffix" :2 } # Grab the bytes of the entire file image_bytes = "NONE" try: image_bytes = open(filename, 'rb').read() except OSError: image_bytes = "NONE" # Get the central bytes image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore") byte_offset = len(image_bytes_str)//4 filename_suffix = filename[-10:] modified_metadata = { "Image Height" : metadata.get("Image Height", "NONE"), "Image Width" : metadata.get("Image Width", "NONE"), "File Size" : metadata.get("File Size", "NONE"), "Content-Type" : metadata.get("Content-Type", "NONE"), "Image Bytes" : image_bytes_str[byte_offset:-byte_offset], "File Name Suffix" : filename_suffix } # Create an array of (token, weight) tuples. These are our features and weights # to be used for the Simhash for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), modified_metadata.items()): features.append((meta_tag + ":" + meta_value, weight)) return features def generate_features(self, filename): """Given an image generate a feature vector""" """ Since Tika-Py requires a server call (i.e. slower) Do native image metadata grabbing, and fallback on tika if the image can't be opened (i.e., it's an svg or gif) """ im, use_tika = None, False try: im = Image.open(filename) use_tika = False except IOError: use_tika = True # Grab the metadata for the image metadata = {} # We'll store features to use for simhash in a tuple array [(token, weight)] features = [] if use_tika: # Use only metadata from tika # The image file can't be opened using PIL.Image, so that means # a diff type of image besides jpg, png metadata = self.tika_metadata(filename) # Grab the bytes of the entire file image_bytes = open(filename).read() # Get the central bytes image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore") #image_bytes_str = str(image_bytes) byte_offset = len(image_bytes_str)//4 metadata["Image Bytes"] = image_bytes_str[byte_offset:-byte_offset] feature_tags = ["Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes"] features = [tag + ":" + metadata.get(tag,"NONE") for tag in feature_tags] return features """ FEATURES We'll resize the image so all images are normalized to a certain size Also make sure to retain aspect ratio Features to use (in order of importance) - center region bytes - color histogram - content type - image width - image height We can take subregions of the image, and hash those """ # Resize the image so all images are normalized width = im.size[0] height = im.size[1] resize_width = 30 resize_height = resize_width*height/width resize_im = None histogram_bytes, histogram_weight = "", 0 center_region_bytes, center_region_weight = "", 5 extension = "" try : resize_im = im.resize((resize_width, resize_height), Image.ANTIALIAS) # Crop sub regions height_padding, width_padding = resize_height/5, resize_width/5 box = (width_padding, height_padding, resize_width - width_padding, resize_height - height_padding) sub_region = resize_im.crop(box) # Generate a histogram histogram_bytes, histogram_weight = str(resize_im.histogram()), 4 center_region_bytes, center_region_weight = str(list(sub_region.getdata())), 3 except OSError: # Couldn't resize the image. Let's print >> sys.stderr, "Couldn't resize the image. Prob an eps or svg" resize_im = im resize_width = im.size[0] resize_height = im.size[1] sub_region = im # Grab the bytes of the entire file image_bytes = open(filename).read() # Get the central bytes #image_bytes_str = str(image_bytes) histogram_bytes = "NONE" image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore") byte_offset = len(image_bytes_str)//4 center_region_bytes = image_bytes_str[byte_offset:-byte_offset] extension = resize_im.format if resize_im.format != None else os.path.splitext(filename)[1] # Figure out the content type (png, jpg, etc.) content_type = "image/" + str(extension.lower()) feature_weight_dict = { "Image Height" : 1, "Image Width" : 1, "Image Histogram" : histogram_weight, "Content-Type" : 5, "Center Region Bytes" : center_region_weight } metadata = { "Image Height" : str(width), "Image Width" : str(height), "Image Histogram" : histogram_bytes, "Content-Type" : content_type, "Center Region Bytes" : center_region_bytes } # Create an array of (token, weight) tuples. These are our features and weights # to be used for the Simhash for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), metadata.items()): features.append((meta_tag + ":" + meta_value, weight)) return features def merge_near_duplicate_dictionaries(self, nd): """Merge the current near duplicate instance with another near duplicate instance""" smaller_nd = self if len(self.image_dictionary) <= len(nd.image_dictionary) else nd larger_nd = self if len(self.image_dictionary) > len(nd.image_dictionary) else nd final_dict = larger_nd.image_dictionary # Iterate over the smaller near duplicate instance for key in smaller_nd.image_dictionary.keys(): # If an exact duplicate exists, just grab it and merge them if larger_nd.image_dictionary.get(key, None) != None: arr = smaller_nd.image_dictionary.get(key, []) +\ larger_nd.image_dictionary.get(key, []) final_dict[key] = arr continue # Find the closest near duplicate in the larger dictionary by # using it's index simhash_obj = smaller_nd.image_dictionary[key][0]["hash_object"] near_duplicates_keys = larger_nd.simhash_index.get_near_dups(simhash_obj) # If a near duplicate exists if len(near_duplicates_keys) > 0: # grab the array of images at that key in the larger dictionary # Merge it the array of images in the smaller dictionary near_dup_key = near_duplicates_keys[0] arr = smaller_nd.image_dictionary.get(key, []) +\ larger_nd.image_dictionary.get(near_dup_key, []) # create an entry in the new dictionary final_dict[near_dup_key] = arr continue # Otherwise we should just add this key-object from the dictionary # to this array final_dict[key] = smaller_nd.image_dictionary[key] # Add this simhash to the Index for efficient searching larger_nd.simhash_index.add(key, simhash_obj) self.image_dictionary = final_dict self.simhash_index = larger_nd.simhash_index nd.image_dicionary = final_dict nd.simhash_index = larger_nd.simhash_index # Now simply return this final dict return final_dict def simhash_value_to_key(self, simhash): """Given a simhash object, convert it's value to a hexadecimal key This key will be used in our image_file dictionary """ return str(hex(simhash.value)) def deduplicate_images(self): """ Given a list of image files "self.filenames", deduplicate the images using near deduplication """ # Iterate through our files for image_file in self.filenames: feature_array = [] if self.metadata_dictionary != None: # Will use a java tika program to generate metadata # Metadata will be a json file with {filename : metadata} objects feature_array = self.generate_features_from_dict(image_file) else: # Use our own function for grabbing metadata # Create a list of features feature_array = self.generate_features(image_file) # Simhash this list of features sHash = Simhash(feature_array) if self.simhash_index == None: # First image, so we create the index add it to the dictionary # And move on to next iteration key = self.simhash_value_to_key(sHash) # We will use this index to speed up the process for finding # nearby simhashes self.simhash_index = SimhashIndex([(key, sHash)], k=self.k) self.image_dictionary[key] = [{ "filename" : image_file, "hash_key" : key, "hash_object": sHash }] continue near_duplicates_keys = self.simhash_index.get_near_dups(sHash) if len(near_duplicates_keys) > 0: # There are duplicates, so we should add them to the corresponding entry # in the file dictionary # Get the key for the nearest duplicate image near_dup_simhash_key = near_duplicates_keys[0] # Get the key for this current image current_simhash_key = self.simhash_value_to_key(sHash) # Create an object comprised of the image filename and key # We'll store this in a dictionary to be used in our merge step current_simhash_object = { "filename" : image_file, "hash_key" : current_simhash_key, "hash_object" : sHash } self.image_dictionary[near_dup_simhash_key].append(current_simhash_object) else: # No duplicates, so let's create an entry in our image filename dictionary key = self.simhash_value_to_key(sHash) # Add this simhash to the Index for efficient searching self.simhash_index.add(key, sHash) # Create an object in our image file dictionary self.image_dictionary[key] = [{ "filename" : image_file, "hash_key" : key, "hash_object" : sHash }]
class DocCollection(object): def __init__(self, hash_size=64, hash_tol=3, num_words_to_complete=10): """ Params: hash_size : The number of output bits of the hash function used in SimHash. Higher values -> able to handle more noise. hash_tol : The number of bits that can differ for a candidate near-match in Simhash num_words_to_complete : The number of words to complete given a context when a new document is encountered in get_best_match """ self.num_words_to_complete = num_words_to_complete self.hash_size = hash_size self.hash_tol = hash_tol #This implementation of simhash stores the index in RAM, but it could easily be # put on disk. self.simhash_index = SimhashIndex(objs=[], f=self.hash_size, k=self.hash_tol) self.author_identifier = LanguageModelAuthorIdentifier() self.author_semantic_models = SemanticLanguageModels() def generate_simhash(self, tokens): #Generate a Simhash from Spacy tokens. sh = Simhash(u'', f=self.hash_size) #silly interface... sh.build_by_features(tokens) return sh def add(self, doc, title, author): add_to_index = self.simhash_index.add #Index each paragraph in the document into the simhash index paras = extract_paragraphs(doc) #Update the word shape language model for this author para_toks = [tokenize(p) for p in paras] flat_tokens = [item for sublist in para_toks for item in sublist] self.author_semantic_models.add_doc(flat_tokens, author) #Update the semantic model for this author self.author_identifier.add_doc(flat_tokens, author) #Add each paragraph to the simhash index for para_num, tokens in enumerate(para_toks, 1): if not tokens: continue sh = self.generate_simhash(tokens) self.simhash_index.add((tokens, title, author, para_num), sh) def get_best_match(self, snippet): get_near_dups = self.simhash_index.get_near_dups generate_simhash = self.generate_simhash title_author_to_count = {} paras = extract_paragraphs(snippet) #evenly distribute the corrupted paragraphs #shuffle(paras) #For each paragraph, get the closest matching previously encountered paragraphs. #If multiple matches, prune via edit distance. #The work of art that matches the most paragraphs is the winner (if it matches enough) paras_done = 0 for para in paras: tokens = tokenize(para) if not tokens: continue paras_done += 1 sh = generate_simhash(tokens) candidates = [make_tuple(match) for match in get_near_dups(sh)] #Increment the count of these works for candidate in candidates: _, title, author, para_num = candidate k = (title, author) title_author_to_count[k] = title_author_to_count.get(k, 0) + 1 if title_author_to_count: #OK, what work was the most frequent, and what was that frequency? (title, author), f = max(title_author_to_count.iteritems(), key=lambda item: item[1]) score = 1.*f/paras_done if score >= 0.1: return {'title': title, 'author': author, 'score': score, 'author_score': None, 'completion': None} #This is either so corrupt that we can't tell what it is, or is a new work. #Guess the author tokens = [item for sublist in [tokenize(p) for p in paras] for item in sublist] author_guess, author_score = self.author_identifier.predict_author(tokens) completion = self.author_semantic_models.complete(author_guess, tokens, self.num_words_to_complete, 1) return {'title': None, 'author': author_guess, 'score': None, 'author_score': author_score, 'completion': completion} def clear(self): self.simhash_index = SimhashIndex(objs=[], f=self.hash_size, k=self.hash_tol)
import re from simhash import Simhash, SimhashIndex def get_features(s): width = 3 s = s.lower() s = re.sub(r'[^\w]+', '', s) return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))] data = {1:u'2018-02-17 0:00:00,2018-02-17 0:00:00,2018-02-17 0:00:00,86000300159583,2018-01-26 0:00:00', 2:u'2018-02-16 0:00:00,60125170993,2018-02-16 0:00:00,86000300257742,2018-01-26 0:00:00', 3:u'2018-02-15 0:00:00,60125170993,2018-02-15 0:00:00,86011600116290,2018-01-26 0:00:00', 4:u'2018-02-14 0:00:00,60125170993,2018-02-14 0:00:00,86008501214219,2018-01-26 0:00:00', 5:u'2018-02-13 0:00:00,60125170993,2018-02-13 0:00:00,86000300420496,2018-01-26 0:00:00', 6:u'2018-02-12 0:00:00,60125170993,2018-02-12 0:00:00,86000300656419,2018-01-26 0:00:00', 7:u'2018-02-11 0:00:00,60125170993,2018-02-11 0:00:00,86553802671042,2018-01-26 0:00:00' } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print index.bucket_size() s1 = Simhash(get_features(u'2018-02-17 0:00:00,2018-03-17 0:00:00,2018-02-17 0:00:00,86000300159583,2018-01-26 0:00:00')) print index.get_near_dups(s1) index.add('0', s1) print index.get_near_dups(s1)