def fit(self, X): self._index = MinHashLSHForest(num_perm = self._n_perm, l = self._n_rep) for i, x in enumerate(X): m = MinHash(num_perm = self._n_perm) for e in x: m.update(str(e)) self._index.add(str(i), m) self._index.index()
def find_relation_class_name_matchings(network, kr_handlers): # Retrieve relation names st = time.time() names = [] seen_sources = [] for (db_name, source_name, _, _) in network.iterate_values(): original_source_name = source_name if source_name not in seen_sources: seen_sources.append(source_name) # seen already source_name = nlp.camelcase_to_snakecase(source_name) source_name = source_name.replace('-', ' ') source_name = source_name.replace('_', ' ') source_name = source_name.lower() m = MinHash(num_perm=32) for token in source_name.split(): if token not in stopwords.words('english'): m.update(token.encode('utf8')) names.append(('relation', (db_name, original_source_name), m)) num_relations_inserted = len(names) # Retrieve class names for kr_name, kr_handler in kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: original_cl_name = cl cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() m = MinHash(num_perm=32) for token in cl.split(): if token not in stopwords.words('english'): m.update(token.encode('utf8')) names.append(('class', (kr_name, original_cl_name), m)) # Index all the minhashes lsh_index = MinHashLSH(threshold=0.5, num_perm=32) for idx in range(len(names)): lsh_index.insert(idx, names[idx][2]) matchings = [] for idx in range(0, num_relations_inserted): # Compare only with classes N = lsh_index.query(names[idx][2]) for n in N: kind_q = names[idx][0] kind_n = names[n][0] if kind_n != kind_q: # match.format is db_name, source_name, field_name -> class_name match = ((names[idx][1][0], names[idx][1][1], "_"), names[n][1]) matchings.append(match) et = time.time() print("Time to relation-class (name): " + str(et - st)) return matchings
def embed(self, corpus: List[str]) -> np.ndarray: signatures: List[np.ndarray] = [] for doc in corpus: m = MinHash(num_perm=self.num_perm) for ngram in ngrams(doc, self.ngram_size): m.update(" ".join(ngram).encode("utf-8")) signatures.append(m.hashvalues) return np.asarray(signatures)
def calculate_jaccard(text1, text2): # 计算两行文本jaccard相似度 minihash1, minihash2 = MinHash(), MinHash() for word in text1: minihash1.update(word.encode('utf-8')) for word in text2: minihash2.update(word.encode('utf-8')) return minihash1.jaccard(minihash2)
def hashing(): m = [] for index in df.index: t = create_shingles(df.at[index, 'urlDrugName']) mh = MinHash(num_perm=256) for d in t: mh.update(d.encode('utf8')) m.append(mh) del mh return m
def compute_minhash(column): permutations = config.MINHASH_PARAMS['num_permutations'] encoding = config.MINHASH_PARAMS['encoding'] minhash = MinHash(num_perm=permutations) for elem in column: minhash.update(elem.encode(encoding)) hash = minhash return hash
def get_min_hash(shingles: set) -> MinHash: """ given a set of shingles, creates a MinHash object updated with those shingles. :param shingles: a set of track shingles. :return: a MinHash object updated with the given shingles. """ track_min_hash = MinHash(num_perm=128) for shin in shingles: track_min_hash.update(str(shin).encode('utf-8')) return track_min_hash
def DIDsampling(dataset,BF,username,userid,attra_id,beta,clustdict): # dataset = Cora_labeled.objects.all() # clustdict = dextrapreclustering.minhashPreClustering(dataset) cluster_membership = {} # values = models.sigirCoraAttrValue.objects.filter(attr_id=attra_id) # attrasynonyms = models.sigirCoraValueSynonym.objects.filter(value_id__in=[ value.id for value in values]) # record_hasAttra = models.sigirCoraToAttrEntity.objects.filter(user=username,attrsynonym_id__in=[ syn.id for syn in attrasynonyms]) record_hasAttra = models.sigirCoraToAttrEntity.objects.filter(user=username,attrsynonym__value__attr_id=attra_id) record_noAttra = dataset.exclude(id__in = [ item.cora_id for item in record_hasAttra]) for k, v in clustdict.items(): for d in v: cluster_membership[d] = k sum = 0.000001 for record in BF: sum = sum + len(clustdict[cluster_membership[record.id]]) for record in BF: # AC cora2ae = models.sigirCoraToAttrEntity.objects.filter(cora_id=record.id,user=username) if cora2ae: list = [ item.attrsynonym.value.attr.id for item in cora2ae] if attra_id in list: record.orderscore = 0 record.save() continue else: ac = 1-len(list)/models.sigirCoraAttr.objects.filter(userid=userid).count() else: ac = 1 # distribution on dataset k = cluster_membership[record.id] ic = len(clustdict[k])/record_noAttra.count() record_minhash = MinHash(num_perm=128) s = set(record.cleantext.split(" ")) for d in s: record_minhash.update(d.encode('utf8')) term2sum = 0 for rr in BF: rr_minhash = MinHash(num_perm=128) ss = set(rr.cleantext.split(" ")) for dd in ss: rr_minhash.update(dd.encode('utf8')) sim = record_minhash.jaccard(rr_minhash) sim = (sim/sum)**beta term2sum = term2sum + sim did = ac*ic*term2sum record.orderscore = did record.save() return BF
def test_count(self): m = MinHash(hashobj=FakeHash) m.update(11) m.update(123) m.update(92) m.update(98) m.update(123218) m.update(32) lm = LeanMinHash(m) c = lm.count() self.assertGreaterEqual(c, 0)
def _get_min_hash(self, r): """ :param r: The incoming row :return: resulting min hash """ cset = self._get_shingle(r) m = MinHash(self.__num_perm) for c in cset: m.update(c.encode('utf-8')) return m
def similar_videos_from_forest(inferred_label_probabilities): inferred_labels_full = convert_inferred_labels_to_list( inferred_label_probabilities) minhash = MinHash(num_perm=128) for label in inferred_labels_full: minhash.update(label) if forest == None: load_forest() return forest.query(minhash, 10)
def mh_digest(data): """ create a MinHash digest """ num_perm = 512 m = MinHash(num_perm) for d in data: m.update(d.encode('utf8')) return m
def get_codes(text_val, nbit=128): ''' Converts the text data to a binary code. :param text_val: The text data to be converted to a minhash binary code. :param nbit: The size of the binary code to be returned. :return: A binary code representation of the text string. ''' minhash = MinHash(num_perm=nbit) for c, i in enumerate(text_val): minhash.update("".join(i)) return minhash
def minhash_implem(url_shingles_list): list_url_hash = [] for url in range(len(url_shingles_list)): m = MinHash(num_perm=8) shingle_list = url_shingles_list[url][1] for shingle in shingle_list: m.update(shingle.encode('utf8')) list_url_hash.append( ["{0}".format(url_shingles_list[url][0]), m.digest()]) return list_url_hash
def test_count(self): m = MinHash(hashfunc=fake_hash_func) m.update(11) m.update(123) m.update(92) m.update(98) m.update(123218) m.update(32) lm = LeanMinHash(m) c = lm.count() self.assertGreaterEqual(c, 0)
def make_lsh(self,shingle_length=2,threshold=0.8): print(f'Making LSH with threshold of {threshold}, shingle length of {shingle_length}') sets = self.make_shingle_sets(self.indoc,shingle_length) self.minhashes = {} self.lsh = MinHashLSH(threshold=threshold, num_perm=128) for k in sets.keys(): m = MinHash(num_perm=128) for item in sets[k]: m.update(item.encode('utf8')) self.minhashes[k] = m self.lsh.insert(k,m)
def apply_lsh(group, col): lsh = MinHashLSH(threshold=0.9, num_perm=256) minhashes = {} for idx, text in group[col].iteritems(): minhash = MinHash(num_perm=256) for d in ngrams(text, 3): minhash.update("".join(d).encode('utf-8')) index = group.loc[idx, 'productId'] lsh.insert(key=index, minhash=minhash) minhashes[index] = minhash return lsh, minhashes
def get_hashbands(window): minhash = MinHash(num_perm=config['n_permutations'], seed=1) for ngram in set(ngrams(' '.join(window), 3)): minhash.update(''.join(ngram).encode('utf8')) hashband_vals = [] for c, i in enumerate(minhash.hashvalues): hashband_vals.append(i) if len(hashband_vals) == config['hashband_length']: hashband = '.'.join([str(j) for j in hashband_vals]) hashband_vals = [] yield hashband
def get_jaccard_index(sequence1, sequence2, k): seq1_minHash, seq2_minHash = MinHash(), MinHash() seq1_kmers = count_kmers(sequence1, k) seq2_kmers = count_kmers(sequence2, k) seq1_keys = list(seq1_kmers.keys()) seq2_keys = list(seq2_kmers.keys()) for key in seq1_keys: seq1_minHash.update(key.encode('utf8')) for key in seq2_keys: seq2_minHash.update(key.encode('utf8')) return seq1_minHash.jaccard(seq2_minHash)
def calculate_lsh(text, mode, lsh_type): """Calculate LSH for the file""" min_hash = MinHash(num_perm=128) if mode == 'char': for d in ngrams(text, 3): min_hash.update("".join(d).encode('utf-8')) elif mode == 'word': for d in ngrams(text, 3): min_hash.update(" ".join(d).encode('utf-8')) result = lsh_type.query(min_hash) return result
def predict(text, database, perms, num_results, forest): #get top results for LSH forest text_preprocessed = preprocess(text) m = MinHash(num_perm=perms) for d in ngrams(text_preprocessed, 3): m.update("".join(d).encode('utf-8')) idx_array = np.array(forest.query(m, num_results)) if len(idx_array) == 0: return None # if your query is empty, return none result = database.iloc[idx_array]['wikidata_numeric_id'].astype(int) return result
def print_minhash_to_pickle(from_k, to_k, sequence_dict): for kk in range(from_k, to_k): print(kk) accession_number_dict = dict() for key, value in sequence_dict.items(): minHash = MinHash() kmer = count_kmers(value[1], kk) for kmer_key in kmer.keys(): minHash.update(kmer_key.encode('utf8')) accession_number_dict[key] = minHash filepath = get_minhash_pickle_filename(kk, basePath) pickle.dump(accession_number_dict, open(filepath, 'wb'))
def minhash(x, y): m1, m2 = MinHash(), MinHash() s1 = extract_keywords(x) s2 = extract_keywords(y) for data in s1: m1.update(data.encode('utf8')) for data in s2: m2.update(data.encode('utf8')) return m1.jaccard(m2)
def test_jaccard(self): m1 = MinHash(4, 1, hashobj=FakeHash) m2 = MinHash(4, 1, hashobj=FakeHash) lm1 = LeanMinHash(m1) lm2 = LeanMinHash(m2) self.assertTrue(lm1.jaccard(lm2) == 1.0) m2.update(12) lm2 = LeanMinHash(m2) self.assertTrue(lm1.jaccard(lm2) == 0.0) m1.update(13) lm1 = LeanMinHash(m1) self.assertTrue(lm1.jaccard(lm2) < 1.0)
def create_LSH_Forest(): global forest if os.path.isfile(LSH_FOREST_FILE): load_forest() else: forest = MinHashLSHForest(num_perm=128) train_records = glob.glob("dataset/train*.tfrecord") validate_records = glob.glob("dataset/validate*.tfrecord") all_records = train_records + validate_records dataset = tf.data.TFRecordDataset(all_records) iterator = dataset.make_one_shot_iterator() count = 0 next_element = iterator.get_next() updated = False with tf.Session() as sess: try: while True: if count % 10000 == 0: print "[SimpleVideoSearch][{}] Processed {} records from the dataset so far".format( datetime.now(), count) if updated and count % 100000 == 0: with open(LSH_FOREST_FILE, 'wb') as forest_file: forest.index() pickle.dump(forest, forest_file, pickle.HIGHEST_PROTOCOL) print "[SimpleVideoSearch][{}] Updated LSH Forest file".format( datetime.now(), count) exampleBinaryString = sess.run(next_element) example = tf.train.Example.FromString(exampleBinaryString) count += 1 example_id = example.features.feature["id"].bytes_list.value[0] if example_id not in forest: if not updated: updated = True print '[SimpleVideoSearch][{}] First update at record {}'.format( datetime.now(), count) dataset_labels_full = convert_dataset_labels_to_list( example.features.feature["labels"].int64_list.value) minhash = MinHash(num_perm=128) for label in dataset_labels_full: minhash.update(label) forest.add(example_id, minhash) except tf.errors.OutOfRangeError: print "[SimpleVideoSearch][{}] Done iterating through dataset".format( datetime.now()) finally: print "[SimpleVideoSearch][{}] Processed {} records from the dataset".format( datetime.now(), count) forest.index() with open(LSH_FOREST_FILE, 'wb') as forest_file: pickle.dump(forest, forest_file, pickle.HIGHEST_PROTOCOL) print "[SimpleVideoSearch][{}] Finished creating LSH Forest file".format( datetime.now(), count)
def xiaoming(lines, length_of_f, i): first_user = lines[i] a = first_user.split() user_ID = a[0] + ' ' + '\n' print(a[1]) hash_value = [0 for i in range(0, 51)] #该列表用于写入文件 sort_value = [0 for i in range(0, 51)] #该列表用于放置更新中的值 # 创建一个该用户文件存储该用户与其他用户的minhash值 #f_hash = open('user_minhash/'+a[0]+'.txt',mode = 'w',encoding = 'utf-8') # f_hash.write(a[0] + ' '+'\n') # 写入该用户的ID for j in range(length_of_f - 1): if (i == j): continue else: second_user = lines[j] b = second_user.split() data1 = a[2].split(',') data2 = b[2].split(',') #若果两列表相同个数为0或者一,舍去 length_of_data1_and_data2 = len(data1) + len(data2) #去重 both_data1_2 = list(set(data1 + data2)) if ((length_of_data1_and_data2 - len(both_data1_2)) > 20): m1, m2 = MinHash(), MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) #print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) if (m1.jaccard(m2) <= 0.0078125): continue else: c = [b[0], m1.jaccard(m2)] for m in range(49, -1, -1): if (sort_value[m] == 0): sort_value[m] = c else: if (c[1] > sort_value[m][1]): sort_value[m + 1] = sort_value[m] sort_value[m] = c else: break else: continue #new_value = b[0] + ' ' +str(m1.jaccard(m2)) + '\n' #hash_value.append(new_value) return sort_value #f_hash.write(b[0] + ' ' +str(m1.jaccard(m2)) + '\n') """ for item in sort_value:
def calculateLSH(self,data,n): # Create MinHash objects print("Initializing the LSH for %i samples" % len(data)) for c, i in enumerate(data): print("data" + str(i[0])) minhash = MinHash(num_perm=self.num_perm) for d in ngrams(i[1], n): minhash.update("".join(d).encode('utf-8')) self.lsh.insert(i[0], minhash) self.minhashes[i[0]] = minhash print(self.minhashes.keys()) return self.minhashes, self.lsh
def get_min_hash(self, title, blurb): """ this method generates min hash """ signature = MinHash() filter_stopwords = lambda x: x not in self.stopwords contents = self.strip_puncs(title.lower() + " " + blurb.lower()).split() cleaned_contents = set(list(filter(filter_stopwords, contents))) for current in cleaned_contents: signature.update(current.encode('utf-8')) return signature
def getTextHashValues(text, ngram=5): """ 利用datasketch和NLTK得到文本的Hash值 :param text: 一段字符串文本 :param ngram: n-gram的值,短文本推荐为5 :return: numpy.ndarray datasketch 计算得到的Hash值 """ assert type(text) == str, 'INPUT text should be str' m = MinHash() for i in nltk.ngrams(text, ngram): m.update(''.join(i).encode('utf8')) return m.hashvalues
def minhash_from_string(self, input_string): """ Generates minhash object from string """ shingles = self.extract_shingles(input_string) shingle_set = set(shingles) m = MinHash(num_perm=self.num_perm) for i in shingle_set: m.update(i.encode('utf8')) return m
def test_jaccard(self): m1 = MinHash(4, 1, hashfunc=fake_hash_func) m2 = MinHash(4, 1, hashfunc=fake_hash_func) lm1 = LeanMinHash(m1) lm2 = LeanMinHash(m2) self.assertTrue(lm1.jaccard(lm2) == 1.0) m2.update(12) lm2 = LeanMinHash(m2) self.assertTrue(lm1.jaccard(lm2) == 0.0) m1.update(13) lm1 = LeanMinHash(m1) self.assertTrue(lm1.jaccard(lm2) < 1.0)
def minhash_tweet(self, tweet_text): """Minhashing operation that allows for a caching of up to 1M tweets in order to speed up the checking procedure when it's the same tweet text""" tweet_hash = MinHash(num_perm=self.permutations) for word in tweet_text.split(): tweet_hash.update( self.punct.sub( "", word.encode('utf8') ) ) return tweet_hash
def get_min_hash(self, x): """ Create a MinHash object for the input example string using w-shingling. Parameters: x - A list of strings representing an example. Returns: A datasketch.MinHash object updated with the generated w-shingles. """ min_hash = MinHash(num_perm=self.num_perm, seed=self.random_state) # we accumulate all shingles extracted from each string for x_str in x: # map string x_str to a set of shingles x_shingles = MinHashNearestNeighbor.get_w_shingles(x_str, self.w) for shingle in x_shingles: min_hash.update(shingle) return min_hash
def _hello_world(): """ This fragment was taken from the datasketch github page: https://github.com/ekzhu/datasketch """ data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'datasets'] data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'documents'] m1, m2 = MinHash(), MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2))) print("Actual Jaccard for data1 and data2 is", actual_jaccard)
def query(self, v, n): m = MinHash(num_perm = self._n_perm) for e in v: m.update(str(e)) return map(int, self._index.query(m, n))
def minhash_str(in_str, perms, gram_sz): minhash = MinHash(num_perm=perms) for d in ngrams(in_str, gram_sz): minhash.update("".join(d).encode('utf-8')) return minhash
newSentence = [] for i in range(num_sentences): newSentence.append(model.getSentence(word_to_index,index_to_word)) # print(len(newSentence)) # print (newSentence) stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(string.punctuation) stopwords.append('') for sen in newSentence: data1 = [token.lower().strip(string.punctuation) for token in nltk.word_tokenize(sen) \ if token.lower().strip(string.punctuation) not in stopwords] f = open('data/data.csv', 'rb') for line in f: data2 = [token.lower().strip(string.punctuation) for token in nltk.word_tokenize(line) \ if token.lower().strip(string.punctuation) not in stopwords] m1, m2 = MinHash(), MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) # print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2))) if(actual_jaccard > 0.3): print("Actual Jaccard for data1 and data2 is", actual_jaccard) print sen print line