def fit(self, X): self._index = MinHashLSHForest(num_perm = self._n_perm, l = self._n_rep) for i, x in enumerate(X): m = MinHash(num_perm = self._n_perm) for e in x: m.update(str(e)) self._index.add(str(i), m) self._index.index()
def estimateDistinctElements(items, num_perm): """This function will estimate the number of distinct elements in a list. The default number of hash function permutations is num_perm(128), but I asjusted after researching more- http://blog.cluster-text.com/tag/minhash/""" h = MinHash(num_perm) # creates a minhash object with the parameter for item in items: # being the number of hash permutations h.digest(sha1(item.encode('utf8'))) # digests the minhash signatures print("Estimated number of elements: ", h.count())
def estimateDistinctElementParallel(listOfItems, num_perm): """Same as above, except here we have a nested for loop to iterate through the lists in the list. This function will also append the estimation result to a list for use in the following accuracy function.""" h = MinHash(num_perm) for item in listOfItems: for i in item: # nested for loop to iterate over lists within a list h.digest(sha1(i.encode('utf8'))) estimate.append(h.count()) print("Estimated number of elements: ", h.count())
def minhash_tweet(self, tweet_text): """Minhashing operation that allows for a caching of up to 1M tweets in order to speed up the checking procedure when it's the same tweet text""" tweet_hash = MinHash(num_perm=self.permutations) for word in tweet_text.split(): tweet_hash.update( self.punct.sub( "", word.encode('utf8') ) ) return tweet_hash
def main(): path = Path('C:/Data/Python/JobLoss') orig_data = [] ind_map = [] ind = 0 with open(path / 'Processed.json') as f: data = json.load(f) for tweet in data: if tweet['type'] != 'retweet': orig_data.append(tweet['orig_text']) ind_map.append(ind) ind += 1 # orig_data.append(tweet['orig_text']) markers = [0 for _ in range(len(orig_data))] lsh = MinHashLSH(threshold=0.5, num_perm=128) minhashes = {} for c, i in enumerate(orig_data): # print(c) minhash = MinHash(num_perm=128) for d in ngrams(i, 5): minhash.update(''.join(d).encode('utf-8')) lsh.insert(c, minhash) minhashes[c] = minhash for i in range(len(minhashes.keys())): result = lsh.query(minhashes[i]) if markers[i] == 2: continue markers[i] = 1 for j in result: if markers[j] != 1: markers[j] = 2 doc_set = set() similar_removed = [ data[ind_map[ind]] for ind, val in enumerate(markers) if val != 2 ] final = [] identicals = 0 for line in similar_removed: doc = ' '.join(line['text']) if doc in doc_set: identicals += 1 continue doc_set.add(doc) final.append(line) print(identicals) print(len(final)) with open(path / 'ProcessedSimilarRemoved.json', 'w') as f: json.dump(final, f)
def DIDsamplingLittle(dataset,BF,username,userid,attra_id,beta,clustdict): # dataset = Cora_labeled.objects.all() # clustdict = dextrapreclustering.minhashPreClustering(dataset) cluster_membership = {} # values = models.sigirCoraAttrValue.objects.filter(attr_id=attra_id) # attrasynonyms = models.sigirCoraValueSynonym.objects.filter(value_id__in=[ value.id for value in values]) # record_hasAttra = models.sigirCoraToAttrEntity.objects.filter(user=username,attrsynonym_id__in=[ syn.id for syn in attrasynonyms]) record_hasAttra = models.sigirCoraToAttrEntity.objects.filter(user=username,attrsynonym__value__attr_id=attra_id) record_noAttra = dataset.exclude(id__in = [ item.cora_id for item in record_hasAttra]) for k, v in clustdict.items(): for d in v: cluster_membership[d] = k sum = dataset.count() for record in dataset: # AC cora2ae = models.sigirCoraToAttrEntity.objects.filter(cora_id=record.id,user=username) if cora2ae: list = [ item.attrsynonym.value.attr.id for item in cora2ae] if attra_id in list: record.orderscore = 0 record.save() continue else: ac = 1-len(list)/models.sigirCoraAttr.objects.filter(userid=userid).count() else: ac = 1 # distribution on dataset k = cluster_membership[record.id] ic = len(clustdict[k])/record_noAttra.count() record_minhash = MinHash(num_perm=128) s = set(record.cleantext.split(" ")) for d in s: record_minhash.update(d.encode('utf8')) term2sum = 0 for rr in BF: rr_minhash = MinHash(num_perm=128) ss = set(dataset.get(id = rr).cleantext.split(" ")) for dd in ss: rr_minhash.update(dd.encode('utf8')) sim = record_minhash.jaccard(rr_minhash) sim = (sim/sum)**beta term2sum = term2sum + sim did = ac*ic*term2sum record.orderscore = did record.save() return dataset
def read_observations(self, ifp): rd = csv.DictReader(ifp) count = 0 for row in rd: if row == rd._fieldnames: continue count += 1 if count % 100000 == 0: progress(str(count)) location = ObsvLocation( canon_url_syntax(row["url"]).geturl(), row["country"], row["timestamp"], row["as.owner"], row["vpn"]) flags = row["flags"] pld = row["payload"] key = flags + "|" + pld if key in self.tok_payloads: tp = self.tok_payloads[key] else: self.tok_payloads[key] = tp = tokenize_payload(flags, pld) if key in self.discarded: continue discard, m_content, m_structure = self.mcp.match(tp, location) if discard: self.discarded.add(key) continue self.locations[tp.structure].append(location) self.locations[tp.content].append(location) if m_content: self.m_content_t.add(tp.content) for m in m_content: self.m_content[m].add(tp.content) elif tp.content not in self.hashes: ch = MinHash(num_perm=128) for ct in tp.content_t: ch.update(ct.encode('utf-8')) self.hashes[tp.content] = (ch, len(tp.content_t)) if m_structure: self.m_structure_t.add(tp.structure) for m in m_structure: self.m_structure[m].add(tp.structure) elif tp.structure not in self.hashes: sh = MinHash(num_perm=128) for st in tp.structure_t: sh.update(st.encode('utf-8')) self.hashes[tp.structure] = (sh, len(tp.structure_t)) progress(str(count))
def similarity(self, other_doc, metric='jaccard', hash_method='minhash'): """ Computes similarity for two documents. Only minhash Jaccard similarity is implemented. >>> doc1 = Doc('Sentence for computing the minhash') >>> doc2 = Doc('Sentence for computing the similarity') >>> doc1.similarity(doc2) 0.7265625 """ if hash_method == 'minhash' and metric == 'jaccard': hash1 = MinHash(hashvalues=self.minhash) hash2 = MinHash(hashvalues=other_doc.minhash) return hash1.jaccard(hash2) else: raise NotImplementedError(f'Metric/hash method combination {metric}' f'/{hash_method} is not implemented as similarity metric')
def query_sim(in_dir): js = json.load(codecs.open(in_dir, "r")) line = js["content_p"] seg_list = jieba.cut(line, cut_all=False) no_list = [] for word in seg_list: if word not in stopword: no_list.append(word) mh = MinHash(num_perm=128) for word in no_list: mh.update(word.encode('utf8')) result = forest.query(mh, 1) return mh.jaccard(forest[result[0]])
def calc_hash(self, bytez): # Create an MinHashLSH index optimized for Jaccard threshold 0.5, # that accepts MinHash objects with 128 permutations functions lsh = MinHashLSH(threshold=0.9, num_perm=128) # Generate MinHash objects. minhashes = {} for c, i in enumerate(bytez): min_hash = MinHash() for d in ngrams(i, 3): min_hash.update("".join(str(d)).encode("utf-8")) lsh.insert(c, min_hash) minhashes[c] = min_hash return minhashes """
def query_candidates(doc, topn): lsh = load_lsh() minhash = MinHash(num_perm=128) content = convert_text(doc['content']) ngram = ngrams_token(remove_punctuation(content), 3) for gram in ngram: minhash.update(gram.encode('utf-8')) result = lsh.query(minhash, topn) print(doc['title']) if result: for item in result: doc = docs_col.find_one({"_id": ObjectId(str(item))}) print(doc['title']) print("=====================") return result
def convert_str_to_minhash(digest): """Convert string that is including 128 numbers which to have a comma as middle between that numbers. Ex. 13241234,213242134,22342234,23423423,...,21341234 (128 numbers.) """ data_array = np.array(digest.split(","), dtype=np.uint64) m1 = MinHash(hashvalues=data_array) return m1
def MinHashFunc(df, num_perm, stop_words): """ Take in dataframe with relevant columns to append and hash row based on these columns Return: dictionary of MinHash objects for all rows """ row_hash = {} stemmer = WordNetLemmatizer() # Iterate through rows in table and min hash each row, then return the dictionary of all rows and MinHash objects for i, row in df.iterrows(): row_hash[i] = MinHash(num_perm=num_perm) split_name = re.sub('[^A-Za-z0-9]+', ' ', row['name'].lower()).split() stop_words_remove = [w for w in split_name if not w in stop_words] name_stem = [stemmer.lemmatize(w) for w in stop_words_remove] name_comb = [''.join(w) for w in list(combinations(name_stem, 2))] split_add = re.sub('[^A-Za-z0-9]+', ' ', row['street_address'].lower()).split() row_values = name_stem + name_comb + split_add + [row['phone']] * 2 + [ row['postal_code'] ] # print(row_values) for j in row_values: try: row_hash[i].update(j.encode('utf8')) except AttributeError: continue return row_hash
def refer_query(lsh_forest, lsh, reference, grams): minhash = MinHash(num_perm=4000) reference = reference.replace(' ', '') for d in ngrams(reference, 3): minhash.update(''.join(d)) query_result = lsh_forest.query(minhash, 1) query_result_thr = lsh.query(minhash) if query_result and query_result_thr: result = grams[query_result[0]] result_similar = [grams[item] for item in query_result_thr] if result in result_similar: return result else: return False
def test_deserialize_byteorder(self): for byteorder in "@=<>!": m1 = MinHash(10, 1, hashfunc=fake_hash_func) m1.update(123) lm1 = LeanMinHash(m1) buf = bytearray(lm1.bytesize(byteorder)) lm1.serialize(buf, byteorder) # Test if we get back the exact same LeanMinHash objects after # deserializing from bytes lm1d = LeanMinHash.deserialize(buf, byteorder) self.assertEqual(lm1d.seed, lm1.seed) self.assertEqual(len(lm1d.hashvalues), len(lm1.hashvalues)) self.assertTrue( all(hvd == hv for hv, hvd in zip(lm1.hashvalues, lm1d.hashvalues)))
def __init__(self, api_key=None, host_url=None, max_workers=1): """ Initial sixecho Attributes: api_key(string) - Optional : api_key generate from sixecho host_url(string) - Optional : is sixecho domain """ self.api_key = api_key deepcut.tokenize("Welcome") # Load library if host_url is not None: if host_url.endswith("/"): host_url = host_url[:-1] self.host_url = host_url self.array_words = [] self.min_hash = MinHash(num_perm=128) self.max_workers = max_workers self.sha256 = ""
def __init__( self, feature_length: int = None, config: Optional[PradoProjectorConfig] = None, ): super().__init__() if config is None: config = PradoProjectorConfig(feature_length=feature_length) self._config = copy.deepcopy(config) self._hashobj = MinHash(num_perm=self.n_permutations, hashfunc=farmhash.hash32) self._projection_operator = PradoProjectionOperator() self._vectorized_projection = np.vectorize(self.project, signature="()->(n)")
def lsh_predict_label(stems): ''' Queries the LSH matcher and returns: 0 if predicted spam 1 if predicted ham -1 if parsing error ''' minhash = MinHash(num_perm=128) if len(stems) < 2: return -1 for s in stems: minhash.update(s.encode('utf-8')) matches = lsh.query(minhash) if matches: return 0 else: return 1
def lsh_predict_label(stems, lsh): ''' LSH 매처의 반환값: 0 스팸 1 햄 -1 에러 ''' minhash = MinHash(num_perm=128) if len(stems) < 2: return -1 for s in stems: minhash.update(s.encode('utf-8')) matches = lsh.query(minhash) if matches: return 0 else: return 1
def predict(tokens, database, perms, num_results, forest): start_time = time.time() m = MinHash(num_perm=perms) for s in tokens: m.update(s.encode('utf-8')) idx_array = np.array(forest.query(m, num_results)) if len(idx_array) == 0: return None # print(idx_array) # result=database[idx_array] print('took % seconds to query forest' % (time.time() - start_time)) return idx_array
def find_relation_class_name_matchings(network, kr_handlers): # Retrieve relation names st = time.time() names = [] seen_sources = [] for (db_name, source_name, _, _) in network.iterate_values(): original_source_name = source_name if source_name not in seen_sources: seen_sources.append(source_name) # seen already source_name = nlp.camelcase_to_snakecase(source_name) source_name = source_name.replace('-', ' ') source_name = source_name.replace('_', ' ') source_name = source_name.lower() m = MinHash(num_perm=32) for token in source_name.split(): if token not in stopwords.words('english'): m.update(token.encode('utf8')) names.append(('relation', (db_name, original_source_name), m)) num_relations_inserted = len(names) # Retrieve class names for kr_name, kr_handler in kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: original_cl_name = cl cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() m = MinHash(num_perm=32) for token in cl.split(): if token not in stopwords.words('english'): m.update(token.encode('utf8')) names.append(('class', (kr_name, original_cl_name), m)) # Index all the minhashes lsh_index = MinHashLSH(threshold=0.5, num_perm=32) for idx in range(len(names)): lsh_index.insert(idx, names[idx][2]) matchings = [] for idx in range(0, num_relations_inserted): # Compare only with classes N = lsh_index.query(names[idx][2]) for n in N: kind_q = names[idx][0] kind_n = names[n][0] if kind_n != kind_q: # match.format is db_name, source_name, field_name -> class_name match = ((names[idx][1][0], names[idx][1][1], "_"), names[n][1]) matchings.append(match) et = time.time() print("Time to relation-class (name): " + str(et - st)) return matchings
def predict(text, database, perms, num_results, forest): start_time = time.time() tokens = preprocess(text) m = MinHash(num_perm=perms) for s in tokens: m.update(s.encode('utf8')) idx_array = np.array(forest.query(m, num_results)) if len(idx_array) == 0: return None # if your query is empty, return none result = database.iloc[idx_array] print('It took %s seconds to query forest.' % (time.time()-start_time)) return result
def _index_records(self, records): """ Constructs Minhash LSH buckets for a given set of records Args: records (dict) : dict of (record_id -> record_value) Returns: None """ indexer = defaultdict(list) # Create minhashes minhashes = {} for rid in records: m = MinHash(num_perm=self._num_perm) for d in records[rid]: qgrams = set(self.nt.basic(d, 2)) for gram in qgrams: m.update(gram.encode('utf-8')) minhashes[rid] = m # Create LSH instance and add min hashes if self._bands == MinHashLSHRecordDeduplication.BANDS and self._rows == MinHashLSHRecordDeduplication.ROWS: lsh = MinHashLSH(threshold=self._threshold, num_perm=self._num_perm) else: lsh = MinHashLSH(num_perm=self._num_perm, params=(self._bands, self._rows)) max_blocks = [] for rid in records: lsh.insert(rid, minhashes[rid]) max_blocks.append(rid) # Generate blocks while (len(max_blocks) > 0): key = max_blocks[0] bucket = lsh.query(minhashes[key]) for rid in bucket: if rid in max_blocks: max_blocks.remove(rid) indexer["b" + str(self._block_index)].append(rid) self._block_index += 1 self._write_indexer(indexer)
def retrieve_class_names(kr_handlers, num_perm=32): names = list() for kr_name, kr_handler in kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: original_cl_name = cl cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() m = MinHash(num_perm=num_perm) for token in cl.split(): if token not in stopwords.words('english'): m.update(token.encode('utf8')) names.append(('class', (kr_name, original_cl_name), m)) return names
def main() -> None: minhashes = [] files = [] for iterator in tqdm(range(config.COUNT_UNQ_MHS), desc="Generate minHashes:"): minhash = MinHash(num_perm=256) file = [] for _ in range(200): rand_string = ''.join( random.choice(string.ascii_lowercase) for _ in range(5)) file.append(rand_string) files.append(file) minhash.update_batch([s.encode('utf-8') for s in file]) minhashes.append(("key" + str(iterator), minhash)) lsh = MinHashLSH(threshold=0.5, num_perm=256, storage_config={ 'type': 'cassandra', 'basename': b'perftest', 'cassandra': { 'seeds': ['127.0.0.1'], 'keyspace': config.KEY_SPACE, 'replication': { 'class': 'SimpleStrategy', 'replication_factor': '1', }, 'drop_keyspace': False, 'drop_tables': False, } }) for _ in tqdm(range(1), desc="Insert 100 minHashes:"): with lsh.insertion_session(buffer_size=100) as session: for key, minhash in minhashes: session.insert(key, minhash) f_disc_mhs = open('minhashes.txt', 'w+') for minhash in tqdm(minhashes, desc="Log minHashes:"): log(f_disc_mhs, minhash[0], minhash[1].digest()) f_disc_mhs.close() f_disc_files = open('files.txt', 'w+') for iterator in tqdm(range(len(files)), desc="Log files:"): log(f_disc_files, minhashes[iterator][0], files[iterator]) f_disc_mhs.close()
def __train_LSH(self,data): start_time = time.time() forest = MinHashLSHForest(num_perm=config.permutations) for item in tqdm(data, desc="MinHash Docs.."): tag = item['tag'] tokens = item['data'] if self.type == 'trigram': tokens = self.normalizer.generate_ngrams_char(tokens[0]) m = MinHash(num_perm=config.permutations) for s in tokens: m.update(s.encode('utf8')) forest.add(tag,m) forest.index() print('It took %.2f seconds to build forest.' % (time.time() - start_time)) return forest
def test_update(self): m1 = MinHash(4, 1, hashfunc=fake_hash_func) try: lm1 = LeanMinHash(m1) lm1.update(12) except TypeError: pass else: raise Exception
def tokenize_method(audio): feature = audio.features[feature_name] pace = 20 offset = 70 blocks = zip(*[feature[i:] for i in range(shingle_size)]) prewords = [[ chr(int(i * pace) + offset) for i in normalize(np.array(bl)) ] for bl in blocks] a = [] if use_minhash: for preword in prewords: m = MinHash(num_perm=min_hash_fns) m.update(' '.join(preword).encode('utf-8')) tx = ''.join([str(c) for c in m.hashvalues]) a.append(hashlib.md5(tx.encode('utf-8')).hexdigest()) return ' '.join(a) else: return ' '.join([''.join(p) for p in prewords])
def add_untopic_doc(in_path, file_id): js = json.load(codecs.open(in_path, "r")) line = js["content_full_text"] seg_list = basic_preprocess(line, "utf8", True, True) #seg_list = jieba.cut(line, cut_all = False) #seg_list = stemmer_by_porter(seg_list) no_list = [] for word in seg_list: if (word not in stopword) and (len(word) > 1): no_list.append(word) mh = MinHash(num_perm=128) for word in no_list: mh.update(word.encode('utf8')) lsh.insert(file_id, mh)
def toBuildLSH(cleanSongs): ''' :param cleanSongs :return: forest, min_hash_list ''' forest = MinHashLSHForest(num_perm=128) min_hash_list = [] for songIndex, song in enumerate(cleanSongs): minhash = MinHash(num_perm=128) for word in song: ### encoding each word minhash.update(word.encode('utf8')) ### add each song's minhash to the forest as well as min_hash_list forest.add(str(songIndex), minhash) min_hash_list.append(minhash) forest.index() return forest, min_hash_list
def test_update(self): m1 = MinHash(4, 1, hashobj=FakeHash) try: lm1 = LeanMinHash(m1) lm1.update(12) except TypeError: pass else: raise Exception
def _get_raw_class_matches(self, class_name, lsh): if not self._classes_signatures: self.get_classes_signatures() class_signatures = self._classes_signatures[class_name] if class_signatures: self._lsh_classes.update([class_name]) m = MinHash(num_perm=config.LSH_PERM_NUM) for signature in class_signatures: m.update(signature.encode('utf8')) matches = lsh.query(m, len(class_signatures)) return set(matches) else: return set()
def lsh_clustering( signatures: List[np.ndarray], threshold: float = 0.5, num_perm: int = 128, ): lsh = MinHashLSH(threshold=threshold, num_perm=num_perm) with lsh.insertion_session() as session: for key, minhash in enumerate(signatures): session.insert(f"id-{key}", MinHash(num_perm=num_perm, hashvalues=minhash)) neighbors: List[List[int]] = [] for key, minhash in enumerate(signatures): result = lsh.query(MinHash(num_perm=num_perm, hashvalues=minhash)) neighbors.append([int(x.split("-")[1]) for x in result]) return neighbors
def extract_attribute(self, base_object: BDFunction) -> int: # Check if value already exists FunctionMinHashLSH_value = base_object.get_attribute_value('FunctionMinHashLSH') if FunctionMinHashLSH_value: pass else: normalized_instr_set: set = set(base_object.get_attribute_value('FunctionNormalized')) # Create MinHash object minhash = MinHash(num_perm=Configuration.MINHASH_PERMUTATIONS, seed=Configuration.MINHASH_SEED) for instr in normalized_instr_set: minhash.update(instr.encode('utf8')) base_object.add_attribute_value('FunctionMinHashLSH', {'function_lsh': minhash.digest()}) FunctionMinHashLSH_value = base_object.get_attribute_value('FunctionMinHashLSH') return FunctionMinHashLSH_value['function_lsh'] if FunctionMinHashLSH_value else None
def get_min_hash(self, x): """ Create a MinHash object for the input example string using w-shingling. Parameters: x - A list of strings representing an example. Returns: A datasketch.MinHash object updated with the generated w-shingles. """ min_hash = MinHash(num_perm=self.num_perm, seed=self.random_state) # we accumulate all shingles extracted from each string for x_str in x: # map string x_str to a set of shingles x_shingles = MinHashNearestNeighbor.get_w_shingles(x_str, self.w) for shingle in x_shingles: min_hash.update(shingle) return min_hash
def _hello_world(): """ This fragment was taken from the datasketch github page: https://github.com/ekzhu/datasketch """ data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'datasets'] data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'documents'] m1, m2 = MinHash(), MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2))) print("Actual Jaccard for data1 and data2 is", actual_jaccard)
def minhash_str(in_str, perms, gram_sz): minhash = MinHash(num_perm=perms) for d in ngrams(in_str, gram_sz): minhash.update("".join(d).encode('utf-8')) return minhash
if args.header: next(f) #TODO test robustness #mycorpus=[(i,set(line.encode('utf8', 'ignore').lower().split())) for i,line in enumerate(f)] mycorpus=[(i,set(line.lower().split())) for i,line in enumerate(f)] print(("--- %s seconds ---" % (time.time() - start_time))) print('Calculate minhash signatures') start_time = time.time() #prepare dictionary of hashes hashcorp=dict.fromkeys([tup[0] for tup in mycorpus]) #compute hashes for key,doc in mycorpus: #compute minhash signature m=MinHash(num_perm=num_permutations) for token in doc: m.digest(sha1(token)) hashcorp[key]=m print(("--- %s seconds ---" % (time.time() - start_time))) if num_processes> 1: if len(thresholds)<num_processes: num_processes=len(thresholds) p=Pool(num_processes) assignment=[ (x,) for x in thresholds] p.map(compute_clusters,assignment) else: for x in thresholds: compute_clusters((x,))
newSentence = [] for i in range(num_sentences): newSentence.append(model.getSentence(word_to_index,index_to_word)) # print(len(newSentence)) # print (newSentence) stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(string.punctuation) stopwords.append('') for sen in newSentence: data1 = [token.lower().strip(string.punctuation) for token in nltk.word_tokenize(sen) \ if token.lower().strip(string.punctuation) not in stopwords] f = open('data/data.csv', 'rb') for line in f: data2 = [token.lower().strip(string.punctuation) for token in nltk.word_tokenize(line) \ if token.lower().strip(string.punctuation) not in stopwords] m1, m2 = MinHash(), MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) # print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2))) if(actual_jaccard > 0.3): print("Actual Jaccard for data1 and data2 is", actual_jaccard) print sen print line
def query(self, v, n): m = MinHash(num_perm = self._n_perm) for e in v: m.update(str(e)) return map(int, self._index.query(m, n))
def get_min_hash(text, too_common, num_perm=128): min_hash = MinHash(num_perm=num_perm) for shingle_h in shingle_hashes(text): if shingle_h.hexdigest() not in too_common: min_hash.digest(shingle_h) return min_hash