def _get_forest(self, data, perms): # START Time self.START_TIME = time.time() minhash_list = [] for text in data['text']: min_hashtext = _create_hashtex(text=text, perms=perms, language=self.LANGUAGE) minhash_list.append(min_hashtext) forest = MinHashLSHForest(num_perm=perms) for item_index, list_item in enumerate(minhash_list): forest.add(item_index, list_item) forest.index() # END Time self.END_TIME = time.time() # TIMING LIST self.TIMING = [self.END_TIME, self.START_TIME] print('It took %s seconds to build forest.' % (calculate_duration(self.TIMING))) return forest
def mylshforest(corpus): #print(len(corpus)) forest = MinHashLSHForest(num_perm=32) score_res = [0] mh = [] for i in range(len(corpus) - 1): doc = corpus[i] doc2 = corpus[i + 1] m = MinHash(num_perm=32) for d in doc: m.update(d.encode('utf8')) forest.add(str(i), m) forest.index() mh.append(m) m2 = MinHash(num_perm=32) for d in doc2: m2.update(d.encode('utf8')) result = forest.query(m2, 10) score = 0.0 for j in range(len(result)): score = score + m2.jaccard(mh[int(result[j])]) if (len(result) > 0): score = score / len(result) score_res.append(score) i = i + 1 return score_res
class MinHas(BaseANN): def __init__(self, metric, n_perm, n_rep): if metric not in ('jaccard'): raise NotImplementedError( "Datasketch doesn't support metric %s" % metric) self._n_perm = n_perm self._n_rep = n_rep self._metric = metric self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep) def fit(self, X): self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep) for i, x in enumerate(X): m = MinHash(num_perm=self._n_perm) for e in x: m.update(str(e).encode('utf-8')) self._index.add(str(i), m) self._index.index() def query(self, v, n): m = MinHash(num_perm=self._n_perm) for e in v: m.update(str(e).encode('utf-8')) print(self._index.query(m, n)) return map(int, self._index.query(m, n))
def search_lshforest_jaccard_topk(index_data, query_data, b, r, k): (index_sets, index_keys, index_minhashes) = index_data (query_sets, query_keys, query_minhashes) = query_data num_perm = b * r print("Building LSH Forest Index.") start = time.perf_counter() index = MinHashLSHForest(num_perm=num_perm, l=b) # Use the indices of the indexed sets as keys in LSH. for i in range(len(index_keys)): index.add(i, index_minhashes[num_perm][i]) index.index() end = time.perf_counter() print("Indexing time: {:.3f}.".format(end - start)) print("Querying.") times = [] results = [] for query_minhash, query_key, query_set in \ zip(query_minhashes[num_perm], query_keys, query_sets): start = time.perf_counter() result = index.query(query_minhash, k * 2) # Recover the retrieved indexed sets and # compute the exact Jaccard similarities. result = [[index_keys[i], compute_jaccard(query_set, index_sets[i])] for i in result] # Sort by similarity. result.sort(key=lambda x: x[1], reverse=True) # Take the top k. result = result[:k] duration = time.perf_counter() - start times.append(duration) results.append((query_key, result)) sys.stdout.write(f"\rQueried {len(results)} sets") sys.stdout.write("\n") return (results, times)
def getMinhashforest2(minhashs): # Create a MinHash LSH Forest with the same num_perm parameter forest = MinHashLSHForest(num_perm=128) for i in range(len(minhashs)): # Add m2 and m3 into the index forest.add(i, minhashs[i]) # IMPORTANT: must call index() otherwise the keys won't be searchable forest.index() return forest
def build_lsh_forest_hash(game_data): forest = MinHashLSHForest(num_perm=_utils.HASH_REZ) for ind, row in game_data.iterrows(): try: forest.add(f"{row['title']} (id:{row['id']})", row['_sim_hash']) except ValueError: print(f"{row['title']} already added") except: raise forest.index() return forest
def create_LSH_Forest(): global forest if os.path.isfile(LSH_FOREST_FILE): load_forest() else: forest = MinHashLSHForest(num_perm=128) train_records = glob.glob("dataset/train*.tfrecord") validate_records = glob.glob("dataset/validate*.tfrecord") all_records = train_records + validate_records dataset = tf.data.TFRecordDataset(all_records) iterator = dataset.make_one_shot_iterator() count = 0 next_element = iterator.get_next() updated = False with tf.Session() as sess: try: while True: if count % 10000 == 0: print "[SimpleVideoSearch][{}] Processed {} records from the dataset so far".format( datetime.now(), count) if updated and count % 100000 == 0: with open(LSH_FOREST_FILE, 'wb') as forest_file: forest.index() pickle.dump(forest, forest_file, pickle.HIGHEST_PROTOCOL) print "[SimpleVideoSearch][{}] Updated LSH Forest file".format( datetime.now(), count) exampleBinaryString = sess.run(next_element) example = tf.train.Example.FromString(exampleBinaryString) count += 1 example_id = example.features.feature["id"].bytes_list.value[0] if example_id not in forest: if not updated: updated = True print '[SimpleVideoSearch][{}] First update at record {}'.format( datetime.now(), count) dataset_labels_full = convert_dataset_labels_to_list( example.features.feature["labels"].int64_list.value) minhash = MinHash(num_perm=128) for label in dataset_labels_full: minhash.update(label) forest.add(example_id, minhash) except tf.errors.OutOfRangeError: print "[SimpleVideoSearch][{}] Done iterating through dataset".format( datetime.now()) finally: print "[SimpleVideoSearch][{}] Processed {} records from the dataset".format( datetime.now(), count) forest.index() with open(LSH_FOREST_FILE, 'wb') as forest_file: pickle.dump(forest, forest_file, pickle.HIGHEST_PROTOCOL) print "[SimpleVideoSearch][{}] Finished creating LSH Forest file".format( datetime.now(), count)
def construct_lsh(obj_dict): forest = MinHashLSHForest(num_perm=128) keys = obj_dict.keys() values = obj_dict.values() ms = [] for i in range(len(keys)): temp = MinHash(num_perm=128) for d in values[i]: temp.update(d.encode('utf8')) ms.append(temp) forest.add(keys[i], temp) forest.index() return forest, keys, ms
def target_lsh(grams): lsh_forest = MinHashLSHForest(num_perm=4000, l=200) lsh = MinHashLSH(threshold=0.5, num_perm=4000) # minhashes = {} for c, i in enumerate(grams): minhash = MinHash(num_perm=4000) i = i.replace(' ', '') for d in ngrams(i, 3): minhash.update(''.join(d)) lsh_forest.add(c, minhash) lsh_forest.index() lsh.insert(c, minhash) return lsh_forest, lsh
def store_lsh(): forest = MinHashLSHForest(num_perm=128) documents_en = docs_col.find({"lang": 'english'}) for item in documents_en: minhash = MinHash(num_perm=128) ngrams = ngrams_token(remove_punctuation(item['content']), 3) for ngram in ngrams: minhash.update(ngram.encode("utf-8")) forest.add(str(item["_id"]), minhash) forest.index() ouf = open('pickle_ngram.txt', 'wb') cPickle.dump(forest, ouf) ouf.close() return forest
def build_lsh_forest(self, company_name_column_name): """ Build the LSH forest data structure from the sets of parsed description words for each company Parameters: company_name_column_name - string; name of the company name column in the company corpus dataframe """ # Note: num_perm is a tuning parameter, but has been abstracted away for simplicity # 256 has been found to be a good amount. Increasing it may increase accuracy, # but will decrease speed and increase memory usage. Decreasing will decrease accuracy lsh_forest = MinHashLSHForest(num_perm=256) iteration = 1 self.company_name_column_name = company_name_column_name self.name_to_index_map = dict( zip(self.company_corpus.corpus.loc[:, company_name_column_name], self.company_corpus.corpus.index)) self.index_to_name_map = dict( zip(self.company_corpus.corpus.index, self.company_corpus.corpus.loc[:, company_name_column_name])) sys.stdout.write("Performing LSH...") for company in self.company_corpus.corpus.iterrows(): # Utilize the 'datasketch' library to minhash the company descriptions and hash to LSh forest company_name = company[1][company_name_column_name] if company_name in self.dict_of_minhash_keys: continue mh = MinHash(num_perm=256) if type(company[1]['rare_words']) is float: mh.update(str(company[1]['rare_words']).encode('utf8')) else: for word in company[1]['rare_words']: mh.update(str(word).encode('utf8')) self.dict_of_minhash_keys[company_name] = mh lsh_forest.add(company_name, mh) iteration += 1 sys.stdout.write('\n') sys.stdout.write("Done performing LSH!\n") # Need this line below to be able to query LSH forest! (See datasketch docs on LSH forest for reasoning) lsh_forest.index() self.lsh_forest = lsh_forest
def __train_LSH(self,data): start_time = time.time() forest = MinHashLSHForest(num_perm=config.permutations) for item in tqdm(data, desc="MinHash Docs.."): tag = item['tag'] tokens = item['data'] if self.type == 'trigram': tokens = self.normalizer.generate_ngrams_char(tokens[0]) m = MinHash(num_perm=config.permutations) for s in tokens: m.update(s.encode('utf8')) forest.add(tag,m) forest.index() print('It took %.2f seconds to build forest.' % (time.time() - start_time)) return forest
def build_lsh_forest(self, company_name_column_name): # Note: num_perm is a tuning parameter, but has been abstracted away for simplicity # 256 has been found to be a good amount. Increasing it may increase accuracy, # but will decrease speed and increase memory usage. Decreasing will decrease accuracy lsh_forest = MinHashLSHForest(num_perm=256) iteration = 0 self.company_name_column_name = company_name_column_name self.name_to_index_map = dict( zip(self.company_corpus.corpus.loc[:, company_name_column_name], self.company_corpus.corpus.index)) self.index_to_name_map = dict( zip(self.company_corpus.corpus.index, self.company_corpus.corpus.loc[:, company_name_column_name])) graph_size = self.company_corpus.corpus.shape[0] for company in self.company_corpus.corpus.iterrows(): company_name = company[1][company_name_column_name] if company_name in self.dict_of_minhash_keys: continue mh = MinHash(num_perm=256) if type(company[1]['rare_words']) is float: mh.update(str(company[1]['rare_words']).encode('utf8')) else: for word in company[1]['rare_words']: mh.update(str(word).encode('utf8')) self.dict_of_minhash_keys[company_name] = mh lsh_forest.add(company_name, mh) if iteration % 10000 is 0 or (iteration + 1) is graph_size: if (iteration + 1) is graph_size: iteration += 1 sys.stdout.write('\r') sys.stdout.write( "LSH Forest Build Percent Complete: {0:0.2f}%".format( round((iteration / graph_size) * 100))) sys.stdout.flush() iteration += 1 sys.stdout.write('\n') # Need this line below !!!! lsh_forest.index() self.lsh_forest = lsh_forest
def toBuildLSH(cleanSongs): ''' :param cleanSongs :return: forest, min_hash_list ''' forest = MinHashLSHForest(num_perm=128) min_hash_list = [] for songIndex, song in enumerate(cleanSongs): minhash = MinHash(num_perm=128) for word in song: ### encoding each word minhash.update(word.encode('utf8')) ### add each song's minhash to the forest as well as min_hash_list forest.add(str(songIndex), minhash) min_hash_list.append(minhash) forest.index() return forest, min_hash_list
def get_forest(self, data, perms): minhash = [] for text in data['err']: tokens = self.preprocess(text) m = MinHash(num_perm=perms) for s in tokens: m.update(s.encode('utf8')) minhash.append(m) forest = MinHashLSHForest(num_perm=perms) for i, m in enumerate(minhash): forest.add(i, m) forest.index() return forest
def form_lsh(self): minhash = [] for s in self.__items: m = MinHash(num_perm=256) for q in s: m.update(q.encode('utf8')) minhash.append(m) forest = MinHashLSHForest(num_perm=256) for i, m in enumerate(minhash): forest.add(i, m) forest.index() self.__forest = forest self.__hashlist = minhash return forest
def benchmark_lshforest(num_perm, l, k, index_data, query_data): print("Building LSH Forest index") forest = MinHashLSHForest(num_perm=num_perm, l=l) for key, minhash in zip(index_data.keys, index_data.minhashes[num_perm]): forest.add(key, minhash) forest.index() print("Querying") times = [] results = [] for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]): start = time.clock() result = forest.query(minhash, k) duration = time.clock() - start times.append(duration) results.append( sorted([[key, _compute_jaccard(qs, index_data.sets[key])] for key in result], key=lambda x: x[1], reverse=True)) return times, results
def build_lsh_forest(columns, override=False): """ Builds a minHash LSH forest which can be used to query top-k columns with maximum Jaccard similarity @param override: @param columns: @return: """ file_path = f'{os.environ["WORKING_DIRECTORY"]}/results/forest.obj' if override or not os.path.isfile(file_path): forest = MinHashLSHForest(num_perm=NUM_PERM) for column in columns: forest.add(f'{column["table"]}.{column["column"]}', deserialize_minhash(column)) forest.index() with open(file_path, 'wb') as file: pickle.dump(forest, file) return forest with open(file_path, 'rb') as file: forest = pickle.load(file) return forest
def get_forest(records, perms): start_time = time.time() minhash = [] for record in records: for text in record: tokens = preprocess(text) m = MinHash(num_perm=perms) for s in tokens: m.update(s.encode('utf8')) minhash.append(m) forest = MinHashLSHForest(num_perm=perms) print(forest) for i, m in enumerate(minhash): forest.add(i, m) forest.index() print('It took %s seconds to build forest.' % (time.time() - start_time)) return forest
def __datasketch_fit(self): if self.kwargs['create']: # Create a list of MinHash objects min_hash_obj_list = [] forest = MinHashLSHForest(num_perm=self.kwargs['num_perm']) for i in range(len(self.features)): min_hash_obj_list.append( MinHash(num_perm=self.kwargs['num_perm'])) for d in self.features[i]: min_hash_obj_list[i].update(d) forest.add(i, min_hash_obj_list[i]) # IMPORTANT: must call index() otherwise the keys won't be searchable forest.index() with open(self.kwargs['file_path'], "wb") as f: pickle.dump(forest, f) pickle.dump(min_hash_obj_list, f) self.predictor = [forest, min_hash_obj_list] else: with open(self.kwargs['file_path'], "rb") as f: forest = pickle.load(f) min_hash_obj_list = pickle.load(f) self.predictor = [forest, min_hash_obj_list]
def get_forest(data, perms): start_time = time.time() minhash = [] for text in data: tokens = p.preprocess(text) m = MinHash(num_perm=perms) for s in tokens: m.update(s.encode('utf-8')) minhash.append(m) forest = MinHashLSHForest(num_perm=perms) for i, m in enumerate(minhash): forest.add(i, m) forest.index() print('time to build forest: ', (time.time() - start_time)) return forest
class DataSketch(BaseANN): def __init__(self, metric, n_perm, n_rep): if metric not in ('jaccard'): raise NotImplementedError("Datasketch doesn't support metric %s" % metric) self._n_perm = n_perm self._n_rep = n_rep self._metric = metric self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep) def fit(self, X): self._index = MinHashLSHForest(num_perm = self._n_perm, l = self._n_rep) for i, x in enumerate(X): m = MinHash(num_perm = self._n_perm) for e in x: m.update(str(e)) self._index.add(str(i), m) self._index.index() def query(self, v, n): m = MinHash(num_perm = self._n_perm) for e in v: m.update(str(e)) return map(int, self._index.query(m, n))
'the', 'similarity', 'between', 'documents' ] dataset = [[0., 0., 0.], [0., 0., 1.], [0., 1., 0.], [0., 1., 1.]] # Create a MinHash LSH Forest with the same num_perm parameter forest = MinHashLSHForest(num_perm=128) for i, data in enumerate(dataset): m = MinHash(num_perm=128) for d in data: m.update(str(d).encode('utf8')) forest.add(str(i), m) # IMPORTANT: must call index() otherwise the keys won't be searchable pickle.dump(forest, open('forest.lsh', 'wb')) del forest forest = pickle.load(open('forest.lsh', 'rb')) forest.index() # Check for membership using the key print("1" in forest) print("2" in forest) m = MinHash(num_perm=128) for d in dataset[0]: m.update(str(d).encode('utf8')) # Using m1 as the query, retrieve top 2 keys that have the higest Jaccard result = forest.query(m, 10) print("Top 2 candidates", result)
def main(): corpus = {} with open('corpus_data/preprocessedf_corpus.json') as file: corpus = json.loads(file.read().encode('Utf-8')) def processLyrics(lyrics): authors = {} for author in lyrics: for song in lyrics[author]: lyric = re.sub(r'\[[^>]+\]', '', song["lyrics"]) lyric = re.sub(r'\([^>]+\)', '', lyric) lyric = re.sub(r'\{[^>]+\}', '', lyric) lyric = lyric.split(r'\s') for line in lyric: line = re.sub(r'\n', ' ', line) if author not in authors: authors[author] = line else: authors[author] += line return authors import nltk from nltk.corpus import stopwords from collections import defaultdict from collections import Counter nltk.download('wordnet') from nltk.corpus import wordnet as wn def get_lemma(word): lemma = wn.morphy(word) if lemma is None: return word else: return lemma from nltk import word_tokenize def clean_text(text, ar): tokenized_text = word_tokenize(text.lower()) tokenized_text = [token for token in tokenized_text if len(token) > 5] cleaned_text = [ t for t in tokenized_text if re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t) ] if ar == 'sw': cleaned_text = [t for t in cleaned_text if t not in STOPWORDS] if ar == 'lm': cleaned_text = [get_lemma(token) for token in cleaned_text] if ar == 'rw': cleaned_text = [ token for token in cleaned_text if token not in PROFANITY ] return cleaned_text STOPWORDS = set(stopwords.words('english')) with open('corpus_data/rapsw.txt') as infile: infile = infile.read() infile = infile.split() PROFANITY = set(infile) corpus = processLyrics(corpus) for author, text in corpus.items(): corpus[author] = clean_text(text, sys.argv[1]) artist_shingle = defaultdict(list) for artist, lyrics in corpus.items(): #tokens = [w for w in tokens if not w in sw] #shingle3 = set([tuple(tokens[i:i+3]) for i in range(len(tokens) - 3 + 1) if len(tokens[i]) < 10]) #shingle2 = set([tuple(tokens[i:i+2]) for i in range(len(tokens) - 2 + 1) if len(tokens[i]) < 10]) shingle1 = lyrics # set([tokens[i] for i in range(len(tokens) - 1 + 1) if len(tokens[i]) < 4]) artist_shingle[artist].append(shingle1) #artist_shingle[artist].append(shingle2) #artist_shingle[artist].append(shingle3) from datasketch import MinHashLSHForest, MinHash from sklearn.metrics import jaccard_similarity_score listlsh = [] lsh = MinHashLSHForest(num_perm=128) for artist, sets in artist_shingle.items(): a = MinHash(num_perm=128) for d in sets[0]: a.update(d.encode('utf8')) listlsh.append(a) lsh.add(artist, a) lsh.index() m1 = MinHash(num_perm=128) g = [] with open(sys.argv[2]) as g: g = g.read() g = g.split() for d in g: m1.update(d.encode('utf8')) result = lsh.query(m1, 5) print(" (Up to) Top 5 candidates", result)
def printStats(json_filename): with open(json_filename) as json_data: d = json.load(json_data) # Query simple index: queryNum -> queryText queryIndex = {} # Index of queries as a LSH forest for top-k similar queries. queriesLSHIndex = MinHashLSHForest(num_perm=128) # You can grok the CSV from stdout by using cut, e.g., # # $ python analyzer.py -i ../../data/queries_ASTs.json | grep "csv:" | cut -d':' -f2 > /tmp/out.csv print 'csv:"queryNum","numExplicitJoins","referencedTables","groupByColumns","numGroupByClauses"' for queryNum, entry in enumerate(d): print '\n=> Stats for query number \"%s:\"' % queryNum # Group by clauses. groupByColumns = jmespath.search( 'ast.statement[*].group.expression[*].name[]', entry) print 'groupBy columns: %s' % groupByColumns # Base tables when the query has no joins. baseTables = jmespath.search( 'ast.statement[?from.variant == \'table\'].from.name[]', entry) print 'baseTables: %s' % baseTables # Base tables when the query has joins. baseTables += jmespath.search( 'ast.statement[?from.variant == \'join\'].from.source.name[]', entry) print 'baseTables (with joins): %s' % baseTables # Join tables. joinTables = jmespath.search( 'ast.statement[?from.variant == \'join\'].from.map[*].source.name[]', entry) print 'joinTables: %s' % joinTables # All tables mentioned in the query referencedTables = baseTables + joinTables # Joins. joinPathPrefix = 'ast.statement[*].from.map[*].constraint.on' joinsLeft = jmespath.search(joinPathPrefix + '.left.name', entry) joinsRight = jmespath.search(joinPathPrefix + '.right.name', entry) print 'explicit joins (left-hand side): %s' % joinsLeft print 'explicit joins (right-hand side): %s' % joinsRight # Text queryText = jmespath.search('queryText', entry) # Index it into an LSH forest for top-k textually similar queries. queryLSH = getQueryMinHash(queryText) queryIndex[queryNum] = { 'queryText': queryText, 'queryLSH': queryLSH } queriesLSHIndex.add(queryNum, queryLSH) # Sort for a prettier CSV dump. referencedTables.sort() groupByColumns.sort() # CSV header: # queryNum,numExplicitJoins,referencedTables,groupByColumns,numGroupByColumns print 'queryNum = %s' % queryNum print 'csv:"%s","%s","%s","%s","%s"' % ( queryNum, len(joinsLeft[0]) if len(joinsLeft) > 0 else 0, ','.join(referencedTables), ','.join(groupByColumns), len(groupByColumns)) # Populate a reverse index from table to script. tableToQuery = {} for referencedTable in referencedTables: if referencedTable not in tableToQuery: tableToQuery[referencedTable] = [queryNum] else: tableToQuery[referencedTable].append(queryNum) # Sample search on LSH forest index: top-3 most similar queries. queriesLSHIndex.index() k = 3 queryNum = 10 query = queryIndex[queryNum] print '\n\nTop %s queries similar to "%s":' % (k, query['queryText']) top_k = queriesLSHIndex.query(query['queryLSH'], k) for k in top_k: print '\n"%s"' % queryIndex[k]['queryText']
class LshSamplesEval: '''STUDENT_PATH = '../studentData/liftoff/' STANDARD_PATH = '../data/raw/liftoff/standard/' UNIFORM_PATH = '../data/raw/liftoff/uniform/' TEMPERED_PATH = '../data/raw/liftoff/tempered/''' def __init__(self, studentDataPath, sampledDataPath): print('Loading data...') self.studentDataPath = studentDataPath print(self.studentDataPath) self.sampledDataPath = sampledDataPath print(self.sampledDataPath) self.sampledData = self.loadSyntheticData() self.studentData = self.loadStudentData() def loadStudentData(self): path = self.studentDataPath + STUDENT_NAME datadict = pickle.load(open(path, "rb" )) return list(datadict.keys()) def loadSyntheticData(self): standard_path = self.sampledDataPath + '/standard/train' + SYNTH_NAME uniform_path = self.sampledDataPath + '/uniform/train' + SYNTH_NAME tempered_path = self.sampledDataPath + '/tempered/train' + SYNTH_NAME standardDict = pickle.load(open(standard_path, "rb" )) uniformDict = pickle.load(open(uniform_path, "rb" )) temperedDict = pickle.load(open(tempered_path, "rb" )) #import pdb; pdb.set_trace() return list(standardDict.values()) + list(uniformDict.values()) + list(temperedDict.values()) def computeLshNN(self): print('Processing sampled and student data...') sampledSets = self.processData(self.sampledData) studentSets = self.processData(self.studentData) print('Finding nearest neighbors from sampled data...') sampledScores = self.constructNNList(studentSets, sampledSets, self.studentData, self.sampledData) print('Found nearest neighbors for data!') # self.constructHistogram(sampledScores) return sampledScores # tokenize every sentence and return a list of sentences def processData(self, dataset): processed = [] for datum in dataset: splitCode = datum.split() processed.append(splitCode) return processed # runs MinHashLsh def constructNNList(self, studentSets, sampledSets, studentData, sampledData): print('Creating min-hashes for student data') self.studentMinHashes = self.createMinHash(studentSets) print('Creating min-hashes for rubric data') self.sampledMinHashes = self.createMinHash(sampledSets) self.forest = MinHashLSHForest(num_perm = 128) i = 0 for minHash in self.sampledMinHashes: self.forest.add(str(i), minHash) i += 1 self.forest.index() print("calculating nearest neighbor") scores = [] for i, query in enumerate(tqdm(self.studentMinHashes)): result = self.forest.query(query, 1) indexMatch = int(result[0]) # Uncomment these to print examples of # student code and their nearest neighbor! print(result) print('Student Code: \n') print(studentData[i]) print('\n') print('Closest Sampled Code: \n') print(sampledData[indexMatch]) print('\n') score = self.sampledMinHashes[indexMatch].jaccard(query) print('Score: \n') scores.append(score) return scores # create minHash objects for every dataset def createMinHash(self, dataset): minHashes = [] for code in tqdm(dataset): minHash = MinHash(num_perm = 128) for d in code: # TODO modify this for n-grams minHash.update("".join(d).encode('utf-8')) minHashes.append(minHash) return minHashes def constructHistogram(self, scores): plt.hist(scores) plt.xlabel('Jaccard Similarity Score') plt.ylabel('Counts') plt.show()
class HashJaccard(FilterProblem): """ A class that does clustering based on hashes from the datasketch library. """ @property def num_perm(self): return DATA_FILTERING["num_permutations"] @property def DataPointClass(self): return DataPoint # Find nearest medoid for a data point. def find_nearest_medoid(self, data_point, data_tag=""): nearest_medoid = self.forest.query(data_point.min_hash, 1) if not nearest_medoid: nearest_medoid = [ random.randint(0, self.num_clusters[data_tag] - 1) ] return nearest_medoid[0] # Do the clustering of sources and targets. def clustering(self, data_tag): """ Params: :data_tag: Whether it's source or target data. """ # Create a min hash forest to quickly find nearest neighbours. self.forest = MinHashLSHForest(num_perm=self.num_perm) # Initialize clusters. medoids = random.sample(range(len(self.data_points[data_tag])), self.num_clusters[data_tag]) for i in range(self.num_clusters[data_tag]): cl = self.ClusterClass(self.data_points[data_tag][medoids[i]]) self.clusters[data_tag].append(cl) # Put medoids in a the forest. self.forest.add(i, self.clusters[data_tag][-1].medoid.min_hash) self.forest.index() # For each data_point find a cluster. self.cluster_points(data_tag) # These will be needed for the stopping criterion. cluster_names = [ self.clusters[data_tag][i].medoid.string for i in range(self.num_clusters[data_tag]) ] cluster_names_old = list(cluster_names) count = 0 counts = [] exit = False # Clustering loop. while not exit: count += 1 # Find the point that minimizes the mean distance within a cluster. self.find_medoid(data_tag) # Create new forest. self.forest = MinHashLSHForest(num_perm=self.num_perm) for i in range(self.num_clusters[data_tag]): self.forest.add(i, self.clusters[data_tag][i].medoid.min_hash) self.forest.index() # Assign each point to the new medoids. self.cluster_points(data_tag) # Check stopping criterions. exit, cluster_names, cluster_names_old, counts = self.stop_clustering( data_tag, cluster_names, cluster_names_old, count, counts)
def saver(self, i, q, retq, matq, l): print_start = t.time() save_start = t.time() global_time = t.time() chunk_size = 100 count = 0 forest = MinHashLSHForest(num_perm=self.numperm) taxstr = '' if self.tax_filter is None: taxstr = 'NoFilter' if self.tax_mask is None: taxstr += 'NoMask' else: taxstr = str(self.tax_filter) dataset_name = self.saving_name + '_' + taxstr self.errorfile = self.saving_path + 'errors.txt' with open(self.errorfile, 'w') as hashes_error_files: with h5py.File(self.hashes_path, 'w', libver='latest') as h5hashes: datasets = {} if dataset_name not in h5hashes.keys(): if self.verbose == True: print('creating dataset') print(dataset_name) print('filtered at taxonomic level: ' + taxstr) h5hashes.create_dataset(dataset_name + '_' + taxstr, (chunk_size, 0), maxshape=(None, None), dtype='int32') datasets[dataset_name] = h5hashes[dataset_name + '_' + taxstr] if self.verbose == True: print(datasets) h5flush = h5hashes.flush print('saver init ' + str(i)) while True: this_dataframe = retq.get() if this_dataframe is not None: if not this_dataframe.empty: hashes = this_dataframe['hash'].to_dict() print(str(this_dataframe.Fam.max()) + 'fam num') print(str(count) + ' done') hashes = { fam: hashes[fam] for fam in hashes if hashes[fam] } [ forest.add(str(fam), hashes[fam]) for fam in hashes ] for fam in hashes: if len(datasets[dataset_name]) < fam + 10: datasets[dataset_name].resize( (fam + chunk_size, len(hashes[fam].hashvalues.ravel()))) datasets[dataset_name][ fam, :] = hashes[fam].hashvalues.ravel() count += 1 if t.time() - save_start > 200: print(t.time() - global_time) forest.index() print(forest.query(hashes[fam], k=10)) h5flush() save_start = t.time() with open(self.lshforestpath, 'wb') as forestout: forestout.write(pickle.dumps(forest, -1)) if self.verbose == True: print('save done at' + str(t.time() - global_time)) else: print(this_dataframe) else: if self.verbose == True: print('wrap it up') with open(self.lshforestpath, 'wb') as forestout: forestout.write(pickle.dumps(forest, -1)) h5flush() if self.verbose == True: print('DONE SAVER' + str(i)) break
class AutoTag(): def __init__(self, num_permutation=60): self.__num_permutation = num_permutation self.__forest = MinHashLSHForest(self.__num_permutation) self.__lem = WordNetLemmatizer() stop_words = set(stopwords.words("english")) stop_words.add('—') stop_words.add('And') self.__stop_words = stop_words def fit(self, csv): df = pd.read_csv(csv) df.drop_duplicates(subset='webURL', keep=False, inplace=True) df.dropna(inplace=True) for index, row in df.iterrows(): min_hash = self.make_min_hash(self.make_clean_words_list(row['Text'])) self.__forest.add(row['webURL'], min_hash) if index % 100 == 0 :print(index, end='\r', flush=True) self.__forest.index() def make_clean_words_list(self, text): text = re.sub('[^a-zA-Z]', ' ', text) #Convert to lowercase text = text.lower() #remove tags text=re.sub("</?.*?>"," <> ",text) # remove special characters and digits text=re.sub("(\\d|\\W)+"," ",text) #Lemmatisation text = text.split() lem = WordNetLemmatizer() text = [lem.lemmatize(word) for word in text if not word in self.__stop_words] return text def predict(self, text, num_of_niebhors): #TODO : change results into tags query = self.make_min_hash(self.make_clean_words_list(text)) return self.__forest.query(query, num_of_niebhors) def make_min_hash(self,words): min_hash = MinHash(self.__num_permutation) for word in words: min_hash.update(word.encode('utf8')) return min_hash def load_trained_model(self, trained_model_file_name, num_of_permutations): self.__forest = pickle.load(open(trained_model_file_name, 'rb')) self.__num_permutation = num_of_permutations def save_model(self, file_name): pickle.dump(self.__forest, open(file_name, 'wb'))
class LshNN(ProgramNN): CACHE_DIR = 'cache/' def __init__(self, sampledDataPath, num_perm=128, top_k=1, evict_cache=False): """ An agent class to find rubric sampled nearest neighbour of a given program by using a MinHash LSH forest. """ self.sampledDataPath = sampledDataPath self.num_perm = num_perm self.top_k = top_k self.evict_cache = evict_cache self.rawProgramData, self.sampledData = self.loadSyntheticData() self.create_lsh_forest() def create_lsh_forest(self): cache_file = os.path.join(self.CACHE_DIR, 'lsh_forest.pkl') if not self.evict_cache and os.path.isfile(cache_file): # load precomputed print('Loading cached forest') self.forest = load_pickle(cache_file) else: sampledSets = self.processData(self.sampledData) self.sampledMinHashes = self.createMinHashSet(sampledSets) self.forest = MinHashLSHForest(num_perm=self.num_perm) for prog_idx, minHash in enumerate(self.sampledMinHashes): self.forest.add(prog_idx, minHash) self.forest.index() os.makedirs(self.CACHE_DIR, exist_ok=True) save_pickle(self.forest, cache_file) def minHash(self, code_tokens): minHash = MinHash(num_perm=self.num_perm) for d in code_tokens: # TODO modify this for n-grams minHash.update("".join(d).encode('utf-8')) return minHash # create minHash objects for every dataset def createMinHashSet(self, dataset): minHashes = [] for code in tqdm(dataset): minHashes.append(self.minHash(code)) return minHashes def multi_dict_get(self, key, all_dicts): for dic in all_dicts: if key in dic: return dic[key] raise ValueError('Key not in any of the dictionaries') def loadSyntheticData(self): cache_file = os.path.join(self.CACHE_DIR, 'lsh_programs.pkl') if not self.evict_cache and os.path.isfile(cache_file): data = load_json(cache_file) prog_items = data['raw_programs'] anon_progs = data['anon_programs'] else: standard_path = self.sampledDataPath + '/standard/train' + SYNTH_NAME uniform_path = self.sampledDataPath + '/uniform/train' + SYNTH_NAME tempered_path = self.sampledDataPath + '/tempered/train' + SYNTH_NAME standardDict = pickle.load(open(standard_path, "rb" )) uniformDict = pickle.load(open(uniform_path, "rb" )) temperedDict = pickle.load(open(tempered_path, "rb" )) all_dicts = [standardDict, uniformDict, temperedDict] # this step is not stable across different runs if caching forest # so this needs to be cached too prog_items = list(standardDict.keys() | uniformDict.keys() | temperedDict.keys()) anon_progs = [self.multi_dict_get(prog, all_dicts) for prog in prog_items] data = dict(raw_programs=prog_items, anon_programs=anon_progs) os.makedirs(self.CACHE_DIR, exist_ok=True) save_json(data, cache_file) # if we dont load cache here, we should regenerate forest too self.evict_cache = True return prog_items, anon_progs def transformCode(self, program): splitCode = program.split() return splitCode #return ngrams(splitCode, 3) # tokenize every sentence and return a list of sentences def processData(self, dataset): processed = [] for datum in dataset: transformedCode = self.transformCode(datum) processed.append(transformedCode) return processed def findNearestNeighbours(self, studentProgram, **kwargs): minHash = self.minHash(self.transformCode(studentProgram)) result = self.forest.query(minHash, self.top_k) top_k_programs_anon = [self.sampledData[idx] for idx in result] top_k_programs = [self.rawProgramData[idx] for idx in result] #return top_k_programs, top_k_programs_anon return top_k_programs
from datasketch import MinHashLSHForest, MinHash from sklearn.metrics import jaccard_similarity_score g = [] listlsh = [] lsh = MinHashLSHForest(num_perm=128) for artist,sets in artist_shingle.items(): a = MinHash(num_perm=128) for d in sets[0]: a.update(d.encode('utf8')) listlsh.append(a) lsh.add(artist,a) lsh.index() tester = {} with open('tester.json') as file: tester = json.loads(file.read().encode('latin-1')) numcorrect_1 =0 numcorrect_5 = 0 numcorrect_10 = 0 total = 0 for artist,songlist in tester.items(): for song in songlist: m1 = MinHash(num_perm=128) songp = clean_text(song['lyrics']) for d in songp: m1.update(d.encode('utf8')) result = lsh.query(m1, 10) if len(result):