class DataSketch(BaseANN): def __init__(self, metric, n_perm, n_rep): if metric not in ('jaccard'): raise NotImplementedError("Datasketch doesn't support metric %s" % metric) self._n_perm = n_perm self._n_rep = n_rep self._metric = metric self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep) def fit(self, X): self._index = MinHashLSHForest(num_perm = self._n_perm, l = self._n_rep) for i, x in enumerate(X): m = MinHash(num_perm = self._n_perm) for e in x: m.update(str(e)) self._index.add(str(i), m) self._index.index() def query(self, v, n): m = MinHash(num_perm = self._n_perm) for e in v: m.update(str(e)) return map(int, self._index.query(m, n))
class DataSketch(BaseANN): def __init__(self, metric, n_perm, n_rep): if metric not in ('jaccard'): raise NotImplementedError("Datasketch doesn't support metric %s" % metric) self._n_perm = n_perm self._n_rep = n_rep self._metric = metric self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep) def fit(self, X): self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep) for i, x in enumerate(X): m = MinHash(num_perm=self._n_perm) for e in x: m.update(str(e)) self._index.add(str(i), m) self._index.index() def query(self, v, n): m = MinHash(num_perm=self._n_perm) for e in v: m.update(str(e)) return map(int, self._index.query(m, n))
def saver(self, i, q, retq, matq, l): print_start = t.time() save_start = t.time() global_time = t.time() chunk_size = 100 count = 0 forest = MinHashLSHForest(num_perm=self.numperm) taxstr = '' if self.tax_filter is None: taxstr = 'NoFilter' if self.tax_mask is None: taxstr += 'NoMask' else: taxstr = str(self.tax_filter) dataset_name = self.saving_name + '_' + taxstr self.errorfile = self.saving_path + 'errors.txt' with open(self.errorfile, 'w') as hashes_error_files: with h5py.File(self.hashes_path, 'w', libver='latest') as h5hashes: datasets = {} if dataset_name not in h5hashes.keys(): if self.verbose == True: print('creating dataset') print(dataset_name) print('filtered at taxonomic level: ' + taxstr) h5hashes.create_dataset(dataset_name + '_' + taxstr, (chunk_size, 0), maxshape=(None, None), dtype='int32') datasets[dataset_name] = h5hashes[dataset_name + '_' + taxstr] if self.verbose == True: print(datasets) h5flush = h5hashes.flush print('saver init ' + str(i)) while True: this_dataframe = retq.get() if this_dataframe is not None: if not this_dataframe.empty: hashes = this_dataframe['hash'].to_dict() print(str(this_dataframe.Fam.max()) + 'fam num') print(str(count) + ' done') hashes = { fam: hashes[fam] for fam in hashes if hashes[fam] } [ forest.add(str(fam), hashes[fam]) for fam in hashes ] for fam in hashes: if len(datasets[dataset_name]) < fam + 10: datasets[dataset_name].resize( (fam + chunk_size, len(hashes[fam].hashvalues.ravel()))) datasets[dataset_name][ fam, :] = hashes[fam].hashvalues.ravel() count += 1 if t.time() - save_start > 200: print(t.time() - global_time) forest.index() print(forest.query(hashes[fam], k=10)) h5flush() save_start = t.time() with open(self.lshforestpath, 'wb') as forestout: forestout.write(pickle.dumps(forest, -1)) if self.verbose == True: print('save done at' + str(t.time() - global_time)) else: print(this_dataframe) else: if self.verbose == True: print('wrap it up') with open(self.lshforestpath, 'wb') as forestout: forestout.write(pickle.dumps(forest, -1)) h5flush() if self.verbose == True: print('DONE SAVER' + str(i)) break
class AutoTag(): def __init__(self, num_permutation=60): self.__num_permutation = num_permutation self.__forest = MinHashLSHForest(self.__num_permutation) self.__lem = WordNetLemmatizer() stop_words = set(stopwords.words("english")) stop_words.add('—') stop_words.add('And') self.__stop_words = stop_words def fit(self, csv): df = pd.read_csv(csv) df.drop_duplicates(subset='webURL', keep=False, inplace=True) df.dropna(inplace=True) for index, row in df.iterrows(): min_hash = self.make_min_hash(self.make_clean_words_list(row['Text'])) self.__forest.add(row['webURL'], min_hash) if index % 100 == 0 :print(index, end='\r', flush=True) self.__forest.index() def make_clean_words_list(self, text): text = re.sub('[^a-zA-Z]', ' ', text) #Convert to lowercase text = text.lower() #remove tags text=re.sub("</?.*?>"," <> ",text) # remove special characters and digits text=re.sub("(\\d|\\W)+"," ",text) #Lemmatisation text = text.split() lem = WordNetLemmatizer() text = [lem.lemmatize(word) for word in text if not word in self.__stop_words] return text def predict(self, text, num_of_niebhors): #TODO : change results into tags query = self.make_min_hash(self.make_clean_words_list(text)) return self.__forest.query(query, num_of_niebhors) def make_min_hash(self,words): min_hash = MinHash(self.__num_permutation) for word in words: min_hash.update(word.encode('utf8')) return min_hash def load_trained_model(self, trained_model_file_name, num_of_permutations): self.__forest = pickle.load(open(trained_model_file_name, 'rb')) self.__num_permutation = num_of_permutations def save_model(self, file_name): pickle.dump(self.__forest, open(file_name, 'wb'))
class LshNN(ProgramNN): CACHE_DIR = 'cache/' def __init__(self, sampledDataPath, num_perm=128, top_k=1, evict_cache=False): """ An agent class to find rubric sampled nearest neighbour of a given program by using a MinHash LSH forest. """ self.sampledDataPath = sampledDataPath self.num_perm = num_perm self.top_k = top_k self.evict_cache = evict_cache self.rawProgramData, self.sampledData = self.loadSyntheticData() self.create_lsh_forest() def create_lsh_forest(self): cache_file = os.path.join(self.CACHE_DIR, 'lsh_forest.pkl') if not self.evict_cache and os.path.isfile(cache_file): # load precomputed print('Loading cached forest') self.forest = load_pickle(cache_file) else: sampledSets = self.processData(self.sampledData) self.sampledMinHashes = self.createMinHashSet(sampledSets) self.forest = MinHashLSHForest(num_perm=self.num_perm) for prog_idx, minHash in enumerate(self.sampledMinHashes): self.forest.add(prog_idx, minHash) self.forest.index() os.makedirs(self.CACHE_DIR, exist_ok=True) save_pickle(self.forest, cache_file) def minHash(self, code_tokens): minHash = MinHash(num_perm=self.num_perm) for d in code_tokens: # TODO modify this for n-grams minHash.update("".join(d).encode('utf-8')) return minHash # create minHash objects for every dataset def createMinHashSet(self, dataset): minHashes = [] for code in tqdm(dataset): minHashes.append(self.minHash(code)) return minHashes def multi_dict_get(self, key, all_dicts): for dic in all_dicts: if key in dic: return dic[key] raise ValueError('Key not in any of the dictionaries') def loadSyntheticData(self): cache_file = os.path.join(self.CACHE_DIR, 'lsh_programs.pkl') if not self.evict_cache and os.path.isfile(cache_file): data = load_json(cache_file) prog_items = data['raw_programs'] anon_progs = data['anon_programs'] else: standard_path = self.sampledDataPath + '/standard/train' + SYNTH_NAME uniform_path = self.sampledDataPath + '/uniform/train' + SYNTH_NAME tempered_path = self.sampledDataPath + '/tempered/train' + SYNTH_NAME standardDict = pickle.load(open(standard_path, "rb" )) uniformDict = pickle.load(open(uniform_path, "rb" )) temperedDict = pickle.load(open(tempered_path, "rb" )) all_dicts = [standardDict, uniformDict, temperedDict] # this step is not stable across different runs if caching forest # so this needs to be cached too prog_items = list(standardDict.keys() | uniformDict.keys() | temperedDict.keys()) anon_progs = [self.multi_dict_get(prog, all_dicts) for prog in prog_items] data = dict(raw_programs=prog_items, anon_programs=anon_progs) os.makedirs(self.CACHE_DIR, exist_ok=True) save_json(data, cache_file) # if we dont load cache here, we should regenerate forest too self.evict_cache = True return prog_items, anon_progs def transformCode(self, program): splitCode = program.split() return splitCode #return ngrams(splitCode, 3) # tokenize every sentence and return a list of sentences def processData(self, dataset): processed = [] for datum in dataset: transformedCode = self.transformCode(datum) processed.append(transformedCode) return processed def findNearestNeighbours(self, studentProgram, **kwargs): minHash = self.minHash(self.transformCode(studentProgram)) result = self.forest.query(minHash, self.top_k) top_k_programs_anon = [self.sampledData[idx] for idx in result] top_k_programs = [self.rawProgramData[idx] for idx in result] #return top_k_programs, top_k_programs_anon return top_k_programs
artist_shingle[artist].append(tokens) from datasketch import MinHashLSHForest, MinHash from sklearn.metrics import jaccard_similarity_score g = [] listlsh = [] lsh = MinHashLSHForest(num_perm=128) for artist,sets in artist_shingle.items(): a = MinHash(num_perm=128) for d in sets[0]: a.update(d.encode('utf8')) listlsh.append(a) lsh.add(artist,a) lsh.index() tester = {} with open('tester.json') as file: tester = json.loads(file.read().encode('latin-1')) numcorrect_1 =0 numcorrect_5 = 0 numcorrect_10 = 0 total = 0 for artist,songlist in tester.items(): for song in songlist: m1 = MinHash(num_perm=128) songp = clean_text(song['lyrics']) for d in songp: m1.update(d.encode('utf8'))
# TODO: neither of these work well with puzzles... df = pd.read_csv( '../chess-opening/csvs/lichess_db_standard_rated_2020-08_600+0.csv', nrows=1000000) def create_min_hash(fens): min_hash = MinHash(num_perm=128) for fen in fens: min_hash.update(fen.encode('utf8')) return min_hash user_df = df.groupby('username').agg({'fen': set, 'elo': 'mean'}) user_df['min_hash'] = user_df['fen'].apply(create_min_hash) forest = MinHashLSHForest(num_perm=128) for row in user_df.itertuples(): forest.add(row.Index, row.min_hash) forest.index() for i in range(10): result = forest.query(user_df['min_hash'][i], 10) elos = [] for username in result: elos.append(user_df.loc[username]['elo']) print(user_df['elo'][i], np.mean(elos))
data3 = ['minhash', 'is', 'probability', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'documents'] # Create MinHash objects m1 = MinHash(num_perm=128) m2 = MinHash(num_perm=128) m3 = MinHash(num_perm=128) for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) for d in data3: m3.update(d.encode('utf8')) # Create a MinHash LSH Forest with the same num_perm parameter forest = MinHashLSHForest(num_perm=128) # Add m2 and m3 into the index forest.add("m2", m2) forest.add("m3", m3) # IMPORTANT: must call index() otherwise the keys won't be searchable forest.index() # Check for membership using the key print("m2" in forest) print("m3" in forest) # Using m1 as the query, retrieve top 2 keys that have the higest Jaccard result = forest.query(m1, 1) print("Top 2 candidates", result)
string2hash = {} pbar = tqdm(total=len(string2label)) for program, _ in string2label.items(): tokens = program.split() minhash = MinHash() for token in tokens: minhash.update(token.encode('utf-8')) string2hash[program] = minhash pbar.update() pbar.close() forest = MinHashLSHForest() pbar = tqdm(total=len(string2hash)) for program, minhash in string2hash.items(): forest.add(program, minhash) pbar.update() pbar.close() forest.index() true_labels = [] pred_labels = [] zipf_labels = [] for program, label in real_data.items(): zipf = real_zipf[program] try: tokens = program.split() except: continue minhash = MinHash() for token in tokens:
return temp # 3.建立分词之后的文档 docment = [] for sentence in content_list: item_str = jieba_cut(sentence) docment.append(item_str) # 建立MinHash结构 MinHashList = [] forest = MinHashLSHForest() for i, line in enumerate(docment): hash_codes = get_minhash(line) MinHashList.append(hash_codes) forest.add(i, hash_codes) # index所有key,以便可以进行检索 forest.index() query = '国足输给叙利亚后,里皮坐不住了,直接辞职了' print("query str :", query) # 4. 将item_text进行分词 item_str = jieba_cut(query) # 得到item_str的MinHash minhash_query = get_minhash(item_str) # 5. 查询forest中与m1相似的Top-K个邻居 result = forest.query(minhash_query, 3) for i in range(len(result)): print("vocab_id:", result[i], "jaccard :",
def main(): corpus = {} with open('corpus_data/preprocessedf_corpus.json') as file: corpus = json.loads(file.read().encode('Utf-8')) def processLyrics(lyrics): authors = {} for author in lyrics: for song in lyrics[author]: lyric = re.sub(r'\[[^>]+\]', '', song["lyrics"]) lyric = re.sub(r'\([^>]+\)', '', lyric) lyric = re.sub(r'\{[^>]+\}', '', lyric) lyric = lyric.split(r'\s') for line in lyric: line = re.sub(r'\n', ' ', line) if author not in authors: authors[author] = line else: authors[author] += line return authors import nltk from nltk.corpus import stopwords from collections import defaultdict from collections import Counter nltk.download('wordnet') from nltk.corpus import wordnet as wn def get_lemma(word): lemma = wn.morphy(word) if lemma is None: return word else: return lemma from nltk import word_tokenize def clean_text(text, ar): tokenized_text = word_tokenize(text.lower()) tokenized_text = [token for token in tokenized_text if len(token) > 5] cleaned_text = [ t for t in tokenized_text if re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t) ] if ar == 'sw': cleaned_text = [t for t in cleaned_text if t not in STOPWORDS] if ar == 'lm': cleaned_text = [get_lemma(token) for token in cleaned_text] if ar == 'rw': cleaned_text = [ token for token in cleaned_text if token not in PROFANITY ] return cleaned_text STOPWORDS = set(stopwords.words('english')) with open('corpus_data/rapsw.txt') as infile: infile = infile.read() infile = infile.split() PROFANITY = set(infile) corpus = processLyrics(corpus) for author, text in corpus.items(): corpus[author] = clean_text(text, sys.argv[1]) artist_shingle = defaultdict(list) for artist, lyrics in corpus.items(): #tokens = [w for w in tokens if not w in sw] #shingle3 = set([tuple(tokens[i:i+3]) for i in range(len(tokens) - 3 + 1) if len(tokens[i]) < 10]) #shingle2 = set([tuple(tokens[i:i+2]) for i in range(len(tokens) - 2 + 1) if len(tokens[i]) < 10]) shingle1 = lyrics # set([tokens[i] for i in range(len(tokens) - 1 + 1) if len(tokens[i]) < 4]) artist_shingle[artist].append(shingle1) #artist_shingle[artist].append(shingle2) #artist_shingle[artist].append(shingle3) from datasketch import MinHashLSHForest, MinHash from sklearn.metrics import jaccard_similarity_score listlsh = [] lsh = MinHashLSHForest(num_perm=128) for artist, sets in artist_shingle.items(): a = MinHash(num_perm=128) for d in sets[0]: a.update(d.encode('utf8')) listlsh.append(a) lsh.add(artist, a) lsh.index() m1 = MinHash(num_perm=128) g = [] with open(sys.argv[2]) as g: g = g.read() g = g.split() for d in g: m1.update(d.encode('utf8')) result = lsh.query(m1, 5) print(" (Up to) Top 5 candidates", result)
print(doc['title']) # filename = "result_minHashLSHforest.txt" # myfile = open(filename, 'a+') # myfile.write(doc['title'] + '\n') if result: for item in result: doc = docs_col.find_one({"_id": ObjectId(str(item))}) print(doc['title']) # myfile.write(doc['title'] + '\n') # myfile.write("================" + '\n') print("=====================") if __name__ == '__main__': forest = MinHashLSHForest(num_perm=128) documents_en = docs_col.find({"lang": 'english'}) for item in documents_en: minhash = MinHash(num_perm=128) list_keyword = item["keyword"].split(",") for k in list_keyword: minhash.update(k.encode("utf-8")) forest.add(str(item["_id"]), minhash) forest.index() documents_vi = docs_col.find({"lang": 'vietnamese'}) start_time = time.time() for doc in documents_vi: # pdb.set_trace() query_candidates(doc, 5) elapsed_time = time.time() - start_time print(elapsed_time)
class DataSketch(BaseANN): def __init__(self, metric, n_perm, n_rep, n_leaves): if metric not in ('jaccard'): raise NotImplementedError("Datasketch doesn't support metric %s" % metric) self._n_perm = n_perm self._n_rep = n_rep self._n_leaves = n_leaves self._metric = metric self.name = 'Datasketch(n_perm=%d, n_rep=%d, n_leaves=%d)' % ( n_perm, n_rep, n_leaves) def fit(self, X): self.index = numpy.empty([0, 32]) self._index_minhash = [] self._ball_index = [] self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep) for i, x in enumerate(X): m = MinHash(num_perm=self._n_perm) for e in x: m.update(str(e).encode('utf-8')) self._index.add(str(i), m) #self.index.append(m.digest()) self.index = numpy.vstack((self.index, m.digest())) self._ball_index.append(m.digest()) self._index_minhash.append(m) self._index.index() self._X = X self.tree = BallTree(self.index, leaf_size=self._n_leaves) # self._annoy = annoy.AnnoyIndex(X.shape[1], metric='euclidean') # for i, x in enumerate(X): # self._annoy.add_item(i, x.tolist()) # self._annoy.build(100) def query(self, v, n): print("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") m = MinHash(num_perm=self._n_perm) for e in v: m.update(str(e).encode('utf-8')) # for i in self._annoy.get_nns_by_vector(v.tolist(), n, 100): # print(self._index_minhash[int(i)].jaccard(m)) dist, ind = self.tree.query([m.digest()], k=n) for i in ind[0]: # print(i) print(self._index_minhash[int(i)].jaccard(m)) print("=======================") brute_indices = self.query_with_distances(m.digest(), n) for i in brute_indices: print(self._index_minhash[int(i)].jaccard(m)) print("-----------------------") ind2 = self._index.query(m, n) for i in ind2: print(self._index_minhash[int(i)].jaccard(m)) # return map(int, ind[0]) return self.query_with_distances(m.digest(), n) popcount = [] for i in range(256): popcount.append(bin(i).count("1")) def query_with_distances(self, v, n): """Find indices of `n` most similar vectors from the index to query vector `v`.""" if self._metric == 'jaccard': dists = numpy.array( [pd[self._metric]['distance'](v, e) for e in self.index]) else: assert False, "invalid metric" # shouldn't get past the constructor! # partition-sort by distance, get `n` closest nearest_indices = dists.argsort()[-n:][::-1] return nearest_indices
class duplicate_docs: def __init__(self): self.lsh = MinHashLSHForest( num_perm=config.LSH_CONFIG['num_permutation']) def load(self, model): print('loading %s ...' % (model)) if os.path.isfile(model): return joblib.load(model) else: return None def save(self, model, path): print('saving %s ...' % (path)) joblib.dump(model, path) return # load data from list documents def run(self, docs): count = 1 for itemid, content in docs.items(): try: doc = document(content) self.insert(doc, key=itemid) print('\rpushed %d items' % (count)), sys.stdout.flush() count += 1 except: pass self.lsh.index() print('') def run_ex(self, itemid, content, call_index=True): try: doc = document(content) self.insert(doc, key=itemid) if call_index: self.lsh.index() except: pass def query(self, doc, topn=1000): try: unicodedata.normalize('NFKC', doc) doc = document(doc) minhash = doc.get_minhash(doc.k_shingles, config.MINHASH_CONFIG['num_permutation']) return self.lsh.query(minhash, topn) except: return [] # insert a document object # output: key if document does not exist duplicate item # otherwise return alert duplication. def insert(self, doc, key=None): if key is None: key = utils.id_generator() minhash = doc.get_minhash(doc.k_shingles, config.MINHASH_CONFIG['num_permutation']) if len(doc.k_shingles) == 0: return u'Does not insert this document to database.\nDocument\'s shingle = 0.\nDocument need to contain at least %d word' \ % (config.SHINGLE_CONFIG['k']) self.lsh.add(key, minhash) def load_model(self): self.lsh = self.load('model/lsh.pkl') self.docs = self.load('model/docs.pkl') self.docs_time = self.load('model/docs_time.pkl') if self.lsh != None and self.docs != None and self.docs_time != None: return True return False def save_model(self): utils.mkdir('model') self.save(self.lsh, 'model/lsh.pkl') self.save(self.docs, 'model/docs.pkl') self.save(self.docs_time, 'model/docs_time.pkl')
# create a minhash for each row(product) by calling the 'create_minhash' function df_tfidf['Minhash'] = df_tfidf[0:].apply(lambda x: create_minhash(x), axis=1) # create a list with all the Minhash signatures minhash_list = df_tfidf['Minhash'] # create a MinHashLSHForest object with num_perm parameter equal to sample_size(=128) # num_perm: the number of permutation functions forest = MinHashLSHForest(num_perm=128) # add each Minhash signature into the index i = 0 for minhash in minhash_list: # Add minhash into the index forest.add("m"+str(i), minhash) i += 1 # call index() in order to make the keys searchable forest.index() # create the recommendations by retrieving top 10 keys that have the higest Jaccard for each product def make_recs(doc_id, n_recs): """ This function takes the id of the target product and returns the top n_recs(=10) keys that have the higest Jaccard :param doc_id: the id of the target product :param n_recs: the number of similar products to be returned :return: top n_recs keys that have the higest Jaccard for each product """ query = minhash_list[doc_id]
class HashJaccard(FilterProblem): """ A class that does clustering based on hashes from the datasketch library. """ @property def num_perm(self): return DATA_FILTERING["num_permutations"] @property def DataPointClass(self): return DataPoint # Find nearest medoid for a data point. def find_nearest_medoid(self, data_point, data_tag=""): nearest_medoid = self.forest.query(data_point.min_hash, 1) if not nearest_medoid: nearest_medoid = [ random.randint(0, self.num_clusters[data_tag] - 1) ] return nearest_medoid[0] # Do the clustering of sources and targets. def clustering(self, data_tag): """ Params: :data_tag: Whether it's source or target data. """ # Create a min hash forest to quickly find nearest neighbours. self.forest = MinHashLSHForest(num_perm=self.num_perm) # Initialize clusters. medoids = random.sample(range(len(self.data_points[data_tag])), self.num_clusters[data_tag]) for i in range(self.num_clusters[data_tag]): cl = self.ClusterClass(self.data_points[data_tag][medoids[i]]) self.clusters[data_tag].append(cl) # Put medoids in a the forest. self.forest.add(i, self.clusters[data_tag][-1].medoid.min_hash) self.forest.index() # For each data_point find a cluster. self.cluster_points(data_tag) # These will be needed for the stopping criterion. cluster_names = [ self.clusters[data_tag][i].medoid.string for i in range(self.num_clusters[data_tag]) ] cluster_names_old = list(cluster_names) count = 0 counts = [] exit = False # Clustering loop. while not exit: count += 1 # Find the point that minimizes the mean distance within a cluster. self.find_medoid(data_tag) # Create new forest. self.forest = MinHashLSHForest(num_perm=self.num_perm) for i in range(self.num_clusters[data_tag]): self.forest.add(i, self.clusters[data_tag][i].medoid.min_hash) self.forest.index() # Assign each point to the new medoids. self.cluster_points(data_tag) # Check stopping criterions. exit, cluster_names, cluster_names_old, counts = self.stop_clustering( data_tag, cluster_names, cluster_names_old, count, counts)
stop = [] # 得到分词后的documents documents = [] for item_text in sentences: # 将item_text进行分词 item_str = get_item_str(item_text) documents.append(item_str) # 创建LSH Forest及MinHash对象 minhash_list = [] forest = MinHashLSHForest() for i in range(len(documents)): # 得到train_documents[i]的MinHash temp = get_minhash(documents[i]) minhash_list.append(temp) forest.add(i, temp) # index所有key,以便可以进行检索 forest.index() query = '00:01:36,2019天猫双11总成交额超100亿元' # 将item_text进行分词 item_str = get_item_str(query) # 得到item_str的MinHash minhash_query = get_minhash(item_str) # 查询forest中与m1相似的Top-K个邻居 result = forest.query(minhash_query, 3) for i in range(len(result)): print(result[i], minhash_query.jaccard(minhash_list[result[i]]), documents[result[i]].replace(' ', '')) print("Top 3 邻居", result)
def printStats(json_filename): with open(json_filename) as json_data: d = json.load(json_data) # Query simple index: queryNum -> queryText queryIndex = {} # Index of queries as a LSH forest for top-k similar queries. queriesLSHIndex = MinHashLSHForest(num_perm=128) # You can grok the CSV from stdout by using cut, e.g., # # $ python analyzer.py -i ../../data/queries_ASTs.json | grep "csv:" | cut -d':' -f2 > /tmp/out.csv print 'csv:"queryNum","numExplicitJoins","referencedTables","groupByColumns","numGroupByClauses"' for queryNum, entry in enumerate(d): print '\n=> Stats for query number \"%s:\"' % queryNum # Group by clauses. groupByColumns = jmespath.search( 'ast.statement[*].group.expression[*].name[]', entry) print 'groupBy columns: %s' % groupByColumns # Base tables when the query has no joins. baseTables = jmespath.search( 'ast.statement[?from.variant == \'table\'].from.name[]', entry) print 'baseTables: %s' % baseTables # Base tables when the query has joins. baseTables += jmespath.search( 'ast.statement[?from.variant == \'join\'].from.source.name[]', entry) print 'baseTables (with joins): %s' % baseTables # Join tables. joinTables = jmespath.search( 'ast.statement[?from.variant == \'join\'].from.map[*].source.name[]', entry) print 'joinTables: %s' % joinTables # All tables mentioned in the query referencedTables = baseTables + joinTables # Joins. joinPathPrefix = 'ast.statement[*].from.map[*].constraint.on' joinsLeft = jmespath.search(joinPathPrefix + '.left.name', entry) joinsRight = jmespath.search(joinPathPrefix + '.right.name', entry) print 'explicit joins (left-hand side): %s' % joinsLeft print 'explicit joins (right-hand side): %s' % joinsRight # Text queryText = jmespath.search('queryText', entry) # Index it into an LSH forest for top-k textually similar queries. queryLSH = getQueryMinHash(queryText) queryIndex[queryNum] = { 'queryText': queryText, 'queryLSH': queryLSH } queriesLSHIndex.add(queryNum, queryLSH) # Sort for a prettier CSV dump. referencedTables.sort() groupByColumns.sort() # CSV header: # queryNum,numExplicitJoins,referencedTables,groupByColumns,numGroupByColumns print 'queryNum = %s' % queryNum print 'csv:"%s","%s","%s","%s","%s"' % ( queryNum, len(joinsLeft[0]) if len(joinsLeft) > 0 else 0, ','.join(referencedTables), ','.join(groupByColumns), len(groupByColumns)) # Populate a reverse index from table to script. tableToQuery = {} for referencedTable in referencedTables: if referencedTable not in tableToQuery: tableToQuery[referencedTable] = [queryNum] else: tableToQuery[referencedTable].append(queryNum) # Sample search on LSH forest index: top-3 most similar queries. queriesLSHIndex.index() k = 3 queryNum = 10 query = queryIndex[queryNum] print '\n\nTop %s queries similar to "%s":' % (k, query['queryText']) top_k = queriesLSHIndex.query(query['queryLSH'], k) for k in top_k: print '\n"%s"' % queryIndex[k]['queryText']
'estimating', 'the', 'similarity', 'between', 'documents' ] data3 = [ 'minhash', 'is', 'probability', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'documents' ] dataset = [[0., 0., 0.], [0., 0., 1.], [0., 1., 0.], [0., 1., 1.]] # Create a MinHash LSH Forest with the same num_perm parameter forest = MinHashLSHForest(num_perm=128) for i, data in enumerate(dataset): m = MinHash(num_perm=128) for d in data: m.update(str(d).encode('utf8')) forest.add(str(i), m) # IMPORTANT: must call index() otherwise the keys won't be searchable pickle.dump(forest, open('forest.lsh', 'wb')) del forest forest = pickle.load(open('forest.lsh', 'rb')) forest.index() # Check for membership using the key print("1" in forest) print("2" in forest) m = MinHash(num_perm=128) for d in dataset[0]:
class LshSamplesEval: '''STUDENT_PATH = '../studentData/liftoff/' STANDARD_PATH = '../data/raw/liftoff/standard/' UNIFORM_PATH = '../data/raw/liftoff/uniform/' TEMPERED_PATH = '../data/raw/liftoff/tempered/''' def __init__(self, studentDataPath, sampledDataPath): print('Loading data...') self.studentDataPath = studentDataPath print(self.studentDataPath) self.sampledDataPath = sampledDataPath print(self.sampledDataPath) self.sampledData = self.loadSyntheticData() self.studentData = self.loadStudentData() def loadStudentData(self): path = self.studentDataPath + STUDENT_NAME datadict = pickle.load(open(path, "rb" )) return list(datadict.keys()) def loadSyntheticData(self): standard_path = self.sampledDataPath + '/standard/train' + SYNTH_NAME uniform_path = self.sampledDataPath + '/uniform/train' + SYNTH_NAME tempered_path = self.sampledDataPath + '/tempered/train' + SYNTH_NAME standardDict = pickle.load(open(standard_path, "rb" )) uniformDict = pickle.load(open(uniform_path, "rb" )) temperedDict = pickle.load(open(tempered_path, "rb" )) #import pdb; pdb.set_trace() return list(standardDict.values()) + list(uniformDict.values()) + list(temperedDict.values()) def computeLshNN(self): print('Processing sampled and student data...') sampledSets = self.processData(self.sampledData) studentSets = self.processData(self.studentData) print('Finding nearest neighbors from sampled data...') sampledScores = self.constructNNList(studentSets, sampledSets, self.studentData, self.sampledData) print('Found nearest neighbors for data!') # self.constructHistogram(sampledScores) return sampledScores # tokenize every sentence and return a list of sentences def processData(self, dataset): processed = [] for datum in dataset: splitCode = datum.split() processed.append(splitCode) return processed # runs MinHashLsh def constructNNList(self, studentSets, sampledSets, studentData, sampledData): print('Creating min-hashes for student data') self.studentMinHashes = self.createMinHash(studentSets) print('Creating min-hashes for rubric data') self.sampledMinHashes = self.createMinHash(sampledSets) self.forest = MinHashLSHForest(num_perm = 128) i = 0 for minHash in self.sampledMinHashes: self.forest.add(str(i), minHash) i += 1 self.forest.index() print("calculating nearest neighbor") scores = [] for i, query in enumerate(tqdm(self.studentMinHashes)): result = self.forest.query(query, 1) indexMatch = int(result[0]) # Uncomment these to print examples of # student code and their nearest neighbor! print(result) print('Student Code: \n') print(studentData[i]) print('\n') print('Closest Sampled Code: \n') print(sampledData[indexMatch]) print('\n') score = self.sampledMinHashes[indexMatch].jaccard(query) print('Score: \n') scores.append(score) return scores # create minHash objects for every dataset def createMinHash(self, dataset): minHashes = [] for code in tqdm(dataset): minHash = MinHash(num_perm = 128) for d in code: # TODO modify this for n-grams minHash.update("".join(d).encode('utf-8')) minHashes.append(minHash) return minHashes def constructHistogram(self, scores): plt.hist(scores) plt.xlabel('Jaccard Similarity Score') plt.ylabel('Counts') plt.show()