def searchResult(datalist, labelist, title, name, imgpath): querylist = [] query = queryEmbedding.queryProcessing() query.image_path = imgpath query.artist = [name] query.title = [title] w = query.conectquery() #querylist.append(query.conectquery()) OPlist = [] record = [] for each in getNamelist(labelist): if 'http' in str(each[3]): record.append('[ARC Database]' + str(each[1]) + ' by:' + str(each[2]) + '————info.' + str(each[3])) else: record.append( '[WikiArt]you can check ID:' + str(each[0]) + ':' + str(each[1]) + ' by:' + str(each[2]) + 'in Wikiart,or search by using image from wikiart or by using keywords:' + str(each[3])) if len(str(imgpath).replace(' ', '')) < 1: querylist.append(w[:1536]) cp = ci.MultiClusterIndex( np.array(getNamelist(datalist))[:, :1536], record) wodTP3 = cp.search(querylist, k=5, k_clusters=50, return_distance=False)[0] for each in wodTP3: OPlist.append(each) elif len(str(title).replace(' ', '')) < 1 and len( str(name).replace(' ', '')) < 1: querylist.append(w[1536:]) cp = ci.MultiClusterIndex( np.array(getNamelist(datalist))[:, 1536:], record) imgTP3 = cp.search(querylist, k=5, k_clusters=50, return_distance=False)[0] for each in imgTP3: OPlist.append(each) else: cp = ci.MultiClusterIndex(np.array(getNamelist(datalist)), record) imgTP3 = cp.search(w, k=5, k_clusters=50, return_distance=False)[0] for each in imgTP3: OPlist.append(each) #print(getNamelist(datalist)[record.index(r)]) return OPlist
def test_search(self): """语义搜索.TypeError: expected dimension <= 2 array or matrix """ print('{} test_search {}'.format('-' * 15, '-' * 15)) texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基'] # 文本向量化 vocab = Vocabulary() for text in texts: vocab.add_word_lst(list(text)) print(len(vocab)) embed = StaticEmbedding( vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt') texts_to_id = [[vocab.to_index(word) for word in list(text)] for text in texts] words = torch.LongTensor(texts_to_id) # 将文本转为index features_vec = embed(words) print(features_vec.shape) # build the search index! cp = ci.MultiClusterIndex(features_vec.detach().numpy(), texts) search_texts = ['朱日和站', '温都尔站', '国电站'] for text in search_texts: texts_to_id = [[vocab.to_index(word) for word in list(text)]] words = torch.LongTensor(texts_to_id) # 将文本转为index features_vec = embed(words) search_features_vec = features_vec.detach().numpy() search_result = cp.search(search_features_vec, k=2, k_clusters=2, return_distance=True) print('text:{}'.format(text)) print('search_result:{}'.format(search_result)) """
def update_model(self): ### This part is called only once, when training is done to define our kNN model. # We set k equal to nr of users which clicked on something divided by 10. self.k = max(int(np.floor(sum(self.did_click) / 10)), 1) total_organic = np.sum(self.M_organic, 1) #Use scaled version of organic information to find neighbours self.M_organic_scaled = self.M_organic / total_organic[:, None] #Only use those users which have actually clicked something self.M_organic_scaled_only_clicks = self.M_organic_scaled[ self.did_click, :] self.M_bandit_attempts_only_clicks = self.M_bandit_attempts[ self.did_click, :] self.M_bandit_clicks_only_clicks = self.M_bandit_clicks[ self.did_click, :] self.M_bandit_CTR_only_clicks = self.div( self.M_bandit_clicks_only_clicks, self.M_bandit_attempts_only_clicks) #Compute the total CTR for all items, not being used now. self.total_CTR = np.sum(self.M_bandit_CTR_only_clicks, 0) #Define our kNN model. self.kNN_model = ci.MultiClusterIndex( self.M_organic_scaled_only_clicks, range(len(self.M_organic_scaled_only_clicks[:, 0]))) return
def build_index_new(data_path): qa_path = 'qa_final.json' with open(qa_path) as f: data = json.load(f) question = defaultdict(list) answer = defaultdict(list) all_question = [] all_cp = {} for k, v in data.items(): for pair in v: question[k].append(' '.join(pair[0])) # question[k].append(' '.join(pair[0]) + ' '.join(pair[1])) answer[k].append(' '.join(pair[1])) # all_question.append(' '.join(pair[0]) + ' '.join(pair[1])) all_question.append(' '.join(pair[0])) # print(question['信用卡'][0]) # exit() tv = TfidfVectorizer() tv.fit(all_question) for k, v in question.items(): features_vec = tv.transform(v) cp = ci.MultiClusterIndex(features_vec, answer[k]) all_cp[k] = cp return tv, all_cp
def test_search(self): """语义搜索. """ print('{} test_search {}'.format('-'*15, '-'*15)) texts = [ '温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基' ] # 文本向量化 tv = TfidfVectorizer(analyzer='char', ngram_range=(2, 2)) tv.fit(texts) features_vec = tv.transform(texts) # build the search index! cp = ci.MultiClusterIndex(features_vec, texts) search_texts = [ '朱日和站', '温都尔站', '国电站' ] for text in search_texts: search_features_vec = tv.transform([text]) search_result = cp.search(search_features_vec, k=2, k_clusters=2, return_distance=True) print('text:{}'.format(text)) print('search_result:{}'.format(search_result)) """
def find_edges(input, test, K): print(f"building kNN classifier ... ", end=" ") st_time = time.time() if kNN_type <= 3: input, test = input.todense(), test.todense() if kNN_type == 1: from sklearn.neighbors import NearestNeighbors tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input) elif kNN_type == 2: from scipy import spatial tree = spatial.KDTree(input) elif kNN_type == 3: from n2 import HnswIndex tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2'] for index in tqdm(range(input.shape[0])): tree.add_data(input[index, :]) tree.build(n_threads=10) elif kNN_type == 4: import pysparnn.cluster_index as ci input_num = input.shape[0] tree = ci.MultiClusterIndex(input, range(input_num)) else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") print("finding indices ... ", end=" ") if kNN_type == 1: _, indices = tree.kneighbors(test) elif kNN_type == 2: _, indices = tree.query(test, k=K + 1) elif kNN_type == 3: indices = [] for i in tqdm(range(test.shape[0])): indices.append(tree.search_by_vector(test[i, :], k=K + 1)) else: indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False) print(f"time={time.time()-st_time:.3f}s") edge_list = [] for index1, per in enumerate(indices): for index2 in per: index2 = int(index2) if index1 != index2: edge_list.append((index1, index2)) print(f"done! .... time={time.time()-st_time:.3f}s") return edge_list
def fit(self, queries, queries_idx): """ Args: queries: list of string, string is a cut query queries_idx: Returns: self """ self._tfidf.fit(queries) self._ci = ci.MultiClusterIndex(self._tfidf.transform(queries), queries_idx) return self
def _build_index(self): print("building search engine index") print("start train tfidf") self.tv = TfidfVectorizer(max_df=0.7, min_df=10) self.tv.fit(self.data) print("start transform tfidf") features_vec = self.tv.transform(self.data) # build the search index! print("start build index") self.cp = ci.MultiClusterIndex(features_vec, list(range(len(self.data)))) print("build finished")
def pysparnn_cluster_um(um, knn, k_clusters=8, cf_type='user'): if cf_type == 'movie': um = um.T num_vectors = um.shape[0] k_clusters = int(np.sqrt(num_vectors)) return_data = range(num_vectors) cp = ci.MultiClusterIndex(um, return_data) results = cp.search(um, k=knn + 1, k_clusters=k_clusters, return_distance=True) dist, ind = results_to_matrices(results) sim = 1.0 - dist return sim[:, 1:], ind[:, 1:]
def prepar_recall_datas(): qa_dict = json.load(open("./corpus/qa_dict.json", encoding="utf-8")) q_list = [] q_cut = [] for i in qa_dict: q_list.append(i) q_cut.append(" ".join( qa_dict[i]["cut"])) #分词之后的问题 [sentence,sentence,....] tfidf_vec = TfidfVectorizer() q_vector = tfidf_vec.fit_transform(q_cut) #得到问题的向量 #准备搜索的索引 cp = ci.MultiClusterIndex(q_vector, q_list) return tfidf_vec, cp, qa_dict
def gen_candidates(): test_mentions = get_mention_docs("test") train_mentions = get_mention_docs("train") dev_mentions = get_mention_docs("dev") mentions = {} mentions.update({ k: ' '.join(set(v["text"].split()) - en_stops) for k, v in train_mentions.items() }) mentions.update({ k: ' '.join(set(v["text"].split()) - en_stops) for k, v in test_mentions.items() }) mentions.update({ k: ' '.join(set(v["text"].split()) - en_stops) for k, v in dev_mentions.items() }) mrconso = get_mrconso() aliases = { k: " ".join(set(v["alias"]["ENG"]) - en_stops) for k, v in mrconso.items() if "ENG" in v["alias"] } mention_ids = sorted(mentions) cuis = sorted(aliases) vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(1, 5), max_features=100000) print(vectorizer) X_cui = vectorizer.fit_transform([aliases[cid] for cid in cuis]) X_mention = vectorizer.transform([mentions[mid] for mid in mention_ids]) print(X_cui.shape, X_mention.shape) print("indexing...") cp = ci.MultiClusterIndex(X_cui, cuis) print("searching...") ns = cp.search(X_mention, k=64, k_clusters=2, return_distance=False) with open('ns.pkl', 'wb') as fout: pickle.dump((ns, cuis, mention_ids), fout) with open('mm_tfidf_candidatessparsenn.json', 'w') as fout: for i, nbrs in enumerate(ns): mention_id = mention_ids[i] fout.write( json.dumps({ "mention_id": mention_id, "tfidf_candidates": [nbr for nbr in nbrs] })) fout.write('\n')
def test_levels_multiindex(self): """Test multiple level indexes""" features = np.random.binomial(1, 0.01, size=(1000, 20000)) features = csr_matrix(features) # build the search index! data_to_return = list(range(1000)) # matrix size smaller - this forces the index to have multiple levels cluster_index = ci.MultiClusterIndex(features, data_to_return, matrix_size=10) ret = cluster_index.search(features[0:10], k=1, k_clusters=1, return_distance=False) self.assertEqual([[x] for x in data_to_return[:10]], ret)
def test_large_k(self): """Test multiple level indexes""" features = np.random.binomial(1, 0.01, size=(1000, 20000)) features = csr_matrix(features) # build the search index! data_to_return = np.array(list(range(1000)), dtype=int) # matrix size smaller - this forces the index to have multiple levels cluster_index = ci.MultiClusterIndex(features, data_to_return, matrix_size=10) ret = cluster_index.search(features[0], k=100, k_clusters=1, return_distance=False) self.assertEqual(100, len(ret[0]))
def get_pysprnn_model(self,fasttext): if os.path.exists(config.recall_pysparnn_cp_model_path): return pickle.load(open(config.recall_pysparnn_cp_model_path,"rb")) else: lines=open(config.recall_merged_q_path,"r").readlines() lines=[line.strip() for line in lines] quesions_string_cut=[" ".join(sentence_process.cut_sentence_by_character(setence)) for setence in lines] quesions_vectors=[fasttext.get_sentence_vector(quesion_string_cut) for quesion_string_cut in quesions_string_cut] # fasttest cp=ci.MultiClusterIndex(quesions_vectors,quesions_string_cut,num_indexes=3) pickle.dump(cp,open(config.recall_pysparnn_cp_model_path,"wb")) return cp
def __init__(self, df, threshold=0.9): self.df = df self.threshold = threshold nltk.download('stopwords', quiet=True) stop_words = set(stopwords.words('english')) self.cv = CountVectorizer(max_df=0.85, stop_words=stop_words, max_features=10000) docs = self.df['normalized'].tolist() word_count_vector = self.cv.fit_transform(docs) self.tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True) self.tfidf_transformer.fit(word_count_vector) features_vec = self.tfidf_transformer.transform(word_count_vector) self.index = ci.MultiClusterIndex(features_vec, np.arange(len(docs)))
def build_index(data_path): qa_path = 'qa.json' with open(qa_path) as f: data = json.load(f) question = [] answer = [] for d in data: question.append(' '.join(d[0]) + ' '.join(d[1])) answer.append(' '.join(d[1])) tv = TfidfVectorizer() tv.fit(question) features_vec = tv.transform(question) cp = ci.MultiClusterIndex(features_vec, answer) return tv, cp
def build_advanced_index(self) -> None: """Build the index using pysparnn `cluster_index` and stores it in `multi_cluster_index` """ import pysparnn.cluster_index as ci import scipy if not self.index: raise ValueError( 'Index is empty, please add data into the indexer using `add` method.' ) keys = [] indexed_vectors = [] for key, vector in self.index.items(): keys.append(key) indexed_vectors.append(vector) self.multi_cluster_index = ci.MultiClusterIndex( features=scipy.sparse.vstack(indexed_vectors), records_data=keys, distance_type=self.metric, num_indexes=self.num_indexes, )
def test_search(self): """语义搜索. """ print('{} test_search {}'.format('-' * 15, '-' * 15)) texts = [ 'hello world', 'oh hello there', 'Play it', 'Play it again Sam' ] # 文本向量化 tv = TfidfVectorizer() tv.fit(texts) features_vec = tv.transform(texts) # build the search index! cp = ci.MultiClusterIndex(features_vec, texts) search_texts = ['Play it', 'oh there', 'Play it again Frank'] for text in search_texts: search_features_vec = tv.transform([text]) search_result = cp.search(search_features_vec, k=1, k_clusters=2, return_distance=True) print('text:{}'.format(text)) print('search_result:{}'.format(search_result)) """
def build_index(self, vectors, data): """构建索引,并存储""" search_index = ci.MultiClusterIndex(vectors, data, num_indexes=2) pickle.dump(search_index, open(self.search_index_path, "wb")) return search_index
import pysparnn.cluster_index as ci from sentiment_infer import normalize_text import joblib data_transform, data = joblib.load('./pre-trained/data.db') data = [str(x[0]) + ':' + x[1] for x in data] cp = ci.MultiClusterIndex(data_transform, data) def tfidf_search(text, vectorizer, k=50): text = normalize_text(text) search_features_vec = vectorizer.transform([text]) search_results = cp.search(search_features_vec, k_clusters=2, k=k, return_distance=False) return search_results[0]
# 将 QA 连接起来并分词 corpus = [] for id, item in enumerate(qa): tmp = item['q'] + item['a'] tmp = jieba.cut(tmp) tmp = ' '.join(tmp) corpus.append(tmp) # Generate bag of word # TfidfVectorizer is a combination of CountVectorizer and TfidfTransformer # Here we use TfidfVectorizer tv = TfidfVectorizer() # deal with corpus tv.fit(corpus) # get all words # 词典 words = tv.get_feature_names() # get feature # 获取每对 QA 的TF-IDF tfidf = tv.transform(corpus) # build index # 创建索引 cp = ci.MultiClusterIndex(tfidf, range(len(qa))) # save pickle.dump(tv, open(tv_path, 'wb')) pickle.dump(cp, open(cp_path, 'wb'))
def find_edges(input, test, K, cluster_ids, query_ids): print(f"\tbuilding kNN classifier ... ", end=" ") st_time = time.time() if kNN_type in [1, 2]: input, test = input.todense(), test.todense() if kNN_type == 1: from sklearn.neighbors import NearestNeighbors tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input) elif kNN_type == 2: from scipy import spatial tree = spatial.KDTree(input) elif kNN_type == 3: from n2 import HnswIndex tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2'] for index in tqdm(range(input.shape[0])): tree.add_data(input[index, :]) tree.build(n_threads=20) elif kNN_type == 4: import pysparnn.cluster_index as ci input_num = input.shape[0] tree = ci.MultiClusterIndex(input, range(input_num)) elif kNN_type == 5: import nmslib M, efC, num_threads = 30, 100, 10 index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0} space_name = 'cosinesimil_sparse' data_type = nmslib.DataType.SPARSE_VECTOR tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type) print(f"type(input) = {type(input)} type(test)={type(test)}", end=" ") tree.addDataPointBatch(input) tree.createIndex(index_time_params) # Setting query-time parameters efS = 100 query_time_params = {'efSearch': efS} print('Setting query-time parameters', query_time_params) tree.setQueryTimeParams(query_time_params) else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") print("\tfinding indices ... ", end=" ") if kNN_type == 1: _, indices = tree.kneighbors(test) elif kNN_type == 2: _, indices = tree.query(test, k=K + 1) elif kNN_type == 3: indices = [] for i in tqdm(range(test.shape[0])): indices.append(tree.search_by_vector(test[i, :], k=K + 1)) elif kNN_type == 4: indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False) elif kNN_type == 5: indices_ = tree.knnQueryBatch(test, k=K, num_threads=num_threads) indices = [i[0] for i in indices_] del indices_ else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") edge_list = [] for index1, per in enumerate(indices): for index2 in per: index2 = int(index2) if index1 != index2: edge_list.append((query_ids[index1], center_ids[index2])) print(f"\tdone! .... time={time.time()-st_time:.3f}s") return edge_list
def nearest_neighbours(data, query, k=5): n = range(data.shape[0]) cp = ci.MultiClusterIndex(data, n) return cp.search(query, k=k, k_clusters=2, return_distance=True)
import pysparnn.cluster_index as ci import numpy as np from scipy.sparse import csr_matrix features = np.random.binomial(1, 0.01, size=(1000, 20000)) features = csr_matrix(features) # build the search index! data_to_return = range(1000) cp = ci.MultiClusterIndex(features, data_to_return) t = cp.search(features[:5], k=1, return_distance=False) print(t)
def bulit_q_vector(): lines_cuted = [value['cuted'] for value in QA_dict.values()] # 实例化conunt tfidf = TfidfVectorizer() tfidf.fit(lines_cuted) features_vec = tfidf.transform(lines_cuted) return tfidf, features_vec, lines_cuted tfidf, features_vec, lines_cuted = bulit_q_vector() print('ljflsdfsdf') print(tfidf.transform(['byebye'])) cp = ci.MultiClusterIndex(features_vec, lines_cuted) ret = ['python 什么'] #对用户输入的句子进行向量化 search_vec = tfidf.transform(ret) print(search_vec) #搜索获取结果,返回最大的8个数据,之后根据`main_entiry`进行过滤结果 cp_search_list = cp.search(search_vec, k=8, k_clusters=10, return_distance=True) print(cp_search_list)
# mrp calculation def mean_reciprocal_rank(rs): rs = (np.asarray(r).nonzero()[0] for r in rs) return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs]) eval_results = {'recalls': {}, 'mrr': {}, 'ndcg': {}} doc_index = np.array(range(len(matrix_df))) for k in [2, 5, 10, 15, 20, 50]: # for each user get average distance of the movies that user rated to retrieve top k movies to recommend print(k) snn = ci.MultiClusterIndex(um_matrix, doc_index, num_indexes=k) results = snn.search(um_matrix, k=5, return_distance=True, k_clusters=1) results_dic = [] for i in results: results_dic.append(dict((int(y), x) for x, y in i)) # for each user get average distance of the movies that user rated to retrieve top k movies to recommend avg_dict = {} for index, user in enumerate(user_hists): user_dict = {} for movie in user: for m in results_dic[movie]: if m not in user_dict: user_dict[m] = results_dic[movie][m] else:
def try_pysparnn(train_v, test_v, train_l, k_clstrs): cp = ci.MultiClusterIndex(train_v, train_l) return cp.search(test_v, k=1, k_clusters=k_clstrs, return_distance=False)
def serve(args): # serve_demo: Load saved embeddings, serve question model. question in, results out. # serve_question: only serve question model. question in, vector out. # serve_context: only serve context model. context in, phrase-vector pairs out. # serve: serve all three. device = torch.device('cuda' if args.cuda else 'cpu') pprint(args.__dict__) interface = FileInterface(**args.__dict__) # use cache for metadata if args.cache: out = interface.cache(preprocess, args) processor = out['processor'] processed_metadata = out['processed_metadata'] else: processor = Processor(**args.__dict__) metadata = interface.load_metadata() processed_metadata = processor.process_metadata(metadata) model = Model(**args.__dict__).to(device) model.init(processed_metadata) interface.bind(processor, model) interface.load(args.iteration, session=args.load_dir) with torch.no_grad(): model.eval() if args.mode == 'serve_demo': phrases = [] paras = [] results = [] embs = [] idxs = [] iterator = interface.context_load(metadata=True, emb_type=args.emb_type) for _, (cur_phrases, each_emb, metadata) in zip(range(args.num_train_mats), iterator): embs.append(each_emb) phrases.extend(cur_phrases) for span in metadata['answer_spans']: results.append([len(paras), span[0], span[1]]) idxs.append(len(idxs)) paras.append(metadata['context']) if args.emb_type == 'dense': import faiss emb = np.concatenate(embs, 0) d = 4 * args.hidden_size * args.num_heads if args.metric == 'ip': quantizer = faiss.IndexFlatIP(d) # Exact Search elif args.metric == 'l2': quantizer = faiss.IndexFlatL2(d) else: raise ValueError() if args.nlist != args.nprobe: # Approximate Search. nlist > nprobe makes it faster and less accurate if args.bpv is None: if args.metric == 'ip': search_index = faiss.IndexIVFFlat(quantizer, d, args.nlist, faiss.METRIC_INNER_PRODUCT) elif args.metric == 'l2': search_index = faiss.IndexIVFFlat(quantizer, d, args.nlist) else: raise ValueError() else: assert args.metric == 'l2' # only l2 is supported for product quantization search_index = faiss.IndexIVFPQ(quantizer, d, args.nlist, args.bpv, 8) search_index.train(emb) else: search_index = quantizer search_index.add(emb) for cur_phrases, each_emb, metadata in iterator: phrases.extend(cur_phrases) for span in metadata['answer_spans']: results.append([len(paras), span[0], span[1]]) paras.append(metadata['context']) search_index.add(each_emb) if args.nlist != args.nprobe: search_index.nprobe = args.nprobe def search(emb, k): D, I = search_index.search(emb, k) return D[0], I[0] elif args.emb_type == 'sparse': assert args.metric == 'l2' # currently only l2 is supported (couldn't find a good ip library) import pysparnn.cluster_index as ci cp = ci.MultiClusterIndex(embs, idxs) for cur_phrases, each_emb, metadata in iterator: phrases.extend(cur_phrases) for span in metadata['answer_spans']: results.append([len(paras), span[0], span[1]]) paras.append(metadata['context']) for each_vec in each_emb: cp.insert(each_vec, len(idxs)) idxs.append(len(idxs)) def search(emb, k): return zip(*[each[0] for each in cp.search(emb, k=k)]) else: raise ValueError() def retrieve(question, k): example = {'question': question, 'id': 'real', 'idx': 0} dataset = (processor.preprocess(example), ) loader = DataLoader(dataset, batch_size=1, collate_fn=processor.collate) batch = next(iter(loader)) question_output = model.get_question(**batch) question_results = processor.postprocess_question_batch(dataset, batch, question_output) id_, emb = question_results[0] D, I = search(emb, k) out = [(paras[results[i][0]], results[i][1], results[i][2], '%.4r' % d.item(),) for d, i in zip(D, I)] return out if args.mem_info: import psutil import os pid = os.getpid() py = psutil.Process(pid) info = py.memory_info()[0] / 2. ** 30 print('Memory Use: %.2f GB' % info) # Demo server. Requires flask and tornado from flask import Flask, request, jsonify from flask_cors import CORS from tornado.wsgi import WSGIContainer from tornado.httpserver import HTTPServer from tornado.ioloop import IOLoop app = Flask(__name__, static_url_path='/static') app.config['JSONIFY_PRETTYPRINT_REGULAR'] = False CORS(app) @app.route('/') def index(): return app.send_static_file('index.html') @app.route('/files/<path:path>') def static_files(path): return app.send_static_file('files/' + path) @app.route('/api', methods=['GET']) def api(): query = request.args['query'] out = retrieve(query, 5) return jsonify(out) print('Starting server at %d' % args.port) http_server = HTTPServer(WSGIContainer(app)) http_server.listen(args.port) IOLoop.instance().start()
import pandas as pd import sqlite3 from vectorizerClass import StemmedTfidfVectorizer from sklearn.externals import joblib from nltk.corpus import stopwords from pysparnn import cluster_index as ci with sqlite3.connect("vk.sql") as con: data = pd.read_sql("""SELECT "index","question" FROM 'vk.sql'""", con) questions_index = data["index"] questions_data = data["question"] stop_words = stopwords.words("russian") vector = StemmedTfidfVectorizer(min_df=1, stop_words=stop_words, decode_error="ignore") vector_data = vector.fit_transform(questions_data) knn_index = ci.MultiClusterIndex(vector_data, questions_index) joblib.dump(vector, 'vector.pkl') joblib.dump(knn_index, 'index_knn.pkl')
def find_edges(input, test, K): print(f"\tbuilding kNN classifier ... ", end=" ") st_time = time.time() if kNN_type in [1, 2]: input, test = input.todense(), test.todense() if kNN_type == 1: from sklearn.neighbors import NearestNeighbors tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input) elif kNN_type == 2: from scipy import spatial tree = spatial.KDTree(input) elif kNN_type == 3: from n2 import HnswIndex tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2'] for index in tqdm(range(input.shape[0])): tree.add_data(input[index, :]) tree.build(n_threads=20) elif kNN_type == 4: import pysparnn.cluster_index as ci input_num = input.shape[0] tree = ci.MultiClusterIndex(input, range(input_num)) elif kNN_type == 5: import nmslib M, efC = 30, 100 index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0} space_names = ['l2_sparse', 'cosinesimil_sparse'] # https://github.com/nmslib/nmslib/blob/master/manual/spaces.md space_name = space_names[0] data_type = nmslib.DataType.SPARSE_VECTOR tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type) ''' def calc_zero_rows(i): if input[i, :].getnnz() == 0: return 1 else: return 0 pool = Pool(num_threads) zero_row_num = sum(pool.map(calc_zero_rows, range(input.shape[0]))) print(f"# zero rows in input = {zero_row_num}", end=" ") ''' tree.addDataPointBatch(input) tree.createIndex(index_time_params, print_progress=True) # Setting query-time parameters efS = 100 query_time_params = {'efSearch': efS} print('Setting query-time parameters', query_time_params, end=" ") tree.setQueryTimeParams(query_time_params) else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") print("\tfinding indices ... ", end=" ") if kNN_type == 1: _, indices = tree.kneighbors(test) elif kNN_type == 2: _, indices = tree.query(test, k=K + 1) elif kNN_type == 3: indices = [] for i in tqdm(range(test.shape[0])): indices.append(tree.search_by_vector(test[i, :], k=K + 1)) elif kNN_type == 4: indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False) elif kNN_type == 5: ''' def calc_zero_rows2(i): if test[i, :].getnnz() == 0: return 1 else: return 0 pool = Pool(num_threads) zero_row_num = sum(pool.map(calc_zero_rows2, range(test.shape[0]))) print(f"# zero rows in test = {zero_row_num}") ''' indices_ = tree.knnQueryBatch(test, k=K+1, num_threads=num_threads) indices = [i[0] for i in indices_] del indices_ else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") edge_list = [] for index1, per in enumerate(indices): assert len(per) == K+1, f"index1={index1} len(per)={len(per)} != K={K}" for index2 in per: index2 = int(index2) if index1 != index2: edge_list.append((index1, index2)) print(f"\tget edges done! .... time={time.time()-st_time:.3f}s") return edge_list