def searchResult(datalist, labelist, title, name, imgpath):
    querylist = []
    query = queryEmbedding.queryProcessing()
    query.image_path = imgpath
    query.artist = [name]
    query.title = [title]
    w = query.conectquery()
    #querylist.append(query.conectquery())
    OPlist = []

    record = []

    for each in getNamelist(labelist):
        if 'http' in str(each[3]):
            record.append('[ARC Database]' + str(each[1]) + ' by:' +
                          str(each[2]) + '————info.' + str(each[3]))
        else:
            record.append(
                '[WikiArt]you can check ID:' + str(each[0]) + ':' +
                str(each[1]) + ' by:' + str(each[2]) +
                'in Wikiart,or search by  using image from wikiart or by using keywords:'
                + str(each[3]))

    if len(str(imgpath).replace(' ', '')) < 1:
        querylist.append(w[:1536])
        cp = ci.MultiClusterIndex(
            np.array(getNamelist(datalist))[:, :1536], record)
        wodTP3 = cp.search(querylist,
                           k=5,
                           k_clusters=50,
                           return_distance=False)[0]
        for each in wodTP3:
            OPlist.append(each)
    elif len(str(title).replace(' ', '')) < 1 and len(
            str(name).replace(' ', '')) < 1:
        querylist.append(w[1536:])
        cp = ci.MultiClusterIndex(
            np.array(getNamelist(datalist))[:, 1536:], record)
        imgTP3 = cp.search(querylist,
                           k=5,
                           k_clusters=50,
                           return_distance=False)[0]
        for each in imgTP3:
            OPlist.append(each)
        else:
            cp = ci.MultiClusterIndex(np.array(getNamelist(datalist)), record)
            imgTP3 = cp.search(w, k=5, k_clusters=50, return_distance=False)[0]
            for each in imgTP3:
                OPlist.append(each)
    #print(getNamelist(datalist)[record.index(r)])
    return OPlist
Exemplo n.º 2
0
 def test_search(self):
     """语义搜索.TypeError: expected dimension <= 2 array or matrix
     """
     print('{} test_search {}'.format('-' * 15, '-' * 15))
     texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基']
     # 文本向量化
     vocab = Vocabulary()
     for text in texts:
         vocab.add_word_lst(list(text))
     print(len(vocab))
     embed = StaticEmbedding(
         vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt')
     texts_to_id = [[vocab.to_index(word) for word in list(text)]
                    for text in texts]
     words = torch.LongTensor(texts_to_id)  # 将文本转为index
     features_vec = embed(words)
     print(features_vec.shape)
     # build the search index!
     cp = ci.MultiClusterIndex(features_vec.detach().numpy(), texts)
     search_texts = ['朱日和站', '温都尔站', '国电站']
     for text in search_texts:
         texts_to_id = [[vocab.to_index(word) for word in list(text)]]
         words = torch.LongTensor(texts_to_id)  # 将文本转为index
         features_vec = embed(words)
         search_features_vec = features_vec.detach().numpy()
         search_result = cp.search(search_features_vec,
                                   k=2,
                                   k_clusters=2,
                                   return_distance=True)
         print('text:{}'.format(text))
         print('search_result:{}'.format(search_result))
     """
    def update_model(self):
        ### This part is called only once, when training is done to define our kNN model.
        # We set k equal to nr of users which clicked on something divided by 10.
        self.k = max(int(np.floor(sum(self.did_click) / 10)), 1)
        total_organic = np.sum(self.M_organic, 1)
        #Use scaled version of organic information to find neighbours
        self.M_organic_scaled = self.M_organic / total_organic[:, None]

        #Only use those users which have actually clicked something
        self.M_organic_scaled_only_clicks = self.M_organic_scaled[
            self.did_click, :]
        self.M_bandit_attempts_only_clicks = self.M_bandit_attempts[
            self.did_click, :]
        self.M_bandit_clicks_only_clicks = self.M_bandit_clicks[
            self.did_click, :]
        self.M_bandit_CTR_only_clicks = self.div(
            self.M_bandit_clicks_only_clicks,
            self.M_bandit_attempts_only_clicks)
        #Compute the total CTR for all items, not being used now.
        self.total_CTR = np.sum(self.M_bandit_CTR_only_clicks, 0)

        #Define our kNN model.
        self.kNN_model = ci.MultiClusterIndex(
            self.M_organic_scaled_only_clicks,
            range(len(self.M_organic_scaled_only_clicks[:, 0])))
        return
def build_index_new(data_path):
    qa_path = 'qa_final.json'

    with open(qa_path) as f:
        data = json.load(f)

    question = defaultdict(list)
    answer = defaultdict(list)
    all_question = []
    all_cp = {}
    for k, v in data.items():
        for pair in v:
            question[k].append(' '.join(pair[0]))
            # question[k].append(' '.join(pair[0]) + ' '.join(pair[1]))
            answer[k].append(' '.join(pair[1]))
            # all_question.append(' '.join(pair[0]) + ' '.join(pair[1]))
            all_question.append(' '.join(pair[0]))


    # print(question['信用卡'][0])
    # exit()

    tv = TfidfVectorizer()
    tv.fit(all_question)

    for k, v in question.items():
        features_vec = tv.transform(v)
        cp = ci.MultiClusterIndex(features_vec, answer[k])
        all_cp[k] = cp

    return tv, all_cp
Exemplo n.º 5
0
 def test_search(self):
     """语义搜索.
     """
     print('{} test_search {}'.format('-'*15, '-'*15))
     texts = [
         '温都尔站',
         '东乌广厦',
         '国电四郎',
         '阿尔善站',
         '朱日和基'
     ]
     # 文本向量化
     tv = TfidfVectorizer(analyzer='char', ngram_range=(2, 2))
     tv.fit(texts)
     features_vec = tv.transform(texts)
     # build the search index!
     cp = ci.MultiClusterIndex(features_vec, texts)
     search_texts = [
         '朱日和站',
         '温都尔站',
         '国电站'
     ]
     for text in search_texts:
         search_features_vec = tv.transform([text])
         search_result = cp.search(search_features_vec, k=2, k_clusters=2, return_distance=True)
         print('text:{}'.format(text))
         print('search_result:{}'.format(search_result))
     """
Exemplo n.º 6
0
def find_edges(input, test, K):
    print(f"building kNN classifier ... ", end=" ")
    st_time = time.time()

    if kNN_type <= 3:
        input, test = input.todense(), test.todense()

    if kNN_type == 1:
        from sklearn.neighbors import NearestNeighbors
        tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input)
    elif kNN_type == 2:
        from scipy import spatial
        tree = spatial.KDTree(input)
    elif kNN_type == 3:
        from n2 import HnswIndex
        tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2']
        for index in tqdm(range(input.shape[0])):
            tree.add_data(input[index, :])
        tree.build(n_threads=10)
    elif kNN_type == 4:
        import pysparnn.cluster_index as ci
        input_num = input.shape[0]
        tree = ci.MultiClusterIndex(input, range(input_num))
    else:
        raise NotImplementedError
    print(f"time={time.time()-st_time:.3f}s")


    print("finding indices ... ", end=" ")
    if kNN_type == 1:
        _, indices = tree.kneighbors(test)
    elif kNN_type == 2:
        _, indices = tree.query(test, k=K + 1)
    elif kNN_type == 3:
        indices = []
        for i in tqdm(range(test.shape[0])):
            indices.append(tree.search_by_vector(test[i, :], k=K + 1))
    else:
        indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False)
    print(f"time={time.time()-st_time:.3f}s")


    edge_list = []
    for index1, per in enumerate(indices):
        for index2 in per:
            index2 = int(index2)
            if index1 != index2:
                edge_list.append((index1, index2))
    print(f"done! .... time={time.time()-st_time:.3f}s")
    return edge_list
Exemplo n.º 7
0
    def fit(self, queries, queries_idx):
        """

        Args:
            queries:  list of string, string is a cut query
            queries_idx:

        Returns:
            self
        """
        self._tfidf.fit(queries)
        self._ci = ci.MultiClusterIndex(self._tfidf.transform(queries),
                                        queries_idx)
        return self
Exemplo n.º 8
0
    def _build_index(self):
        print("building search engine index")

        print("start train tfidf")
        self.tv = TfidfVectorizer(max_df=0.7, min_df=10)
        self.tv.fit(self.data)

        print("start transform tfidf")
        features_vec = self.tv.transform(self.data)

        # build the search index!
        print("start build index")
        self.cp = ci.MultiClusterIndex(features_vec,
                                       list(range(len(self.data))))
        print("build finished")
def pysparnn_cluster_um(um, knn, k_clusters=8, cf_type='user'):
    if cf_type == 'movie': um = um.T
    num_vectors = um.shape[0]

    k_clusters = int(np.sqrt(num_vectors))
    return_data = range(num_vectors)
    cp = ci.MultiClusterIndex(um, return_data)
    results = cp.search(um,
                        k=knn + 1,
                        k_clusters=k_clusters,
                        return_distance=True)

    dist, ind = results_to_matrices(results)
    sim = 1.0 - dist
    return sim[:, 1:], ind[:, 1:]
Exemplo n.º 10
0
def prepar_recall_datas():
    qa_dict = json.load(open("./corpus/qa_dict.json", encoding="utf-8"))
    q_list = []
    q_cut = []
    for i in qa_dict:
        q_list.append(i)
        q_cut.append(" ".join(
            qa_dict[i]["cut"]))  #分词之后的问题 [sentence,sentence,....]

    tfidf_vec = TfidfVectorizer()
    q_vector = tfidf_vec.fit_transform(q_cut)  #得到问题的向量

    #准备搜索的索引
    cp = ci.MultiClusterIndex(q_vector, q_list)

    return tfidf_vec, cp, qa_dict
Exemplo n.º 11
0
def gen_candidates():
    test_mentions = get_mention_docs("test")
    train_mentions = get_mention_docs("train")
    dev_mentions = get_mention_docs("dev")
    mentions = {}
    mentions.update({
        k: ' '.join(set(v["text"].split()) - en_stops)
        for k, v in train_mentions.items()
    })
    mentions.update({
        k: ' '.join(set(v["text"].split()) - en_stops)
        for k, v in test_mentions.items()
    })
    mentions.update({
        k: ' '.join(set(v["text"].split()) - en_stops)
        for k, v in dev_mentions.items()
    })

    mrconso = get_mrconso()
    aliases = {
        k: " ".join(set(v["alias"]["ENG"]) - en_stops)
        for k, v in mrconso.items() if "ENG" in v["alias"]
    }
    mention_ids = sorted(mentions)
    cuis = sorted(aliases)
    vectorizer = TfidfVectorizer(analyzer="char_wb",
                                 ngram_range=(1, 5),
                                 max_features=100000)
    print(vectorizer)
    X_cui = vectorizer.fit_transform([aliases[cid] for cid in cuis])
    X_mention = vectorizer.transform([mentions[mid] for mid in mention_ids])
    print(X_cui.shape, X_mention.shape)
    print("indexing...")
    cp = ci.MultiClusterIndex(X_cui, cuis)
    print("searching...")
    ns = cp.search(X_mention, k=64, k_clusters=2, return_distance=False)
    with open('ns.pkl', 'wb') as fout:
        pickle.dump((ns, cuis, mention_ids), fout)
    with open('mm_tfidf_candidatessparsenn.json', 'w') as fout:
        for i, nbrs in enumerate(ns):
            mention_id = mention_ids[i]
            fout.write(
                json.dumps({
                    "mention_id": mention_id,
                    "tfidf_candidates": [nbr for nbr in nbrs]
                }))
            fout.write('\n')
Exemplo n.º 12
0
    def test_levels_multiindex(self):
        """Test multiple level indexes"""
        features = np.random.binomial(1, 0.01, size=(1000, 20000))
        features = csr_matrix(features)

        # build the search index!
        data_to_return = list(range(1000))

        # matrix size smaller - this forces the index to have multiple levels
        cluster_index = ci.MultiClusterIndex(features,
                                             data_to_return,
                                             matrix_size=10)

        ret = cluster_index.search(features[0:10],
                                   k=1,
                                   k_clusters=1,
                                   return_distance=False)
        self.assertEqual([[x] for x in data_to_return[:10]], ret)
Exemplo n.º 13
0
    def test_large_k(self):
        """Test multiple level indexes"""
        features = np.random.binomial(1, 0.01, size=(1000, 20000))
        features = csr_matrix(features)

        # build the search index!
        data_to_return = np.array(list(range(1000)), dtype=int)

        # matrix size smaller - this forces the index to have multiple levels
        cluster_index = ci.MultiClusterIndex(features,
                                             data_to_return,
                                             matrix_size=10)

        ret = cluster_index.search(features[0],
                                   k=100,
                                   k_clusters=1,
                                   return_distance=False)
        self.assertEqual(100, len(ret[0]))
Exemplo n.º 14
0
    def get_pysprnn_model(self,fasttext):

        if os.path.exists(config.recall_pysparnn_cp_model_path):
            return pickle.load(open(config.recall_pysparnn_cp_model_path,"rb"))
        else:
            lines=open(config.recall_merged_q_path,"r").readlines()

            lines=[line.strip() for line in lines]

            quesions_string_cut=[" ".join(sentence_process.cut_sentence_by_character(setence)) for setence in lines]

            quesions_vectors=[fasttext.get_sentence_vector(quesion_string_cut) for quesion_string_cut in quesions_string_cut]

            # fasttest
            cp=ci.MultiClusterIndex(quesions_vectors,quesions_string_cut,num_indexes=3)

            pickle.dump(cp,open(config.recall_pysparnn_cp_model_path,"wb"))

            return cp
Exemplo n.º 15
0
    def __init__(self, df, threshold=0.9):
        self.df = df
        self.threshold = threshold

        nltk.download('stopwords', quiet=True)
        stop_words = set(stopwords.words('english'))

        self.cv = CountVectorizer(max_df=0.85,
                                  stop_words=stop_words,
                                  max_features=10000)
        docs = self.df['normalized'].tolist()
        word_count_vector = self.cv.fit_transform(docs)

        self.tfidf_transformer = TfidfTransformer(smooth_idf=True,
                                                  use_idf=True)
        self.tfidf_transformer.fit(word_count_vector)
        features_vec = self.tfidf_transformer.transform(word_count_vector)

        self.index = ci.MultiClusterIndex(features_vec, np.arange(len(docs)))
Exemplo n.º 16
0
def build_index(data_path):
    qa_path = 'qa.json'

    with open(qa_path) as f:
        data = json.load(f)

    question = []
    answer = []
    for d in data:
        question.append(' '.join(d[0]) + ' '.join(d[1]))
        answer.append(' '.join(d[1]))

    tv = TfidfVectorizer()
    tv.fit(question)

    features_vec = tv.transform(question)
    cp = ci.MultiClusterIndex(features_vec, answer)

    return tv, cp
Exemplo n.º 17
0
    def build_advanced_index(self) -> None:
        """Build the index using pysparnn `cluster_index` and stores it in `multi_cluster_index` """

        import pysparnn.cluster_index as ci
        import scipy

        if not self.index:
            raise ValueError(
                'Index is empty, please add data into the indexer using `add` method.'
            )
        keys = []
        indexed_vectors = []
        for key, vector in self.index.items():
            keys.append(key)
            indexed_vectors.append(vector)

        self.multi_cluster_index = ci.MultiClusterIndex(
            features=scipy.sparse.vstack(indexed_vectors),
            records_data=keys,
            distance_type=self.metric,
            num_indexes=self.num_indexes,
        )
Exemplo n.º 18
0
 def test_search(self):
     """语义搜索.
     """
     print('{} test_search {}'.format('-' * 15, '-' * 15))
     texts = [
         'hello world', 'oh hello there', 'Play it', 'Play it again Sam'
     ]
     # 文本向量化
     tv = TfidfVectorizer()
     tv.fit(texts)
     features_vec = tv.transform(texts)
     # build the search index!
     cp = ci.MultiClusterIndex(features_vec, texts)
     search_texts = ['Play it', 'oh there', 'Play it again Frank']
     for text in search_texts:
         search_features_vec = tv.transform([text])
         search_result = cp.search(search_features_vec,
                                   k=1,
                                   k_clusters=2,
                                   return_distance=True)
         print('text:{}'.format(text))
         print('search_result:{}'.format(search_result))
     """
Exemplo n.º 19
0
 def build_index(self, vectors, data):
     """构建索引,并存储"""
     search_index = ci.MultiClusterIndex(vectors, data, num_indexes=2)
     pickle.dump(search_index, open(self.search_index_path, "wb"))
     return search_index
Exemplo n.º 20
0
import pysparnn.cluster_index as ci
from sentiment_infer import normalize_text
import joblib

data_transform, data = joblib.load('./pre-trained/data.db')
data = [str(x[0]) + ':' + x[1] for x in data]
cp = ci.MultiClusterIndex(data_transform, data)


def tfidf_search(text, vectorizer, k=50):
    text = normalize_text(text)
    search_features_vec = vectorizer.transform([text])
    search_results = cp.search(search_features_vec,
                               k_clusters=2,
                               k=k,
                               return_distance=False)

    return search_results[0]
Exemplo n.º 21
0
# 将 QA 连接起来并分词
corpus = []
for id, item in enumerate(qa):
    tmp = item['q'] + item['a']
    tmp = jieba.cut(tmp)
    tmp = ' '.join(tmp)
    corpus.append(tmp)

# Generate bag of word
# TfidfVectorizer is a combination of CountVectorizer and TfidfTransformer
# Here we use TfidfVectorizer
tv = TfidfVectorizer()

# deal with corpus
tv.fit(corpus)

# get all words
# 词典
words = tv.get_feature_names()

# get feature
# 获取每对 QA 的TF-IDF
tfidf = tv.transform(corpus)

# build index
# 创建索引
cp = ci.MultiClusterIndex(tfidf, range(len(qa)))

# save
pickle.dump(tv, open(tv_path, 'wb'))
pickle.dump(cp, open(cp_path, 'wb'))
Exemplo n.º 22
0
def find_edges(input, test, K, cluster_ids, query_ids):
    print(f"\tbuilding kNN classifier ... ", end=" ")
    st_time = time.time()

    if kNN_type in [1, 2]:
        input, test = input.todense(), test.todense()

    if kNN_type == 1:
        from sklearn.neighbors import NearestNeighbors
        tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input)
    elif kNN_type == 2:
        from scipy import spatial
        tree = spatial.KDTree(input)
    elif kNN_type == 3:
        from n2 import HnswIndex
        tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2']
        for index in tqdm(range(input.shape[0])):
            tree.add_data(input[index, :])
        tree.build(n_threads=20)
    elif kNN_type == 4:
        import pysparnn.cluster_index as ci
        input_num = input.shape[0]
        tree = ci.MultiClusterIndex(input, range(input_num))
    elif kNN_type == 5:
        import nmslib
        M, efC, num_threads = 30, 100, 10
        index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0}
        space_name = 'cosinesimil_sparse'
        data_type = nmslib.DataType.SPARSE_VECTOR
        tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type)
        
        print(f"type(input) = {type(input)} type(test)={type(test)}", end=" ")
        
        tree.addDataPointBatch(input)

        tree.createIndex(index_time_params)
        # Setting query-time parameters
        efS = 100
        query_time_params = {'efSearch': efS}
        print('Setting query-time parameters', query_time_params)
        tree.setQueryTimeParams(query_time_params)

    else:
        raise NotImplementedError
    print(f"time={time.time()-st_time:.3f}s")


    print("\tfinding indices ... ", end=" ")
    if kNN_type == 1:
        _, indices = tree.kneighbors(test)
    elif kNN_type == 2:
        _, indices = tree.query(test, k=K + 1)
    elif kNN_type == 3:
        indices = []
        for i in tqdm(range(test.shape[0])):
            indices.append(tree.search_by_vector(test[i, :], k=K + 1))
    elif kNN_type == 4:
        indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False)
    elif kNN_type == 5:
        indices_ = tree.knnQueryBatch(test, k=K, num_threads=num_threads)
        indices = [i[0] for i in indices_]
        del indices_
    else:
        raise NotImplementedError

    print(f"time={time.time()-st_time:.3f}s")


    edge_list = []
    for index1, per in enumerate(indices):
        for index2 in per:
            index2 = int(index2)
            if index1 != index2:
                edge_list.append((query_ids[index1], center_ids[index2]))
    print(f"\tdone! .... time={time.time()-st_time:.3f}s")
    return edge_list
Exemplo n.º 23
0
def nearest_neighbours(data, query, k=5):
    n = range(data.shape[0])
    cp = ci.MultiClusterIndex(data, n)

    return cp.search(query, k=k, k_clusters=2, return_distance=True)
Exemplo n.º 24
0
import pysparnn.cluster_index as ci

import numpy as np
from scipy.sparse import csr_matrix

features = np.random.binomial(1, 0.01, size=(1000, 20000))
features = csr_matrix(features)

# build the search index!
data_to_return = range(1000)
cp = ci.MultiClusterIndex(features, data_to_return)

t = cp.search(features[:5], k=1, return_distance=False)
print(t)
Exemplo n.º 25
0
def bulit_q_vector():

    lines_cuted = [value['cuted'] for value in QA_dict.values()]
    # 实例化conunt

    tfidf = TfidfVectorizer()
    tfidf.fit(lines_cuted)

    features_vec = tfidf.transform(lines_cuted)

    return tfidf, features_vec, lines_cuted


tfidf, features_vec, lines_cuted = bulit_q_vector()
print('ljflsdfsdf')
print(tfidf.transform(['byebye']))

cp = ci.MultiClusterIndex(features_vec, lines_cuted)
ret = ['python 什么']

#对用户输入的句子进行向量化
search_vec = tfidf.transform(ret)
print(search_vec)
#搜索获取结果,返回最大的8个数据,之后根据`main_entiry`进行过滤结果
cp_search_list = cp.search(search_vec,
                           k=8,
                           k_clusters=10,
                           return_distance=True)
print(cp_search_list)

# mrp calculation
def mean_reciprocal_rank(rs):
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


eval_results = {'recalls': {}, 'mrr': {}, 'ndcg': {}}

doc_index = np.array(range(len(matrix_df)))

for k in [2, 5, 10, 15, 20, 50]:
    # for each user get average distance of the movies that user rated to retrieve top k movies to recommend
    print(k)
    snn = ci.MultiClusterIndex(um_matrix, doc_index, num_indexes=k)
    results = snn.search(um_matrix, k=5, return_distance=True, k_clusters=1)

    results_dic = []
    for i in results:
        results_dic.append(dict((int(y), x) for x, y in i))

    # for each user get average distance of the movies that user rated to retrieve top k movies to recommend
    avg_dict = {}
    for index, user in enumerate(user_hists):
        user_dict = {}
        for movie in user:
            for m in results_dic[movie]:
                if m not in user_dict:
                    user_dict[m] = results_dic[movie][m]
                else:
Exemplo n.º 27
0
def try_pysparnn(train_v, test_v, train_l, k_clstrs):
    cp = ci.MultiClusterIndex(train_v, train_l)
    return cp.search(test_v, k=1, k_clusters=k_clstrs, return_distance=False)
Exemplo n.º 28
0
def serve(args):
    # serve_demo: Load saved embeddings, serve question model. question in, results out.
    # serve_question: only serve question model. question in, vector out.
    # serve_context: only serve context model. context in, phrase-vector pairs out.
    # serve: serve all three.
    device = torch.device('cuda' if args.cuda else 'cpu')
    pprint(args.__dict__)

    interface = FileInterface(**args.__dict__)
    # use cache for metadata
    if args.cache:
        out = interface.cache(preprocess, args)
        processor = out['processor']
        processed_metadata = out['processed_metadata']
    else:
        processor = Processor(**args.__dict__)
        metadata = interface.load_metadata()
        processed_metadata = processor.process_metadata(metadata)

    model = Model(**args.__dict__).to(device)
    model.init(processed_metadata)
    interface.bind(processor, model)

    interface.load(args.iteration, session=args.load_dir)

    with torch.no_grad():
        model.eval()

        if args.mode == 'serve_demo':
            phrases = []
            paras = []
            results = []
            embs = []
            idxs = []
            iterator = interface.context_load(metadata=True, emb_type=args.emb_type)
            for _, (cur_phrases, each_emb, metadata) in zip(range(args.num_train_mats), iterator):
                embs.append(each_emb)
                phrases.extend(cur_phrases)
                for span in metadata['answer_spans']:
                    results.append([len(paras), span[0], span[1]])
                    idxs.append(len(idxs))
                paras.append(metadata['context'])
            if args.emb_type == 'dense':
                import faiss
                emb = np.concatenate(embs, 0)

                d = 4 * args.hidden_size * args.num_heads
                if args.metric == 'ip':
                    quantizer = faiss.IndexFlatIP(d)  # Exact Search
                elif args.metric == 'l2':
                    quantizer = faiss.IndexFlatL2(d)
                else:
                    raise ValueError()

                if args.nlist != args.nprobe:
                    # Approximate Search. nlist > nprobe makes it faster and less accurate
                    if args.bpv is None:
                        if args.metric == 'ip':
                            search_index = faiss.IndexIVFFlat(quantizer, d, args.nlist, faiss.METRIC_INNER_PRODUCT)
                        elif args.metric == 'l2':
                            search_index = faiss.IndexIVFFlat(quantizer, d, args.nlist)
                        else:
                            raise ValueError()
                    else:
                        assert args.metric == 'l2'  # only l2 is supported for product quantization
                        search_index = faiss.IndexIVFPQ(quantizer, d, args.nlist, args.bpv, 8)
                    search_index.train(emb)
                else:
                    search_index = quantizer

                search_index.add(emb)
                for cur_phrases, each_emb, metadata in iterator:
                    phrases.extend(cur_phrases)
                    for span in metadata['answer_spans']:
                        results.append([len(paras), span[0], span[1]])
                    paras.append(metadata['context'])
                    search_index.add(each_emb)

                if args.nlist != args.nprobe:
                    search_index.nprobe = args.nprobe

                def search(emb, k):
                    D, I = search_index.search(emb, k)
                    return D[0], I[0]

            elif args.emb_type == 'sparse':
                assert args.metric == 'l2'  # currently only l2 is supported (couldn't find a good ip library)
                import pysparnn.cluster_index as ci

                cp = ci.MultiClusterIndex(embs, idxs)

                for cur_phrases, each_emb, metadata in iterator:
                    phrases.extend(cur_phrases)
                    for span in metadata['answer_spans']:
                        results.append([len(paras), span[0], span[1]])
                    paras.append(metadata['context'])
                    for each_vec in each_emb:
                        cp.insert(each_vec, len(idxs))
                        idxs.append(len(idxs))

                def search(emb, k):
                    return zip(*[each[0] for each in cp.search(emb, k=k)])

            else:
                raise ValueError()

            def retrieve(question, k):
                example = {'question': question, 'id': 'real', 'idx': 0}
                dataset = (processor.preprocess(example), )
                loader = DataLoader(dataset, batch_size=1, collate_fn=processor.collate)
                batch = next(iter(loader))
                question_output = model.get_question(**batch)
                question_results = processor.postprocess_question_batch(dataset, batch, question_output)
                id_, emb = question_results[0]
                D, I = search(emb, k)
                out = [(paras[results[i][0]], results[i][1], results[i][2], '%.4r' % d.item(),)
                       for d, i in zip(D, I)]
                return out

            if args.mem_info:
                import psutil
                import os
                pid = os.getpid()
                py = psutil.Process(pid)
                info = py.memory_info()[0] / 2. ** 30
                print('Memory Use: %.2f GB' % info)

            # Demo server. Requires flask and tornado
            from flask import Flask, request, jsonify
            from flask_cors import CORS

            from tornado.wsgi import WSGIContainer
            from tornado.httpserver import HTTPServer
            from tornado.ioloop import IOLoop

            app = Flask(__name__, static_url_path='/static')

            app.config['JSONIFY_PRETTYPRINT_REGULAR'] = False
            CORS(app)

            @app.route('/')
            def index():
                return app.send_static_file('index.html')

            @app.route('/files/<path:path>')
            def static_files(path):
                return app.send_static_file('files/' + path)

            @app.route('/api', methods=['GET'])
            def api():
                query = request.args['query']
                out = retrieve(query, 5)
                return jsonify(out)

            print('Starting server at %d' % args.port)
            http_server = HTTPServer(WSGIContainer(app))
            http_server.listen(args.port)
            IOLoop.instance().start()
Exemplo n.º 29
0
import pandas as pd
import sqlite3
from vectorizerClass import StemmedTfidfVectorizer
from sklearn.externals import joblib
from nltk.corpus import stopwords
from pysparnn import cluster_index as ci

with sqlite3.connect("vk.sql") as con:
    data = pd.read_sql("""SELECT "index","question" FROM 'vk.sql'""", con)

questions_index = data["index"]
questions_data = data["question"]
stop_words = stopwords.words("russian")
vector = StemmedTfidfVectorizer(min_df=1,
                                stop_words=stop_words,
                                decode_error="ignore")
vector_data = vector.fit_transform(questions_data)
knn_index = ci.MultiClusterIndex(vector_data, questions_index)

joblib.dump(vector, 'vector.pkl')
joblib.dump(knn_index, 'index_knn.pkl')
Exemplo n.º 30
0
def find_edges(input, test, K):
    print(f"\tbuilding kNN classifier ... ", end=" ")
    st_time = time.time()

    if kNN_type in [1, 2]:
        input, test = input.todense(), test.todense()

    if kNN_type == 1:
        from sklearn.neighbors import NearestNeighbors
        tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input)
    elif kNN_type == 2:
        from scipy import spatial
        tree = spatial.KDTree(input)
    elif kNN_type == 3:
        from n2 import HnswIndex
        tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2']
        for index in tqdm(range(input.shape[0])):
            tree.add_data(input[index, :])
        tree.build(n_threads=20)
    elif kNN_type == 4:
        import pysparnn.cluster_index as ci
        input_num = input.shape[0]
        tree = ci.MultiClusterIndex(input, range(input_num))
    elif kNN_type == 5:
        import nmslib
        M, efC = 30, 100
        index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0}
        
        space_names = ['l2_sparse', 'cosinesimil_sparse'] # https://github.com/nmslib/nmslib/blob/master/manual/spaces.md
        space_name = space_names[0]
        data_type = nmslib.DataType.SPARSE_VECTOR
        tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type)
        
        '''
        def calc_zero_rows(i):
            if input[i, :].getnnz() == 0:
                return 1
            else:
                return 0
        pool = Pool(num_threads)
        zero_row_num = sum(pool.map(calc_zero_rows, range(input.shape[0])))
        print(f"# zero rows in input = {zero_row_num}", end=" ")
        '''
        tree.addDataPointBatch(input)

        tree.createIndex(index_time_params, print_progress=True)
        # Setting query-time parameters
        efS = 100
        query_time_params = {'efSearch': efS}
        print('Setting query-time parameters', query_time_params, end=" ")
        tree.setQueryTimeParams(query_time_params)
    else:
        raise NotImplementedError
    print(f"time={time.time()-st_time:.3f}s")


    print("\tfinding indices ... ", end=" ")
    if kNN_type == 1:
        _, indices = tree.kneighbors(test)
    elif kNN_type == 2:
        _, indices = tree.query(test, k=K + 1)
    elif kNN_type == 3:
        indices = []
        for i in tqdm(range(test.shape[0])):
            indices.append(tree.search_by_vector(test[i, :], k=K + 1))
    elif kNN_type == 4:
        indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False)
    elif kNN_type == 5:
        '''
        def calc_zero_rows2(i):
            if test[i, :].getnnz() == 0:
                return 1
            else:
                return 0
        pool = Pool(num_threads)
        zero_row_num = sum(pool.map(calc_zero_rows2, range(test.shape[0])))
        print(f"# zero rows in test = {zero_row_num}")
        '''

        indices_ = tree.knnQueryBatch(test, k=K+1, num_threads=num_threads)
        indices = [i[0] for i in indices_]
        del indices_
    else:
        raise NotImplementedError

    print(f"time={time.time()-st_time:.3f}s")


    edge_list = []
    for index1, per in enumerate(indices):
        assert len(per) == K+1, f"index1={index1} len(per)={len(per)} != K={K}"
        for index2 in per:
            index2 = int(index2)
            if index1 != index2:
                edge_list.append((index1, index2))
    print(f"\tget edges done! .... time={time.time()-st_time:.3f}s")
    return edge_list