示例#1
0
def create_index(words, index_path, vocab_path, cache_dir, batch_size=64):

    tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
    model = AutoModel.from_pretrained("ai4bharat/indic-bert",
                                      cache_dir=cache_dir,
                                      return_dict=True)
    model.to('cuda')

    index = faiss.IndexFlatIP(model.config.hidden_size)
    i = 0
    while i < len(words):
        batch = words[i:i + batch_size]
        tokens = tokenizer(batch,
                           truncation=True,
                           padding=True,
                           max_length=10,
                           return_tensors="pt")
        tokens.to('cuda')
        outputs = model(**tokens)
        embeddings = torch.mean(outputs.last_hidden_state,
                                1).detach().cpu().numpy()
        faiss.normalize_L2(embeddings)
        index.add(embeddings)
        i += batch_size
        print("{} words done".format(index.ntotal))

    faiss.write_index(index, index_path)
示例#2
0
    def test_sparse_routines(self):
        """ the sparse assignment routine """
        ds = datasets.SyntheticDataset(1000, 2000, 0, 200)
        xt = ds.get_train().copy()
        faiss.normalize_L2(xt)

        mask = np.abs(xt) > 0.045
        # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros
        xt[np.logical_not(mask)] = 0

        centroids = ds.get_queries()
        assert len(centroids) == 200

        xsparse = scipy.sparse.csr_matrix(xt)

        Dref, Iref = faiss.knn(xsparse.todense(), centroids, 1)
        D, I = clustering.sparse_assign_to_dense(xsparse, centroids)

        np.testing.assert_array_equal(Iref.ravel(), I)
        np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4)

        D, I = clustering.sparse_assign_to_dense_blocks(xsparse,
                                                        centroids,
                                                        qbs=123,
                                                        bbs=33,
                                                        nt=4)

        np.testing.assert_array_equal(Iref.ravel(), I)
        np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4)
示例#3
0
def vector_search(query, model, index, num_results=10, threshold=0.75):
    """Tranforms query to vector using a pretrained, sentence-level
    DistilBERT model and finds similar vectors using FAISS.
    Args:
        query (str): User query that should be more than a sentence long.
        model (sentence_transformers.SentenceTransformer.SentenceTransformer)
        index (`numpy.ndarray`): FAISS index that needs to be deserialized.
        num_results (int): Number of results to return.
        threshold: filter the D and I
    Returns:
        D (:obj:`numpy.array` of `float`): Distance between results and query.
        I (:obj:`numpy.array` of `int`): Paper ID of the results.

    """
    vector = model.encode(query)
    normalize_L2(vector)
    D, I = index.search(np.array(vector).astype("float32"), k=num_results)
    index = I.flatten()
    distance = D.flatten()
    filter = distance > threshold
    if distance[filter].tolist():
        pass
    else:
        index = index[:3]
        distance = distance[:3]
        filter = distance > threshold / 2
    return distance[filter].tolist(), index[filter].tolist()
def IndexIVFFlat():
    d = 2048                           # dimension
    nb = 1000050                    # database size
    np.random.seed(1234)             # make reproducible
    training_vectors= np.random.random((nb, d)).astype('float32')*10

    normalize_L2(training_vectors)

    nlist = 1000  # 聚类中心的个数
    k = 50 #邻居个数
    quantizer = faiss.IndexFlatIP(d)  # the other index,需要以其他index作为基础

    index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
    # by default it performs inner-product search
    assert not index.is_trained
    t_tr = time.time()
    index.train(training_vectors)
    print('tr time:', time.time()-t_tr)
    assert index.is_trained
    index.nprobe = 300  # default nprobe is 1, try a few more
    t_s = time.time()
    index.add(training_vectors)  # add may be a bit slower as well
    print('add time:', time.time()-t_s)
    t1=time.time()
    D, I = index.search(training_vectors[:100], k)  # actual search
    t2 = time.time()
    print('faiss kmeans result times {}'.format(t2-t1))
    # print(D[:5])  # neighbors of the 5 first queries
    print(I[:5])
    topk = 5

    np.save('rank{}'.format(topk) + '.npy', I)
    np.save('similarity{}'.format(topk) + '.npy', D)
示例#5
0
    def __init__(
        self,
        embedding_space: EmbeddingSpaceType,
        embedding_space_dims: int,
        similarity_algorithm: SimilarityAlgorithm,
    ):
        super(GloVeWordEmbeddingIndex, self).__init__(
            faiss_index_name='faiss_index',
            index_np_name='index_np',
            embedding_space_dims_name='embedding_space_dims',
            similarity_algorithm_name='similarity_algorithm',
        )

        self.embedding_space = embedding_space
        self.embedding_space_dims = embedding_space_dims
        self.similarity_algorithm = similarity_algorithm
        self.index_np, self.word_to_index, self.index_to_word = (
            GloVeWordEmbeddingIndex.build_index(
                embedding_space,
                embedding_space_dims,
            ))

        # for FAISS we need float32 instead of float64
        self.index_np = self.index_np.astype('float32')

        self.faiss_index = faiss.IndexFlatIP(embedding_space_dims)
        if similarity_algorithm == SimilarityAlgorithm.CosineSimilarity:
            # normalize with L2 as a proxy for cosine search
            faiss.normalize_L2(self.index_np)
        self.faiss_index.add(self.index_np)
示例#6
0
    def __init__(self, file, d, norm=True,file_str=None):
        self.vec = []
        self.txt = []

        if file.endswith('.gz'): 
            f = gzip.open(file, 'rt')
        else:
            f = io.open(file, 'r', encoding='utf-8', newline='\n', errors='ignore')

        for l in f:
            l = l.rstrip().split(' ')
            if len(l) != d:
                logging.error('found {} floats instead of {}'.format(len(l),d))
                sys.exit()
            self.vec.append(l)

        self.vec = np.array(self.vec).astype('float32')
        if norm:
            faiss.normalize_L2(self.vec)

        if file_str is None:
            return

        if file_str.endswith('.gz'): 
            f = gzip.open(file_str, 'rt')
        else:
            f = io.open(file_str, 'r', encoding='utf-8', newline='\n', errors='ignore')

        for l in f:
            self.txt.append(l.rstrip())

        if len(self.txt) != len(self.vec):
            logging.error('diff num of entries {} <> {} in files {} and {}'.format(len(self.vec),len(self.txt),file, file_str))
            sys.exit()
def build_index(hidden_states):

    d = hidden_states.shape[1]
    index = faiss.index_factory(d, "Flat", faiss.METRIC_INNER_PRODUCT)
    faiss.normalize_L2(hidden_states)
    index.add(hidden_states)
    return (index)
示例#8
0
    def _createSenSetVecsNumpy(self):
        '''
        将所有句子的语料全部加载到内存
        :return:
        '''
        print("\n开始生成所有句子的句子向量....")
        start_time = datetime.datetime.now()  # 放在程序开始处
        self.sens = []  # 用来所有的存储句子      我 爱 你
        senVecs = []  # 用来存储所有句子向量
        with open(self.sentencesfile, mode="r", encoding="utf-8") as fr:
            for line in fr:
                line = line.strip()
                if line != "":
                    tokens = line.split(" ")
                    tokens_ids = self._convertWords2ids(tokens)
                    self.sens.append(line)
                    senVecs.append(self._getSenVec(tokens_ids))

        self.senVecs = np.ascontiguousarray(senVecs)  # 是一个numpy

        print("\n开始执行_normalize_L2")
        faiss.normalize_L2(self.senVecs)
        print("\n结束执行_normalize_L2")

        print("\n结束生成所有句子的句子向量....")

        end_time = datetime.datetime.now()  # 放在程序结尾处
        interval = (end_time - start_time).seconds  # 以秒的形式
        print("句子向量生成完毕,共用时:", interval)
示例#9
0
    def __init__(self, file, d=0, norm=True, max_vec=1000000):
        logging.info('Reading {}'.format(file))

        self.file = file
        self.d = d     ### will contain length of vectors
        self.vec = []  ### list with all vectors found in file
        self.max_vec = max_vec

        if self.file.endswith('.gz'): 
            f = gzip.open(self.file, 'rt')
        else:
            f = io.open(self.file, 'r', encoding='utf-8', newline='\n', errors='ignore')

        for l in f:
            l = l.rstrip().split(' ')
            if self.d == 0:
                self.d = len(l)
            if len(l) != self.d:
                logging.error('found a vector with {} cells instead of {} in line {} of file {}'.format(len(l),self.d,len(self.vec)+1,self.file))
                sys.exit()
            self.vec.append(l)

        if self.max_vec == 0:
            self.vecs = [self.vec]
        else:
            self.vecs = [self.vec[i: i+self.max_vec] for i in range(0, len(self.vec), self.max_vec)]
        logging.info('\t\tRead {} vectors into {} chunks ({} cells)'.format(len(self.vec),len(self.vecs),self.d))

        for i in range(len(self.vecs)):
            self.vecs[i] = np.array(self.vecs[i]).astype('float32')
            logging.info('\t\tBuilt float32 array for chunk {} with {} vectors'.format(i,len(self.vecs[i])))
            if norm:
                faiss.normalize_L2(self.vecs[i])
 def get_intent(self, query, prefix, tasks, k_nearest=1):
     index = faiss.read_index(f"data/{prefix}_intent_index.idx")
     query_vector = np.array([self._get_embedding(query, prefix)
                              ]).astype(np.float32)
     faiss.normalize_L2(query_vector)
     similarities, similarities_ids = index.search(query_vector, k_nearest)
     return similarities_ids[0][0], tasks['task'][similarities_ids[0][0]]
示例#11
0
    def __init__(self,
                 iterator=None,
                 filename=None,
                 embeddings=None,
                 shape=None,
                 device="cpu"):

        self.iterator = iterator

        if os.path.exists(filename) == True:

            print(f'Index file {filename}')
            self.index = faiss.read_index(
                filename)  # index2 is identical to index

        else:

            self.index = faiss.index_factory(shape, "Flat",
                                             faiss.METRIC_INNER_PRODUCT)
            faiss.normalize_L2(embeddings)
            self.index.add(embeddings)
            faiss.write_index(self.index, filename)
            print(f'Index written at {filename}')

        if device == "cuda":
            print('Now running on CUDA')
            self.index = faiss.index_cpu_to_all_gpus(self.index)

        print(f'Index trained - {self.index.is_trained}')
示例#12
0
 def test_normalized(self):
     rs = np.random.RandomState(123)
     m = rs.rand(40, 20).astype('float32')
     faiss.normalize_L2(m)
     comments = faiss.MatrixStats(m).comments
     print(comments)
     assert 'vectors are normalized' in comments
示例#13
0
def _knn_faiss(data_numpy, k, metric='euclidean', use_gpu=False):
    import faiss

    data_numpy = data_numpy.astype(np.float32)
    data_numpy = data_numpy.copy(order='C')
    data_numpy = np.ascontiguousarray(data_numpy, dtype=np.float32)

    if use_gpu:
        print('Using GPU for Faiss...')
        res = faiss.StandardGpuResources()
    else:
        print('Using CPU for Faiss...')

    if metric == 'euclidean':
        index = faiss.IndexFlatL2(data_numpy.shape[1])

    elif metric == 'manhattan':
        index = faiss.IndexFlat(data_numpy.shape[1], faiss.METRIC_L1)
    elif metric == 'cosine':
        index = faiss.IndexFlat(data_numpy.shape[1], faiss.METRIC_INNER_PRODUCT)
        faiss.normalize_L2(data_numpy)

    if use_gpu:
        index = faiss.index_cpu_to_gpu(res, 0, index)

    data_numpy = np.ascontiguousarray(data_numpy, dtype=np.float32)
    index.train(data_numpy)
    assert index.is_trained

    index.add(data_numpy)
    nprobe = data_numpy.shape[0]
    index.nprobe = nprobe
    distances, neighbors = index.search(data_numpy, k)

    return distances, neighbors
示例#14
0
    def evaluate(self):
        faiss.normalize_L2(self.embedding_vectors)
        self.index.add(self.embedding_vectors)
        self.D, self.I = self.index.search(self.embedding_vectors, self.k)
        self.I = self.I.astype(np.int32)
        np.savetxt(join(self.target_dir, "D.out"), self.D, delimiter=",")
        np.savetxt(join(self.target_dir, "I.out"),
                   self.I.astype(np.int),
                   delimiter=",",
                   fmt="%i")
        with open(join(self.target_dir, "filenames.txt"), "w") as f:
            f.writelines([x + "\n" for x in self.filepaths])
        with open(join(self.target_dir, "vectors.npz"), "wb") as f:
            np.save(f, self.embedding_vectors)
        #TODO: Calculate top-5 accuracy etc. from index matrix and class dictionaries
        self.A = self.vec_index_to_ad_id_func(idx=self.I)

        # Subtract query id from all columns.
        # If there is a match, the jth column with have a zero in it
        self.A_proper = self.A.copy()
        self.A[:, 1:] -= self.A[:, 0][:, None]
        k_accuracies = np.zeros(self.k)
        for k in range(1, self.k + 1):
            k_accuracies[k - 1] = (np.mean(
                np.count_nonzero(self.A[:, 1:k] == 0, axis=1) >= 1))

        with open(join(self.target_dir, "k_accuracies.npz"), "wb") as f:
            np.save(f, k_accuracies)

        fig, ax = plt.subplots(1, 1)

        ax.plot(np.arange(1, self.k + 1), k_accuracies)
        plt.show()
示例#15
0
def vector_search(
    query_vector: Union[str, np.ndarray],
    data: List[str],
    encoded_data: np.ndarray = np.ndarray([0]),
    embed: Optional[Callable] = None,  # embed_data
    index_: str = "",  # default to indexflatl2, or indexflatip
    sanity_check: Union[bool, int] = False,
    topk: int = 5,
) -> Optional[Tuple[np.ndarray, np.ndarray]]:
    """Search via faiss."""

    if embed is None:
        embed = fetch_embed

    if encoded_data is None:
        encoded_data = embed(data)

    if isinstance(query_vector, str):
        query_vector = fetch_embed(query_vector)

    if isinstance(query_vector, np.ndarray):
        try:
            assert query_vector.shape[1] == encoded_data.shape[1]
        except AssertionError as exc:
            raise SystemExit(
                "dimentions query vector and vectors in "
                "database dimensions do not match"
            ) from exc
    else:
        logger.info(
            "You probably need to embed (encode) the list of str first."
            "\n\t.e.g, embed(nameof(query_vector)). Exiting"
        )
        try:
            _ = fetch_embed(query_vector)
            # _ = np.array().astype("float16")
            assert _.shape[1] == encoded_data.shape[1]
            query_vector = _
        except Exception as exc:
            logger.error(exc)
            raise SystemExit(1) from exc

    if index_.lower() in [
        "indexflat_ip", "flat_ip", "flatip", "flat-ip", "indexflat-ip"
    ]:
        index = faiss_flat_ip(encoded_data)
    else:  # index_.lower() in ["indexflatl2", "flat_l2", "flatl2"]
        index = faiss_flat_l2(encoded_data)
        faiss.normalize_L2(query_vector)

    _ = index.search(query_vector, topk)
    # _ = index.search(ed, topk)

    if sanity_check:
        # query_vector = encoded_data[:10]
        top_k = index.search(encoded_data[:10], topk)
        print([np.round(top_k[0], 2), top_k[1]])
        # return None

    return _
示例#16
0
    def _execute_one_chunk(cls, ctx, op):
        (inp, ), device_id, xp = as_same_device(
            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True)

        with device(device_id):
            # create index
            index = faiss.index_factory(inp.shape[1], op.faiss_index,
                                        op.faiss_metric_type)
            # GPU
            if device_id >= 0:  # pragma: no cover
                res = faiss.StandardGpuResources()
                index = faiss.index_cpu_to_gpu(res, device_id, index)

            # train index
            if not index.is_trained:
                assert op.n_sample is not None
                sample_indices = xp.random.choice(inp.shape[0],
                                                  size=op.n_sample,
                                                  replace=False)
                sampled = inp[sample_indices]
                index.train(sampled)

            if op.metric == 'cosine':
                # faiss does not support cosine distances directly,
                # data needs to be normalize before adding to index,
                # refer to:
                # https://github.com/facebookresearch/faiss/wiki/FAQ#how-can-i-index-vectors-for-cosine-distance
                faiss.normalize_L2(inp)
            # add vectors to index
            index.add(inp)

            ctx[op.outputs[0].key] = _store_index(ctx, op, index, device_id)
示例#17
0
    def predict(self, text: str) -> str:
        response = ""
        tag = ""

        # update
        ques_embedding_dict, response_cluster_dict = self.data_controller.update(
        )
        if self.ques_embedding_dict != ques_embedding_dict:
            self.ques_embedding_dict = ques_embedding_dict
            self.querys = self.ques_embedding_dict['sentences']
            self.querys_wo_space = [s.replace(" ", "") for s in self.querys]
            self.faiss_index, self.class_list = self._faiss_indexing()

        if self.response_cluster_dict != response_cluster_dict:
            self.response_cluster_dict = response_cluster_dict

        self.thres_prob = self.data_controller.threshold_dict[
            'scenario_similarity_threshold']
        self.thres_similar = self.data_controller.threshold_dict[
            'character_similarity_threshold']

        # 1) exact matching
        res_class = self._exact_matching(text)
        if res_class:
            response = self._generate_response(res_class)
            tag = "<Scenario>"

        # 2) similarity analysis
        else:
            # a) character similarity
            res_class = self._char_similarity_analysis(text)
            if res_class:
                response = self._generate_response(res_class)
                tag = "<Scenario>"

            # b) semantic similarity
            else:
                query_vec = self.inferencer.infer(text)
                query_vec = np.array(query_vec).astype(np.float32)

                if len(query_vec) == 0:
                    return ""

                normalize_L2(query_vec)

                D, I = self.faiss_index.search(query_vec, self.k)
                topk_class = [self.class_list[i] for i in I[0]]

                pred_counts = Counter(topk_class)
                res_class = max(pred_counts)
                max_prob = D[0].max()

                if pred_counts[res_class] >= math.ceil(
                        self.k / 2) and max_prob >= self.thres_prob:
                    response = self._generate_response(res_class)
                    tag = "<Scenario-Semantic | Score: {}>".format(
                        str(round(max_prob, 2)))
        if tag:
            response = response + "\n" + tag
        return response
示例#18
0
def search_top_k(corp_emb, query_emb, embedding_dim, k, config):
    """
    Returns a tuple with ordered lists of lists of cosine distances between and top k matches in corpus_embeddings.
    Each list corresponds to one query.
    Needs GPU
    Available metrics = faiss.METRIC_INNER_PRODUCT, faiss.METRIC_L2, ...
    more here https://github.com/facebookresearch/faiss/wiki/MetricType-and-distances

    return type = (List[List[int = cosine_distance]], List[List[int = index_of_corpus_embedding]]);
    type(cosine_distance) == Float
    type(index_of_corpus_embedding) == Int
    """
    config.logger.info(f"Preparing index and executing the search with embedding dim {embedding_dim}")
    index = faiss.index_factory(embedding_dim, "PCA384,Flat", faiss.METRIC_INNER_PRODUCT)  # Flat = exhaustive search
    faiss.normalize_L2(corp_emb)  # need to normalize query and corpus vectors for cosine distance
    faiss.normalize_L2(query_emb)
    if config.device != 'cpu':
        res = faiss.StandardGpuResources()
        if len(config.devices) > 1:
            dev_index = faiss.index_cpu_to_all_gpus(index)  # use gpu
        else:
            dev_index = faiss.index_cpu_to_gpu(res, 0, index)
    else:
        dev_index = index
    
    dev_index.train(corp_emb)
    dev_index.add(corp_emb)
    return dev_index.search(query_emb, k)  # return distances, indices matrices
示例#19
0
def LoadDataNLI(fn1,
                fn2,
                fn_lbl,
                dim=1024,
                bsize=32,
                shuffle=False,
                quiet=False):
    x = np.fromfile(fn1, dtype=np.float32, count=-1)
    x.resize(x.shape[0] // dim, dim)
    faiss.normalize_L2(x)

    y = np.fromfile(fn2, dtype=np.float32, count=-1)
    y.resize(y.shape[0] // dim, dim)
    faiss.normalize_L2(y)

    lbl = np.loadtxt(fn_lbl, dtype=np.int32)
    lbl.reshape(lbl.shape[0], 1)
    if not quiet:
        print(' - read {:d}x{:d} elements in {:s}'.format(
            x.shape[0], x.shape[1], fn1))
        print(' - read {:d}x{:d} elements in {:s}'.format(
            y.shape[0], y.shape[1], fn2))
        print(' - read {:d} labels [{:d},{:d}] in {:s}'.format(
            lbl.shape[0], lbl.min(), lbl.max(), fn_lbl))

    # nli = torch.cat((x, y, torch.abs(x - y), x * y), 1)
    if not quiet:
        print(' - combine premises and hyps')
    nli = np.concatenate((x, y, np.absolute(x - y), np.multiply(x, y)), axis=1)

    D = data_utils.TensorDataset(torch.from_numpy(nli), torch.from_numpy(lbl))
    loader = data_utils.DataLoader(D, batch_size=bsize, shuffle=shuffle)
    return loader
示例#20
0
    def index(self):
        """Creates a faiss index for similarity searches over the node embeddings.
        Simple implementation of a cached property.

        Returns
        -------
        a faiss index with input embeddings added and optionally trained"""

        if self._index is None:
            if not self._masks_set:
                self.set_masks()
            if self.distance_metric=='cosine':
                self._index  = faiss.IndexFlatIP(self.embedding_dim)
                embeddings = np.copy(self.embeddings[self.entity_mask])
                #this function operates in place so np.copy any views into a new array before using.
                faiss.normalize_L2(embeddings)
            elif self.distance_metric=='l2':
                self._index = faiss.IndexFlatL2(self.embedding_dim)
                embeddings = self.embeddings[self.entity_mask]
            
            if self.train_faiss:
                training_points = min(
                    len(self.node_ids)//FAISS_NODES_TO_CLUSTERS+1,
                    MAXIMUM_FAISS_CLUSTERS)
                self._index = faiss.IndexIVFFlat(self._index, self.embedding_dim, training_points)
                self._index.train(embeddings)

            self._index.add(embeddings)

            if self.faiss_gpu:
                GPU = faiss.StandardGpuResources()
                self._index = faiss.index_cpu_to_gpu(GPU, 0, self._index)


        return self._index
示例#21
0
def faiss_flat_ip(encoded_data):
    """Faiss flatip."""
    dim = encoded_data.shape[1]
    index = faiss.IndexIDMap(faiss.IndexFlatIP(dim))
    faiss.normalize_L2(encoded_data)
    index.add_with_ids(encoded_data, np.arange(len(encoded_data)))
    return index
def annSearch(emds, uuid, uvec, test_label, ivec, iid, topk):

    indextree = faiss.IndexFlatIP(emds)

    faiss.normalize_L2(ivec)
    indextree.add(ivec)

    faiss.normalize_L2(uvec)

    #D,I = indextree.search(np.ascontiguousarray(uvec),topk)
    D, I = indextree.search(uvec, topk)
    score = []
    hit = 0
    #uid测试集中 用户,,iid所有数据集上的item
    for i, uid in tqdm(enumerate(uuid)):
        try:
            pred = [iid.values[x] for x in I[i]]

            recall_score = len(set(pred[:topk])
                               & set(test_label)) * 1.0 / len(test_label)
            score.append(recall_score)

            if test_label[uid] in pred:
                hit += 1
        except:
            print(i)

    score_mean = np.mean(score)
    hit_rate = hit / len(uuid)

    return score_mean, hit_rate
示例#23
0
def cosine_similar():
    '''
    cosine_similarity
    use

    :return:
    '''
    d = 64  # dimension
    nb = 105  # database size
    #主要是为了测试不是归一化的vector
    training_vectors = np.random.random((nb, d)).astype('float32') * 10
    print('just  compare with skearn')
    from sklearn.metrics.pairwise import cosine_similarity
    #主要是为了与sklearn 比较结果
    ag = cosine_similarity(training_vectors)
    fe = np.sort(ag, axis=1)
    print('normalize_L2')
    normalize_L2(training_vectors)
    print('IndexFlatIP')
    index = faiss.IndexFlatIP(d)
    index.train(training_vectors)
    print(index)
    print('train')
    print(index.is_trained)
    print('add')
    print(index)
    index.add(training_vectors)
    print('search')
    D, I = index.search(training_vectors[:100], 5)
    print(I[:5])  # 表示最相近的前5个的index
    print(D[:5])  # 表示最相近的前5个的相似度的值
示例#24
0
    def search(self, query, top=5, nprobe=1, ret_vec=0, index=None):  #
        D, I, V = [], [], []

        # 查找聚类中心的个数,默认为1个。
        self.index.nprobe = nprobe  #self.nprobe

        # 如果指定了索引号,则使用索引号指定的向量 2020/9/10
        if index:
            query = self.xb[index, :]
        else:
            if not query.dtype == 'float32':
                query = query.astype('float32')
            #print(query.shape)

            # 如果是单条查询,把向量处理成二维
            if len(query.shape) == 1:
                query = query[np.newaxis, :]

            # 向量归一化
            if self.normalize:
                faiss.normalize_L2(query)

        # print('q,n:', (query, top) )
        # 查询
        D, I = self.index.search(query, top)
        # 添加向量输出 2020/9/7
        V = []
        if ret_vec:
            V = self.xb[I, :]

        return D, I, V
示例#25
0
def faiss_search(embeddings, uids, num_results):
    """Returns and index and query vector from FAISS Model based on given embeddings.

    Create a matrix to store article embeddings
    Assign dimension for the vector space
    Build the index
    IndexFlatIP: taking inner product of the vectors
    with normalized vectors, the inner product (IP, of IndexFlatIP)
    becomes cosine similarity
    Adding vectors to the index
    Prepare query vector
    """

    xb = np.ascontiguousarray(embeddings).astype(np.float32)
    d = xb.shape[1]
    index = faiss.IndexFlatIP(d)
    faiss.normalize_L2(xb)
    index.add(xb)

    query_vec = np.ascontiguousarray(embeddings.loc[uids]).reshape(
        1, -1).astype(np.float32)
    faiss.normalize_L2(query_vec)

    _, matches = index.search(query_vec, num_results)
    similar_embeddings = matches.tolist()[0]

    return [
        uid for uid in embeddings.iloc[similar_embeddings].index
        if uid not in uids
    ]
示例#26
0
def submit(valid_data, all_data, mapped_data):
    t1 = time.time()
    corpus = np.array(all_data['embedding'].values.tolist()).astype('float32')
    import faiss
    faiss.normalize_L2(corpus)
    index = faiss.IndexFlatIP(corpus.shape[1])
    index.train(corpus)
    index.add(corpus)
    query = np.array(valid_data['embedding'].values.tolist()).astype('float32')
    faiss.normalize_L2(query)
    D, I = index.search(query, len(corpus))
    res = []
    for i, d in enumerate(I):
        index_lst = I[i][:3]
        paper_id_lst = [
            all_data.loc[index, 'paper_id'][0] for index in index_lst
        ]
        description_id = valid_data.loc[i, 'description_id'][0]
        res.append({
            "description_id": description_id,
            "paper_id_lst": ",".join(paper_id_lst)
        })
    print("Time {:.02f}s".format(time.time() - t1))
    res = pd.DataFrame(res)
    res.to_csv("./result/submit.csv", index=0, header=0)
示例#27
0
 def _loadTextAndEmb(textF, encoding, embF, encoderDim, unify, verbose):
     inds, sents = TextLoadUnify(textF, encoding, unify, verbose)
     emb = EmbedLoad(embF, encoderDim, verbose=verbose)
     if unify:
         emb = unique_embeddings(emb, inds)
     faiss.normalize_L2(emb)
     return inds, sents, emb
示例#28
0
文件: __init__.py 项目: yk/jina-hub
    def build_advanced_index(self, vecs: 'np.ndarray'):
        """Load all vectors (in numpy ndarray) into Faiss indexers """
        import faiss

        metric = faiss.METRIC_L2
        if self.distance == 'inner_product':
            metric = faiss.METRIC_INNER_PRODUCT
        if self.distance not in {'inner_product', 'l2'}:
            self.logger.warning(
                'Invalid distance metric for Faiss index construction. Defaulting to l2 distance'
            )

        index = self.to_device(
            index=faiss.index_factory(self.num_dim, self.index_key, metric))
        if not self.is_trained and self.train_filepath:
            train_data = self._load_training_data(self.train_filepath)
            if train_data is None:
                self.logger.warning(
                    'loading training data failed. some faiss indexes require previous training.'
                )
            else:
                train_data = train_data.astype(np.float32)
                if self.normalize:
                    faiss.normalize_L2(train_data)
                self.train(index, train_data)

        self.build_partial_index(vecs, index)
        index.nprobe = self.nprobe
        return index
示例#29
0
def create_faiss_index(sequence_vectors):

    array = np.array(sequence_vectors).astype(np.float32)
    index = faiss.IndexFlatIP(array.shape[1])
    faiss.normalize_L2(array)
    index.add(array)

    return index
示例#30
0
文件: __init__.py 项目: yk/jina-hub
 def query(self, vecs: 'np.ndarray', top_k: int, *args,
           **kwargs) -> Tuple['np.ndarray', 'np.ndarray']:
     if self.normalize:
         from faiss import normalize_L2
         normalize_L2(vecs)
     dist, ids = self.query_handler.search(vecs, top_k)
     keys = self.int2ext_id[self.valid_indices][ids]
     return keys, dist
示例#31
0
 def manual_trans(x):
     x = x.copy()
     faiss.normalize_L2(x)
     x = pca.apply_py(x)
     faiss.normalize_L2(x)
     return x