Exemplo n.º 1
0
    def test_2level(self):
        " verify that 2-level clustering is not too sub-optimal "
        ds = datasets.SyntheticDataset(32, 10000, 0, 0)
        xt = ds.get_train()
        km_ref = faiss.Kmeans(ds.d, 100)
        km_ref.train(xt)
        err = faiss.knn(xt, km_ref.centroids, 1)[0].sum()

        centroids2, _ = clustering.two_level_clustering(xt, 10, 10)
        err2 = faiss.knn(xt, centroids2, 1)[0].sum()

        self.assertLess(err2, err * 1.1)
Exemplo n.º 2
0
    def test_RCQ_knn(self):
        ds = datasets.SyntheticDataset(32, 1000, 0, 123)
        xt = ds.get_train()
        xq = ds.get_queries()

        # RQ 3+4+5 = 12 bits = 4096 centroids
        rcq = faiss.index_factory(ds.d, "RCQ1x3_1x4_1x5")
        rcq.train(xt)

        aq = rcq.rq

        cents = rcq.reconstruct_n(0, rcq.ntotal)

        sp = faiss.swig_ptr

        # test norms computation

        norms_ref = (cents ** 2).sum(1)
        norms = np.zeros(1 << aq.tot_bits, dtype="float32")
        aq.compute_centroid_norms(sp(norms))

        np.testing.assert_array_almost_equal(norms, norms_ref, decimal=5)

        # test IP search

        Dref, Iref = faiss.knn(
            xq, cents, 10,
            metric=faiss.METRIC_INNER_PRODUCT
        )

        Dnew = np.zeros_like(Dref)
        Inew = np.zeros_like(Iref)

        aq.knn_centroids_inner_product(len(xq), sp(xq), 10, sp(Dnew), sp(Inew))

        np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
        np.testing.assert_array_equal(Iref, Inew)

        # test L2 search

        Dref, Iref = faiss.knn(xq, cents, 10, metric=faiss.METRIC_L2)

        Dnew = np.zeros_like(Dref)
        Inew = np.zeros_like(Iref)

        aq.knn_centroids_L2(len(xq), sp(xq), 10, sp(Dnew), sp(Inew), sp(norms))

        np.testing.assert_array_equal(Iref, Inew)
        np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
Exemplo n.º 3
0
    def test_small_data(self):
        d = 20
        # nlist = (2^4)^2 = 256
        index = faiss.index_factory(d, 'IMI2x4,Flat')

        # When nprobe >= nlist, it is equivalent to an IndexFlat.
        rs = np.random.RandomState(123)
        xt = rs.rand(100, d).astype('float32')
        xb = rs.rand(1000, d).astype('float32')

        index.train(xt)
        index.add(xb)
        index.nprobe = 2048
        k = 5
        xq = rs.rand(10, d).astype('float32')

        # test kNN search
        ref_D, ref_I = index.search(xq, k)
        D, I = faiss.knn(xq, xb, k)
        assert np.all(D == ref_D)
        assert np.all(I == ref_I)

        # test range search
        thresh = 0.1  # *squared* distance
        ref_lims, ref_D, ref_I = index.range_search(xq, thresh)
        gt_index = faiss.IndexFlat(d)
        gt_index.add(xb)
        lims, D, I = index.range_search(xq, thresh)
        assert np.all(lims == ref_lims)
        assert np.all(D == ref_D)
        assert np.all(I == ref_I)
Exemplo n.º 4
0
    def test_query_iterator(self, metric=faiss.METRIC_L2):
        ds = datasets.SyntheticDataset(32, 0, 1000, 1000)
        xq = ds.get_queries()
        xb = ds.get_database()
        D, I = faiss.knn(xq, xb, 10, metric=metric)
        threshold = float(D[:, -1].mean())
        print(threshold)

        index = faiss.IndexFlat(32, metric)
        index.add(xb)
        ref_lims, ref_D, ref_I = index.range_search(xq, threshold)

        def matrix_iterator(xb, bs):
            for i0 in range(0, xb.shape[0], bs):
                yield xb[i0:i0 + bs]

        # check repro OK
        _, new_lims, new_D, new_I = range_search_max_results(
            index, matrix_iterator(xq, 100), threshold)

        evaluation.test_ref_range_results(ref_lims, ref_D, ref_I, new_lims,
                                          new_D, new_I)

        max_res = ref_lims[-1] // 2

        new_threshold, new_lims, new_D, new_I = range_search_max_results(
            index, matrix_iterator(xq, 100), threshold, max_results=max_res)

        self.assertLessEqual(new_lims[-1], max_res)

        ref_lims, ref_D, ref_I = index.range_search(xq, new_threshold)

        evaluation.test_ref_range_results(ref_lims, ref_D, ref_I, new_lims,
                                          new_D, new_I)
Exemplo n.º 5
0
    def test_sparse_routines(self):
        """ the sparse assignment routine """
        ds = datasets.SyntheticDataset(1000, 2000, 0, 200)
        xt = ds.get_train().copy()
        faiss.normalize_L2(xt)

        mask = np.abs(xt) > 0.045
        # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros
        xt[np.logical_not(mask)] = 0

        centroids = ds.get_queries()
        assert len(centroids) == 200

        xsparse = scipy.sparse.csr_matrix(xt)

        Dref, Iref = faiss.knn(xsparse.todense(), centroids, 1)
        D, I = clustering.sparse_assign_to_dense(xsparse, centroids)

        np.testing.assert_array_equal(Iref.ravel(), I)
        np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4)

        D, I = clustering.sparse_assign_to_dense_blocks(xsparse,
                                                        centroids,
                                                        qbs=123,
                                                        bbs=33,
                                                        nt=4)

        np.testing.assert_array_equal(Iref.ravel(), I)
        np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4)
Exemplo n.º 6
0
    def subtest(self, d, K, metric):
        metric_names = {faiss.METRIC_L1: 'L1',
                        faiss.METRIC_L2: 'L2',
                        faiss.METRIC_INNER_PRODUCT: 'IP'}

        nb = 1000
        _, xb, _ = get_dataset_2(d, 0, nb, 0)

        _, knn = faiss.knn(xb, xb, K + 1, metric)
        knn = knn[:, 1:]

        index = faiss.IndexNNDescentFlat(d, K, metric)
        index.nndescent.S = 10
        index.nndescent.R = 32
        index.nndescent.L = K + 20
        index.nndescent.iter = 5
        index.verbose = True

        index.add(xb)
        graph = index.nndescent.final_graph
        graph = faiss.vector_to_array(graph)
        graph = graph.reshape(nb, K)

        recalls = 0
        for i in range(nb):
            for j in range(K):
                for k in range(K):
                    if graph[i, j] == knn[i, k]:
                        recalls += 1
                        break
        recall = 1.0 * recalls / (nb * K)
        print('Metric: {}, knng accuracy: {}'.format(metric_names[metric], recall))
        assert recall > 0.99
Exemplo n.º 7
0
    def test_python_kmeans(self):
        """ Test the python implementation of kmeans """
        ds = datasets.SyntheticDataset(32, 10000, 0, 0)
        x = ds.get_train()

        # bad distribution to stress-test split code
        xt = x[:10000].copy()
        xt[:5000] = x[0]

        km_ref = faiss.Kmeans(ds.d, 100, niter=10)
        km_ref.train(xt)
        err = faiss.knn(xt, km_ref.centroids, 1)[0].sum()

        data = clustering.DatasetAssign(xt)
        centroids = clustering.kmeans(100, data, 10)
        err2 = faiss.knn(xt, centroids, 1)[0].sum()

        # 33517.645 and 33031.098
        self.assertLess(err2, err * 1.1)
Exemplo n.º 8
0
def eval_codec(q, xq, xb, gt):
    t0 = time.time()
    codes = q.compute_codes(xb)
    t1 = time.time()
    xb_decoded = q.decode(codes)
    recons_err = ((xb - xb_decoded)**2).sum() / xb.shape[0]
    # for compatibility with the codec benchmarks
    err_compat = np.linalg.norm(xb - xb_decoded, axis=1).mean()
    xq_decoded = q.decode(q.compute_codes(xq))
    D, I = faiss.knn(xq_decoded, xb_decoded, 1)
    recall = (I[:, 0] == gt[:, 0]).sum() / nq
    print(
        f"\tencode time: {t1 - t0:.3f} reconstruction error: {recons_err:.3f} "
        f"1-recall@1: {recall:.4f} recons_err_compat {err_compat:.3f}")
Exemplo n.º 9
0
def _knn_search(queries, data, k):
    """
    Perform exact knn search (should be replaced with approximate)

    Return the k nearest keys
    """
    if torch.cuda.is_available(
    ):  # not the best way but should let me know that gpu is being used
        res = faiss.StandardGpuResources()
        D, I = faiss.knn_gpu(res, queries, data, k)
        return D.detach().cpu().numpy(), I.detach().cpu().numpy()

    queries, data = queries.detach().numpy(), data.detach().numpy()
    return faiss.knn(queries, data, k)  #(distances, indices)
Exemplo n.º 10
0
    def do_test_range(self, metric):
        ds = datasets.SyntheticDataset(32, 0, 1000, 10)
        xq = ds.get_queries()
        xb = ds.get_database()
        D, I = faiss.knn(xq, xb, 10, metric=metric)
        threshold = float(D[:, -1].mean())

        index = faiss.IndexFlat(32, metric)
        index.add(xb)
        ref_lims, ref_D, ref_I = index.range_search(xq, threshold)

        new_lims, new_D, new_I = range_ground_truth(
            xq, ds.database_iterator(bs=100), threshold, metric_type=metric)

        evaluation.test_ref_range_results(ref_lims, ref_D, ref_I, new_lims,
                                          new_D, new_I)
Exemplo n.º 11
0
 def test_rand_vector(self):
     """ test if the smooth_vectors function is reasonably compressible with
     a small PQ """
     x = faiss.rand_smooth_vectors(1300, 32)
     xt = x[:1000]
     xb = x[1000:1200]
     xq = x[1200:]
     _, gt = faiss.knn(xq, xb, 10)
     index = faiss.IndexPQ(32, 4, 4)
     index.train(xt)
     index.add(xb)
     D, I = index.search(xq, 10)
     ninter = faiss.eval_intersection(I, gt)
     # 445 for SyntheticDataset
     self.assertGreater(ninter, 420)
     self.assertLess(ninter, 460)
Exemplo n.º 12
0
    def do_test(self, metric):
        d = 32
        xt, xb, xq = get_dataset_2(d, 2000, 1000, 200)
        index1 = faiss.index_factory(d, "PQ4x4np", metric)
        Dref, Iref = faiss.knn(xq, xb, 10, metric)

        index1.train(xt)
        index1.add(xb)

        D1, I1 = index1.search(xq, 100)

        recall1 = (I1 == Iref[:, :1]).sum()

        # add refine index on top
        index_flat = faiss.IndexFlat(d, metric)
        index_flat.add(xb)

        index2 = faiss.IndexRefine(index1, index_flat)
        index2.k_factor = 10.0
        D2, I2 = index2.search(xq, 10)

        # check distance is computed properly
        for i in range(len(xq)):
            x1 = xq[i]
            x2 = xb[I2[i, 5]]
            if metric == faiss.METRIC_L2:
                dref = ((x1 - x2) ** 2).sum()
            else:
                dref = np.dot(x1, x2)
            np.testing.assert_almost_equal(dref, D2[i, 5], decimal=5)

        # check that with refinement, the recall@10 is the same as
        # the original recall@100
        recall2 = (I2 == Iref[:, :1]).sum()
        # print("recalls", recall1, recall2)
        self.assertEquals(recall1, recall2)
Exemplo n.º 13
0
 def perform_search(self, centroids):
     return faiss.knn(self.x, centroids, 1)
Exemplo n.º 14
0
 def get_groundtruth(self, k=100):
     return faiss.knn(
         self.xq, self.xb, k, faiss.METRIC_L2
         if self.metric == 'L2' else faiss.METRIC_INNER_PRODUCT)[1]