Пример #1
0
    def do_read_callback(self, bsz):
        d, n = 32, 1000
        x = np.random.uniform(size=(n, d)).astype('float32')
        index = faiss.IndexFlatL2(d)
        index.add(x)

        fd, fname = tempfile.mkstemp()
        os.close(fd)
        try:
            faiss.write_index(index, fname)

            with open(fname, 'rb') as f:
                reader = faiss.PyCallbackIOReader(f.read, 1234)

                if bsz > 0:
                    reader = faiss.BufferedIOReader(reader, bsz)

                index2 = faiss.read_index(reader)

            self.assertEqual(index.d, index2.d)
            np.testing.assert_array_equal(faiss.vector_to_array(index.xb),
                                          faiss.vector_to_array(index2.xb))

            # This is not a callable function: should raise an exception
            reader = faiss.PyCallbackIOReader("blabla")
            self.assertRaises(Exception, faiss.read_index, reader)
        finally:
            if os.path.exists(fname):
                os.unlink(fname)
Пример #2
0
def run_kmeans(x, nmb_clusters, verbose=False, use_gpu=True):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.max_points_per_centroid = 10000000

    if use_gpu:
        res = faiss.StandardGpuResources()
        flat_config = faiss.GpuIndexFlatConfig()
        flat_config.useFloat16 = False
        flat_config.device = 0
        index = faiss.GpuIndexFlatL2(res, d, flat_config)
    else:
        index = faiss.IndexFlatL2(d)

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)
    centroids = faiss.vector_to_array(clus.centroids).reshape(
        (nmb_clusters, d))  # Also return centroids!
    losses = faiss.vector_to_array(clus.obj)

    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in I], losses[-1], centroids, index
Пример #3
0
    def test_8bit_equiv(self):
        rs = np.random.RandomState(123)
        for _it in range(20):
            for d in 13, 16, 24:
                x = np.floor(rs.rand(5, d) * 256).astype('float32')
                x[0] = 0
                x[1] = 255

                # make sure to test extreme cases
                x[2, 0] = 0
                x[3, 0] = 255
                x[2, 1] = 255
                x[3, 1] = 0

                ref_index = faiss.IndexScalarQuantizer(
                    d, faiss.ScalarQuantizer.QT_8bit)
                ref_index.train(x[:2])
                ref_index.add(x[2:3])

                index = faiss.IndexScalarQuantizer(
                    d, faiss.ScalarQuantizer.QT_8bit_direct)
                assert index.is_trained
                index.add(x[2:3])

                assert np.all(
                    faiss.vector_to_array(ref_index.codes) ==
                    faiss.vector_to_array(index.codes))

                # Note that distances are not the same because ref_index
                # reconstructs x as x + 0.5
                D, I = index.search(x[3:], 1)

                # assert D[0, 0] == Dref[0, 0]
                print(D[0, 0], ((x[3] - x[2]) ** 2).sum())
                assert D[0, 0] == ((x[3] - x[2]) ** 2).sum()
Пример #4
0
def preprocess_features(npdata, pca=256, pca_info=None):
    """Preprocess an array of features.
    Args:
        npdata (np.array N * ndim): features to preprocess
        pca (int): dim of output
    Returns:
        np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized
    """
    _, ndim = npdata.shape
    npdata = npdata.astype('float32')

    if pca_info is None:
        # Apply PCA-whitening with Faiss
        pca_matrix = faiss.PCAMatrix(ndim, pca, eigen_power=-0.5)
        pca_matrix.train(npdata)
        assert pca_matrix.is_trained
        npdata = pca_matrix.apply_py(npdata)

        pca_A = np.transpose(
            faiss.vector_to_array(pca_matrix.A).reshape((pca, ndim)))
        pca_b = faiss.vector_to_array(pca_matrix.b)
        pca_info = (pca_A, pca_b)
    else:
        npdata = np.dot(npdata, pca_info[0]) + pca_info[1]

    # L2 normalization
    row_sums = np.linalg.norm(npdata, axis=1)
    npdata = npdata / row_sums[:, np.newaxis]

    return npdata, pca_info
def run_kmeans(x, nmb_clusters, verbose=False):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    index = faiss.IndexFlatL2(d)

    # perform the training
    clus.train(x, index)
    Dist, Ind = index.search(x, 1)
    losses = faiss.vector_to_array(clus.obj)
    # if verbose:
        # print('k-means loss evolution: {0}'.format(losses))

    centers = faiss.vector_to_array(clus.centroids).reshape(nmb_clusters, -1)
    kmeans_trans_weight = -2 * centers
    kmeans_trans_bias = (centers**2).sum(axis=1)  # (K,)

    return [int(n[0]) for n in Ind], [float(n[0]) for n in Dist], losses[-1], clus, \
        [kmeans_trans_weight, kmeans_trans_bias], centers
Пример #6
0
    def test_encoded(self):
        d = 32
        k = 5
        xt, xb, xq = get_dataset_2(d, 1000, 0, 0)

        # make sure that training on a compressed then decompressed
        # dataset gives the same result as decompressing on-the-fly

        codec = faiss.IndexScalarQuantizer(d, faiss.ScalarQuantizer.QT_4bit)
        codec.train(xt)
        codes = codec.sa_encode(xt)

        xt2 = codec.sa_decode(codes)

        clus = faiss.Clustering(d, k)
        # clus.verbose = True
        clus.niter = 0
        index = faiss.IndexFlatL2(d)
        clus.train(xt2, index)
        ref_centroids = faiss.vector_to_array(clus.centroids).reshape(-1, d)

        _, ref_errs = index.search(xt2, 1)

        clus = faiss.Clustering(d, k)
        # clus.verbose = True
        clus.niter = 0
        clus.decode_block_size = 120
        index = faiss.IndexFlatL2(d)
        clus.train_encoded(codes, codec, index)
        new_centroids = faiss.vector_to_array(clus.centroids).reshape(-1, d)

        _, new_errs = index.search(xt2, 1)

        # It's the same operation, so should be bit-exact the same
        self.assertTrue(np.all(ref_centroids == new_centroids))
Пример #7
0
    def do_write_callback(self, bsz):
        d, n = 32, 1000
        x = np.random.uniform(size=(n, d)).astype('float32')
        index = faiss.IndexFlatL2(d)
        index.add(x)

        f = io.BytesIO()
        # test with small block size
        writer = faiss.PyCallbackIOWriter(f.write, 1234)

        if bsz > 0:
            writer = faiss.BufferedIOWriter(writer, bsz)

        faiss.write_index(index, writer)
        del writer  # make sure all writes committed

        if sys.version_info[0] < 3:
            buf = f.getvalue()
        else:
            buf = f.getbuffer()

        index2 = faiss.deserialize_index(np.frombuffer(buf, dtype='uint8'))

        self.assertEqual(index.d, index2.d)
        self.assertTrue(
            np.all(
                faiss.vector_to_array(index.xb) == faiss.vector_to_array(
                    index2.xb)))

        # This is not a callable function: shoudl raise an exception
        writer = faiss.PyCallbackIOWriter("blabla")
        self.assertRaises(Exception, faiss.write_index, index, writer)
Пример #8
0
    def test_precomp(self):
        ds = datasets.SyntheticDataset(32, 1000, 1000, 0)

        # make sure it work with varying nb of bits
        nbits = faiss.UInt64Vector()
        nbits.push_back(5)
        nbits.push_back(6)
        nbits.push_back(7)

        rq = faiss.ResidualQuantizer(ds.d, nbits)
        rq.train_type = faiss.ResidualQuantizer.Train_default
        rq.train(ds.get_train())

        codebooks = get_additive_quantizer_codebooks(rq)
        precomp = precomp_codebooks(codebooks)
        codebook_cross_prods_ref, cent_norms_ref = precomp

        # check C++ precomp tables
        codebook_cross_prods_ref = np.hstack([
            np.vstack(c) for c in codebook_cross_prods_ref])

        rq.compute_codebook_tables()
        codebook_cross_prods = faiss.vector_to_array(
            rq.codebook_cross_products)
        codebook_cross_prods = codebook_cross_prods.reshape(
            rq.total_codebook_size, rq.total_codebook_size)
        cent_norms = faiss.vector_to_array(rq.cent_norms)

        np.testing.assert_array_almost_equal(
            codebook_cross_prods, codebook_cross_prods_ref, decimal=5)
        np.testing.assert_array_almost_equal(
            np.hstack(cent_norms_ref), cent_norms, decimal=5)

        # validate that the python tab-based encoding works
        xb = ds.get_database()
        ref_codes, _, _ = beam_search_encoding_ref(codebooks, xb, 7)
        new_codes, _ = beam_search_encoding_tab(codebooks, xb, 7, precomp)
        np.testing.assert_array_equal(ref_codes, new_codes)

        # validate the C++ beam_search_encode_step_tab function
        beam_search_encoding_tab(codebooks, xb, 7, precomp, implem="ref cpp")

        # check implem w/ residuals
        n = ref_codes.shape[0]
        sp = faiss.swig_ptr
        ref_codes_packed = np.zeros((n, rq.code_size), dtype='uint8')
        ref_codes_int32 = ref_codes.astype('int32')
        rq.pack_codes(
            n, sp(ref_codes_int32),
            sp(ref_codes_packed), rq.M * ref_codes.shape[1]
        )

        rq.max_beam_size = 7
        codes_ref_residuals = rq.compute_codes(xb)
        np.testing.assert_array_equal(ref_codes_packed, codes_ref_residuals)

        rq.use_beam_LUT = 1
        codes_new = rq.compute_codes(xb)
        np.testing.assert_array_equal(codes_ref_residuals, codes_new)
Пример #9
0
    def do_test(self, d, dsub, nbit=8, metric=None):
        if metric is None:
            self.do_test(d, dsub, nbit, faiss.METRIC_INNER_PRODUCT)
            self.do_test(d, dsub, nbit, faiss.METRIC_L2)
            return
        # faiss.cvar.distance_compute_blas_threshold = 1000000

        M = d // dsub
        pq = faiss.ProductQuantizer(d, M, nbit)
        xt = faiss.randn((max(1000, pq.ksub * 50), d), 123)
        pq.cp.niter = 4  # to avoid timeouts in tests
        pq.train(xt)

        centroids = faiss.vector_to_array(pq.centroids)
        centroids = centroids.reshape(pq.M, pq.ksub, pq.dsub)

        nx = 100
        x = faiss.randn((nx, d), 555)

        ref_tab = np.zeros((nx, M, pq.ksub), "float32")

        # computation of tables in numpy
        for sq in range(M):
            i0, i1 = sq * dsub, (sq + 1) * dsub
            xsub = x[:, i0:i1]
            centsq = centroids[sq, :, :]
            if metric == faiss.METRIC_INNER_PRODUCT:
                ref_tab[:, sq, :] = xsub @ centsq.T
            elif metric == faiss.METRIC_L2:
                xsub3 = xsub.reshape(nx, 1, dsub)
                cent3 = centsq.reshape(1, pq.ksub, dsub)
                ref_tab[:, sq, :] = ((xsub3 - cent3)**2).sum(2)
            else:
                assert False

        sp = faiss.swig_ptr

        new_tab = np.zeros((nx, M, pq.ksub), "float32")
        if metric == faiss.METRIC_INNER_PRODUCT:
            pq.compute_inner_prod_tables(nx, sp(x), sp(new_tab))
        elif metric == faiss.METRIC_L2:
            pq.compute_distance_tables(nx, sp(x), sp(new_tab))
        else:
            assert False

        # compute sdc tables in numpy
        cent1 = np.expand_dims(centroids, axis=2)  # [M, ksub, 1, dsub]
        cent2 = np.expand_dims(centroids, axis=1)  # [M, 1, ksub, dsub]
        ref_sdc_tab = ((cent1 - cent2)**2).sum(3)

        pq.compute_sdc_table()
        new_sdc_tab = faiss.vector_to_array(pq.sdc_table)
        new_sdc_tab = new_sdc_tab.reshape(M, pq.ksub, pq.ksub)

        np.testing.assert_array_almost_equal(ref_tab, new_tab, decimal=5)
        np.testing.assert_array_almost_equal(ref_sdc_tab,
                                             new_sdc_tab,
                                             decimal=5)
Пример #10
0
def run_kmeans(x, nmb_clusters):
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)

    # Change faiss seed at each k-means so that the randomly picked
    # initialization centroids do not correspond to the same feature ids
    # from an epoch to another.
    clus.seed = np.random.randint(1234)

    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)
    losses = faiss.vector_to_array(clus.obj)
    print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in I], losses[-1]
Пример #11
0
def run_kmeans(x, nmb_clusters, verbose=False, 
               seed=DEFAULT_KMEANS_SEED, gpu_device=0):
    """
    Runs kmeans on 1 GPU.
    
    Args:
    -----
    x: data
    nmb_clusters (int): number of clusters
    
    Returns:
    --------
    list: ids of data in each cluster
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    clus.seed = seed
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = gpu_device
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)
    losses = faiss.vector_to_array(clus.obj)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in I], losses[-1]
Пример #12
0
def preprocess_features(npdata, pca=32, eps=1e-5):
    """Preprocess an array of features.
    Args:
        npdata (np.array N * ndim): features to preprocess
        pca (int): initial dim of output
    Returns:
        np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized
    """
    _, ndim = npdata.shape
    npdata = npdata.astype('float32')
    npdata = npdata - np.mean(npdata, axis=0)

    # Apply PCA-whitening with Faiss
    mat = faiss.PCAMatrix(ndim, pca, eigen_power=-0.5)
    mat.train(npdata)
    assert mat.is_trained
    eigs = faiss.vector_to_array(mat.eigenvalues)
    pca = np.argwhere(
        np.cumsum(sorted(eigs / np.sum(eigs), reverse=True)) >= 0.95)[0, 0]
    mat = faiss.PCAMatrix(ndim, int(pca), eigen_power=-0.5)
    mat.train(npdata)
    assert mat.is_trained

    npdata = mat.apply_py(npdata)
    # L2 normalization
    row_sums = np.linalg.norm(npdata, axis=1)
    npdata = npdata / np.clip(row_sums[:, np.newaxis], eps, None)

    return npdata
Пример #13
0
def run_kmeans(x, nmb_clusters, verbose=False):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape
    # print(n_data, d)
    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)

    # Change faiss seed at each k-means so that the randomly picked
    # initialization centroids do not correspond to the same feature ids
    # from an epoch to another.
    clus.seed = np.random.randint(1234)
    clus.niter = 20
    # clus.min_points_per_centroid = 5
    clus.max_points_per_centroid = 100000000
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    # flat_config = faiss.GpuIndexIVFFlatConfig()   # IVF
    flat_config.useFloat16 = False
    flat_config.device = 0
    # index = faiss.GpuIndexIVFFlat(res, d, nmb_clusters, faiss.METRIC_L2, flat_config)  # faiss.Metric_INNER_PRODUCT,
    index = faiss.GpuIndexFlatL2(res, d, flat_config)
    # index = faiss.GpuIndexIP(res, d, flat_config)  # Inner product between samples
    # perform the training
    clus.train(x, index)
    D, I = index.search(x, 1)
    losses = faiss.vector_to_array(clus.obj)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))
    return [int(n[0]) for n in I], losses[-1], np.array([(d[0]) for d in D])
Пример #14
0
    def test_serialize_to_vector(self):
        d = 10
        nb = 1000
        nq = 200
        nt = 500
        xt, xb, xq = get_dataset_2(d, nt, nb, nq)

        index = faiss.IndexFlatL2(d)
        index.add(xb)

        Dref, Iref = index.search(xq, 5)

        writer = faiss.VectorIOWriter()
        faiss.write_index(index, writer)

        ar_data = faiss.vector_to_array(writer.data)

        # direct transfer of vector
        reader = faiss.VectorIOReader()
        reader.data.swap(writer.data)

        index2 = faiss.read_index(reader)

        Dnew, Inew = index2.search(xq, 5)
        assert np.all(Dnew == Dref) and np.all(Inew == Iref)

        # from intermediate numpy array
        reader = faiss.VectorIOReader()
        faiss.copy_array_to_vector(ar_data, reader.data)

        index3 = faiss.read_index(reader)

        Dnew, Inew = index3.search(xq, 5)
        assert np.all(Dnew == Dref) and np.all(Inew == Iref)
Пример #15
0
    def test_rq3(self):
        index = faiss.index_factory(5, "RQ2x16_3x8_6x4")

        np.testing.assert_array_equal(
            faiss.vector_to_array(index.rq.nbits),
            np.array([16, 16, 8, 8, 8, 4, 4, 4, 4, 4, 4])
        )
Пример #16
0
    def test_lut(self):
        """test compute_LUT function"""
        ds = datasets.SyntheticDataset(16, 1000, 0, 100)

        xt = ds.get_train()
        xq = ds.get_queries()

        nsplits = 2
        Msub = 2
        nbits = 4
        nq, d = xq.shape
        dsub = d // nsplits

        plsq = faiss.ProductLocalSearchQuantizer(ds.d, nsplits, Msub, nbits)
        plsq.train(xt)

        subcodebook_size = Msub * (1 << nbits)
        codebook_size = nsplits * subcodebook_size
        lut = np.zeros((nq, codebook_size), dtype=np.float32)
        plsq.compute_LUT(nq, sp(xq), sp(lut))

        codebooks = faiss.vector_to_array(plsq.codebooks)
        codebooks = codebooks.reshape(nsplits, subcodebook_size, dsub)
        xq = xq.reshape(nq, nsplits, dsub)
        lut_ref = np.zeros((nq, nsplits, subcodebook_size), dtype=np.float32)
        for i in range(nsplits):
            lut_ref[:, i] = xq[:, i] @ codebooks[i].T
        lut_ref = lut_ref.reshape(nq, codebook_size)

        # max rtoal in OSX: 2.87e-6
        np.testing.assert_allclose(lut, lut_ref, rtol=5e-06)
Пример #17
0
    def test_int64(self):
        # see https://github.com/facebookresearch/faiss/issues/1529
        v = faiss.Int64Vector()

        for i in range(10):
            v.push_back(i)
        a = faiss.vector_to_array(v)
        assert a.dtype == 'int64'
        np.testing.assert_array_equal(a, np.arange(10, dtype='int64'))

        # check if it works in an IDMap
        idx = faiss.IndexIDMap(faiss.IndexFlatL2(32))
        idx.add_with_ids(
            np.random.rand(10, 32).astype('float32'),
            np.random.randint(1000, size=10, dtype='int64'))
        faiss.vector_to_array(idx.id_map)
Пример #18
0
    def subtest(self, d, K, metric):
        metric_names = {faiss.METRIC_L1: 'L1',
                        faiss.METRIC_L2: 'L2',
                        faiss.METRIC_INNER_PRODUCT: 'IP'}

        nb = 1000
        _, xb, _ = get_dataset_2(d, 0, nb, 0)

        _, knn = faiss.knn(xb, xb, K + 1, metric)
        knn = knn[:, 1:]

        index = faiss.IndexNNDescentFlat(d, K, metric)
        index.nndescent.S = 10
        index.nndescent.R = 32
        index.nndescent.L = K + 20
        index.nndescent.iter = 5
        index.verbose = True

        index.add(xb)
        graph = index.nndescent.final_graph
        graph = faiss.vector_to_array(graph)
        graph = graph.reshape(nb, K)

        recalls = 0
        for i in range(nb):
            for j in range(K):
                for k in range(K):
                    if graph[i, j] == knn[i, k]:
                        recalls += 1
                        break
        recall = 1.0 * recalls / (nb * K)
        print('Metric: {}, knng accuracy: {}'.format(metric_names[metric], recall))
        assert recall > 0.99
def run_kmeans(x, num_clusters, temperature):
    """
    Args:
        x: data to be clustered
    """
    print('performing kmeans clustering')
    results = {'im2cluster': [], 'centroids': [], 'density': []}
    for seed, num_cluster in enumerate(num_clusters):
        # intialize faiss clustering parameters
        d = x.shape[1]
        k = int(num_cluster)
        clus = faiss.Clustering(d, k)
        clus.verbose = False
        clus.niter = 20
        clus.nredo = 5
        clus.seed = seed
        clus.max_points_per_centroid = 1000
        clus.min_points_per_centroid = 5
        res = faiss.StandardGpuResources()
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = 0
        index = faiss.GpuIndexFlatL2(res, d, cfg)
        clus.train(x, index)
        D, I = index.search(
            x, 1)  # for each sample, find cluster distance and assignments
        im2cluster = [int(n[0]) for n in I]
        # get cluster centroids
        centroids = faiss.vector_to_array(clus.centroids).reshape(k, d)
        # sample-to-centroid distances for each cluster
        Dcluster = [[] for c in range(k)]
        for im, i in enumerate(im2cluster):
            Dcluster[i].append(D[im][0])
        # concentration estimation (phi)
        density = np.zeros(k)
        for i, dist in enumerate(Dcluster):
            if len(dist) > 1:
                d = (np.asarray(dist)**0.5).mean() / np.log(len(dist) + 10)
                density[i] = d
                # if cluster only has one point, use the max to estimate its concentration
        dmax = density.max()
        for i, dist in enumerate(Dcluster):
            if len(dist) <= 1:
                density[i] = dmax
        density = density.clip(np.percentile(density, 10),
                               np.percentile(
                                   density,
                                   90))  # clamp extreme values for stability
        density = temperature * density / density.mean(
        )  # scale the mean to temperature
        # convert to cuda Tensors for broadcast
        centroids = torch.Tensor(centroids).cuda()
        centroids = nn.functional.normalize(centroids, p=2, dim=1)
        im2cluster = torch.LongTensor(im2cluster).cuda()
        density = torch.Tensor(density).cuda()
        results['centroids'].append(centroids)
        results['density'].append(density)
        results['im2cluster'].append(im2cluster)
    return results
Пример #20
0
    def test_redo(self):
        d = 64
        n = 1000

        rs = np.random.RandomState(123)
        x = rs.uniform(size=(n, d)).astype('float32')

        clus = faiss.Clustering(d, 20)
        clus.nredo = 1
        clus.train(x, faiss.IndexFlatL2(d))
        obj1 = faiss.vector_to_array(clus.obj)

        clus = faiss.Clustering(d, 20)
        clus.nredo = 10
        clus.train(x, faiss.IndexFlatL2(d))
        obj10 = faiss.vector_to_array(clus.obj)

        self.assertGreater(obj1[-1], obj10[-1])
Пример #21
0
    def test_redo(self):
        d = 64
        n = 1000

        rs = np.random.RandomState(123)
        x = rs.uniform(size=(n, d)).astype('float32')

        clus = faiss.Clustering(d, 20)
        clus.nredo = 1
        clus.train(x, faiss.IndexFlatL2(d))
        obj1 = faiss.vector_to_array(clus.obj)

        clus = faiss.Clustering(d, 20)
        clus.nredo = 10
        clus.train(x, faiss.IndexFlatL2(d))
        obj10 = faiss.vector_to_array(clus.obj)

        self.assertGreater(obj1[-1], obj10[-1])
        def train_and_get_centroids(override_kmeans_index):
            index = faiss.index_binary_factory(d, b"BIVF10")
            index.verbose = True

            if override_kmeans_index is not None:
                index.clustering_index = override_kmeans_index

            index.train(xt)

            centroids = faiss.downcast_IndexBinary(index.quantizer).xb
            return faiss.vector_to_array(centroids).reshape(-1, d // 8)
Пример #23
0
    def test_read_buffer(self):
        d, n = 32, 1000
        x = np.random.uniform(size=(n, d)).astype('float32')
        index = faiss.IndexFlatL2(d)
        index.add(x)

        _, fname = tempfile.mkstemp()
        try:
            faiss.write_index(index, fname)

            reader = faiss.BufferedIOReader(faiss.FileIOReader(fname), 1234)

            index2 = faiss.read_index(reader)

            self.assertEqual(index.d, index2.d)
            np.testing.assert_array_equal(faiss.vector_to_array(index.xb),
                                          faiss.vector_to_array(index2.xb))

        finally:
            if os.path.exists(fname):
                os.unlink(fname)
Пример #24
0
    def test_equiv_rq(self):
        """
        make sure it is equivalent to search a RQ and to search an IVF
        with RCQ + RQ with the same codebooks.
        """
        ds = datasets.SyntheticDataset(32, 3000, 1000, 50)

        # make a flat RQ
        iflat = faiss.IndexResidualQuantizer(ds.d, 5, 4)
        iflat.rq.train_type = faiss.ResidualQuantizer.Train_default
        iflat.train(ds.get_train())
        iflat.add(ds.get_database())

        # ref search result
        Dref, Iref = iflat.search(ds.get_queries(), 10)

        # get its codebooks + encoded version of the dataset
        codebooks = get_additive_quantizer_codebooks(iflat.rq)
        codes = faiss.vector_to_array(iflat.codes).reshape(-1, iflat.code_size)

        # make an IVF with 2x4 + 3x4 = 5x4 bits
        ivf = faiss.index_factory(ds.d, "IVF256(RCQ2x4),RQ3x4")

        # initialize the codebooks
        rcq = faiss.downcast_index(ivf.quantizer)
        faiss.copy_array_to_vector(
            np.vstack(codebooks[:rcq.rq.M]).ravel(),
            rcq.rq.codebooks
        )
        rcq.rq.is_trained = True
        # translation of AdditiveCoarseQuantizer::train
        rcq.ntotal = 1 << rcq.rq.tot_bits
        rcq.centroid_norms.resize(rcq.ntotal)
        rcq.rq.compute_centroid_norms(rcq.centroid_norms.data())
        rcq.is_trained = True

        faiss.copy_array_to_vector(
            np.vstack(codebooks[rcq.rq.M:]).ravel(),
            ivf.rq.codebooks
        )
        ivf.rq.is_trained = True
        ivf.is_trained = True

        # add the codes (this works because 2x4 is a multiple of 8 bits)
        ivf.add_sa_codes(codes)

        # perform exhaustive search
        ivf.nprobe = ivf.nlist

        Dnew, Inew = ivf.search(ds.get_queries(), 10)

        np.testing.assert_array_equal(Iref, Inew)
        np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
Пример #25
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    print("Reading features")
    x = np.load(args.data, mmap_mode="r")

    print("Computing PCA")
    pca = faiss.PCAMatrix(x.shape[-1], args.dim, args.eigen_power)
    pca.train(x)
    b = faiss.vector_to_array(pca.b)
    A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in)

    os.makedirs(args.output, exist_ok=True)

    prefix = str(args.dim)
    if args.eigen_power != 0:
        prefix += f"_{args.eigen_power}"

    np.save(osp.join(args.output, f"{prefix}_pca_A"), A.T)
    np.save(osp.join(args.output, f"{prefix}_pca_b"), b)
Пример #26
0
    def __init__(self, ds, indexfile):
        self.d = ds.d
        self.metric = ds.metric
        self.nq = ds.nq
        self.xq = ds.get_queries()

        # get the xb set
        src_index = faiss.read_index(indexfile)
        src_quant = faiss.downcast_index(src_index.quantizer)
        centroids = faiss.vector_to_array(src_quant.xb)
        self.xb = centroids.reshape(-1, self.d)
        self.nb = self.nt = len(self.xb)
Пример #27
0
    def subtest_cluster1d(self, n, k):
        rs = np.random.RandomState(123)
        x = rs.uniform(size=(n, 1)).astype('float32')

        clus = faiss.Clustering1D(k)
        clus.train_exact(x)
        centroids = faiss.vector_to_array(clus.centroids).reshape((-1, 1))
        obj = self.evaluate_obj(centroids, x)

        clus2 = faiss.Kmeans(1, k)
        clus2.train(x)
        obj2 = self.evaluate_obj(clus2.centroids, x)
        self.assertLessEqual(obj, obj2)
Пример #28
0
def unpack_codes(rq, packed_codes):
    nbits = faiss.vector_to_array(rq.nbits)
    if np.all(nbits == 8):
        return packed_codes.astype("uint32")
    nbits = [int(x) for x in nbits]
    nb = len(nbits)
    n, code_size = packed_codes.shape
    codes = np.zeros((n, nb), dtype="uint32")
    for i in range(n):
        br = faiss.BitstringReader(faiss.swig_ptr(packed_codes[i]), code_size)
        for j, nbi in enumerate(nbits):
            codes[i, j] = br.read(nbi)
    return codes
Пример #29
0
    def train(self, vecs: np.ndarray, *args, **kwargs) -> None:
        import faiss
        num_samples, num_dim = vecs.shape
        assert self.output_dim <= num_samples, 'training PCA requires at least %d points, but %d was given' % (
            self.output_dim, num_samples)
        assert self.output_dim < num_dim, 'PCA output dimension should < data dimension, received (%d, %d)' % (
            self.output_dim, num_dim)

        pca = faiss.PCAMatrix(num_dim, self.output_dim)
        self.mean = np.mean(vecs, axis=0)  # 1 x 768
        pca.train(vecs)
        explained_variance_ratio = faiss.vector_to_array(
            pca.eigenvalues)[:self.output_dim]
        components = faiss.vector_to_array(pca.PCAMat).reshape(
            [-1, num_dim])[:self.output_dim]

        # permutate engive according to variance
        opt_order = get_perm(explained_variance_ratio, self.num_locals)
        comp_tmp = np.reshape(components[opt_order],
                              [self.output_dim, num_dim])

        self.pca_components = np.transpose(comp_tmp)  # 768 x 200
Пример #30
0
    def test_int64(self):
        # see https://github.com/facebookresearch/faiss/issues/1529
        sizeof_long = array.array("l").itemsize
        if sizeof_long == 4:
            v = faiss.LongLongVector()
        elif sizeof_long == 8:
            v = faiss.LongVector()
        else:
            raise AssertionError("weird long size")

        for i in range(10):
            v.push_back(i)
        a = faiss.vector_to_array(v)
        assert a.dtype == 'int64'
        np.testing.assert_array_equal(a, np.arange(10, dtype='int64'))

        # check if it works in an IDMap
        idx = faiss.IndexIDMap(faiss.IndexFlatL2(32))
        idx.add_with_ids(
            np.random.rand(10, 32).astype('float32'),
            np.random.randint(1000, size=10, dtype='int64'))
        faiss.vector_to_array(idx.id_map)
Пример #31
0
    def subtest_add2col(self, xb, xq, index, qname):
        """Test with 2 additional dimensions to take also the non-SIMD
        codepath. We don't retrain anything but add 2 dims to the
        queries, the centroids and the trained ScalarQuantizer.
        """
        nb, d = xb.shape

        d2 = d + 2
        xb2 = self.add2columns(xb)
        xq2 = self.add2columns(xq)

        nlist = index.nlist
        quantizer = faiss.downcast_index(index.quantizer)
        quantizer2 = faiss.IndexFlat(d2, index.metric_type)
        centroids = faiss.vector_to_array(quantizer.xb).reshape(nlist, d)
        centroids2 = self.add2columns(centroids)
        quantizer2.add(centroids2)
        index2 = faiss.IndexIVFScalarQuantizer(
            quantizer2, d2, index.nlist, index.sq.qtype,
            index.metric_type)
        index2.nprobe = 4
        if qname in ('8bit', '4bit'):
            trained = faiss.vector_to_array(index.sq.trained).reshape(2, -1)
            nt = trained.shape[1]
            # 2 lines: vmins and vdiffs
            new_nt = int(nt * d2 / d)
            trained2 = np.hstack((
                trained,
                np.zeros((2, new_nt - nt), dtype='float32')
            ))
            trained2[1, nt:] = 1.0   # set vdiff to 1 to avoid div by 0
            faiss.copy_array_to_vector(trained2.ravel(), index2.sq.trained)
        else:
            index2.sq.trained = index.sq.trained

        index2.is_trained = True
        index2.add(xb2)
        return index2.search(xq2, 10)