예제 #1
0
파일: test_pq.py 프로젝트: matsui528/nanopq
 def test_instantiate(self):
     pq1 = nanopq.PQ(M=4, Ks=256)
     pq2 = nanopq.PQ(M=4, Ks=500)
     pq3 = nanopq.PQ(M=4, Ks=2**16 + 10)
     self.assertEqual(pq1.code_dtype, np.uint8)
     self.assertEqual(pq2.code_dtype, np.uint16)
     self.assertEqual(pq3.code_dtype, np.uint32)
예제 #2
0
파일: test_pq.py 프로젝트: matsui528/nanopq
    def test_fit(self):
        N, D, M, Ks = 100, 12, 4, 10
        X = np.random.random((N, D)).astype(np.float32)
        pq = nanopq.PQ(M=M, Ks=Ks)
        pq.fit(X)
        self.assertEqual(pq.Ds, D / M)
        self.assertEqual(pq.codewords.shape, (M, Ks, D / M))

        pq2 = nanopq.PQ(M=M, Ks=Ks).fit(X)  # Can be called as a chain
        self.assertTrue(np.allclose(pq.codewords, pq2.codewords))
예제 #3
0
    def fit(self, vecs, iter=20, seed=123):
        """Given training vectors, train a codec (PQ or OPQ instance)
        This should be called first and only once.

        Args:
            vecs (np.ndarray): Traning vectors with shape=(Nt, D) and dtype=np.float32.
            iter (int): The number of iteration for k-means of PQ/OPQ
            seed (int): The seed for random process

        Returns:
            object: self

        """
        assert self.fine_quantizer is None, "`fit` should be called only once"
        assert vecs.dtype == np.float32

        if self.codec == "pq":
            self.fine_quantizer = nanopq.PQ(M=self.M,
                                            Ks=self.Ks,
                                            verbose=self.verbose)
            self.fine_quantizer.fit(vecs=vecs, iter=iter, seed=seed)
        elif self.codec == "opq":
            self.fine_quantizer = nanopq.OPQ(M=self.M,
                                             Ks=self.Ks,
                                             verbose=self.verbose)
            # rotation_iter is currently fixed to 10
            self.fine_quantizer.fit(vecs=vecs,
                                    pq_iter=iter,
                                    rotation_iter=10,
                                    seed=seed)

        # Set trained codewords to cpp impl
        self.impl_cpp.set_codewords(self.fine_quantizer.codewords)

        return self
예제 #4
0
    def test_query_linear(self):
        M, Ks = 4, 20
        N, D = 1000, 40
        X = np.random.random((N, D)).astype(np.float32)
        e = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit(
            vecs=X))
        e.add_configure(vecs=X, nlist=20)

        for n, q in enumerate(X[:10]):
            topk = 10
            ids1, dists1 = e.impl_cpp.query_linear(
                q, topk, np.array([], dtype=np.int64))
            self.assertTrue(isinstance(ids1, list))
            self.assertTrue(isinstance(ids1[0], int))
            self.assertTrue(isinstance(dists1, list))
            self.assertTrue(isinstance(dists1[0], float))
            self.assertEqual(len(ids1), topk)
            self.assertEqual(len(ids1), len(dists1))
            self.assertTrue(
                np.all(0 <= np.diff(dists1)))  # Make sure dists1 is sorted
            # The true NN is included in top 10 with high prob
            self.assertTrue(n in ids1)

            # Subset search w/ a full indices should be the same w/o target
            ids2, dists2 = e.impl_cpp.query_linear(q, topk, np.arange(N))
            self.assertListEqual(ids1, ids2)
            self.assertListEqual(dists1, dists2)

            S = np.array(
                [2, 24, 43, 55, 102, 139, 221, 542, 667, 873, 874, 899])
            ids3, dists3 = e.impl_cpp.query_linear(q, topk, S)
            self.assertTrue(np.all([id in S for id in ids3]))
예제 #5
0
    def test_nanopq_to_faiss(self):
        D, M, Ks = 32, 4, 256
        Nt, Nb, Nq = 2000, 10000, 100
        Xt = np.random.rand(Nt, D).astype(np.float32)
        Xb = np.random.rand(Nb, D).astype(np.float32)
        Xq = np.random.rand(Nq, D).astype(np.float32)
        pq_nanopq = nanopq.PQ(M=M, Ks=Ks)
        pq_nanopq.fit(vecs=Xt)

        with self.assertRaises(AssertionError):  # opq is not supported
            opq = nanopq.OPQ(M=M, Ks=Ks)
            nanopq.nanopq_to_faiss(opq)

        pq_faiss = nanopq.nanopq_to_faiss(pq_nanopq)  # IndexPQ

        # Encoded results should be same
        Cb_nanopq = pq_nanopq.encode(vecs=Xb)
        Cb_faiss = pq_faiss.pq.compute_codes(x=Xb)  # ProductQuantizer in IndexPQ
        self.assertTrue(np.array_equal(Cb_nanopq, Cb_faiss))

        # Search result should be same
        topk = 10
        pq_faiss.add(Xb)
        _, ids1 = pq_faiss.search(x=Xq, k=topk)
        ids2 = np.array(
            [
                np.argsort(pq_nanopq.dtable(query=xq).adist(codes=Cb_nanopq))[:topk]
                for xq in Xq
            ]
        )

        self.assertTrue(np.array_equal(ids1, ids2))
예제 #6
0
    def test_faiss_nanopq_compare_accuracy(self):
        D, M, Ks = 32, 4, 256
        Nt, Nb, Nq = 20000, 10000, 100
        nbits = int(np.log2(Ks))
        assert nbits == 8
        Xt = np.random.rand(Nt, D).astype(np.float32)
        Xb = np.random.rand(Nb, D).astype(np.float32)
        Xq = np.random.rand(Nq, D).astype(np.float32)

        pq_faiss = faiss.IndexPQ(D, M, nbits)
        pq_faiss.train(x=Xt)
        Cb_faiss = pq_faiss.pq.compute_codes(Xb)
        Xb_faiss_ = pq_faiss.pq.decode(Cb_faiss)

        pq_nanopq = nanopq.PQ(M=M, Ks=Ks)
        pq_nanopq.fit(vecs=Xt)
        Cb_nanopq = pq_nanopq.encode(vecs=Xb)
        Xb_nanopq_ = pq_nanopq.decode(codes=Cb_nanopq)

        # Reconstruction error should be almost identical
        avg_relative_error_faiss = ((Xb - Xb_faiss_) ** 2).sum() / (Xb ** 2).sum()
        avg_relative_error_nanopq = ((Xb - Xb_nanopq_) ** 2).sum() / (Xb ** 2).sum()
        diff_rel = (
            avg_relative_error_faiss - avg_relative_error_nanopq
        ) / avg_relative_error_faiss
        diff_rel = np.sqrt(diff_rel ** 2)
        print("avg_rel_error_faiss:", avg_relative_error_faiss)
        print("avg_rel_error_nanopq:", avg_relative_error_nanopq)
        print("diff rel:", diff_rel)

        self.assertLess(diff_rel, 0.01)
예제 #7
0
def pq_search(source_dataset, query_vector):
    pq = nanopq.PQ(M=8)
    pq.fit(source_dataset)

    source_code = pq.encode(source_dataset)

    dists = pq.dtable(query_vector).adist(source_code)  # (10000, )

    print(dists)
예제 #8
0
 def test_construct(self):
     M, Ks = 4, 20
     N, D = 1000, 40
     X = np.random.random((N, D)).astype(np.float32)
     e = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit(
         vecs=X))
     self.assertEqual(e.fine_quantizer.codewords.shape, (M, Ks, D / M))
     self.assertEqual((e.M, e.Ks), (M, Ks))
     self.assertEqual(e.verbose, True)
     e.verbose = False
     self.assertEqual(e.verbose, False)
예제 #9
0
 def test_add_configure(self):
     M, Ks = 4, 20
     N, D = 1000, 40
     X = np.random.random((N, D)).astype(np.float32)
     e1 = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit(
         vecs=X))
     e1.add_configure(vecs=X, nlist=20)
     e2 = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit(
         vecs=X))
     e2.add(vecs=X, update_posting_lists=False)
     e2.reconfigure(nlist=20)
     # The result of add_configure() should be the same as that of
     # (1) add(updating_posting_lists=False) and (2) reconfigure()
     self.assertTrue(np.allclose(e1.codes, e2.codes))
     self.assertListEqual(e1.posting_lists, e2.posting_lists)
     e3 = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit(
         vecs=X)).add_configure(vecs=X, nlist=20)
     # Can be called as a chain
     self.assertTrue(np.allclose(e1.codes, e3.codes))
     self.assertListEqual(e1.posting_lists, e3.posting_lists)
예제 #10
0
 def test_pickle(self):
     import pickle
     N, D, M, Ks = 100, 12, 4, 10
     X = np.random.random((N, D)).astype(np.float32)
     pq = nanopq.PQ(M=M, Ks=Ks)
     pq.fit(X)
     dumped = pickle.dumps(pq)
     pq2 = pickle.loads(dumped)
     self.assertEqual((pq.M, pq.Ks, pq.verbose, pq.code_dtype, pq.Ds),
                      (pq2.M, pq2.Ks, pq2.verbose, pq2.code_dtype, pq2.Ds))
     self.assertTrue(np.allclose(pq.codewords, pq2.codewords))
     self.assertTrue(pq == pq2)
예제 #11
0
    def test_eq(self):
        import copy
        N, D, M, Ks = 100, 12, 4, 10
        X = np.random.random((N, D)).astype(np.float32)
        pq1 = nanopq.PQ(M=M, Ks=Ks)
        pq2 = nanopq.PQ(M=M, Ks=Ks)
        pq3 = copy.deepcopy(pq1)
        pq4 = nanopq.PQ(M=M, Ks=2 * Ks)
        self.assertTrue(pq1 == pq1)
        self.assertTrue(pq1 == pq2)
        self.assertTrue(pq1 == pq3)
        self.assertTrue(pq1 != pq4)

        pq1.fit(X)
        pq2.fit(X)
        pq3 = copy.deepcopy(pq1)
        pq4.fit(X)
        self.assertTrue(pq1 == pq1)
        self.assertTrue(pq1 == pq2)
        self.assertTrue(pq1 == pq3)
        self.assertTrue(pq1 != pq4)
예제 #12
0
    def test_query_ivf(self):
        M, Ks = 20, 256
        N, D = 1000, 40
        X = np.random.random((N, D)).astype(np.float32)
        e = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit(
            vecs=X))
        e.add_configure(vecs=X, nlist=20)

        for n, q in enumerate(X[:10]):
            L = 200
            topk = 10
            ids1, dists1 = e.impl_cpp.query_ivf(q, topk,
                                                np.array([], dtype=np.int64),
                                                L)
            self.assertTrue(isinstance(ids1, list))
            self.assertTrue(isinstance(ids1[0], int))
            self.assertTrue(isinstance(dists1, list))
            self.assertTrue(isinstance(dists1[0], float))
            self.assertEqual(len(ids1), topk)
            self.assertEqual(len(ids1), len(dists1))
            self.assertTrue(
                np.all(0 <= np.diff(dists1)))  # Make sure dists1 is sorted
            # The true NN is included in top 10 with high prob
            # This might fail if the parameters are severe
            self.assertTrue(n in ids1)

            # Subset search w/ a full indices should be the same w/o target
            ids2, dists2 = e.impl_cpp.query_ivf(q, topk,
                                                np.arange(N, dtype=np.int64),
                                                L)
            self.assertListEqual(ids1, ids2)
            self.assertListEqual(dists1, dists2)

            S = np.array(
                [2, 24, 43, 55, 102, 139, 221, 542, 667, 873, 874, 899],
                dtype=np.int64)
            ids3, dists3 = e.impl_cpp.query_ivf(q, topk, S, L)
            self.assertTrue(np.all([id in S for id in ids3]))

            # When target_ids is all vectors and L=all, the results is the same as linear PQ scan
            ids4, dists4 = e.impl_cpp.query_ivf(q, topk,
                                                np.arange(N, dtype=np.int64),
                                                N)
            ids5, dists5 = e.impl_cpp.query_linear(
                q, topk, np.array([], dtype=np.int64))
            self.assertListEqual(ids4, ids5)
            self.assertListEqual(dists4, dists5)

            # When target_ids is specified and L is large, linear and ivf should produce the same result
            ids6, dists6 = e.impl_cpp.query_ivf(q, topk, S, L)
            ids7, dists7 = e.impl_cpp.query_linear(q, topk, S)
            self.assertListEqual(ids6, ids7)
            self.assertListEqual(dists6, dists7)
예제 #13
0
 def fit(self, x):
     np_x = np.float32(np.asarray(x))
     if (len(np_x)) < self.minimum_required:
         raise RuntimeError(
             f"Too less data to train, need at least {self.minimum_required} vectors"
         )
     pq_model = nanopq.PQ(M=self.m, Ks=min(len(np_x) - 1, self.ks_max))
     pq_model.fit(vecs=np_x,
                  iter=self.n_iter,
                  seed=random.randint(1, 10000))
     self.model = PQModelHolder(pq_model=pq_model,
                                pq_codes=pq_model.encode(vecs=np_x),
                                indexed_data=self.indexed_data)
예제 #14
0
파일: test_pq.py 프로젝트: matsui528/nanopq
 def test_encode_decode(self):
     N, D, M, Ks = 100, 12, 4, 10
     X = np.random.random((N, D)).astype(np.float32)
     pq = nanopq.PQ(M=M, Ks=Ks)
     pq.fit(X)
     X_ = pq.encode(X)  # encoded
     self.assertEqual(X_.shape, (N, M))
     self.assertEqual(X_.dtype, np.uint8)
     X__ = pq.decode(X_)  # reconstructed
     self.assertEqual(X.shape, X__.shape)
     # The original X and the reconstructed X__ should be similar
     self.assertTrue(
         np.linalg.norm(X - X__)**2 / np.linalg.norm(X)**2 < 0.1)
예제 #15
0
 def test_clear(self):
     M, Ks = 4, 20
     N, D = 1000, 40
     X = np.random.random((N, D)).astype(np.float32)
     e = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit(
         vecs=X))
     e.add_configure(vecs=X, nlist=20)
     e.clear()
     self.assertTrue(e.threshold is None)
     self.assertEqual(e.N, 0)
     self.assertEqual(e.nlist, 0)
     self.assertEqual(e.coarse_centers, None)
     self.assertEqual(e.codes, None)
     self.assertEqual(len(e.posting_lists), 0)
예제 #16
0
파일: test_pq.py 프로젝트: matsui528/nanopq
 def test_search(self):
     N, D, M, Ks = 100, 12, 4, 10
     X = np.random.random((N, D)).astype(np.float32)
     pq = nanopq.PQ(M=M, Ks=Ks)
     pq.fit(X)
     X_ = pq.encode(X)
     q = X[13]
     dtbl = pq.dtable(q)
     self.assertEqual(dtbl.dtable.shape, (M, Ks))
     dists = dtbl.adist(X_)
     self.assertEqual(len(dists), N)
     self.assertEqual(np.argmin(dists), 13)
     dists2 = pq.dtable(q).adist(X_)  # can be chained
     self.assertAlmostEqual(dists.tolist(), dists2.tolist())
예제 #17
0
 def test_simple_add_configure(self):
     M, Ks = 4, 20
     N1, N2, D = 300, 700, 40
     X1 = np.random.random((N1, D)).astype(np.float32)
     X2 = np.random.random((N2, D)).astype(np.float32)
     e = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit(
         vecs=X1))
     e.add(vecs=X1)
     self.assertEqual(e.N, N1)
     e.add(vecs=X2)
     self.assertEqual(e.N, N1 + N2)
     for nlist in [5, 100]:
         e.reconfigure(nlist=nlist)
         self.assertEqual(e.nlist, nlist)
         self.assertEqual(e.coarse_centers.shape, (nlist, M))
         self.assertEqual(len(e.posting_lists), nlist)
         self.assertEqual(sum([len(plist) for plist in e.posting_lists]),
                          N1 + N2)
예제 #18
0
    def test_pickle(self):
        M, Ks = 10, 256
        N, D = 1000, 40
        X = np.random.random((N, D)).astype(np.float32)
        e1 = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit(
            vecs=X))
        e1.add_configure(vecs=X, nlist=20)

        import pickle
        dumped = pickle.dumps(e1)
        e2 = pickle.loads(dumped)
        self.assertEqual((e1.M, e1.Ks, e1.threshold),
                         (e2.M, e2.Ks, e2.threshold))

        self.assertTrue(np.allclose(e1.coarse_centers, e2.coarse_centers))
        self.assertTrue(np.allclose(e1.codes, e2.codes))
        for pl1, pl2 in zip(e1.posting_lists, e2.posting_lists):
            self.assertListEqual(pl1, pl2)
예제 #19
0
    def test_merge(self):
        from itertools import chain
        M, Ks, N1, N2, D = 4, 20, 1000, 500, 40
        X1 = np.random.random((N1, D)).astype(np.float32)
        X2 = np.random.random((N2, D)).astype(np.float32)
        codec = nanopq.PQ(M=M, Ks=Ks, verbose=True).fit(vecs=X1)
        e1 = rii.Rii(fine_quantizer=codec)
        e2 = rii.Rii(fine_quantizer=codec)

        # e1: empty  e2: empty
        e1.merge(e2)
        self.assertEqual((e1.N, e2.N), (0, 0))

        # e1: vecs  e2: empty
        e1.add_configure(vecs=X1)
        e1.merge(e2)  # posting lists are created in the above line
        self.assertEqual(e1.N, N1)
        self.assertEqual(e1.nlist, int(np.sqrt(N1)))  # Have posting lists
        e1.clear()

        # e1: empty  e2: vecs
        e2.add_configure(vecs=X2)
        e1.merge(e2)  # e1 didn't have posting lists
        self.assertEqual(e1.N, N2)
        self.assertEqual(e1.nlist, 0)  # No posting lists
        e1.clear()
        e2.clear()

        # e1: vecs  e2: vecs
        e1.add_configure(vecs=X1)
        e2.add_configure(vecs=X2)
        e1.merge(e2)
        self.assertEqual(e1.N, N1 + N2)
        self.assertEqual(e1.nlist, int(
            np.sqrt(N1)))  # posting lists are same as the original e1

        # Make sure everything is fine
        self.assertTrue(
            np.array_equal(e1.codes, codec.encode(np.vstack((X1, X2)))))
        self.assertEqual(sorted(chain(*e1.posting_lists)),
                         list(range(N1 + N2)))
예제 #20
0
def pq_dis():
    N, D = 10000, 128
    X = np.random.random((N, D)).astype(np.float32)  # 10,000 128-dim vectors
    query = np.random.random((D, )).astype(np.float32)  # a 128-dim vector

    # Instantiate with M=8 sub-spaces
    #pq = nanopq.PQ(M=8,Ks=256)
    pq = nanopq.PQ(M=8, Ks=256)

    # Train with the top 1000 vectors
    pq.fit(X[:1000])

    # Encode to PQ-codes
    X_code = pq.encode(X)  # (10000, 8) with dtype=np.uint8
    time1 = datetime.datetime.now()
    # Results: create a distance table online, and compute Asymmetric Distance to each PQ-code
    dists = pq.dtable(query).adist(X_code)

    nsmallestList = heapq.nsmallest(5, dists)
    print(nsmallestList)
    indexs = [dists.tolist().index(i) for i in nsmallestList]
    print(indexs)
    print(dists[indexs])
    print("time", (datetime.datetime.now() - time1).microseconds)
예제 #21
0
    def test_add_configure_small_number_of_vectors(self):
        import copy
        M, Ks = 4, 20
        N, D = 1000, 40
        X = np.random.random((N, D)).astype(np.float32)
        e1 = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit(
            vecs=X))
        e2 = copy.deepcopy(e1)
        e3 = copy.deepcopy(e1)
        for x in X[:10]:
            e1.add_configure(vecs=x.reshape(1, -1))  # Can be added one by one
        self.assertEqual(e1.N, 10)

        e2.add_configure(vecs=X[:10])
        # Should be same to that by add_reconfigure at once
        self.assertTrue(np.allclose(e1.codes, e2.codes))
        self.assertEqual(e1.posting_lists, e2.posting_lists)

        for x in X[:10]:
            e3.add(x.reshape(1, -1))
        e3.reconfigure()
        # Should be same to that by add several times the nreconfigure
        self.assertTrue(np.allclose(e1.codes, e3.codes))
        self.assertEqual(e1.posting_lists, e3.posting_lists)
예제 #22
0
파일: run_sift1b.py 프로젝트: us03385/rii
Nt = 10000000  # Use top 10M vectors for training
with path_train.open("rb") as f:
    for vec in texmex_python.reader.read_bvec_iter(f):
        Xt.append(vec)
        if len(Xt) == Nt:
            break
Xt = np.array(Xt, dtype=np.float32)
print("Xt.shape: {}, Xt.dtype: {}".format(Xt.shape, Xt.dtype))


# Train a PQ codec and save it
M = 8  # The number of subspace.
path_codec = p / 'cache/codec_m{}.pkl'.format(M)
if not path_codec.exists():
    print("Start to train a codec")
    codec = nanopq.PQ(M=M, verbose=True).fit(vecs=Xt)
    pickle.dump(codec, path_codec.open("wb"))
    print("Dump the codec in {}".format(path_codec))
else:
    print("Read a codec from cache: {}".format(path_codec))
    codec = pickle.load(path_codec.open("rb"))


# Construct a search engine
path_engine = p / 'cache/engine_m{}.pkl'.format(M)
if not path_engine.exists():
    print("Start to construct a Rii engine")
    e = rii.Rii(fine_quantizer=codec)
    batch_size = 10000000
    with path_base.open("rb") as f:
        for n, batch in enumerate(more_itertools.chunked(texmex_python.reader.read_bvec_iter(f), batch_size)):
예제 #23
0
    datas = []
    for file in files:
        img_1 = cv2.imread(path + "/" + file, 0)
        img1 = cv2.resize(img_1, (65, 64), interpolation=cv2.INTER_LINEAR)
        dhash = dHash(img1, 64)
        data = list(map(int, dhash))
        datas.append(data)
    datas = np.asarray(datas, dtype=np.float32)
    N = len(datas)
    D = 64 * 64

    query = datas[0]
    # np.random.random((D,)).astype(np.float32)  # a 128-dim vector

    # Instantiate with M=8 sub-spaces
    pq = nanopq.PQ(M=8, Ks=48)

    # Train with the top 1000 vectors
    pq.fit(datas)

    # Encode to PQ-codes
    X_code = pq.encode(datas)  # (10000, 8) with dtype=np.uint8

    time1 = datetime.datetime.now()
    # Results: create a distance table online, and compute Asymmetric Distance to each PQ-code
    dists = pq.dtable(query).adist(X_code)

    nsmallestList = heapq.nsmallest(54, dists)
    print(nsmallestList)
    indexs = [dists.tolist().index(i) for i in nsmallestList]
    print(indexs)
예제 #24
0
 def train(self, vecs):
     codec = nanopq.PQ(M=self.M, verbose=False).fit(vecs=vecs)
     self.index = rii.Rii(fine_quantizer=codec)