def eval_quant_loss(self, by_residual, metric=faiss.METRIC_L2): ds = datasets.SyntheticDataset(32, 2000, 5000, 1000) index = faiss.index_factory(32, "IVF32,PQ16x4np", metric) index.train(ds.get_train()) index.add(ds.get_database()) index.nprobe = 4 index.by_residual = by_residual Da, Ia = index.search(ds.get_queries(), 10) # loss due to int8 quantization of LUTs index2 = faiss.IndexIVFPQFastScan(index) index2.implem = 2 Db, Ib = index2.search(ds.get_queries(), 10) m3 = three_metrics(Da, Ia, Db, Ib) # print(by_residual, metric, recall_at_1, recall_at_10, intersection_at_10) ref_results = { (True, 1): [0.985, 1.0, 9.872], (True, 0): [ 0.987, 1.0, 9.914], (False, 1): [0.991, 1.0, 9.907], (False, 0): [0.986, 1.0, 9.917], } ref = ref_results[(by_residual, metric)] self.assertGreaterEqual(m3[0], ref[0] * 0.995) self.assertGreaterEqual(m3[1], ref[1] * 0.995) self.assertGreaterEqual(m3[2], ref[2] * 0.995)
def test_equiv_pq(self): ds = datasets.SyntheticDataset(32, 2000, 200, 4) index = faiss.index_factory(32, "IVF1,PQ16x4np") index.by_residual = False # force coarse quantizer index.quantizer.add(np.zeros((1, 32), dtype='float32')) index.train(ds.get_train()) index.add(ds.get_database()) Dref, Iref = index.search(ds.get_queries(), 4) index_pq = faiss.index_factory(32, "PQ16x4np") index_pq.pq = index.pq index_pq.is_trained = True index_pq.codes = faiss. downcast_InvertedLists( index.invlists).codes.at(0) index_pq.ntotal = index.ntotal Dnew, Inew = index_pq.search(ds.get_queries(), 4) np.testing.assert_array_equal(Iref, Inew) np.testing.assert_array_equal(Dref, Dnew) index_pq2 = faiss.IndexPQFastScan(index_pq) index_pq2.implem = 12 Dref, Iref = index_pq2.search(ds.get_queries(), 4) index2 = faiss.IndexIVFPQFastScan(index) index2.implem = 12 Dnew, Inew = index2.search(ds.get_queries(), 4) np.testing.assert_array_equal(Iref, Inew) np.testing.assert_array_equal(Dref, Dnew)
def test_clipping(self): """ verify that a clipped residual quantizer gives the same code prefix + suffix as the full RQ """ ds = datasets.SyntheticDataset(32, 1000, 100, 0) rq = faiss.ResidualQuantizer(ds.d, 5, 4) rq.train_type = faiss.ResidualQuantizer.Train_default rq.max_beam_size = 5 rq.train(ds.get_train()) rq.max_beam_size = 1 # is not he same for a large beam size codes = rq.compute_codes(ds.get_database()) rq2 = faiss.ResidualQuantizer(ds.d, 2, 4) rq2.initialize_from(rq) self.assertEqual(rq2.M, 2) # verify that the beginning of the codes are the same codes2 = rq2.compute_codes(ds.get_database()) rq3 = faiss.ResidualQuantizer(ds.d, 3, 4) rq3.initialize_from(rq, 2) self.assertEqual(rq3.M, 3) codes3 = rq3.compute_codes(ds.get_database() - rq2.decode(codes2)) # verify that prefixes are the same for i in range(ds.nb): print(i, ds.nb) br = faiss.BitstringReader(faiss.swig_ptr(codes[i]), rq.code_size) br2 = faiss.BitstringReader(faiss.swig_ptr(codes2[i]), rq2.code_size) self.assertEqual(br.read(rq2.tot_bits), br2.read(rq2.tot_bits)) br3 = faiss.BitstringReader(faiss.swig_ptr(codes3[i]), rq3.code_size) self.assertEqual(br.read(rq3.tot_bits), br3.read(rq3.tot_bits))
def test_sparse_routines(self): """ the sparse assignment routine """ ds = datasets.SyntheticDataset(1000, 2000, 0, 200) xt = ds.get_train().copy() faiss.normalize_L2(xt) mask = np.abs(xt) > 0.045 # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros xt[np.logical_not(mask)] = 0 centroids = ds.get_queries() assert len(centroids) == 200 xsparse = scipy.sparse.csr_matrix(xt) Dref, Iref = faiss.knn(xsparse.todense(), centroids, 1) D, I = clustering.sparse_assign_to_dense(xsparse, centroids) np.testing.assert_array_equal(Iref.ravel(), I) np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4) D, I = clustering.sparse_assign_to_dense_blocks(xsparse, centroids, qbs=123, bbs=33, nt=4) np.testing.assert_array_equal(Iref.ravel(), I) np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4)
def subtest_from_idxaq(self, implem, metric): if metric == 'L2': metric_type = faiss.METRIC_L2 st = '_Nrq2x4' else: metric_type = faiss.METRIC_INNER_PRODUCT st = '' d = 16 ds = datasets.SyntheticDataset(d, 1000, 2000, 1000, metric=metric) gt = ds.get_groundtruth(k=1) index = faiss.index_factory(d, 'RQ8x4' + st, metric_type) index.train(ds.get_train()) index.add(ds.get_database()) index.nprobe = 16 Dref, Iref = index.search(ds.get_queries(), 1) indexfs = faiss.IndexAdditiveQuantizerFastScan(index) indexfs.implem = implem D1, I1 = indexfs.search(ds.get_queries(), 1) nq = Iref.shape[0] recall_ref = (Iref == gt).sum() / nq recall1 = (I1 == gt).sum() / nq print(recall_ref, recall1) assert abs(recall_ref - recall1) < 0.05
def do_test(self, by_residual, metric=faiss.METRIC_L2, d=32): ds = datasets.SyntheticDataset(d, 2000, 5000, 200) index = faiss.index_factory(d, f"IVF32,PQ{d//2}x4np", metric) # force coarse quantizer # index.quantizer.add(np.zeros((1, 32), dtype='float32')) index.by_residual = by_residual index.train(ds.get_train()) index.add(ds.get_database()) index.nprobe = 4 index2 = faiss.IndexIVFPQFastScan(index) index2.implem = 2 Dref, Iref = index2.search(ds.get_queries(), 4) index2 = faiss.IndexIVFPQFastScan(index) index2.implem = self.IMPLEM Dnew, Inew = index2.search(ds.get_queries(), 4) verify_with_draws(self, Dref, Iref, Dnew, Inew) stats = faiss.cvar.indexIVF_stats stats.reset() # also verify with single result Dnew, Inew = index2.search(ds.get_queries(), 1) for q in range(len(Dref)): if Dref[q, 1] == Dref[q, 0]: # then we cannot conclude continue self.assertEqual(Iref[q, 0], Inew[q, 0]) np.testing.assert_almost_equal(Dref[q, 0], Dnew[q, 0], decimal=5) self.assertGreater(stats.ndis, 0)
def test_synthetic(self): ds = datasets.SyntheticDataset(32, 1000, 2000, 10) xq = ds.get_queries() self.assertEqual(xq.shape, (10, 32)) xb = ds.get_database() self.assertEqual(xb.shape, (2000, 32)) ds.check_sizes()
def test_lut(self): """test compute_LUT function""" ds = datasets.SyntheticDataset(16, 1000, 0, 100) xt = ds.get_train() xq = ds.get_queries() nsplits = 2 Msub = 2 nbits = 4 nq, d = xq.shape dsub = d // nsplits plsq = faiss.ProductLocalSearchQuantizer(ds.d, nsplits, Msub, nbits) plsq.train(xt) subcodebook_size = Msub * (1 << nbits) codebook_size = nsplits * subcodebook_size lut = np.zeros((nq, codebook_size), dtype=np.float32) plsq.compute_LUT(nq, sp(xq), sp(lut)) codebooks = faiss.vector_to_array(plsq.codebooks) codebooks = codebooks.reshape(nsplits, subcodebook_size, dsub) xq = xq.reshape(nq, nsplits, dsub) lut_ref = np.zeros((nq, nsplits, subcodebook_size), dtype=np.float32) for i in range(nsplits): lut_ref[:, i] = xq[:, i] @ codebooks[i].T lut_ref = lut_ref.reshape(nq, codebook_size) # max rtoal in OSX: 2.87e-6 np.testing.assert_allclose(lut, lut_ref, rtol=5e-06)
def test_IndexLocalSearchQuantizer(self): ds = datasets.SyntheticDataset(32, 1000, 200, 100) gt = ds.get_groundtruth(10) ir = faiss.IndexLocalSearchQuantizer(ds.d, 4, 5) ir.train(ds.get_train()) ir.add(ds.get_database()) Dref, Iref = ir.search(ds.get_queries(), 10) inter_ref = faiss.eval_intersection(Iref, gt) # 467 self.assertGreater(inter_ref, 460) AQ = faiss.AdditiveQuantizer ir2 = faiss.IndexLocalSearchQuantizer(ds.d, 4, 5, faiss.METRIC_L2, AQ.ST_norm_float) ir2.train(ds.get_train()) # just to set flags properly ir2.lsq.codebooks = ir.lsq.codebooks ir2.add(ds.get_database()) D2, I2 = ir2.search(ds.get_queries(), 10) np.testing.assert_array_almost_equal(Dref, D2, decimal=5) self.assertLess((Iref != I2).sum(), Iref.size * 0.01) # test I/O ir3 = faiss.deserialize_index(faiss.serialize_index(ir)) D3, I3 = ir3.search(ds.get_queries(), 10) np.testing.assert_array_equal(Iref, I3) np.testing.assert_array_equal(Dref, D3)
def test_constructor(self): d = 32 ds = datasets.SyntheticDataset(d, 2000, 5000, 200) index = faiss.index_factory(d, f'PQ{d//2}x4np') index.train(ds.get_train()) index.add(ds.get_database()) Dref, Iref = index.search(ds.get_queries(), 10) nq = Iref.shape[0] index2 = faiss.IndexPQFastScan(d, d // 2, 4) index2.train(ds.get_train()) index2.add(ds.get_database()) Dnew, Inew = index2.search(ds.get_queries(), 10) recall_at_1 = (Iref[:, 0] == Inew[:, 0]).sum() / nq self.assertGreater(recall_at_1, 0.99) data = faiss.serialize_index(index2) index3 = faiss.deserialize_index(data) self.assertEqual(index2.implem, index3.implem) D3, I3 = index3.search(ds.get_queries(), 10) np.testing.assert_array_equal(D3, Dnew) np.testing.assert_array_equal(I3, Inew)
def do_test(self, metric_type): ds = datasets.SyntheticDataset(32, 0, 1000, 200) index = faiss.IndexFlat(ds.d, metric_type) index.add(ds.get_database()) # find a reasonable radius D, _ = index.search(ds.get_queries(), 10) radius0 = float(np.median(D[:, -1])) # baseline = search with that radius lims_ref, Dref, Iref = index.range_search(ds.get_queries(), radius0) # now see if using just the total number of results, we can get back the same # result table query_iterator = exponential_query_iterator(ds.get_queries()) init_radius = 1e10 if metric_type == faiss.METRIC_L2 else -1e10 radius1, lims_new, Dnew, Inew = range_search_max_results( index, query_iterator, init_radius, min_results=Dref.size, clip_to_min=True) evaluation.test_ref_range_results(lims_ref, Dref, Iref, lims_new, Dnew, Inew)
def test_search_IP(self): ds = datasets.SyntheticDataset(32, 1000, 200, 100) xt = ds.get_train() xb = ds.get_database() xq = ds.get_queries() ir = faiss.IndexResidualQuantizer( ds.d, 3, 4, faiss.METRIC_INNER_PRODUCT) ir.rq.train_type = faiss.ResidualQuantizer.Train_default ir.train(xt) ir.add(xb) Dref, Iref = ir.search(xq, 4) AQ = faiss.AdditiveQuantizer ir2 = faiss.IndexResidualQuantizer( ds.d, 3, 4, faiss.METRIC_INNER_PRODUCT, AQ.ST_LUT_nonorm) ir2.rq.codebooks = ir.rq.codebooks # fake training ir2.rq.is_trained = True ir2.is_trained = True ir2.add(xb) D2, I2 = ir2.search(xq, 4) np.testing.assert_array_equal(Iref, I2) np.testing.assert_array_almost_equal(Dref, D2, decimal=5)
def subtest_accuracy(self, aq, st, implem, metric_type='L2'): """ Compare IndexAdditiveQuantizerFastScan with IndexAQ (qint8) """ d = 16 # ds = datasets.SyntheticDataset(d, 1000, 2000, 1000, metric_type) ds = datasets.SyntheticDataset(d, 1000, 1000, 500, metric_type) gt = ds.get_groundtruth(k=1) if metric_type == 'L2': metric = faiss.METRIC_L2 postfix1 = '_Nqint8' postfix2 = f'_N{st}2x4' else: metric = faiss.METRIC_INNER_PRODUCT postfix1 = postfix2 = '' index = faiss.index_factory(d, f'{aq}3x4{postfix1}', metric) index.train(ds.get_train()) index.add(ds.get_database()) Dref, Iref = index.search(ds.get_queries(), 1) indexfs = faiss.index_factory(d, f'{aq}3x4fs_32{postfix2}', metric) indexfs.train(ds.get_train()) indexfs.add(ds.get_database()) indexfs.implem = implem Da, Ia = indexfs.search(ds.get_queries(), 1) nq = Iref.shape[0] recall_ref = (Iref == gt).sum() / nq recall = (Ia == gt).sum() / nq print(aq, st, implem, metric_type, recall_ref, recall) assert abs(recall_ref - recall) < 0.05
def test_ivfsq(self): ds = datasets.SyntheticDataset(32, 3000, 1000, 100) xt = ds.get_train() xb = ds.get_database() gt = ds.get_groundtruth(1) # RQ 2x5 = 10 bits = 1024 centroids index = faiss.index_factory(ds.d, "IVF1024(RCQ2x5),SQ8") quantizer = faiss.downcast_index(index.quantizer) rq = quantizer.rq rq.train_type = faiss.ResidualQuantizer.Train_default index.train(xt) index.add(xb) # make sure that increasing the nprobe increases accuracy index.nprobe = 10 D, I = index.search(ds.get_queries(), 10) r10 = (I == gt[None, :]).sum() / ds.nq index.nprobe = 40 D, I = index.search(ds.get_queries(), 10) r40 = (I == gt[None, :]).sum() / ds.nq self.assertGreater(r40, r10)
def test_PQ4_speed(self): ds = datasets.SyntheticDataset(32, 2000, 5000, 1000) xt = ds.get_train() xb = ds.get_database() xq = ds.get_queries() index = faiss.index_factory(32, 'PQ16x4') index.train(xt) index.add(xb) t0 = time.time() D1, I1 = index.search(xq, 10) t1 = time.time() pq_t = t1 - t0 print('PQ16x4 search time:', pq_t) index2 = faiss.index_factory(32, 'PQ16x4fs') index2.train(xt) index2.add(xb) t0 = time.time() D2, I2 = index2.search(xq, 10) t1 = time.time() pqfs_t = t1 - t0 print('PQ16x4fs search time:', pqfs_t) self.assertLess(pqfs_t * 5, pq_t)
def test_training(self): """check that the error is in the same ballpark as PQ """ ds = datasets.SyntheticDataset(32, 3000, 1000, 0) xt = ds.get_train() xb = ds.get_database() rq = faiss.ResidualQuantizer(ds.d, 4, 6) rq.verbose rq.verbose = True # rq.train_type = faiss.ResidualQuantizer.Train_default rq.cp.verbose # rq.cp.verbose = True rq.train(xt) err_rq = eval_codec(rq, xb) pq = faiss.ProductQuantizer(ds.d, 4, 6) pq.train(xt) err_pq = eval_codec(pq, xb) # in practice RQ is often better than PQ but it does not the case here, so just check # that we are within some factor. print(err_pq, err_rq) self.assertLess(err_rq, err_pq * 1.2)
def test_query_iterator(self, metric=faiss.METRIC_L2): ds = datasets.SyntheticDataset(32, 0, 1000, 1000) xq = ds.get_queries() xb = ds.get_database() D, I = faiss.knn(xq, xb, 10, metric=metric) threshold = float(D[:, -1].mean()) print(threshold) index = faiss.IndexFlat(32, metric) index.add(xb) ref_lims, ref_D, ref_I = index.range_search(xq, threshold) def matrix_iterator(xb, bs): for i0 in range(0, xb.shape[0], bs): yield xb[i0:i0 + bs] # check repro OK _, new_lims, new_D, new_I = range_search_max_results( index, matrix_iterator(xq, 100), threshold) evaluation.test_ref_range_results(ref_lims, ref_D, ref_I, new_lims, new_D, new_I) max_res = ref_lims[-1] // 2 new_threshold, new_lims, new_D, new_I = range_search_max_results( index, matrix_iterator(xq, 100), threshold, max_results=max_res) self.assertLessEqual(new_lims[-1], max_res) ref_lims, ref_D, ref_I = index.range_search(xq, new_threshold) evaluation.test_ref_range_results(ref_lims, ref_D, ref_I, new_lims, new_D, new_I)
def do_test_rounding(self, implem=4, metric=faiss.METRIC_L2): ds = datasets.SyntheticDataset(32, 2000, 5000, 200) index = faiss.index_factory(32, 'PQ16x4', metric) index.train(ds.get_train()) index.add(ds.get_database()) Dref, Iref = index.search(ds.get_queries(), 10) nq = Iref.shape[0] index2 = faiss.IndexPQFastScan(index) # simply repro normal search index2.implem = 2 D2, I2 = index2.search(ds.get_queries(), 10) np.testing.assert_array_equal(I2, Iref) np.testing.assert_array_equal(D2, Dref) # rounded LUT with correction index2.implem = implem D4, I4 = index2.search(ds.get_queries(), 10) # check accuracy of indexes recalls = {} for rank in 1, 10: recalls[rank] = (Iref[:, :1] == I4[:, :rank]).sum() / nq min_r1 = 0.98 if metric == faiss.METRIC_INNER_PRODUCT else 0.99 self.assertGreater(recalls[1], min_r1) self.assertGreater(recalls[10], 0.995) # check accuracy of distances # err3 = ((D3 - D2) ** 2).sum() err4 = ((D4 - D2) ** 2).sum() nf = (D2 ** 2).sum() self.assertLess(err4, nf * 1e-4)
def eval_index_accuracy(self, factory_key): # just do a single test, most search functions are already stress # tested in test_residual_quantizer.py ds = datasets.SyntheticDataset(32, 3000, 1000, 100) index = faiss.index_factory(ds.d, factory_key) index.train(ds.get_train()) index.add(ds.get_database()) inters = [] for nprobe in 1, 2, 5, 10, 20, 50: index.nprobe = nprobe D, I = index.search(ds.get_queries(), 10) inter = faiss.eval_intersection(I, ds.get_groundtruth(10)) # print("nprobe=", nprobe, "inter=", inter) inters.append(inter) inters = np.array(inters) # in fact the results should be the same for the decoding and the # reconstructing versions self.assertTrue(np.all(inters[1:] >= inters[:-1])) # do a little I/O test index2 = faiss.deserialize_index(faiss.serialize_index(index)) D2, I2 = index2.search(ds.get_queries(), 10) np.testing.assert_array_equal(I2, I) np.testing.assert_array_equal(D2, D)
def test_precomp(self): ds = datasets.SyntheticDataset(32, 1000, 1000, 0) # make sure it work with varying nb of bits nbits = faiss.UInt64Vector() nbits.push_back(5) nbits.push_back(6) nbits.push_back(7) rq = faiss.ResidualQuantizer(ds.d, nbits) rq.train_type = faiss.ResidualQuantizer.Train_default rq.train(ds.get_train()) codebooks = get_additive_quantizer_codebooks(rq) precomp = precomp_codebooks(codebooks) codebook_cross_prods_ref, cent_norms_ref = precomp # check C++ precomp tables codebook_cross_prods_ref = np.hstack([ np.vstack(c) for c in codebook_cross_prods_ref]) rq.compute_codebook_tables() codebook_cross_prods = faiss.vector_to_array( rq.codebook_cross_products) codebook_cross_prods = codebook_cross_prods.reshape( rq.total_codebook_size, rq.total_codebook_size) cent_norms = faiss.vector_to_array(rq.cent_norms) np.testing.assert_array_almost_equal( codebook_cross_prods, codebook_cross_prods_ref, decimal=5) np.testing.assert_array_almost_equal( np.hstack(cent_norms_ref), cent_norms, decimal=5) # validate that the python tab-based encoding works xb = ds.get_database() ref_codes, _, _ = beam_search_encoding_ref(codebooks, xb, 7) new_codes, _ = beam_search_encoding_tab(codebooks, xb, 7, precomp) np.testing.assert_array_equal(ref_codes, new_codes) # validate the C++ beam_search_encode_step_tab function beam_search_encoding_tab(codebooks, xb, 7, precomp, implem="ref cpp") # check implem w/ residuals n = ref_codes.shape[0] sp = faiss.swig_ptr ref_codes_packed = np.zeros((n, rq.code_size), dtype='uint8') ref_codes_int32 = ref_codes.astype('int32') rq.pack_codes( n, sp(ref_codes_int32), sp(ref_codes_packed), rq.M * ref_codes.shape[1] ) rq.max_beam_size = 7 codes_ref_residuals = rq.compute_codes(xb) np.testing.assert_array_equal(ref_codes_packed, codes_ref_residuals) rq.use_beam_LUT = 1 codes_new = rq.compute_codes(xb) np.testing.assert_array_equal(codes_ref_residuals, codes_new)
def test_synthetic_iterator(self): ds = datasets.SyntheticDataset(32, 1000, 2000, 10) xb = ds.get_database() xb2 = [] for xbi in ds.database_iterator(): xb2.append(xbi) xb2 = np.vstack(xb2) np.testing.assert_array_equal(xb, xb2)
def test_synthetic_ip(self): ds = datasets.SyntheticDataset(32, 1000, 2000, 10, "IP") index = faiss.IndexFlatIP(32) index.add(ds.get_database()) np.testing.assert_array_equal( ds.get_groundtruth(100), index.search(ds.get_queries(), 100)[1] )
def do_test_accuracy(self, by_residual, st): ds = datasets.SyntheticDataset(32, 3000, 1000, 100) quantizer = faiss.IndexFlatL2(ds.d) index = faiss.IndexIVFResidualQuantizer( quantizer, ds.d, 100, 3, 4, faiss.METRIC_L2, st ) index.by_residual = by_residual index.rq.train_type index.rq.train_type = faiss.ResidualQuantizer.Train_default index.rq.max_beam_size = 30 index.train(ds.get_train()) index.add(ds.get_database()) inters = [] for nprobe in 1, 2, 5, 10, 20, 50: index.nprobe = nprobe D, I = index.search(ds.get_queries(), 10) inter = faiss.eval_intersection(I, ds.get_groundtruth(10)) # print(st, "nprobe=", nprobe, "inter=", inter) inters.append(inter) # do a little I/O test index2 = faiss.deserialize_index(faiss.serialize_index(index)) D2, I2 = index2.search(ds.get_queries(), 10) np.testing.assert_array_equal(I2, I) np.testing.assert_array_equal(D2, D) inters = np.array(inters) if by_residual: # check that we have increasing intersection measures with # nprobe self.assertTrue(np.all(inters[1:] >= inters[:-1])) else: self.assertTrue(np.all(inters[1:3] >= inters[:2])) # check that we have the same result as the flat residual quantizer iflat = faiss.IndexResidualQuantizer( ds.d, 3, 4, faiss.METRIC_L2, st) iflat.rq.train_type iflat.rq.train_type = faiss.ResidualQuantizer.Train_default iflat.rq.max_beam_size = 30 iflat.train(ds.get_train()) iflat.rq.codebooks = index.rq.codebooks iflat.add(ds.get_database()) Dref, Iref = iflat.search(ds.get_queries(), 10) index.nprobe = 100 D2, I2 = index.search(ds.get_queries(), 10) np.testing.assert_array_almost_equal(Dref, D2, decimal=5) # there are many ties because the codes are so short self.assertLess((Iref != I2).sum(), Iref.size * 0.2)
def test_binary(self): ds = datasets.SyntheticDataset(128, 2000, 2000, 200) d = ds.d xt = ds.get_train() xq = ds.get_queries() xb = ds.get_database() # define alternative quantizer on the 20 first dims of vectors (will be in float) km = faiss.Kmeans(20, 50) km.train(xt[:, :20].copy()) alt_quantizer = km.index binarizer = faiss.index_factory(d, "ITQ,LSHt") binarizer.train(xt) xb_bin = binarizer.sa_encode(xb) xq_bin = binarizer.sa_encode(xq) index = faiss.index_binary_factory(d, "BIVF200") fake_centroids = np.zeros((index.nlist, index.d // 8), dtype="uint8") index.quantizer.add(fake_centroids) index.is_trained = True # add elements xb a = alt_quantizer.search(xb[:, :20].copy(), 1)[1].ravel() ivf_tools.add_preassigned(index, xb_bin, a) # search elements xq, increase nprobe, check 4 first results w/ groundtruth prev_inter_perf = 0 for nprobe in 1, 10, 20: index.nprobe = nprobe a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1] D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a) inter_perf = (I == ds.get_groundtruth()[:, :4]).sum() / I.size self.assertTrue(inter_perf >= prev_inter_perf) prev_inter_perf = inter_perf # test range search index.nprobe = 20 a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1] # just to find a reasonable radius D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a) radius = int(D.max() + 1) lims, DR, IR = ivf_tools.range_search_preassigned( index, xq_bin, radius, a) # with that radius the k-NN results are a subset of the range search results for q in range(len(xq)): l0, l1 = lims[q], lims[q + 1] self.assertTrue(set(I[q]) <= set(IR[l0:l1]))
def test_float(self): ds = datasets.SyntheticDataset(128, 2000, 2000, 200) d = ds.d xt = ds.get_train() xq = ds.get_queries() xb = ds.get_database() # define alternative quantizer on the 20 first dims of vectors km = faiss.Kmeans(20, 50) km.train(xt[:, :20].copy()) alt_quantizer = km.index index = faiss.index_factory(d, "IVF50,PQ16np") index.by_residual = False # (optional) fake coarse quantizer fake_centroids = np.zeros((index.nlist, index.d), dtype="float32") index.quantizer.add(fake_centroids) # train the PQ part index.train(xt) # add elements xb a = alt_quantizer.search(xb[:, :20].copy(), 1)[1].ravel() ivf_tools.add_preassigned(index, xb, a) # search elements xq, increase nprobe, check 4 first results w/ groundtruth prev_inter_perf = 0 for nprobe in 1, 10, 20: index.nprobe = nprobe a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1] D, I = ivf_tools.search_preassigned(index, xq, 4, a) inter_perf = faiss.eval_intersection(I, ds.get_groundtruth()[:, :4]) self.assertTrue(inter_perf >= prev_inter_perf) prev_inter_perf = inter_perf # test range search index.nprobe = 20 a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1] # just to find a reasonable radius D, I = ivf_tools.search_preassigned(index, xq, 4, a) radius = D.max() * 1.01 lims, DR, IR = ivf_tools.range_search_preassigned(index, xq, radius, a) # with that radius the k-NN results are a subset of the range search results for q in range(len(xq)): l0, l1 = lims[q], lims[q + 1] self.assertTrue(set(I[q]) <= set(IR[l0:l1]))
def test_2level(self): " verify that 2-level clustering is not too sub-optimal " ds = datasets.SyntheticDataset(32, 10000, 0, 0) xt = ds.get_train() km_ref = faiss.Kmeans(ds.d, 100) km_ref.train(xt) err = faiss.knn(xt, km_ref.centroids, 1)[0].sum() centroids2, _ = clustering.two_level_clustering(xt, 10, 10) err2 = faiss.knn(xt, centroids2, 1)[0].sum() self.assertLess(err2, err * 1.1)
def test_equiv_rq(self): """ make sure it is equivalent to search a RQ and to search an IVF with RCQ + RQ with the same codebooks. """ ds = datasets.SyntheticDataset(32, 3000, 1000, 50) # make a flat RQ iflat = faiss.IndexResidualQuantizer(ds.d, 5, 4) iflat.rq.train_type = faiss.ResidualQuantizer.Train_default iflat.train(ds.get_train()) iflat.add(ds.get_database()) # ref search result Dref, Iref = iflat.search(ds.get_queries(), 10) # get its codebooks + encoded version of the dataset codebooks = get_additive_quantizer_codebooks(iflat.rq) codes = faiss.vector_to_array(iflat.codes).reshape(-1, iflat.code_size) # make an IVF with 2x4 + 3x4 = 5x4 bits ivf = faiss.index_factory(ds.d, "IVF256(RCQ2x4),RQ3x4") # initialize the codebooks rcq = faiss.downcast_index(ivf.quantizer) faiss.copy_array_to_vector( np.vstack(codebooks[:rcq.rq.M]).ravel(), rcq.rq.codebooks ) rcq.rq.is_trained = True # translation of AdditiveCoarseQuantizer::train rcq.ntotal = 1 << rcq.rq.tot_bits rcq.centroid_norms.resize(rcq.ntotal) rcq.rq.compute_centroid_norms(rcq.centroid_norms.data()) rcq.is_trained = True faiss.copy_array_to_vector( np.vstack(codebooks[rcq.rq.M:]).ravel(), ivf.rq.codebooks ) ivf.rq.is_trained = True ivf.is_trained = True # add the codes (this works because 2x4 is a multiple of 8 bits) ivf.add_sa_codes(codes) # perform exhaustive search ivf.nprobe = ivf.nlist Dnew, Inew = ivf.search(ds.get_queries(), 10) np.testing.assert_array_equal(Iref, Inew) np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
def test_aqfastscan(self): ds = datasets.SyntheticDataset(20, 1000, 1000, 0) index = faiss.index_factory(20, 'RQ5x4_Nrq2x4') index.train(ds.get_train()) index.add(ds.get_database()) recons = index.reconstruct_n(0, index.ntotal) index2 = faiss.IndexAdditiveQuantizerFastScan(index) recons2 = index2.reconstruct_n(0, index.ntotal) np.testing.assert_array_equal(recons, recons2)
def test_RCQ_knn(self): ds = datasets.SyntheticDataset(32, 1000, 0, 123) xt = ds.get_train() xq = ds.get_queries() # RQ 3+4+5 = 12 bits = 4096 centroids rcq = faiss.index_factory(ds.d, "RCQ1x3_1x4_1x5") rcq.train(xt) aq = rcq.rq cents = rcq.reconstruct_n(0, rcq.ntotal) sp = faiss.swig_ptr # test norms computation norms_ref = (cents ** 2).sum(1) norms = np.zeros(1 << aq.tot_bits, dtype="float32") aq.compute_centroid_norms(sp(norms)) np.testing.assert_array_almost_equal(norms, norms_ref, decimal=5) # test IP search Dref, Iref = faiss.knn( xq, cents, 10, metric=faiss.METRIC_INNER_PRODUCT ) Dnew = np.zeros_like(Dref) Inew = np.zeros_like(Iref) aq.knn_centroids_inner_product(len(xq), sp(xq), 10, sp(Dnew), sp(Inew)) np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5) np.testing.assert_array_equal(Iref, Inew) # test L2 search Dref, Iref = faiss.knn(xq, cents, 10, metric=faiss.METRIC_L2) Dnew = np.zeros_like(Dref) Inew = np.zeros_like(Iref) aq.knn_centroids_L2(len(xq), sp(xq), 10, sp(Dnew), sp(Inew), sp(norms)) np.testing.assert_array_equal(Iref, Inew) np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
def test_PQ4_accuracy(self): ds = datasets.SyntheticDataset(32, 2000, 5000, 1000) index_gt = faiss.IndexFlatL2(32) index_gt.add(ds.get_database()) Dref, Iref = index_gt.search(ds.get_queries(), 10) index = faiss.index_factory(32, 'PQ16x4') index.train(ds.get_train()) index.add(ds.get_database()) Da, Ia = index.search(ds.get_queries(), 10) nq = Iref.shape[0] recall_at_1 = (Iref[:, 0] == Ia[:, 0]).sum() / nq assert recall_at_1 > 0.6