def subtest(self, mt): d = 32 xt, xb, xq = get_dataset_2(d, 1000, 2000, 200) nlist = 64 gt_index = faiss.IndexFlat(d, mt) gt_index.add(xb) gt_D, gt_I = gt_index.search(xq, 10) quantizer = faiss.IndexFlat(d, mt) for by_residual in True, False: index = faiss.IndexIVFPQ(quantizer, d, nlist, 4, 8) index.metric_type = mt index.by_residual = by_residual if by_residual: # perform cheap polysemous training index.do_polysemous_training = True pt = faiss.PolysemousTraining() pt.n_iter = 50000 pt.n_redo = 1 index.polysemous_training = pt index.train(xt) index.add(xb) index.nprobe = 4 D, I = index.search(xq, 10) ninter = faiss.eval_intersection(I, gt_I) print('(%d, %s): %d, ' % (mt, by_residual, ninter)) assert ninter >= self.ref_results[mt, by_residual] - 2 index.use_precomputed_table = 0 D2, I2 = index.search(xq, 10) assert np.all(I == I2) if by_residual: index.use_precomputed_table = 1 index.polysemous_ht = 20 D, I = index.search(xq, 10) ninter = faiss.eval_intersection(I, gt_I) print('(%d, %s, %d): %d, ' % (mt, by_residual, index.polysemous_ht, ninter)) # polysemous behaves bizarrely on ARM assert ( ninter >= self.ref_results[mt, by_residual, index.polysemous_ht] - 4)
def test_white(self): # generate data d = 4 nt = 1000 nb = 200 nq = 200 # normal distribition x = faiss.randn((nt + nb + nq) * d, 1234).reshape(nt + nb + nq, d) index = faiss.index_factory(d, 'Flat') xt = x[:nt] xb = x[nt:-nq] xq = x[-nq:] # NN search on normal distribution index.add(xb) Do, Io = index.search(xq, 5) # make distribution very skewed x *= [10, 4, 1, 0.5] rr, _ = np.linalg.qr(faiss.randn(d * d).reshape(d, d)) x = np.dot(x, rr).astype('float32') xt = x[:nt] xb = x[nt:-nq] xq = x[-nq:] # L2 search on skewed distribution index = faiss.index_factory(d, 'Flat') index.add(xb) Dl2, Il2 = index.search(xq, 5) # whiten + L2 search on L2 distribution index = faiss.index_factory(d, 'PCAW%d,Flat' % d) index.train(xt) index.add(xb) Dw, Iw = index.search(xq, 5) # make sure correlation of whitened results with original # results is much better than simple L2 distances # should be 961 vs. 264 assert (faiss.eval_intersection(Io, Iw) > 2 * faiss.eval_intersection(Io, Il2))
def eval_index_accuracy(self, factory_key): # just do a single test, most search functions are already stress # tested in test_residual_quantizer.py ds = datasets.SyntheticDataset(32, 3000, 1000, 100) index = faiss.index_factory(ds.d, factory_key) index.train(ds.get_train()) index.add(ds.get_database()) inters = [] for nprobe in 1, 2, 5, 10, 20, 50: index.nprobe = nprobe D, I = index.search(ds.get_queries(), 10) inter = faiss.eval_intersection(I, ds.get_groundtruth(10)) # print("nprobe=", nprobe, "inter=", inter) inters.append(inter) inters = np.array(inters) # in fact the results should be the same for the decoding and the # reconstructing versions self.assertTrue(np.all(inters[1:] >= inters[:-1])) # do a little I/O test index2 = faiss.deserialize_index(faiss.serialize_index(index)) D2, I2 = index2.search(ds.get_queries(), 10) np.testing.assert_array_equal(I2, I) np.testing.assert_array_equal(D2, D)
def test_IVFPQ_non8bit(self): d = 16 xt, xb, xq = get_dataset_2(d, 10000, 2000, 200) nlist = 64 gt_index = faiss.IndexFlat(d) gt_index.add(xb) gt_D, gt_I = gt_index.search(xq, 10) quantizer = faiss.IndexFlat(d) ninter = {} for v in '2x8', '8x2': if v == '8x2': index = faiss.IndexIVFPQ( quantizer, d, nlist, 2, 8) else: index = faiss.IndexIVFPQ( quantizer, d, nlist, 8, 2) index.train(xt) index.add(xb) index.npobe = 16 D, I = index.search(xq, 10) ninter[v] = faiss.eval_intersection(I, gt_I) print('ninter=', ninter) # this should be the case but we don't observe # that... Probavly too few test points # assert ninter['2x8'] > ninter['8x2'] # ref numbers on 2019-11-02 assert abs(ninter['2x8'] - 458) < 4 assert abs(ninter['8x2'] - 465) < 4
def test_IndexLocalSearchQuantizer(self): ds = datasets.SyntheticDataset(32, 1000, 200, 100) gt = ds.get_groundtruth(10) ir = faiss.IndexLocalSearchQuantizer(ds.d, 4, 5) ir.train(ds.get_train()) ir.add(ds.get_database()) Dref, Iref = ir.search(ds.get_queries(), 10) inter_ref = faiss.eval_intersection(Iref, gt) # 467 self.assertGreater(inter_ref, 460) AQ = faiss.AdditiveQuantizer ir2 = faiss.IndexLocalSearchQuantizer(ds.d, 4, 5, faiss.METRIC_L2, AQ.ST_norm_float) ir2.train(ds.get_train()) # just to set flags properly ir2.lsq.codebooks = ir.lsq.codebooks ir2.add(ds.get_database()) D2, I2 = ir2.search(ds.get_queries(), 10) np.testing.assert_array_almost_equal(Dref, D2, decimal=5) self.assertLess((Iref != I2).sum(), Iref.size * 0.01) # test I/O ir3 = faiss.deserialize_index(faiss.serialize_index(ir)) D3, I3 = ir3.search(ds.get_queries(), 10) np.testing.assert_array_equal(Iref, I3) np.testing.assert_array_equal(Dref, D3)
def subtest(self, mt): d = 32 xt, xb, xq = get_dataset_2(d, 1000, 2000, 200) nlist = 64 gt_index = faiss.IndexFlat(d, mt) gt_index.add(xb) gt_D, gt_I = gt_index.search(xq, 10) quantizer = faiss.IndexFlat(d, mt) for qname in '8bit 4bit 8bit_uniform 4bit_uniform fp16'.split(): qtype = getattr(faiss.ScalarQuantizer, 'QT_' + qname) index = faiss.IndexIVFScalarQuantizer(quantizer, d, nlist, qtype, mt) index.train(xt) index.add(xb) index.nprobe = 4 # hopefully more robust than 1 D, I = index.search(xq, 10) ninter = faiss.eval_intersection(I, gt_I) print('(%d, %s): %d, ' % (mt, repr(qname), ninter)) assert abs(ninter - self.ref_results[(mt, qname)]) <= 9 D2, I2 = self.subtest_add2col(xb, xq, index, qname) assert np.all(I2 == I) # also test range search if mt == faiss.METRIC_INNER_PRODUCT: radius = float(D[:, -1].max()) else: radius = float(D[:, -1].min()) print('radius', radius) lims, D3, I3 = index.range_search(xq, radius) ntot = ndiff = 0 for i in range(len(xq)): l0, l1 = lims[i], lims[i + 1] Inew = set(I3[l0:l1]) if mt == faiss.METRIC_INNER_PRODUCT: mask = D2[i] > radius else: mask = D2[i] < radius Iref = set(I2[i, mask]) ndiff += len(Inew ^ Iref) ntot += len(Iref) print('ndiff %d / %d' % (ndiff, ntot)) assert ndiff < ntot * 0.01 for pm in 1, 2: print('parallel_mode=%d' % pm) index.parallel_mode = pm lims4, D4, I4 = index.range_search(xq, radius) print('sizes', lims4[1:] - lims4[:-1]) for qno in range(len(lims) - 1): Iref = I3[lims[qno]:lims[qno + 1]] Inew = I4[lims4[qno]:lims4[qno + 1]] assert set(Iref) == set( Inew), "q %d ref %s new %s" % (qno, Iref, Inew)
def do_test_accuracy(self, by_residual, st): ds = datasets.SyntheticDataset(32, 3000, 1000, 100) quantizer = faiss.IndexFlatL2(ds.d) index = faiss.IndexIVFResidualQuantizer( quantizer, ds.d, 100, 3, 4, faiss.METRIC_L2, st ) index.by_residual = by_residual index.rq.train_type index.rq.train_type = faiss.ResidualQuantizer.Train_default index.rq.max_beam_size = 30 index.train(ds.get_train()) index.add(ds.get_database()) inters = [] for nprobe in 1, 2, 5, 10, 20, 50: index.nprobe = nprobe D, I = index.search(ds.get_queries(), 10) inter = faiss.eval_intersection(I, ds.get_groundtruth(10)) # print(st, "nprobe=", nprobe, "inter=", inter) inters.append(inter) # do a little I/O test index2 = faiss.deserialize_index(faiss.serialize_index(index)) D2, I2 = index2.search(ds.get_queries(), 10) np.testing.assert_array_equal(I2, I) np.testing.assert_array_equal(D2, D) inters = np.array(inters) if by_residual: # check that we have increasing intersection measures with # nprobe self.assertTrue(np.all(inters[1:] >= inters[:-1])) else: self.assertTrue(np.all(inters[1:3] >= inters[:2])) # check that we have the same result as the flat residual quantizer iflat = faiss.IndexResidualQuantizer( ds.d, 3, 4, faiss.METRIC_L2, st) iflat.rq.train_type iflat.rq.train_type = faiss.ResidualQuantizer.Train_default iflat.rq.max_beam_size = 30 iflat.train(ds.get_train()) iflat.rq.codebooks = index.rq.codebooks iflat.add(ds.get_database()) Dref, Iref = iflat.search(ds.get_queries(), 10) index.nprobe = 100 D2, I2 = index.search(ds.get_queries(), 10) np.testing.assert_array_almost_equal(Dref, D2, decimal=5) # there are many ties because the codes are so short self.assertLess((Iref != I2).sum(), Iref.size * 0.2)
def test_float(self): ds = datasets.SyntheticDataset(128, 2000, 2000, 200) d = ds.d xt = ds.get_train() xq = ds.get_queries() xb = ds.get_database() # define alternative quantizer on the 20 first dims of vectors km = faiss.Kmeans(20, 50) km.train(xt[:, :20].copy()) alt_quantizer = km.index index = faiss.index_factory(d, "IVF50,PQ16np") index.by_residual = False # (optional) fake coarse quantizer fake_centroids = np.zeros((index.nlist, index.d), dtype="float32") index.quantizer.add(fake_centroids) # train the PQ part index.train(xt) # add elements xb a = alt_quantizer.search(xb[:, :20].copy(), 1)[1].ravel() ivf_tools.add_preassigned(index, xb, a) # search elements xq, increase nprobe, check 4 first results w/ groundtruth prev_inter_perf = 0 for nprobe in 1, 10, 20: index.nprobe = nprobe a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1] D, I = ivf_tools.search_preassigned(index, xq, 4, a) inter_perf = faiss.eval_intersection(I, ds.get_groundtruth()[:, :4]) self.assertTrue(inter_perf >= prev_inter_perf) prev_inter_perf = inter_perf # test range search index.nprobe = 20 a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1] # just to find a reasonable radius D, I = ivf_tools.search_preassigned(index, xq, 4, a) radius = D.max() * 1.01 lims, DR, IR = ivf_tools.range_search_preassigned(index, xq, radius, a) # with that radius the k-NN results are a subset of the range search results for q in range(len(xq)): l0, l1 = lims[q], lims[q + 1] self.assertTrue(set(I[q]) <= set(IR[l0:l1]))
def test_sh(self): d = 32 xt, xb, xq = get_dataset_2(d, 2000, 1000, 200) nlist, nprobe = 1, 1 gt_index = faiss.IndexFlatL2(d) gt_index.add(xb) gt_D, gt_I = gt_index.search(xq, 10) for nbit in 32, 64, 128: quantizer = faiss.IndexFlatL2(d) index_lsh = faiss.IndexLSH(d, nbit, True) index_lsh.add(xb) D, I = index_lsh.search(xq, 10) ninter = faiss.eval_intersection(I, gt_I) print('LSH baseline: %d' % ninter) for period in 10.0, 1.0: for tt in 'global centroid centroid_half median'.split(): index = faiss.IndexIVFSpectralHash(quantizer, d, nlist, nbit, period) index.nprobe = nprobe index.threshold_type = getattr( faiss.IndexIVFSpectralHash, 'Thresh_' + tt ) index.train(xt) index.add(xb) D, I = index.search(xq, 10) ninter = faiss.eval_intersection(I, gt_I) key = (nbit, tt, period) print('(%d, %s, %g): %d, ' % (nbit, repr(tt), period, ninter)) assert abs(ninter - self.ref_results[key]) <= 12
def test_search_L2(self): ds = datasets.SyntheticDataset(32, 1000, 200, 100) xt = ds.get_train() xb = ds.get_database() xq = ds.get_queries() gt = ds.get_groundtruth(10) ir = faiss.IndexResidualQuantizer(ds.d, 3, 4) ir.rq.train_type = faiss.ResidualQuantizer.Train_default ir.rq.max_beam_size = 30 ir.train(xt) # reference run w/ decoding ir.add(xb) Dref, Iref = ir.search(xq, 10) # 388 inter_ref = faiss.eval_intersection(Iref, gt) AQ = faiss.AdditiveQuantizer for st in AQ.ST_norm_float, AQ.ST_norm_qint8, AQ.ST_norm_qint4, \ AQ.ST_norm_cqint8, AQ.ST_norm_cqint4: ir2 = faiss.IndexResidualQuantizer(ds.d, 3, 4, faiss.METRIC_L2, st) ir2.rq.max_beam_size = 30 ir2.train(xt) # to get the norm bounds ir2.rq.codebooks = ir.rq.codebooks # fake training ir2.add(xb) D2, I2 = ir2.search(xq, 10) if st == AQ.ST_norm_float: np.testing.assert_array_almost_equal(Dref, D2, decimal=5) self.assertLess((Iref != I2).sum(), Iref.size * 0.05) else: inter_2 = faiss.eval_intersection(I2, gt) self.assertGreater(inter_ref, inter_2)
def test_rand_vector(self): """ test if the smooth_vectors function is reasonably compressible with a small PQ """ x = faiss.rand_smooth_vectors(1300, 32) xt = x[:1000] xb = x[1000:1200] xq = x[1200:] _, gt = faiss.knn(xq, xb, 10) index = faiss.IndexPQ(32, 4, 4) index.train(xt) index.add(xb) D, I = index.search(xq, 10) ninter = faiss.eval_intersection(I, gt) # 445 for SyntheticDataset self.assertGreater(ninter, 420) self.assertLess(ninter, 460)
def do_test_accuracy_IP(self, by_residual): ds = datasets.SyntheticDataset(32, 3000, 1000, 100, "IP") quantizer = faiss.IndexFlatIP(ds.d) index = faiss.IndexIVFResidualQuantizer( quantizer, ds.d, 100, 3, 4, faiss.METRIC_INNER_PRODUCT, faiss.AdditiveQuantizer.ST_decompress ) index.cp.spherical = True index.by_residual = by_residual index.rq.train_type index.rq.train_type = faiss.ResidualQuantizer.Train_default index.train(ds.get_train()) index.add(ds.get_database()) inters = [] for nprobe in 1, 2, 5, 10, 20, 50: index.nprobe = nprobe index.rq.search_type = faiss.AdditiveQuantizer.ST_decompress D, I = index.search(ds.get_queries(), 10) index.rq.search_type = faiss.AdditiveQuantizer.ST_LUT_nonorm D2, I2 = index.search(ds.get_queries(), 10) # print(D[:5] - D2[:5]) # print(I[:5]) np.testing.assert_array_almost_equal(D, D2, decimal=5) # there are many ties because the codes are so short self.assertLess((I != I2).sum(), I.size * 0.1) # D2, I2 = index2.search(ds.get_queries(), 10) # print(D[:5]) # print(D2[:5]) inter = faiss.eval_intersection(I, ds.get_groundtruth(10)) # print("nprobe=", nprobe, "inter=", inter) inters.append(inter) self.assertTrue(np.all(inters[1:4] >= inters[:3]))
def test_coarse_quantizer(self): ds = datasets.SyntheticDataset(32, 5000, 1000, 100) gt = ds.get_groundtruth(10) quantizer = faiss.LocalSearchCoarseQuantizer(ds.d, 2, 4) quantizer.lsq.nperts quantizer.lsq.nperts = 2 index = faiss.IndexIVFFlat(quantizer, ds.d, 256) index.quantizer_trains_alone = True index.train(ds.get_train()) index.add(ds.get_database()) index.nprobe = 4 Dref, Iref = index.search(ds.get_queries(), 10) inter_ref = faiss.eval_intersection(Iref, gt) # 249 self.assertGreater(inter_ref, 235)
def subtest(self, mt): d = 32 xt, xb, xq = get_dataset_2(d, 1000, 2000, 200) nlist = 64 gt_index = faiss.IndexFlat(d, mt) gt_index.add(xb) gt_D, gt_I = gt_index.search(xq, 10) quantizer = faiss.IndexFlat(d, mt) for qname in '8bit 4bit 8bit_uniform 4bit_uniform fp16'.split(): qtype = getattr(faiss.ScalarQuantizer, 'QT_' + qname) index = faiss.IndexIVFScalarQuantizer(quantizer, d, nlist, qtype, mt) index.train(xt) index.add(xb) index.nprobe = 4 # hopefully more robust than 1 D, I = index.search(xq, 10) ninter = faiss.eval_intersection(I, gt_I) print('(%d, %s): %d, ' % (mt, repr(qname), ninter)) assert ninter >= self.ref_results[(mt, qname)] - 4 D2, I2 = self.subtest_add2col(xb, xq, index, qname) assert np.all(I2 == I)
def test_binary(self): ds = datasets.SyntheticDataset(128, 2000, 2000, 200) d = ds.d xt = ds.get_train() xq = ds.get_queries() xb = ds.get_database() # define alternative quantizer on the 20 first dims of vectors (will be in float) km = faiss.Kmeans(20, 50) km.train(xt[:, :20].copy()) alt_quantizer = km.index binarizer = faiss.index_factory(d, "ITQ,LSHt") binarizer.train(xt) xb_bin = binarizer.sa_encode(xb) xq_bin = binarizer.sa_encode(xq) index = faiss.index_binary_factory(d, "BIVF200") fake_centroids = np.zeros((index.nlist, index.d // 8), dtype="uint8") index.quantizer.add(fake_centroids) index.is_trained = True # add elements xb a = alt_quantizer.search(xb[:, :20].copy(), 1)[1].ravel() ivf_tools.add_preassigned(index, xb_bin, a) # recompute GT in binary k = 15 ib = faiss.IndexBinaryFlat(128) ib.add(xb_bin) Dgt, Igt = ib.search(xq_bin, k) # search elements xq, increase nprobe, check 4 first results w/ groundtruth prev_inter_perf = 0 for nprobe in 1, 10, 20: index.nprobe = nprobe a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1] D, I = ivf_tools.search_preassigned(index, xq_bin, k, a) inter_perf = faiss.eval_intersection(I, Igt) self.assertGreaterEqual(inter_perf, prev_inter_perf) prev_inter_perf = inter_perf # test range search index.nprobe = 20 a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1] # just to find a reasonable radius D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a) radius = int(D.max() + 1) lims, DR, IR = ivf_tools.range_search_preassigned( index, xq_bin, radius, a) # with that radius the k-NN results are a subset of the range search results for q in range(len(xq)): l0, l1 = lims[q], lims[q + 1] self.assertTrue(set(I[q]) <= set(IR[l0:l1]))
def subtest(self, mt): d = 32 xt, xb, xq = get_dataset_2(d, 2000, 1000, 200) nlist = 64 gt_index = faiss.IndexFlat(d, mt) gt_index.add(xb) gt_D, gt_I = gt_index.search(xq, 10) quantizer = faiss.IndexFlat(d, mt) for by_residual in True, False: index = faiss.IndexIVFPQ( quantizer, d, nlist, 4, 8) index.metric_type = mt index.by_residual = by_residual if by_residual: # perform cheap polysemous training index.do_polysemous_training = True pt = faiss.PolysemousTraining() pt.n_iter = 50000 pt.n_redo = 1 index.polysemous_training = pt index.train(xt) index.add(xb) index.nprobe = 4 D, I = index.search(xq, 10) ninter = faiss.eval_intersection(I, gt_I) print('(%d, %s): %d, ' % (mt, by_residual, ninter)) assert abs(ninter - self.ref_results[mt, by_residual]) <= 3 index.use_precomputed_table = 0 D2, I2 = index.search(xq, 10) assert np.all(I == I2) if by_residual: index.use_precomputed_table = 1 index.polysemous_ht = 20 D, I = index.search(xq, 10) ninter = faiss.eval_intersection(I, gt_I) print('(%d, %s, %d): %d, ' % ( mt, by_residual, index.polysemous_ht, ninter)) # polysemous behaves bizarrely on ARM assert (ninter >= self.ref_results[ mt, by_residual, index.polysemous_ht] - 4) # also test range search if mt == faiss.METRIC_INNER_PRODUCT: radius = float(D[:, -1].max()) else: radius = float(D[:, -1].min()) print('radius', radius) lims, D3, I3 = index.range_search(xq, radius) ntot = ndiff = 0 for i in range(len(xq)): l0, l1 = lims[i], lims[i + 1] Inew = set(I3[l0:l1]) if mt == faiss.METRIC_INNER_PRODUCT: mask = D2[i] > radius else: mask = D2[i] < radius Iref = set(I2[i, mask]) ndiff += len(Inew ^ Iref) ntot += len(Iref) print('ndiff %d / %d' % (ndiff, ntot)) assert ndiff < ntot * 0.02