def test_ivf_nprobe(self): """Test in case of nprobe > nlist.""" d = self.xq.shape[1] * 8 xt, xb, xq = self.xt, self.xb, self.xq # nlist = 10 index = faiss.index_binary_factory(d, "BIVF10") # When nprobe >= nlist, it is equivalent to an IndexFlat. index.train(xt) index.add(xb) index.nprobe = 2048 k = 5 # test kNN search D, I = index.search(xq, k) ref_index = faiss.index_binary_factory(d, "BFlat") ref_index.add(xb) ref_D, ref_I = ref_index.search(xq, k) print(D[0], ref_D[0]) print(I[0], ref_I[0]) assert np.all(D == ref_D) # assert np.all(I == ref_I) # id may be different # test range search thresh = 5 # *squared* distance lims, D, I = index.range_search(xq, thresh) ref_lims, ref_D, ref_I = ref_index.range_search(xq, thresh) assert np.all(lims == ref_lims) assert np.all(D == ref_D)
def create(hashes: t.Iterable[PDQ_HASH_TYPE], custom_ids: t.Iterable[int] = None) -> "PDQFlatHashIndex": """ Creates a PDQFlatHashIndex for use searching against the provided hashes. Parameters ---------- hashes: sequence of PDQ Hashes The PDQ hashes to create the index with custom_ids: sequence of custom ids for the PDQ Hashes (optional) Optional sequence of custom id values to use for the PDQ hashes for any method relating to indexes (e.g., hash_at). If provided, the nth item in custom_ids will be used as the id for the nth hash in hashes. If not provided then the ids for the hashes will be assumed to be their respective index in hashes (i.e., the nth hash would have id n, starting from 0). Returns ------- a PDQFlatHashIndex of these hashes """ hash_bytes = [binascii.unhexlify(hash) for hash in hashes] vectors = list( map(lambda h: numpy.frombuffer(h, dtype=numpy.uint8), hash_bytes)) index = faiss.index_binary_factory(BITS_IN_PDQ, "BFlat") if custom_ids != None: index = faiss.IndexBinaryIDMap2(index) i64_ids = list(map(uint64_to_int64, custom_ids)) index.add_with_ids(numpy.array(vectors), numpy.array(i64_ids)) else: index.add(numpy.array(vectors)) return PDQFlatHashIndex(index)
def test_binary(self): ds = datasets.SyntheticDataset(128, 2000, 2000, 200) d = ds.d xt = ds.get_train() xq = ds.get_queries() xb = ds.get_database() # define alternative quantizer on the 20 first dims of vectors (will be in float) km = faiss.Kmeans(20, 50) km.train(xt[:, :20].copy()) alt_quantizer = km.index binarizer = faiss.index_factory(d, "ITQ,LSHt") binarizer.train(xt) xb_bin = binarizer.sa_encode(xb) xq_bin = binarizer.sa_encode(xq) index = faiss.index_binary_factory(d, "BIVF200") fake_centroids = np.zeros((index.nlist, index.d // 8), dtype="uint8") index.quantizer.add(fake_centroids) index.is_trained = True # add elements xb a = alt_quantizer.search(xb[:, :20].copy(), 1)[1].ravel() ivf_tools.add_preassigned(index, xb_bin, a) # search elements xq, increase nprobe, check 4 first results w/ groundtruth prev_inter_perf = 0 for nprobe in 1, 10, 20: index.nprobe = nprobe a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1] D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a) inter_perf = (I == ds.get_groundtruth()[:, :4]).sum() / I.size self.assertTrue(inter_perf >= prev_inter_perf) prev_inter_perf = inter_perf # test range search index.nprobe = 20 a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1] # just to find a reasonable radius D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a) radius = int(D.max() + 1) lims, DR, IR = ivf_tools.range_search_preassigned( index, xq_bin, radius, a) # with that radius the k-NN results are a subset of the range search results for q in range(len(xq)): l0, l1 = lims[q], lims[q + 1] self.assertTrue(set(I[q]) <= set(IR[l0:l1]))
def train_and_get_centroids(override_kmeans_index): index = faiss.index_binary_factory(d, b"BIVF10") index.verbose = True if override_kmeans_index is not None: index.clustering_index = override_kmeans_index index.train(xt) centroids = faiss.downcast_IndexBinary(index.quantizer).xb return faiss.vector_to_array(centroids).reshape(-1, d // 8)
def test_factory_IVF_HNSW(self): index = faiss.index_binary_factory(256, "BIVF1024_BHNSW32") assert index.code_size == 32 assert index.nlist == 1024
def test_factory_HNSW(self): index = faiss.index_binary_factory(256, "BHNSW32") assert index.code_size == 32
def test_factory_Flat(self): index = faiss.index_binary_factory(16, "BFlat") assert index.code_size == 2
def test_factory_IVF(self): index = faiss.index_binary_factory(16, "BIVF10") assert index.invlists is not None assert index.nlist == 10 assert index.code_size == 2
def __init__(self): faiss_index = faiss.IndexBinaryIDMap2( faiss.index_binary_factory(BITS_IN_PDQ, "BFlat")) super().__init__(faiss_index)
def test_factory_1(self): index = faiss.index_binary_factory(16, "BIVF10") assert index.invlists is not None
def build(self, config): ''' build index from scratch ''' operation_method = config.get("index_operation", "new").lower() gallery_images, gallery_docs = split_datafile( config['data_file'], config['image_root'], config['delimiter']) # when remove data in index, do not need extract fatures if operation_method != "remove": gallery_features = self._extract_features(gallery_images, config) assert operation_method in [ "new", "remove", "append" ], "Only append, remove and new operation are supported" # vector.index: faiss index file # id_map.pkl: use this file to map id to image_doc if operation_method in ["remove", "append"]: # if remove or append, vector.index and id_map.pkl must exist assert os.path.join( config["index_dir"], "vector.index" ), "The vector.index dose not exist in {} when 'index_operation' is not None".format( config["index_dir"]) assert os.path.join( config["index_dir"], "id_map.pkl" ), "The id_map.pkl dose not exist in {} when 'index_operation' is not None".format( config["index_dir"]) index = faiss.read_index( os.path.join(config["index_dir"], "vector.index")) with open(os.path.join(config["index_dir"], "id_map.pkl"), 'rb') as fd: ids = pickle.load(fd) assert index.ntotal == len(ids.keys( )), "data number in index is not equal in in id_map" else: if not os.path.exists(config["index_dir"]): os.makedirs(config["index_dir"], exist_ok=True) index_method = config.get("index_method", "HNSW32") # if IVF method, cal ivf number automaticlly if index_method == "IVF": index_method = index_method + str( min(int(len(gallery_images) // 8), 65536)) + ",Flat" # for binary index, add B at head of index_method if config["dist_type"] == "hamming": index_method = "B" + index_method #dist_type dist_type = faiss.METRIC_INNER_PRODUCT if config[ "dist_type"] == "IP" else faiss.METRIC_L2 #build index if config["dist_type"] == "hamming": index = faiss.index_binary_factory(config["embedding_size"], index_method) else: index = faiss.index_factory(config["embedding_size"], index_method, dist_type) index = faiss.IndexIDMap2(index) ids = {} if config["index_method"] == "HNSW32": logger.warning( "The HNSW32 method dose not support 'remove' operation") if operation_method != "remove": # calculate id for new data start_id = max(ids.keys()) + 1 if ids else 0 ids_now = ( np.arange(0, len(gallery_images)) + start_id).astype(np.int64) # only train when new index file if operation_method == "new": if config["dist_type"] == "hamming": index.add(gallery_features) else: index.train(gallery_features) if not config["dist_type"] == "hamming": index.add_with_ids(gallery_features, ids_now) for i, d in zip(list(ids_now), gallery_docs): ids[i] = d else: if config["index_method"] == "HNSW32": raise RuntimeError( "The index_method: HNSW32 dose not support 'remove' operation" ) # remove ids in id_map, remove index data in faiss index remove_ids = list( filter(lambda k: ids.get(k) in gallery_docs, ids.keys())) remove_ids = np.asarray(remove_ids) index.remove_ids(remove_ids) for k in remove_ids: del ids[k] # store faiss index file and id_map file if config["dist_type"] == "hamming": faiss.write_index_binary( index, os.path.join(config["index_dir"], "vector.index")) else: faiss.write_index( index, os.path.join(config["index_dir"], "vector.index")) with open(os.path.join(config["index_dir"], "id_map.pkl"), 'wb') as fd: pickle.dump(ids, fd)
def test_factory_MultiHash(self): index = faiss.index_binary_factory(256, "BHash5x6") assert index.b == 6 assert index.nhash == 5
def test_factory_Hash(self): index = faiss.index_binary_factory(256, "BHash12") assert index.b == 12