def test_ivf_flat(self): d = self.xq.shape[1] * 8 quantizer = faiss.IndexBinaryFlat(d) index = faiss.IndexBinaryIVF(quantizer, d, 8) index.cp.min_points_per_centroid = 5 # quiet warning index.nprobe = 4 index.train(self.xt) index.add(self.xb) D, I = index.search(self.xq, 3) _, tmpnam = tempfile.mkstemp() try: faiss.write_index_binary(index, tmpnam) index2 = faiss.read_index_binary(tmpnam) D2, I2 = index2.search(self.xq, 3) assert (I2 == I).all() assert (D2 == D).all() finally: os.remove(tmpnam)
def test_remove_id_map_binary(self): sub_index = faiss.IndexBinaryFlat(40) xb = np.zeros((10, 5), dtype='uint8') xb[:, 0] = np.arange(10) + 100 index = faiss.IndexBinaryIDMap2(sub_index) index.add_with_ids(xb, np.arange(10) + 1000) assert index.reconstruct(1004)[0] == 104 index.remove_ids(np.array([1003])) assert index.reconstruct(1004)[0] == 104 try: index.reconstruct(1003) except: pass else: assert False, 'should have raised an exception' # while we are there, let's test I/O as well... _, tmpnam = tempfile.mkstemp() try: faiss.write_index_binary(index, tmpnam) index = faiss.read_index_binary(tmpnam) finally: os.remove(tmpnam) assert index.reconstruct(1004)[0] == 104 try: index.reconstruct(1003) except: pass else: assert False, 'should have raised an exception'
def save(self, path1, path2): '''only faiss need this procedure''' if self.binary: faiss.write_index_binary(self.searcher, path1) else: faiss.write_index(self.searcher, path1) # save the text with open(path2, 'wb') as f: joblib.dump(self.corpus, f)
def test_write_580M(self): dim = 8 nhash = 1 num_million = 580 # changing to 570 works index1 = faiss.IndexBinaryMultiHash(dim, nhash, int(dim/nhash)) random_hash_codes = np.random.randint(0, 256, ( num_million * int(1e6), int(dim/8))).astype("uint8") index1.add(random_hash_codes) faiss.write_index_binary(index1, "/tmp/tmp.faiss") index2 = faiss.read_index_binary("/tmp/tmp.faiss")
def train(): all_data = np.array(get_all_data()) if len(all_data) == 0: print("No images. exit()") exit() d = 32 * 8 centroids = round(sqrt(all_data.shape[0])) print(f'centroids: {centroids}') quantizer = faiss.IndexBinaryFlat(d) index = faiss.IndexBinaryIVF(quantizer, d, centroids) index.nprobe = 8 index.train(all_data) faiss.write_index_binary(index, "./" + "trained_import.index")
def build_index(cls, feature_file, index_file): ''' :params feature_file: a npy file generated by using utils.build_mol_features ''' logging.info("rebuild index from {}".format(feature_file)) fp_arr = np.load(feature_file) bytes_list = [] for item in tqdm.tqdm(fp_arr): bytes_list.append(cls.vec2bytes(item)) dim = int(np.ceil(fp_arr.shape[1] / 8) * 8) index = faiss.IndexBinaryFlat(dim) index.add(np.array(bytes_list).astype("uint8")) faiss.write_index_binary(index, index_file) return index
def test_read_index_ownership(self): d = self.xq.shape[1] * 8 index = faiss.IndexBinaryFlat(d) index.add(self.xb) _, tmpnam = tempfile.mkstemp() try: faiss.write_index_binary(index, tmpnam) index2 = faiss.read_index_binary(tmpnam) assert index2.thisown finally: os.remove(tmpnam)
def train(): all_descriptors=[] all_data=import_get_all_data() if len(all_data)==0: print("No images. exit()") exit() for x in all_data: all_descriptors.append(x[1]) all_descriptors=np.concatenate(all_descriptors, axis=0) d=61*8 centroids = round(sqrt(all_descriptors.shape[0])) print(f'centroids: {centroids}') quantizer = faiss.IndexBinaryFlat(d) index = faiss.IndexBinaryIVF(quantizer, d, centroids) index.nprobe = 8 index.train(all_descriptors) faiss.write_index_binary(index, "./" + "trained_import.index")
def train(): all_descriptors = [] all_ids = get_all_ids() if len(all_ids) == 0: print("No images. exit()") exit() for id in all_ids: x = convert_array(get_akaze_features_by_id(id)) all_descriptors.append(x) all_descriptors = np.concatenate(all_descriptors, axis=0) d = 61 * 8 centroids = round(sqrt(all_descriptors.shape[0])) print(f'centroids: {centroids}') quantizer = faiss.IndexBinaryFlat(d) index = faiss.IndexBinaryIVF(quantizer, d, centroids) index.nprobe = 8 index.train(all_descriptors) faiss.write_index_binary(index, "./" + "trained.index")
def test_flat(self): d = self.xq.shape[1] * 8 index = faiss.IndexBinaryFlat(d) index.add(self.xb) D, I = index.search(self.xq, 3) _, tmpnam = tempfile.mkstemp() try: faiss.write_index_binary(index, tmpnam) index2 = faiss.read_index_binary(tmpnam) D2, I2 = index2.search(self.xq, 3) assert (I2 == I).all() assert (D2 == D).all() finally: os.remove(tmpnam)
def test_binary_from_float(self): d = self.xq.shape[1] * 8 float_index = faiss.IndexHNSWFlat(d, 16) index = faiss.IndexBinaryFromFloat(float_index) index.add(self.xb) D, I = index.search(self.xq, 3) fd, tmpnam = tempfile.mkstemp() os.close(fd) try: faiss.write_index_binary(index, tmpnam) index2 = faiss.read_index_binary(tmpnam) D2, I2 = index2.search(self.xq, 3) assert (I2 == I).all() assert (D2 == D).all() finally: os.remove(tmpnam)
def registry_index(way_index): # assert way_index in range(len(DIMENSIONS)) # prepare index dimensions = DIMENSIONS[way_index] if isAddPhash: dimensions += PHASH_X * PHASH_Y # https://github.com/facebookresearch/faiss/wiki/Binary-indexes # https://github.com/facebookresearch/faiss/blob/22b7876ef5540b85feee173aa3182a2f37dc98f6/tests/test_index_binary.py#L213 if way_index != 3: # nbits/8 https://github.com/facebookresearch/faiss/wiki/Faiss-indexes#relationship-with-lsh index = faiss.IndexBinaryHash(dimensions * 8, 1) else: index = faiss.index_factory(dimensions, INDEX_KEY) if USE_GPU: print("Use GPU...") res = faiss.StandardGpuResources() index = faiss.index_cpu_to_gpu(res, 0, index) # start training images_list = iterate_files(train_image_dir) # may change # prepare ids ids_count = 0 index_defaultdict = defaultdict(list) # ids = None # features = np.matrix([]) features = [] ids = [] cla_name_temp = parser_name(images_list[0]) way = get_way(w_index=way_index) # ORB , surf, and so on for file_name in images_list: cla_name = parser_name(file_name) ret, feature = way_feature(way, file_name) numf = feature.shape[0] if way_index == 3 and FEATURE_CLIP: numf = FEATURE_CLIP if feature.shape[ 0] > FEATURE_CLIP else feature.shape[0] # feature = feature[:FEATURE_CLIP, :] choosed_fea = sample(range(feature.shape[0]), numf) feature = feature[choosed_fea, :] if ret == 0 and feature.any(): if cla_name != cla_name_temp: ids_count += 1 # change when same img not only one cla_name_temp = cla_name # record id and path # image_dict = {ids_count: (file_name, feature)} # image_dict = {ids_count: file_name} # smaller than above index_defaultdict[ids_count].append( file_name ) # here in registry, on_id may have more than one img(obj) # print(way_feature.shape[0]) # ids_list = np.linspace(ids_count, ids_count, num=feature.shape[0], dtype="int64") ids_list = np.linspace(ids_count, ids_count, num=numf, dtype="int64") print(feature.shape, ids_count, len(ids_list), ids_list.shape) features.append(feature) ids.append(ids_list) # if features.any(): # # print(feature[0].dtype) # uint8 # features = np.vstack((features, feature)) # <class 'numpy.matrix'> # # print(feature.shape) # ids = np.hstack((ids, ids_list)) # None --> empty matrix # print(ids.dtype, ids) # else: # all feature is 0 # features = feature # ids = ids_list # print(ids, ids.dtype) # int64 # print(index.is_trained) # print(features.shape, ids.shape) # if ids_count % 500 == 499: # optim # if not index.is_trained: # index.train(features) # index.add_with_ids(features, ids) # https://github.com/facebookresearch/faiss/issues/856 # ids = None # features = np.matrix([]) # print(len(features), len(ids)) features = np.vstack(features) ids = np.hstack(ids) print(features.shape, ids.shape) if features.any(): if not index.is_trained: index.train(features) index.add_with_ids(features, ids) # change # save index if WAY_INDEX == 3: faiss.write_index(index, index_path) else: faiss.write_index_binary(index, index_path) # save ids if not os.path.exists(ids_path): with open(ids_path, 'wb+') as f: try: pickle.dump(index_defaultdict, f, True) except EnvironmentError as e: logging.error('Failed to save index file error:[{}]'.format(e)) except RuntimeError as v: logging.error('Failed to save index file error:[{}]'.format(v)) print('Registry completed')
import argparse import faiss import joblib import numpy as np from tqdm import trange if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--embedding_file", type=str, required=True) parser.add_argument("--output_file", type=str, required=True) parser.add_argument("--batch_size", type=int, default=256) parser.add_argument("--hash_num_bits", type=int, default=768) args = parser.parse_args() embedding_data = joblib.load(args.embedding_file, mmap_mode="r") ids = np.array(embedding_data["ids"], dtype=np.int) embeddings = embedding_data["embeddings"] dim_size = embeddings.shape[1] * 8 index = faiss.IndexBinaryIDMap(faiss.IndexBinaryFlat(dim_size)) for start in trange(0, ids.size, args.batch_size): index.add_with_ids(embeddings[start:start + args.batch_size], ids[start:start + args.batch_size]) faiss.write_index_binary(index, args.output_file)
base_index.hnsw.efSearch = args.hnsw_ef_search base_index.hnsw.efConstruction = args.hnsw_ef_construction index = FaissHNSWIndex.build(ids, embeddings, base_index) else: base_index = faiss.IndexFlatIP(dim_size) index = FaissIndex.build(ids, embeddings, base_index) if args.index_device == "cuda": index = index.to_gpu() del ids del embeddings with tempfile.NamedTemporaryFile() as f: if isinstance(index, FaissBinaryIndex): faiss.write_index_binary(index.index, f.name) else: faiss.write_index(index.index, f.name) logger.info("Index size: %d bytes", os.path.getsize(f.name)) logger.info("Loading BiEncoder...") biencoder = BiEncoder.load_from_checkpoint(args.biencoder_file, map_location="cpu") biencoder = biencoder.to(args.biencoder_device) biencoder.eval() biencoder.freeze() if args.parallel: biencoder.query_encoder = DataParallel(biencoder.query_encoder)
def build(self, config): ''' build index from scratch ''' operation_method = config.get("index_operation", "new").lower() gallery_images, gallery_docs = split_datafile( config['data_file'], config['image_root'], config['delimiter']) # when remove data in index, do not need extract fatures if operation_method != "remove": gallery_features = self._extract_features(gallery_images, config) assert operation_method in [ "new", "remove", "append" ], "Only append, remove and new operation are supported" # vector.index: faiss index file # id_map.pkl: use this file to map id to image_doc if operation_method in ["remove", "append"]: # if remove or append, vector.index and id_map.pkl must exist assert os.path.join( config["index_dir"], "vector.index" ), "The vector.index dose not exist in {} when 'index_operation' is not None".format( config["index_dir"]) assert os.path.join( config["index_dir"], "id_map.pkl" ), "The id_map.pkl dose not exist in {} when 'index_operation' is not None".format( config["index_dir"]) index = faiss.read_index( os.path.join(config["index_dir"], "vector.index")) with open(os.path.join(config["index_dir"], "id_map.pkl"), 'rb') as fd: ids = pickle.load(fd) assert index.ntotal == len(ids.keys( )), "data number in index is not equal in in id_map" else: if not os.path.exists(config["index_dir"]): os.makedirs(config["index_dir"], exist_ok=True) index_method = config.get("index_method", "HNSW32") # if IVF method, cal ivf number automaticlly if index_method == "IVF": index_method = index_method + str( min(int(len(gallery_images) // 8), 65536)) + ",Flat" # for binary index, add B at head of index_method if config["dist_type"] == "hamming": index_method = "B" + index_method #dist_type dist_type = faiss.METRIC_INNER_PRODUCT if config[ "dist_type"] == "IP" else faiss.METRIC_L2 #build index if config["dist_type"] == "hamming": index = faiss.index_binary_factory(config["embedding_size"], index_method) else: index = faiss.index_factory(config["embedding_size"], index_method, dist_type) index = faiss.IndexIDMap2(index) ids = {} if config["index_method"] == "HNSW32": logger.warning( "The HNSW32 method dose not support 'remove' operation") if operation_method != "remove": # calculate id for new data start_id = max(ids.keys()) + 1 if ids else 0 ids_now = ( np.arange(0, len(gallery_images)) + start_id).astype(np.int64) # only train when new index file if operation_method == "new": if config["dist_type"] == "hamming": index.add(gallery_features) else: index.train(gallery_features) if not config["dist_type"] == "hamming": index.add_with_ids(gallery_features, ids_now) for i, d in zip(list(ids_now), gallery_docs): ids[i] = d else: if config["index_method"] == "HNSW32": raise RuntimeError( "The index_method: HNSW32 dose not support 'remove' operation" ) # remove ids in id_map, remove index data in faiss index remove_ids = list( filter(lambda k: ids.get(k) in gallery_docs, ids.keys())) remove_ids = np.asarray(remove_ids) index.remove_ids(remove_ids) for k in remove_ids: del ids[k] # store faiss index file and id_map file if config["dist_type"] == "hamming": faiss.write_index_binary( index, os.path.join(config["index_dir"], "vector.index")) else: faiss.write_index( index, os.path.join(config["index_dir"], "vector.index")) with open(os.path.join(config["index_dir"], "id_map.pkl"), 'wb') as fd: pickle.dump(ids, fd)