def run_index(args): phrase_path = os.path.join(args.dump_dir, 'phrase.hdf5') if os.path.exists(phrase_path): dump_paths = [phrase_path] else: dump_names = os.listdir(os.path.join(args.dump_dir, 'phrase')) dump_paths = [os.path.join(args.dump_dir, 'phrase', name) for name in dump_names if name.endswith('.hdf5')] data = None if args.stage in ['all', 'coarse']: if args.replace or not os.path.exists(args.quantizer_path): if not os.path.exists(args.index_dir): os.makedirs(args.index_dir) data, max_norm = sample_data(dump_paths, max_norm=args.max_norm, para=args.para, doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio, max_norm_cf=args.max_norm_cf, num_dummy_zeros=args.num_dummy_zeros, norm_th=args.norm_th) with open(args.max_norm_path, 'w') as fp: json.dump(max_norm, fp) train_coarse_quantizer(data, args.quantizer_path, args.num_clusters, cuda=args.cuda) if args.stage in ['all', 'fine']: if args.replace or not os.path.exists(args.trained_index_path): with open(args.max_norm_path, 'r') as fp: max_norm = json.load(fp) if data is None: data, _ = sample_data(dump_paths, max_norm=max_norm, para=args.para, doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio, num_dummy_zeros=args.num_dummy_zeros, norm_th=args.norm_th) train_index(data, args.quantizer_path, args.trained_index_path, fine_quant=args.fine_quant, cuda=args.cuda) if args.stage in ['all', 'add']: if args.replace or not os.path.exists(args.index_path): with open(args.max_norm_path, 'r') as fp: max_norm = json.load(fp) if args.dump_paths is not None: dump_paths = args.dump_paths if not os.path.exists(args.subindex_dir): os.makedirs(args.subindex_dir) add_to_index(dump_paths, args.trained_index_path, args.index_path, args.idx2id_path, max_norm=max_norm, para=args.para, num_dummy_zeros=args.num_dummy_zeros, cuda=args.cuda, num_docs_per_add=args.num_docs_per_add, offset=args.offset, norm_th=args.norm_th, fine_quant=args.fine_quant) if args.stage == 'merge': if args.replace or not os.path.exists(args.index_path): merge_indexes(args.subindex_dir, args.trained_index_path, args.index_path, args.idx2id_path, args.inv_path) if args.stage == 'move': index = faiss.read_index(args.trained_index_path) invlists = faiss.OnDiskInvertedLists( index.nlist, index.code_size, args.inv_path) index.replace_invlists(invlists) faiss.write_index(index, args.index_path)
def merge_indexes(subindex_dir, trained_index_path, target_index_path, target_idx2id_path, target_inv_path): # target_inv_path = merged_index.ivfdata names = os.listdir(subindex_dir) idx2id_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.hdf5')] index_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.faiss')] print('copying idx2id') with h5py.File(target_idx2id_path, 'w') as out: for idx2id_path in tqdm(idx2id_paths, desc='copying idx2id'): with h5py.File(idx2id_path, 'r') as in_: offset = str(in_.attrs['offset']) group = out.create_group(offset) group.create_dataset('doc', data=in_['doc']) group.create_dataset('para', data=in_['para']) group.create_dataset('word', data=in_['word']) print('loading invlists') ivfs = [] for index_path in tqdm(index_paths, desc='loading invlists'): # the IO_FLAG_MMAP is to avoid actually loading the data thus # the total size of the inverted lists can exceed the # available RAM index = faiss.read_index(index_path, faiss.IO_FLAG_MMAP) ivfs.append(index.invlists) # avoid that the invlists get deallocated with the index index.own_invlists = False # construct the output index index = faiss.read_index(trained_index_path) # prepare the output inverted lists. They will be written # to merged_index.ivfdata invlists = faiss.OnDiskInvertedLists( index.nlist, index.code_size, target_inv_path) # merge all the inverted lists print('merging') ivf_vector = faiss.InvertedListsPtrVector() for ivf in tqdm(ivfs): ivf_vector.push_back(ivf) print("merge %d inverted lists " % ivf_vector.size()) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) print(ntotal) # now replace the inverted lists in the output index index.ntotal = ntotal index.replace_invlists(invlists) print('writing index') faiss.write_index(index, target_index_path)
def test_rename(self): d = 10 nb = 500 nq = 100 nt = 100 xt, xb, xq = get_dataset_2(d, nt, nb, nq) quantizer = faiss.IndexFlatL2(d) index1 = faiss.IndexIVFFlat(quantizer, d, 20) index1.train(xt) dirname = tempfile.mkdtemp() try: # make an index with ondisk invlists invlists = faiss.OnDiskInvertedLists( index1.nlist, index1.code_size, dirname + '/aa.ondisk') index1.replace_invlists(invlists) index1.add(xb) D1, I1 = index1.search(xq, 10) faiss.write_index(index1, dirname + '/aa.ivf') # move the index elsewhere os.mkdir(dirname + '/1') for fname in 'aa.ondisk', 'aa.ivf': os.rename(dirname + '/' + fname, dirname + '/1/' + fname) # try to read it: fails! try: index2 = faiss.read_index(dirname + '/1/aa.ivf') except RuntimeError: pass # normal else: assert False # read it with magic flag index2 = faiss.read_index(dirname + '/1/aa.ivf', faiss.IO_FLAG_ONDISK_SAME_DIR) D2, I2 = index2.search(xq, 10) assert np.all(I1 == I2) finally: shutil.rmtree(dirname)
def merge_on_disk(trained_index: faiss.Index, shard_fnames: List[str], ivfdata_fname: str) -> None: """ Adds the contents of the indexes stored in shard_fnames into the index trained_index. The on-disk data is stored in ivfdata_fname. Args: trained_index: The trained index to add the data to. shard_fnames: A list of the partial index filenames. ivfdata_fname: The filename for the on-disk extracted data. """ # Load the inverted lists ivfs = [] for fname in shard_fnames: # The IO_FLAG_MMAP is to avoid actually loading the data, and thus the # total size of the inverted lists can exceed the available RAM index = faiss.read_index(fname, faiss.IO_FLAG_MMAP) index_ivf = faiss.extract_index_ivf(index) ivfs.append(index_ivf.invlists) # Avoid deallocating the invlists with the index index_ivf.own_invlists = False # Construct the output index index = trained_index index_ivf = faiss.extract_index_ivf(index) assert index.ntotal == 0, 'The trained index should be empty' # Prepare the output inverted lists, which are written to ivfdata_fname. invlists = faiss.OnDiskInvertedLists(index_ivf.nlist, index_ivf.code_size, ivfdata_fname) # Merge all the inverted lists ivf_vector = faiss.InvertedListsPtrVector() for ivf in ivfs: ivf_vector.push_back(ivf) n_total = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) # Replace the inverted lists in the output index index.ntotal = index_ivf.ntotal = n_total index_ivf.replace_invlists(invlists, True) invlists.this.disown()
def merge_ondisk(trained_index: faiss.Index, shard_fnames: List[str], ivfdata_fname: str) -> None: """Add the contents of the indexes stored in shard_fnames into the index trained_index. The on-disk data is stored in ivfdata_fname""" assert not isinstance( trained_index, faiss.IndexIVFPQR), "IndexIVFPQR is not supported as an on disk index." # merge the images into an on-disk index # first load the inverted lists ivfs = [] for fname in shard_fnames: # the IO_FLAG_MMAP is to avoid actually loading the data thus # the total size of the inverted lists can exceed the # available RAM LOG.info("read " + fname) index = faiss.read_index(fname, faiss.IO_FLAG_MMAP) index_ivf = faiss.extract_index_ivf(index) ivfs.append(index_ivf.invlists) # avoid that the invlists get deallocated with the index index_ivf.own_invlists = False # construct the output index index = trained_index index_ivf = faiss.extract_index_ivf(index) assert index.ntotal == 0, "works only on empty index" # prepare the output inverted lists. They will be written # to merged_index.ivfdata invlists = faiss.OnDiskInvertedLists(index_ivf.nlist, index_ivf.code_size, ivfdata_fname) # merge all the inverted lists ivf_vector = faiss.InvertedListsPtrVector() for ivf in ivfs: ivf_vector.push_back(ivf) LOG.info("merge %d inverted lists " % ivf_vector.size()) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) # now replace the inverted lists in the output index index.ntotal = index_ivf.ntotal = ntotal index_ivf.replace_invlists(invlists, True) invlists.this.disown()
def mergeIndexes(indexFolder, machineNumber, finalIndexFile): indexesToMerge = [] indexFiles = os.listdir(os.path.dirname(indexFolder)) for indexFile in indexFiles: parts = indexFile.split('_') if len(parts) > 2: mnum = int(parts[1][-3:]) if mnum == machineNumber: fullFile = os.path.join(os.path.dirname(indexFolder), indexFile) print('mmaping ', fullFile) print('memory usage: ', psutil.virtual_memory().percent) index = faiss.read_index(fullFile, faiss.IO_FLAG_MMAP) indexesToMerge.append(index.invlists) print('adding final index') mainIndexFile = open(finalIndexFile, 'rb') mainIndexResource = Resource('indexparameters', mainIndexFile.read(), 'application/octet-stream') mainIndex, emptyIndex, preproc, map, all_tmp_paths = deserializeIndex( mainIndexResource) indexesToMerge.append(mainIndex.invlists) print("Merging " + str(len(indexesToMerge)) + "Index Shards for final index") finalIndex = emptyIndex invlists = faiss.OnDiskInvertedLists( finalIndex.nlist, index.code_size, os.path.join(os.path.dirname(self.index_cachefile), 'merged_index.ivfdata')) ivf_vector = faiss.InvertedListsPtrVector() bar = progressbar.ProgressBar() for ivf in bar(indexesToMerge): ivf_vector.push_back(ivf) print("merge %d inverted lists " % ivf_vector.size()) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) finalIndex.ntotal = ntotal finalIndex.replace_invlists(invlists) print('ntotal: ', finalIndex.ntotal) outName = "finalIndex_%03d" % machineNumber outPath = os.path.join(os.path.dirname(finalIndexFile), outName) binaryIndex = serializeIndex(finalIndexFile, map, machineNumber, all_tmp_paths) with open(outPath, 'wb') as of: of.write(binaryIndex)
def do_merge_then_remove(self, ondisk): d = 10 nb = 1000 nq = 200 nt = 200 xt, xb, xq = get_dataset_2(d, nt, nb, nq) quantizer = faiss.IndexFlatL2(d) index1 = faiss.IndexIVFFlat(quantizer, d, 20) index1.train(xt) filename = None if ondisk: filename = tempfile.mkstemp()[1] invlists = faiss.OnDiskInvertedLists( index1.nlist, index1.code_size, filename) index1.replace_invlists(invlists) index1.add(xb[:int(nb / 2)]) index2 = faiss.IndexIVFFlat(quantizer, d, 20) assert index2.is_trained index2.add(xb[int(nb / 2):]) Dref, Iref = index1.search(xq, 10) index1.merge_from(index2, int(nb / 2)) assert index1.ntotal == nb index1.remove_ids(faiss.IDSelectorRange(int(nb / 2), nb)) assert index1.ntotal == int(nb / 2) Dnew, Inew = index1.search(xq, 10) assert np.all(Dnew == Dref) assert np.all(Inew == Iref) if filename is not None: os.unlink(filename)
def mergeIndexList(indexList, emptyIndexPath, outPath, machineNum=""): # merge the images into an on-disk index # first load the inverted lists ivfs = [] outDir = os.path.dirname(outPath) bar = progressbar.ProgressBar() for indexFile in bar(indexList): # the IO_FLAG_MMAP is to avoid actually loading the data thus # the total size of the inverted lists can exceed the # available RAM # print("read " + indexFile) index = faiss.read_index(indexFile, faiss.IO_FLAG_MMAP) ivfs.append(index.invlists) # avoid that the invlists get deallocated with the index index.own_invlists = False # construct the output index index = faiss.read_index(emptyIndexPath) # prepare the output inverted lists. They will be written # to merged_index.ivfdata ivfDataStr = outDir + "merged_index_" + machineNum + "_.ivfdata" invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size, ivfDataStr) # merge all the inverted lists ivf_vector = faiss.InvertedListsPtrVector() for ivf in ivfs: ivf_vector.push_back(ivf) print("merge %d inverted lists " % ivf_vector.size()) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) # now replace the inverted lists in the output index index.ntotal = ntotal index.replace_invlists(invlists) print("write " + outPath) faiss.write_index(index, outPath) return ivfDataStr
def merge_IVFs(self, index_path: Union[str, Path], ivfdata_path: Union[str, Path], ivfindex_paths: List[Union[str, Path]] = None) -> int: """ An on-disk index must be built from existing subindexes. The inverted file list (IVF) from each subindex is merged into one disk-searchable .ivfdata file referenced by the .index file. Note: Use self.mv_index_and_ivfdata() to move these files. :param index_path: Path to output.index file :param ivfdata_path: Path to output.ivfdata file (on-disk searchable data) :param ivfindex_paths: Paths to indexes to be merged :return: Number of vectors indexed """ # Collect IVF data from subindexes ivfs = list() if not ivfindex_paths: ivfindex_paths = list(self.subindex_path_totals.keys()) for subindex_path in ivfindex_paths: index = faiss.read_index(p.abspath(subindex_path), faiss.IO_FLAG_MMAP) ivfs.append(index.invlists) index.own_invlists = False # Prevents de-allocation del index # Prepare .ivfdata file index = self.load_base_idx() invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size, ivfdata_path) ivf_vector = faiss.InvertedListsPtrVector() for ivf in ivfs: ivf_vector.push_back(ivf) # Merge IVF data ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) index.ntotal = ntotal index.replace_invlists(invlists) faiss.write_index(index, index_path) return int(ntotal)
def dumpIndex(indexInMemory, indexOnDiskPath, shardCount): if shardCount == 0: faiss.write_index(indexInMemory, indexOnDiskPath) else: ivfs = [] ivfs.append(indexInMemory.invlists) indexInMemory.own_invlists = False diskIndex = faiss.read_index(indexOnDiskPath, faiss.IO_FLAG_MMAP) ivfs.append(diskIndex.invlists) diskIndex.own_invlists = False invlists = faiss.OnDiskInvertedLists(diskIndex.nlist, diskIndex.code_size, 'mergedIndex_tmp.ivfdata') ivf_vector = faiss.InvertedListsPtrVector() ivf_vector.push_back(indexInMemory.invlists) ivf_vector.push_back(diskIndex.invlists) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) indexInMemory.ntotal = ntotal indexInMemory.replace_invlists(invlists) print('Index on disk now has ', indexInMemory.ntotal, ' entries') faiss.write_index(indexInMemory, indexOnDiskPath)
def make_mmap_index(self, base_index: BASE_INDEX, ids: np.array, embs: np.array): # Get invlists index = faiss.clone_index(base_index) index.add_with_ids(embs, ids) ivf_vector = faiss.InvertedListsPtrVector() ivf_vector.push_back(index.invlists) index.own_invlists = False del index gc.collect() # Make MMAP ivfdata index_name = p.abspath(self.sub_dir / f'{self.seed_name}') invlists = faiss.OnDiskInvertedLists(base_index.nlist, base_index.code_size, f'{index_name}.ivfdata') ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) # Link index to ivfdata and save index = faiss.clone_index(base_index) index.ntotal = ntotal index.replace_invlists(invlists) faiss.write_index(index, f'{index_name}.index')
# the total size of the inverted lists can exceed the # available RAM print("read " + tmpdir + "block_%d.index" % bno) index = faiss.read_index(tmpdir + "block_%d.index" % bno, faiss.IO_FLAG_MMAP) ivfs.append(index.invlists) # avoid that the invlists get deallocated with the index index.own_invlists = False # construct the output index index = faiss.read_index(tmpdir + "trained.index") # prepare the output inverted lists. They will be written # to merged_index.ivfdata invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size, tmpdir + "merged_index.ivfdata") # merge all the inverted lists ivf_vector = faiss.InvertedListsPtrVector() for ivf in ivfs: ivf_vector.push_back(ivf) print("merge %d inverted lists " % ivf_vector.size()) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) # now replace the inverted lists in the output index index.ntotal = ntotal index.replace_invlists(invlists) print("write " + tmpdir + "populated.index") faiss.write_index(index, tmpdir + "populated.index")
il.crop_invlists(args.l0, args.l1) ils_dont_dealloc.append(il) ils.push_back(il) if index0 is None: index0 = index print("loaded %d invlists" % ils.size()) if not args.outputIL: args.outputIL = args.output + '_invlists' il0 = ils.at(0) il = faiss.OnDiskInvertedLists(il0.nlist, il0.code_size, args.outputIL) print("perform merge") ntotal = il.merge_from(ils.data(), ils.size(), True) print("swap into index0") index0_ivf = faiss.extract_index_ivf(index0) index0_ivf.nlist = il0.nlist index0_ivf.ntotal = index0.ntotal = ntotal index0_ivf.invlists = il index0_ivf.own_invlists = False print("write", args.output)
def run_index(args): dump_names = os.listdir(os.path.join(args.dump_dir, 'phrase')) dump_paths = sorted([ os.path.join(args.dump_dir, 'phrase', name) for name in dump_names if name.endswith('.hdf5') ]) data = None if args.stage in ['all', 'coarse']: if args.replace or not os.path.exists(args.quantizer_path): if not os.path.exists(args.index_dir): os.makedirs(args.index_dir) start_data = sample_data(dump_paths, doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio, norm_th=args.norm_th) if args.stage in ['all', 'fine']: if args.replace or not os.path.exists(args.trained_index_path): if start_data is None: start_data = sample_data( dump_paths, doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio, norm_th=args.norm_th, hnsw=args.hnsw) train_index(start_data, args.quantizer_path, args.trained_index_path, args.num_clusters, fine_quant=args.fine_quant, cuda=args.cuda, hnsw=args.hnsw) if args.stage in ['all', 'add']: if args.replace or not os.path.exists(args.index_path): if args.dump_paths is not None: dump_paths = args.dump_paths if not os.path.exists(args.subindex_dir): os.makedirs(args.subindex_dir) add_to_index(dump_paths, args.trained_index_path, args.index_path, args.idx2id_path, cuda=args.cuda, num_docs_per_add=args.num_docs_per_add, offset=args.offset, norm_th=args.norm_th, fine_quant=args.fine_quant) if args.stage == 'merge': if args.replace or not os.path.exists(args.index_path): merge_indexes(args.subindex_dir, args.trained_index_path, args.index_path, args.idx2id_path, args.inv_path) if args.stage == 'move': index = faiss.read_index(args.trained_index_path) invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size, args.inv_path) index.replace_invlists(invlists) faiss.write_index(index, args.index_path)
def run_index(args): dump_names = os.listdir(os.path.join(args.dump_dir, args.phrase_dir)) dump_paths = sorted([ os.path.join(args.dump_dir, args.phrase_dir, name) for name in dump_names if name.endswith('.hdf5') ]) data = None if args.stage in ['all', 'coarse']: if args.replace or not os.path.exists(args.quantizer_path): if not os.path.exists(args.index_dir): os.makedirs(args.index_dir) start_data, avg_vec, std_vec = sample_data( dump_paths, doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio, norm_th=args.norm_th) with open(os.path.join(args.index_dir, 'avg_vec.pkl'), 'wb') as fp: pickle.dump(avg_vec, fp) with open(os.path.join(args.index_dir, 'std_vec.pkl'), 'wb') as fp: pickle.dump(std_vec, fp) if args.stage in ['all', 'fine']: if args.replace or not os.path.exists(args.trained_index_path): if start_data is None: start_data, avg_vec, std_vec = sample_data( dump_paths, doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio, norm_th=args.norm_th, hnsw=args.hnsw) train_index(start_data, args.quantizer_path, args.trained_index_path, args.num_clusters, fine_quant=args.fine_quant, cuda=args.cuda, hnsw=args.hnsw) if args.stage in ['all', 'add']: if args.replace or not os.path.exists(args.index_path): avg_vec = None std_vec = None # with open(os.path.join(args.index_dir, 'avg_vec.pkl'), 'rb') as fp: # avg_vec = pickle.load(fp) # with open(os.path.join(args.index_dir, 'std_vec.pkl'), 'rb') as fp: # std_vec = pickle.load(fp) if args.dump_paths is not None: dump_paths = args.dump_paths if not os.path.exists(args.subindex_dir): os.makedirs(args.subindex_dir) add_to_index( dump_paths, args.trained_index_path, args.index_path, args.idx2id_path, cuda=args.cuda, num_docs_per_add=args.num_docs_per_add, offset=args.offset, norm_th=args.norm_th, fine_quant=args.fine_quant, avg_vec=avg_vec, std_vec=std_vec, first_passage=args.first_passage, index_filter=args.index_filter, ) if args.stage == 'merge': if args.replace or not os.path.exists(args.index_path): merge_indexes(args.subindex_dir, args.trained_index_path, args.index_path, args.idx2id_path, args.inv_path) if args.stage == 'move': index = faiss.read_index(args.trained_index_path) invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size, args.inv_path) index.replace_invlists(invlists) faiss.write_index(index, args.index_path)