def compute_populated_index(preproc): """Add elements to a sharded index. Return the index and if available a sharded gpu_index that contains the same data. """ indexall = prepare_trained_index(preproc) co = faiss.GpuMultipleClonerOptions() co.useFloat16 = use_float16 co.useFloat16CoarseQuantizer = False co.usePrecomputed = use_precomputed_tables co.indicesOptions = faiss.INDICES_CPU co.verbose = 10 co.reserveVecs = max_add if max_add > 0 else xb.shape[0] co.shard = True vres, vdev = make_vres_vdev() gpu_index = faiss.index_cpu_to_gpu_multiple( vres, vdev, indexall, co) print "add..." t0 = time.time() nb = xb.shape[0] for i0, xs in dataset_iterator(xb, preproc, add_batch_size): i1 = i0 + xs.shape[0] gpu_index.add_with_ids(xs, np.arange(i0, i1)) if max_add > 0 and gpu_index.ntotal > max_add: print "Flush indexes to CPU" for i in range(ngpu): index_src_gpu = faiss.downcast_index(gpu_index.at(i)) index_src = faiss.index_gpu_to_cpu(index_src_gpu) print " index %d size %d" % (i, index_src.ntotal) index_src.copy_subset_to(indexall, 0, 0, nb) index_src_gpu.reset() index_src_gpu.reserveMemory(max_add) gpu_index.sync_with_shard_indexes() print '\r%d/%d (%.3f s) ' % ( i0, nb, time.time() - t0), sys.stdout.flush() print "Add time: %.3f s" % (time.time() - t0) print "Aggregate indexes to CPU" t0 = time.time() for i in range(ngpu): index_src = faiss.index_gpu_to_cpu(gpu_index.at(i)) print " index %d size %d" % (i, index_src.ntotal) index_src.copy_subset_to(indexall, 0, 0, nb) print " done in %.3f s" % (time.time() - t0) if max_add > 0: # it does not contain all the vectors gpu_index = None return gpu_index, indexall
def train_index(data, quantizer_path, trained_index_path, fine_quant='SQ8', cuda=False): quantizer = faiss.read_index(quantizer_path) if fine_quant == 'SQ8': trained_index = faiss.IndexIVFScalarQuantizer(quantizer, quantizer.d, quantizer.ntotal, faiss.METRIC_L2) elif fine_quant.startswith('PQ'): m = int(fine_quant[2:]) trained_index = faiss.IndexIVFPQ(quantizer, quantizer.d, quantizer.ntotal, m, 8) else: raise ValueError(fine_quant) if cuda: if fine_quant.startswith('PQ'): print('PQ not supported on GPU; keeping CPU.') else: res = faiss.StandardGpuResources() gpu_index = faiss.index_cpu_to_gpu(res, 0, trained_index) gpu_index.train(data) trained_index = faiss.index_gpu_to_cpu(gpu_index) else: trained_index.train(data) faiss.write_index(trained_index, trained_index_path)
def save(self, path: str) -> None: r"""Save the index and meta data in ``path`` directory. The index will be saved as ``index.faiss`` and ``index.meta_data`` respectively inside ``path`` directory. Args: path (str): A path to the directory where the index will be saved """ if os.path.exists(path): logging.warning("%s directory already exists. Index will be " "saved into an existing directory", path) else: os.makedirs(path) cpu_index = faiss.index_gpu_to_cpu(self._index) \ if self._index.__class__.__name__.startswith("Gpu") else self._index faiss.write_index(cpu_index, f"{path}/index.faiss") with open(f"{path}/index.meta_data", "wb") as f: pickle.dump(self._meta_data, f)
def upgrade_indices(self, new_index_type="IDMap,IVF100,PQ8"): for column_name, index in self.indices.items(): if faiss.get_num_gpus() > 0: index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), self.rank, index) vectors = index.reconstruct_n(0, index.ntotal) ids = np.array([index.id_map.at(i) for i in range(index.id_map.size())]) assert len(vectors) == len(ids) new_index = faiss.index_factory(vectors.shape[1], new_index_type) if faiss.get_num_gpus() > 0: new_index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), self.rank, new_index) if not new_index.is_trained: new_index.train(vectors) new_index.add_with_ids(vectors, ids) if faiss.get_num_gpus() > 0: new_index = faiss.index_gpu_to_cpu(new_index) faiss.write_index(new_index, f"{self.directory}_new/{column_name}.index")
def _save_faiss_model(self): """ Save the index and parameters to the configured DataElements. """ with self._model_lock: # Only write to cache elements if they are both writable. writable = (self._index_element and self._index_element.writable() and self._index_param_element and self._index_param_element.writable()) if writable: self._log.debug("Storing index: %s", self._index_element) # FAISS wants to write to a file, so make a temp file, then # read it in, putting bytes into element. fd, fp = tempfile.mkstemp() try: # Write function needs a CPU index instance, so bring it # down from the GPU if necessary. if self._use_gpu and isinstance(self._faiss_index, faiss.GpuIndex): to_write = faiss.index_gpu_to_cpu(self._faiss_index) else: to_write = self._faiss_index faiss.write_index(to_write, fp) self._index_element.set_bytes( os.read(fd, os.path.getsize(fp))) finally: os.close(fd) os.remove(fp) # Store index parameters used. params = { "factory_string": self.factory_string, "read_only": self.read_only, "random_seed": self.random_seed, "use_multiprocessing": self.use_multiprocessing, "next_index": self._next_index, } self._index_param_element.set_bytes(json.dumps(params))
assert not index.is_trained index.train(train_subset) assert index.is_trained print("Training index... finished") subset_i = 0 print("Adding {} train features to index".format(len(label_features))) for label, features in tqdm(label_features.items()): n_features = features.shape[0] f = features[:n_features] index.add(mat.apply_py(f) if pca else f) for n_feature in range(n_features): index_dict[subset_i+n_feature] = int(label) subset_i += n_features faiss.write_index(faiss.index_gpu_to_cpu(index) if gpu else index, INDEX_FILENAME) with open(INDEX_FILENAME_PK, 'wb') as fp: pickle.dump(index_dict, fp) print("Indexed vectors {}".format(index.ntotal)) index.nprobe = 100 files = sorted(glob.glob(FEATURES_NPY)) suffix = "_tk{}".format(args.top_k) if not args.extract_train_nn: test = np.empty((len(files), FEATURES_NUMBER), dtype=np.float32) subset_i = 0 test_ids = []
def add_to_index(dump_paths, trained_index_path, target_index_path, idx2id_path, num_docs_per_add=1000, cuda=False, fine_quant='SQ4', offset=0, norm_th=999, ignore_ids=None, avg_vec=None, std_vec=None, first_passage=False, index_filter=-1e8): sidx2doc_id = [] sidx2word_id = [] dumps = [h5py.File(dump_path, 'r') for dump_path in dump_paths] # filter dumps if index_filter != -1e8: f_dumps = [ h5py.File(dump_path.replace('/phrase/', '/filter/'), 'r') for dump_path in dump_paths ] print('reading %s' % trained_index_path) start_index = faiss.read_index(trained_index_path) if 'none' not in fine_quant: index_ivf = faiss.extract_index_ivf(start_index) index_ivf.make_direct_map() index_ivf.set_direct_map_type(faiss.DirectMap.Hashtable) if cuda: if 'PQ' in fine_quant: index_ivf = faiss.extract_index_ivf(start_index) quantizer = index_ivf.quantizer quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer) index_ivf.quantizer = quantizer_gpu else: res = faiss.StandardGpuResources() co = faiss.GpuClonerOptions() co.useFloat16 = True start_index = faiss.index_cpu_to_gpu(res, 0, start_index, co) print('adding following dumps:') for dump_path in dump_paths: print(dump_path) start_total = 0 start_total_prev = 0 cnt = 0 for di, phrase_dump in enumerate(tqdm(dumps, desc='dumps')): starts = [] start_valids = [] dump_length = len(phrase_dump) for i, (doc_idx, doc_group) in enumerate( tqdm(phrase_dump.items(), desc='adding %d' % di)): if ignore_ids is not None and doc_idx in ignore_ids: continue num_start = doc_group['start'].shape[0] if num_start == 0: continue cnt += 1 # First passage only if first_passage: f2o_start = doc_group['f2o_start'][:] cut = sum(f2o_start < doc_group['len_per_para'][0]) start = int8_to_float(doc_group['start'][:cut], doc_group.attrs['offset'], doc_group.attrs['scale']) num_start = start.shape[0] # Apply index filter elif index_filter != -1e8: o2f_start = { orig: ft for ft, orig in enumerate(doc_group['f2o_start'][:]) } filter_start = f_dumps[di][doc_idx]['filter_start'][:] filter_end = f_dumps[di][doc_idx]['filter_end'][:] start_idxs, = np.where(filter_start > index_filter) end_idxs, = np.where(filter_end > index_filter) save_idx = set(np.concatenate([start_idxs, end_idxs])) save_idx = sorted( [o2f_start[si] for si in save_idx if si in o2f_start]) start = int8_to_float(doc_group['start'][save_idx], doc_group.attrs['offset'], doc_group.attrs['scale']) num_start = start.shape[0] else: start = int8_to_float(doc_group['start'][:], doc_group.attrs['offset'], doc_group.attrs['scale']) start_valid = np.linalg.norm(start, axis=1) <= norm_th starts.append(start) start_valids.append(start_valid) sidx2doc_id.extend([int(doc_idx)] * num_start) if index_filter == -1e8: sidx2word_id.extend(range(num_start)) else: sidx2word_id.extend(save_idx) start_total += num_start if len(starts) > 0 and ((i % num_docs_per_add == 0) or (i == dump_length - 1)): print('adding at %d' % (i + 1)) add_with_offset( start_index, concat_vectors(starts), concat_vectors(start_valids), start_total_prev, offset, fine_quant, ) start_total_prev = start_total starts = [] start_valids = [] if len(starts) > 0: print('final adding at %d' % (i + 1)) add_with_offset( start_index, concat_vectors(starts), concat_vectors(start_valids), start_total_prev, offset, fine_quant, ) start_total_prev = start_total print('number of docs', cnt) for dump in dumps: dump.close() if cuda: print('moving back to cpu') if 'PQ' in fine_quant: index_ivf.quantizer = quantizer del quantizer_gpu else: start_index = faiss.index_gpu_to_cpu(start_index) print('start_index ntotal: %d' % start_index.ntotal) print(start_total) sidx2doc_id = np.array(sidx2doc_id, dtype=np.int32) sidx2word_id = np.array(sidx2word_id, dtype=np.int32) print('writing index and metadata') with h5py.File(idx2id_path, 'w') as f: g = f.create_group(str(offset)) g.create_dataset('doc', data=sidx2doc_id) g.create_dataset('word', data=sidx2word_id) g.attrs['offset'] = offset faiss.write_index(start_index, target_index_path) print('done')
def saveindex(self, path): print(f"saving{path}") t = faiss.index_gpu_to_cpu(self.index) faiss.write_index(t, path)
def from_database(cls, con, table, paramstyle, hash_length, ids_train=None, train_size=None, chunksize=100000, metadata_columns=None, index=None, gpu=False, dtype='uint8', distance_metric='euclidean'): """Train and build a FAISS index from a database connection. Args: con: A database connection from which to obtain metadata for matched hashes. table: The table in the database that we should query for metadata. paramstyle: The parameter style for the given database hash_length: The length of the hash that is being matched against. ids_train: The IDs for the vectors to train on. train_size: The number of vectors to use for training. Will be randomly selected from 1 to the number of vectors in the database. Ignored if ids_train is not None. chunksize: The chunks of data to draw from the database at a time when adding vectors to the index. metadata_columns: The metadata that should be returned for queries. index: If a pretrained index is provided, training will be skipped, any existing vectors will be discarded, and the index will be repopulated with the current contents of the database. gpu: If true, will attempt to carry out training on a GPU. dtype: The data type for the vectors distance_metric: The distance metric for the vectors """ assert dtype == 'uint8', 'Only unsigned 8-bit integer hashes are supported at this time.' assert distance_metric == 'euclidean', 'Only euclidean distance is supported at this time.' if index is None: # Train the index using the practices from # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index#if-below-1m-vectors-ivfx ntotal = pd.read_sql( sql="select count(*) as count from hashes", con=con).iloc[0]['count'] assert train_size <= ntotal, 'Cannot train on more hashes than are available.' nlist = int(min(4 * np.sqrt(ntotal), ntotal / 39)) min_train_size = 39 * nlist if ids_train is not None: train_size = len(ids_train) if train_size is None: train_size = min_train_size assert train_size >= min_train_size, f'Training an index used for {ntotal} hashes requires at least {min_train_size} training hashes.' if ids_train is None: ids_train = np.random.choice( np.arange(ntotal), size=train_size, replace=False) df_train = query_by_id( con=con, table=table, ids=ids_train, paramstyle=paramstyle, extra_columns=['hash']) x_train = np.array([ np.frombuffer(h, dtype=dtype) for h in df_train['hash'] ]).astype('float32') assert x_train.shape[ 1] == hash_length, 'Hashes are of incorrect length.' index = faiss.IndexIVFFlat( faiss.IndexFlatL2(hash_length), hash_length, nlist) if gpu: res = faiss.StandardGpuResources() gpu_index = faiss.index_cpu_to_gpu(res, 0, index) gpu_index.train(x_train) index = faiss.index_gpu_to_cpu(gpu_index) else: index.train(x_train) else: index.reset() # Add hashes to the index in chunks. for df_add in pd.read_sql( sql=f"SELECT id, hash FROM {table}", con=con, chunksize=chunksize): x_add = np.array([ np.frombuffer(h, dtype=dtype) for h in df_add['hash'] ]).astype('float32') index.add_with_ids(x_add, df_add['id'].values) return cls( con=con, index=index, hash_length=hash_length, distance_metric=distance_metric, dtype=dtype, table=table, paramstyle=paramstyle, metadata_columns=metadata_columns)
def _build_ann_index(index_filename: str, embeddings: np.ndarray, precursor_mzs: pd.Series, mz_splits: np.ndarray) -> None: """ Create ANN indexes for the given embedding vectors. Vectors will be split over multiple ANN indexes based on the given m/z interval. Parameters ---------- index_filename: str Base file name of the ANN index. Separate indexes for the given m/z splits will be created. embeddings: np.ndarray The embedding vectors to build the ANN index. precursor_mzs: pd.Series Precursor m/z's corresponding to the embedding vectors used to split the embeddings over multiple ANN indexes. mz_splits: np.ndarray M/z splits used to create separate ANN indexes. """ logger.debug('Use %d GPUs for ANN index construction', faiss.get_num_gpus()) # Create separate indexes for all embeddings with precursor m/z in the # specified intervals. for mz in tqdm.tqdm(mz_splits, desc='Indexes built', unit='index'): if os.path.isfile(index_filename.format(mz)): continue # Create an ANN index using Euclidean distance for fast NN queries. index_embeddings_ids = _get_precursor_mz_interval_ids( precursor_mzs, mz, config.mz_interval, config.precursor_tol_mode, config.precursor_tol_mass) num_index_embeddings = len(index_embeddings_ids) # Figure out a decent value for the num_list hyperparameter based on # the number of embeddings. Rules of thumb from the Faiss wiki: # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index#how-big-is-the-dataset if num_index_embeddings == 0: continue elif num_index_embeddings < 10e5: # FIXME: A brute-force index might be better if there are too few # embeddings. # Ceil to avoid zero. num_list = math.ceil(2**math.floor( math.log2(num_index_embeddings / 39))) elif num_index_embeddings < 10e6: num_list = 2**16 elif num_index_embeddings < 10e7: num_list = 2**18 else: num_list = 2**20 if num_index_embeddings > 10e8: logger.warning('More than 1B embeddings to be indexed, ' 'consider decreasing the ANN size') logger.debug( 'Build the ANN index for precursor m/z %d–%d ' '(%d embeddings, %d lists)', mz, mz + config.mz_interval, num_index_embeddings, num_list) # Large datasets won't fit in the GPU memory, so we first train the # index on the CPU. index_embeddings = embeddings[index_embeddings_ids] index_cpu = faiss.IndexIVFFlat( faiss.IndexFlatL2(config.embedding_size), config.embedding_size, num_list, faiss.METRIC_L2) index_cpu.train(index_embeddings) # Add the embeddings to the index using the GPU for increased # performance. Shard the GPU index over all available GPUs. logger.debug('Add %d embeddings to the ANN index', num_index_embeddings) # https://github.com/facebookresearch/faiss/blob/2cce2e5f59a5047aa9a1729141e773da9bec6b78/benchs/bench_gpu_1bn.py#L506 co = faiss.GpuMultipleClonerOptions() co.shard = True co.useFloat16 = True co.useFloat16CoarseQuantizer = False co.indicesOptions = faiss.INDICES_CPU co.reserveVecs = num_index_embeddings index_gpu = faiss.index_cpu_to_all_gpus(index_cpu, co) # Add the embeddings in batches to avoid exhausting the GPU memory. batch_size = config.batch_size_add for batch_start in tqdm.tqdm(range(0, num_index_embeddings, batch_size), desc='Batches processed', leave=False, unit='batch'): batch_stop = min(batch_start + batch_size, num_index_embeddings) index_gpu.add_with_ids( index_embeddings[batch_start:batch_stop], index_embeddings_ids[batch_start:batch_stop]) # Combine the sharded index into a single index and save. logger.debug('Save the ANN index to file %s', index_filename.format(mz)) # https://github.com/facebookresearch/faiss/blob/2cce2e5f59a5047aa9a1729141e773da9bec6b78/benchs/bench_gpu_1bn.py#L544 if hasattr(index_gpu, 'at'): # Sharded index. for i in range(index_gpu.count()): index_src = faiss.index_gpu_to_cpu(index_gpu.at(i)) index_src.copy_subset_to(index_cpu, 0, 0, int(precursor_mzs.index.max())) index_gpu.at(i).reset() else: # Standard index. index_src = faiss.index_gpu_to_cpu(index_gpu) index_src.copy_subset_to(index_cpu, 0, 0, int(precursor_mzs.index.max())) index_gpu.reset() faiss.write_index(index_cpu, index_filename.format(mz)) index_cpu.reset()
def add_to_index(dump_paths, trained_index_path, target_index_path, idx2id_path, num_docs_per_add=1000, cuda=False, fine_quant='SQ4', offset=0, norm_th=999, ignore_ids=None): sidx2doc_id = [] sidx2word_id = [] dumps = [h5py.File(dump_path, 'r') for dump_path in dump_paths] print('reading %s' % trained_index_path) start_index = faiss.read_index(trained_index_path) start_index.make_direct_map() start_index.set_direct_map_type(faiss.DirectMap.Hashtable) if cuda: if 'PQ' in fine_quant: index_ivf = faiss.extract_index_ivf(start_index) quantizer = index_ivf.quantizer quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer) index_ivf.quantizer = quantizer_gpu else: res = faiss.StandardGpuResources() co = faiss.GpuClonerOptions() co.useFloat16 = True start_index = faiss.index_cpu_to_gpu(res, 0, start_index, co) print('adding following dumps:') for dump_path in dump_paths: print(dump_path) start_total = 0 start_total_prev = 0 cnt = 0 for di, phrase_dump in enumerate(tqdm(dumps, desc='dumps')): starts = [] start_valids = [] dump_length = len(phrase_dump) for i, (doc_idx, doc_group) in enumerate( tqdm(phrase_dump.items(), desc='adding %d' % di)): if ignore_ids is not None and doc_idx in ignore_ids: continue num_start = doc_group['start'].shape[0] if num_start == 0: continue cnt += 1 start = int8_to_float(doc_group['start'][:], doc_group.attrs['offset'], doc_group.attrs['scale']) start_valid = np.linalg.norm(start, axis=1) <= norm_th starts.append(start) start_valids.append(start_valid) sidx2doc_id.extend([int(doc_idx)] * num_start) sidx2word_id.extend(range(num_start)) start_total += num_start if len(starts) > 0 and ((i % num_docs_per_add == 0) or (i == dump_length - 1)): print('adding at %d' % (i + 1)) add_with_offset(start_index, concat_vectors(starts), concat_vectors(start_valids), start_total_prev, offset) start_total_prev = start_total starts = [] start_valids = [] if len(starts) > 0: print('final adding at %d' % (i + 1)) add_with_offset(start_index, concat_vectors(starts), concat_vectors(start_valids), start_total_prev, offset) start_total_prev = start_total print('number of docs', cnt) for dump in dumps: dump.close() if cuda: print('moving back to cpu') if 'PQ' in fine_quant: index_ivf.quantizer = quantizer del quantizer_gpu else: start_index = faiss.index_gpu_to_cpu(start_index) print('start_index ntotal: %d' % start_index.ntotal) print(start_total) sidx2doc_id = np.array(sidx2doc_id, dtype=np.int32) sidx2word_id = np.array(sidx2word_id, dtype=np.int32) print('writing index and metadata') with h5py.File(idx2id_path, 'w') as f: g = f.create_group(str(offset)) g.create_dataset('doc', data=sidx2doc_id) g.create_dataset('word', data=sidx2word_id) g.attrs['offset'] = offset faiss.write_index(start_index, target_index_path) print('done')
def build_index(cfg: DictConfig, model: object): """ Builds faiss index from index dataset specified in the config. Args: cfg (DictConfig): Config file specifying index parameters model (object): Encoder model """ # Get index dataset embeddings # PCA model exists and index embeddings have already been PCAed, no need to re-extract/PCA them if cfg.apply_pca and os.path.isfile( cfg.pca.pca_save_name) and os.path.isfile( cfg.pca_embeddings_save_name): logging.info("Loading reduced dimensionality embeddings") embeddings = h5py.File(cfg.pca_embeddings_save_name, "r") embeddings = embeddings[cfg.index_ds.name][:] elif os.path.isfile(cfg.embedding_save_name): logging.info("Loading previously extracted index dataset embeddings") embeddings = h5py.File(cfg.embedding_save_name, "r") embeddings = embeddings[cfg.index_ds.name][:] else: logging.info("Encoding index dataset, this may take a while") index_dataloader = model.setup_dataloader(cfg.index_ds, is_index_data=True) embeddings, concept_ids = get_index_embeddings(cfg, index_dataloader, model) # Create pca model to reduce dimensionality of index dataset and decrease memory footprint if cfg.apply_pca: # Need to train PCA model and apply PCA transformation with newly trained model if not os.path.isfile(cfg.pca.pca_save_name): logging.info( "Fitting PCA model for embedding dimensionality reduction") pca_train_set = random.sample( list(embeddings), k=int(len(embeddings) * cfg.pca.sample_fraction)) pca = PCA(n_components=cfg.pca.output_dim) pca.fit(pca_train_set) pkl.dump(pca, open(cfg.pca.pca_save_name, "wb")) embeddings = reduce_embedding_dim(pca, embeddings, cfg) # PCA model already trained, just need to reduce dimensionality of all embeddings elif not os.path.isfile(cfg.pca_embeddings_save_name): pca = pkl.load(open(cfg.pca.pca_save_name, "rb")) embeddings = reduce_embedding_dim(pca, embeddings, cfg) # Build faiss index from embeddings logging.info( f"Training index with embedding dim size {cfg.dims} using {faiss.get_num_gpus()} gpus" ) quantizer = faiss.IndexFlatL2(cfg.dims) index = faiss.IndexIVFFlat(quantizer, cfg.dims, cfg.nlist) index = faiss.index_cpu_to_all_gpus(index) index.train(embeddings) logging.info("Adding dataset embeddings to index") for i in tqdm(range(0, embeddings.shape[0], cfg.index_batch_size)): index.add(embeddings[i:i + cfg.index_batch_size]) logging.info("Saving index") faiss.write_index(faiss.index_gpu_to_cpu(index), cfg.index_save_name) logging.info("Index built and saved")
macro_processor = Processor(macro_prefixes) logger.info("building index...") res = faiss.StandardGpuResources() co = faiss.GpuClonerOptions() index = faiss.index_factory(args.dim, args.index_type, faiss.METRIC_INNER_PRODUCT) if args.index_type != "Flat": macro_mat_sample = macro_processor.sample_across_mmap_shards(suffix='.emb.npy', sample=1000000) logger.info(f"training...") index.train(macro_mat_sample) # train on a large subset of macro data logger.info("index built!") logger.info("adding all vectors to index...") macro_ids = [] for mat_batch, id_batch in macro_processor.iterate_across_mmap_shards(batch_size=args.batch_size): faiss.normalize_L2(mat_batch) index.add(mat_batch) # add vectors to the index macro_ids.append(id_batch) macro_ids = np.concatenate(macro_ids, axis=0) macro_instances = read_dataset(args.text) if args.device >= 0: index = faiss.index_cpu_to_gpu(res, args.device, index, co) logger.info(f"saving index to {args.serialization_dir}...") if not os.path.isdir(args.serialization_dir): os.mkdir(args.serialization_dir) faiss.write_index(faiss.index_gpu_to_cpu(index), os.path.join(args.serialization_dir, "faiss.index")) write_dataset(macro_instances, os.path.join(args.serialization_dir, "text.jsonl")) np.save(os.path.join(args.serialization_dir, "ids.npy"), macro_ids) logger.info("done!")
def process(self, scenes, tmp_dir): import faiss import torch z_dim = 512 index = faiss.IndexFlatL2(z_dim) if torch.cuda.is_available(): print('USING GPU') res = faiss.StandardGpuResources() index = faiss.index_cpu_to_gpu(res, 0, index) else: print('NOT USING GPU') scenes_and_windows = [] for scene in scenes: print('Embedding scene {}'.format(scene.id)) embeddings = [] for window, embedding in scene.prediction_label_store.get_labels( ).get_embeddings(): scenes_and_windows.append((scene, window)) embeddings.append(embedding) print('.' * len(embeddings), end='', flush=True) print(np.array(embeddings).shape) index.add(np.array(embeddings)) print('.') # Write index print('Writing index...') index_path = os.path.join(tmp_dir, 'embeddings.idx') if torch.cuda.is_available(): faiss.write_index(faiss.index_gpu_to_cpu(index), index_path) else: faiss.write_index(index, index_path) if not os.path.exists(index_path): raise Exception('Did not write index to {}'.format(index_path)) else: print('Wrote index to {}'.format(index_path)) print('Uploading to {}'.format(self.config.index_output_uri)) upload_or_copy(index_path, self.config.index_output_uri) # Write scene list. def make_scene_list_row(src_scene_id, src_window, i): return { 'idx': i, 'src_scene_id': src_scene_id, 'src_window_xmin': src_window.xmin, 'src_window_ymin': src_window.ymin, 'src_window_xmax': src_window.xmax, 'src_window_ymax': src_window.ymax } print('Writing scene list...') rows = [] for (i, (scene, window)) in enumerate(scenes_and_windows): rows.append( make_scene_list_row(scene.raster_source.uris[0], window, i)) df = pd.DataFrame(rows) path = os.path.join(tmp_dir, 'scene-windows.csv') df.to_csv(path) upload_or_copy(path, self.config.window_list_uri) if self.config.compute_nearest: # Calculate nearest. k = 4 results = [] for scene in scenes: print('Finding {} nearest for scene {}'.format(k, scene.id)) windows = [] embeddings = [] for window, embedding in scene.prediction_label_store.get_labels( ).get_embeddings(): windows.append(window) embeddings.append(embedding) print('.' * len(embeddings), end='', flush=True) D, I = index.search(np.array(embeddings), k) for i, nearest_idx in enumerate(I): nearest = [] for idx in nearest_idx: near_scene, near_window = scenes_and_windows[idx] nearest.append((near_scene.id, near_window)) results.append((scene.id, windows[i], nearest)) print('.') # Write results def make_nearest_row(src_scene_id, src_window, near_scene_id, near_window, near_rank): return { 'src_scene_id': src_scene_id, 'src_window_xmin': src_window.xmin, 'src_window_ymin': src_window.ymin, 'src_window_xmax': src_window.xmax, 'src_window_ymax': src_window.ymax, 'near_scene_id': near_scene_id, 'near_window_xmin': near_window.xmin, 'near_window_ymin': near_window.ymin, 'near_window_xmax': near_window.xmax, 'near_window_ymax': near_window.ymax, 'near_rank': near_rank } print('Writing results...') rows = [] for (src_scene_id, src_window, neighbors) in results: for near_rank, (near_scene_id, near_window) in enumerate(neighbors): rows.append( make_nearest_row(src_scene_id, src_window, near_scene_id, near_window, near_rank)) df = pd.DataFrame(rows) path = os.path.join(tmp_dir, 'nearest.csv') df.to_csv(path) upload_or_copy(path, self.config.nearest_output_uri)
def add_to_index(dump_paths, trained_index_path, target_index_path, idx2id_path, max_norm, para=False, num_docs_per_add=1000, num_dummy_zeros=0, cuda=False, fine_quant='SQ8', offset=0, norm_th=999, ignore_ids=None): idx2doc_id = [] idx2para_id = [] idx2word_id = [] dumps = [h5py.File(dump_path, 'r') for dump_path in dump_paths] print('reading %s' % trained_index_path) start_index = faiss.read_index(trained_index_path) if cuda: if fine_quant.startswith('PQ'): print('PQ not supported on GPU; keeping CPU.') else: res = faiss.StandardGpuResources() start_index = faiss.index_cpu_to_gpu(res, 0, start_index) print('adding following dumps:') for dump_path in dump_paths: print(dump_path) if para: for di, phrase_dump in enumerate(tqdm(dumps, desc='dumps')): starts = [] for i, (doc_idx, doc_group) in enumerate( tqdm(phrase_dump.items(), desc='faiss indexing')): for para_idx, group in doc_group.items(): num_vecs = group['start'].shape[0] start = int8_to_float(group['start'][:], group.attrs['offset'], group.attrs['scale']) norms = np.linalg.norm(start, axis=1, keepdims=True) consts = np.sqrt(np.maximum(0.0, max_norm**2 - norms**2)) start = np.concatenate([consts, start], axis=1) if num_dummy_zeros > 0: start = np.concatenate([ start, np.zeros([start.shape[0], num_dummy_zeros], dtype=start.dtype) ], axis=1) starts.append(start) idx2doc_id.extend([int(doc_idx)] * num_vecs) idx2para_id.extend([int(para_idx)] * num_vecs) idx2word_id.extend(list(range(num_vecs))) if len(starts) > 0 and i % num_docs_per_add == 0: print('concatenating') concat = np.concatenate(starts, axis=0) print('adding') add_with_offset(start_index, concat, offset) # start_index.add(concat) print('done') starts = [] if i % 100 == 0: print('%d/%d' % (i + 1, len(phrase_dump.keys()))) print('adding leftover') add_with_offset(start_index, np.concatenate(starts, axis=0), offset) # start_index.add(np.concatenate(starts, axis=0)) # leftover print('done') else: for di, phrase_dump in enumerate(tqdm(dumps, desc='dumps')): starts = [] valids = [] for i, (doc_idx, doc_group) in enumerate( tqdm(phrase_dump.items(), desc='adding %d' % di)): if ignore_ids is not None and doc_idx in ignore_ids: continue num_vecs = doc_group['start'].shape[0] start = int8_to_float(doc_group['start'][:], doc_group.attrs['offset'], doc_group.attrs['scale']) valid = np.linalg.norm(start, axis=1) <= norm_th norms = np.linalg.norm(start, axis=1, keepdims=True) consts = np.sqrt(np.maximum(0.0, max_norm**2 - norms**2)) start = np.concatenate([consts, start], axis=1) if num_dummy_zeros > 0: start = np.concatenate([ start, np.zeros([start.shape[0], num_dummy_zeros], dtype=start.dtype) ], axis=1) starts.append(start) valids.append(valid) idx2doc_id.extend([int(doc_idx)] * num_vecs) idx2word_id.extend(range(num_vecs)) if len(starts) > 0 and i % num_docs_per_add == 0: add_with_offset(start_index, np.concatenate(starts, axis=0), offset, np.concatenate(valids)) # start_index.add(np.concatenate(starts, axis=0)) starts = [] valids = [] if i % 100 == 0: print('%d/%d' % (i + 1, len(phrase_dump.keys()))) add_with_offset(start_index, np.concatenate(starts, axis=0), offset, np.concatenate(valids)) # start_index.add(np.concatenate(starts, axis=0)) # leftover for dump in dumps: dump.close() if cuda and not fine_quant.startswith('PQ'): print('moving back to cpu') start_index = faiss.index_gpu_to_cpu(start_index) print('index ntotal: %d' % start_index.ntotal) idx2doc_id = np.array(idx2doc_id, dtype=np.int32) idx2para_id = np.array(idx2para_id, dtype=np.int32) idx2word_id = np.array(idx2word_id, dtype=np.int32) print('writing index and metadata') with h5py.File(idx2id_path, 'w') as f: g = f.create_group(str(offset)) g.create_dataset('doc', data=idx2doc_id) g.create_dataset('para', data=idx2para_id) g.create_dataset('word', data=idx2word_id) g.attrs['offset'] = offset faiss.write_index(start_index, target_index_path) print('done')
def cpu(self): super().cpu() self.index = faiss.index_gpu_to_cpu(self.index) return self
def save(self, path): faiss.write_index(faiss.index_gpu_to_cpu(self.index), path)
def get_distractor_graph(sx_1, k): ts = [time.time()] def pt(): ts.append(time.time()) return " [%.3f s, %.2f GiB]" % ( ts[-1] - ts[-2], faiss.get_mem_usage_kb() / float(1 << 20)) ndis, d = sx_1.shape print(pt(), "make distractor graph for ndis=%d k=%d" % (ndis, k)) fname_base = '%s/knngraph/ndis%d' % (os.getenv('DDIR'), ndis) print(pt(), "fname_base=", fname_base) fname_index = fname_base + ".index" if not os.path.exists(fname_index): index = make_index(sx_1, preproc=norm_L2) print(pt(), "move to CPU") index_cpu = faiss.index_gpu_to_cpu(index) print(pt(), "store", fname_index) faiss.write_index(index_cpu, fname_index) del index_cpu is_new_index = True else: print(pt(), "load", fname_index) index_cpu = faiss.read_index(fname_index) if faiss.get_num_gpus() > 0: index = move_index_to_gpu(index_cpu, True) else: # for run on cluster index = index_cpu del index_cpu is_new_index = False D_11, I_11 = None, None if not is_new_index: # otherwise presumaby we should recompute for log_ki in range(11): ki = 1 << log_ki if ki < k: continue fname_I = fname_base + '_k%d_I_11' % ki fname_D = fname_base + '_k%d_D_11' % ki if os.path.exists(fname_I + '.npy'): fname_D += '.npy' fname_I += '.npy' print(pt(), 'mmap', fname_D, fname_I) D_11 = np.load(fname_D, mmap_mode='r') I_11 = np.load(fname_I, mmap_mode='r') break if os.path.exists(fname_I + '.int%d' % ki): fname_I += '.int%d' % ki fname_D += '.float%d' % ki print(pt(), 'mmap', fname_D, fname_I) D_11 = np.memmap(fname_D, mode='r', dtype='float32').reshape(-1, ki) I_11 = np.memmap(fname_I, mode='r', dtype='int32').reshape(-1, ki) break if I_11 is None: # it was not computed for this value of ki for log_ki in range(11): ki = 1 << log_ki if ki >= k: break fname_D = fname_base + '_k%d_D_11.float%d' % (ki, ki) fname_I = fname_base + '_k%d_I_11.int%d' % (ki, ki) print(pt(), 'writing on-the-fly to ', fname_D, fname_I) file_D = open(fname_D, 'w') file_I = open(fname_I, 'w') search_by_blocks(index, sx_1, ki, preproc=norm_L2, output_files=(file_D, file_I)) del file_D del file_I print(pt(), 'mmap', fname_D, fname_I) D_11 = np.memmap(fname_D, mode='r', dtype='float32').reshape(-1, ki) I_11 = np.memmap(fname_I, mode='r', dtype='int32').reshape(-1, ki) assert D_11.shape == I_11.shape assert D_11.shape[0] == ndis assert D_11.shape[1] >= k print(pt(), 'distractor graph ready') return index, D_11, I_11
def build_index(embeds_path, output_path, num_clusters=65536, use_gpu=False, train_ratio=1.0, embeds_format='labeled_numpy', sort=True, **kwargs): # embeds_file_paths = pathex.get_sorted_files_from_all_sub_dirs__(embeds_path, full_path=True) # gx.write_all_lines(path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}_files.txt'), embeds_file_paths) # text_file_path = path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}.txt') # index_file_path = path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}.idx') embeds_list, _ = load_embeds(embeds_path=embeds_path, format=embeds_format, sort=sort, **kwargs) tic('Initializing index ...') if not num_clusters: num_clusters = len(embeds_list) // 100 index = faiss.index_factory(embeds_list[0].shape[-1], f"IVF{num_clusters},Flat", faiss.METRIC_INNER_PRODUCT) if use_gpu: index = faiss.index_cpu_to_all_gpus(index) tic('Concatenating embeddings ...') if 0 < train_ratio < 1: gx.hprint_message( f"will sample subset for training with ratio {train_ratio}...") all_embeds = np.concatenate(embeds_list if train_ratio == 1 else list( gx.sampled_iter(embeds_list, train_ratio))) toc(msg=f'Initialization done!') tic(f'Training embeddings of shape {all_embeds.shape} ...') index.train(all_embeds) if use_gpu: index = faiss.index_gpu_to_cpu(index) toc(msg='Index training done!') tic('Add embeddings to index ...') del all_embeds embed_index_start = 0 for embeds in tqdm(embeds_list): embed_count = embeds.shape[0] index.add_with_ids( embeds, np.arange(embed_index_start, embed_index_start + embed_count)) embed_index_start += embed_count # with open(text_file_path, 'w+') as wf: # for embeds, batch in embeds_iter(embeds_file_paths=embeds_file_paths, embeds_key=embeds_key, sample_file=sample_file, sample_ratio=train_ratio, embeds_idx=embeds_idx, use_tqdm=True, yield_batch=True): # write_all_lines_to_stream(wf=wf, iterable=batch[embeds_txt_key], use_tqdm=False) # embed_count = embeds.shape[0] # index.add_with_ids(embeds, np.arange(embed_index_start, embed_index_start + embed_count)) # embed_index_start += embed_count if path.exists(output_path): os.remove(output_path) gx.hprint_message('saving indexed embeddings to', output_path) faiss.write_index(index, output_path) toc(msg='Indexing done!') return index
ids_array = np.fromfile(train_ids_path, sep=' ', dtype='int64') print("start read train features") features_array = np.fromfile(train_features_path, sep=' ', dtype='>f4') print("start reshape") features_count = len(ids_array) features = features_array.reshape((features_count, dimensions)) print("start train index1") index.train(features) index.add_with_ids(features, ids_array) print("train index1 done. features count: " + str(features_count)) print("start read add ids") ids_array = np.fromfile(add_ids_path, sep=' ', dtype='int64') print("start read add features") features_array = np.fromfile(add_features_path, sep=' ', dtype='>f4') print("start reshape") features_count = len(ids_array) features = features_array.reshape((features_count, dimensions)) print("start add ids & features in index1") index.add_with_ids(features, ids_array) print("add done. features count: " + str(features_count)) # save index1 if USE_GPU: index = faiss.index_gpu_to_cpu(index) faiss.write_index(index, index_output_path) print("save index1 to file done: " + index_output_path) process_time = time.time() - start_time print(process_time)
def write(self, path): # faiss.write_index(self.index, path) faiss.write_index(faiss.index_gpu_to_cpu(self.index), path)
default=1500000) arguments = argparser.parse_args() # We need between 983040 and 8388608 training vectors training_size = int(arguments.training_size) training_embeddings = numpy.fromfile(arguments.embedding_file, numpy.float32, training_size * DIMENSIONS) training_embeddings.resize(training_embeddings.shape[0] // DIMENSIONS, DIMENSIONS) print("%i training embeddings..." % training_embeddings.shape[0]) faiss.normalize_L2(training_embeddings) print("Training embeddings normalized...") index = faiss.index_factory(DIMENSIONS, "OPQ32_128,IVF32768,PQ32") gpu_index = faiss.index_cpu_to_all_gpus(index) print("Index created...") batch_size = int(arguments.batch_size) gpu_index.train(training_embeddings) print("Index trained", gpu_index.is_trained) add_embeddings_to_index(arguments.embedding_file, gpu_index, batch_size) print("%i embeddings in index" % gpu_index.ntotal) # Store index to a file index_to_store = faiss.index_gpu_to_cpu(gpu_index) print(index_to_store.ntotal) faiss.write_index(index_to_store, arguments.index_name)
def to_cpu(self): self.index = faiss.index_gpu_to_cpu(self.index)