Exemplo n.º 1
0
def compute_populated_index(preproc):
    """Add elements to a sharded index. Return the index and if available
    a sharded gpu_index that contains the same data. """

    indexall = prepare_trained_index(preproc)

    co = faiss.GpuMultipleClonerOptions()
    co.useFloat16 = use_float16
    co.useFloat16CoarseQuantizer = False
    co.usePrecomputed = use_precomputed_tables
    co.indicesOptions = faiss.INDICES_CPU
    co.verbose = 10
    co.reserveVecs = max_add if max_add > 0 else xb.shape[0]
    co.shard = True

    vres, vdev = make_vres_vdev()
    gpu_index = faiss.index_cpu_to_gpu_multiple(
        vres, vdev, indexall, co)

    print "add..."
    t0 = time.time()
    nb = xb.shape[0]
    for i0, xs in dataset_iterator(xb, preproc, add_batch_size):
        i1 = i0 + xs.shape[0]
        gpu_index.add_with_ids(xs, np.arange(i0, i1))
        if max_add > 0 and gpu_index.ntotal > max_add:
            print "Flush indexes to CPU"
            for i in range(ngpu):
                index_src_gpu = faiss.downcast_index(gpu_index.at(i))
                index_src = faiss.index_gpu_to_cpu(index_src_gpu)
                print "  index %d size %d" % (i, index_src.ntotal)
                index_src.copy_subset_to(indexall, 0, 0, nb)
                index_src_gpu.reset()
                index_src_gpu.reserveMemory(max_add)
            gpu_index.sync_with_shard_indexes()

        print '\r%d/%d (%.3f s)  ' % (
            i0, nb, time.time() - t0),
        sys.stdout.flush()
    print "Add time: %.3f s" % (time.time() - t0)

    print "Aggregate indexes to CPU"
    t0 = time.time()

    for i in range(ngpu):
        index_src = faiss.index_gpu_to_cpu(gpu_index.at(i))
        print "  index %d size %d" % (i, index_src.ntotal)
        index_src.copy_subset_to(indexall, 0, 0, nb)

    print "  done in %.3f s" % (time.time() - t0)

    if max_add > 0:
        # it does not contain all the vectors
        gpu_index = None

    return gpu_index, indexall
Exemplo n.º 2
0
def train_index(data, quantizer_path, trained_index_path, fine_quant='SQ8', cuda=False):
    quantizer = faiss.read_index(quantizer_path)
    if fine_quant == 'SQ8':
        trained_index = faiss.IndexIVFScalarQuantizer(quantizer, quantizer.d, quantizer.ntotal, faiss.METRIC_L2)
    elif fine_quant.startswith('PQ'):
        m = int(fine_quant[2:])
        trained_index = faiss.IndexIVFPQ(quantizer, quantizer.d, quantizer.ntotal, m, 8)
    else:
        raise ValueError(fine_quant)

    if cuda:
        if fine_quant.startswith('PQ'):
            print('PQ not supported on GPU; keeping CPU.')
        else:
            res = faiss.StandardGpuResources()
            gpu_index = faiss.index_cpu_to_gpu(res, 0, trained_index)
            gpu_index.train(data)
            trained_index = faiss.index_gpu_to_cpu(gpu_index)
    else:
        trained_index.train(data)
    faiss.write_index(trained_index, trained_index_path)
Exemplo n.º 3
0
    def save(self, path: str) -> None:
        r"""Save the index and meta data in ``path`` directory. The index
        will be saved as ``index.faiss`` and ``index.meta_data`` respectively
        inside ``path`` directory.

        Args:
            path (str): A path to the directory where the index will be saved

        """

        if os.path.exists(path):
            logging.warning("%s directory already exists. Index will be "
                            "saved into an existing directory", path)
        else:
            os.makedirs(path)

        cpu_index = faiss.index_gpu_to_cpu(self._index) \
            if self._index.__class__.__name__.startswith("Gpu") else self._index
        faiss.write_index(cpu_index, f"{path}/index.faiss")
        with open(f"{path}/index.meta_data", "wb") as f:
            pickle.dump(self._meta_data, f)
Exemplo n.º 4
0
Arquivo: db.py Projeto: JCBrouwer/mmss
    def upgrade_indices(self, new_index_type="IDMap,IVF100,PQ8"):
        for column_name, index in self.indices.items():
            if faiss.get_num_gpus() > 0:
                index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), self.rank, index)

            vectors = index.reconstruct_n(0, index.ntotal)
            ids = np.array([index.id_map.at(i) for i in range(index.id_map.size())])
            assert len(vectors) == len(ids)

            new_index = faiss.index_factory(vectors.shape[1], new_index_type)
            if faiss.get_num_gpus() > 0:
                new_index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), self.rank, new_index)

            if not new_index.is_trained:
                new_index.train(vectors)

            new_index.add_with_ids(vectors, ids)

            if faiss.get_num_gpus() > 0:
                new_index = faiss.index_gpu_to_cpu(new_index)

            faiss.write_index(new_index, f"{self.directory}_new/{column_name}.index")
Exemplo n.º 5
0
 def _save_faiss_model(self):
     """
     Save the index and parameters to the configured DataElements.
     """
     with self._model_lock:
         # Only write to cache elements if they are both writable.
         writable = (self._index_element and self._index_element.writable()
                     and self._index_param_element
                     and self._index_param_element.writable())
         if writable:
             self._log.debug("Storing index: %s", self._index_element)
             # FAISS wants to write to a file, so make a temp file, then
             # read it in, putting bytes into element.
             fd, fp = tempfile.mkstemp()
             try:
                 # Write function needs a CPU index instance, so bring it
                 # down from the GPU if necessary.
                 if self._use_gpu and isinstance(self._faiss_index,
                                                 faiss.GpuIndex):
                     to_write = faiss.index_gpu_to_cpu(self._faiss_index)
                 else:
                     to_write = self._faiss_index
                 faiss.write_index(to_write, fp)
                 self._index_element.set_bytes(
                     os.read(fd, os.path.getsize(fp)))
             finally:
                 os.close(fd)
                 os.remove(fp)
             # Store index parameters used.
             params = {
                 "factory_string": self.factory_string,
                 "read_only": self.read_only,
                 "random_seed": self.random_seed,
                 "use_multiprocessing": self.use_multiprocessing,
                 "next_index": self._next_index,
             }
             self._index_param_element.set_bytes(json.dumps(params))
        assert not index.is_trained
        index.train(train_subset)
        assert index.is_trained
        print("Training index... finished")

    subset_i = 0
    print("Adding {} train features to index".format(len(label_features)))
    for label, features in tqdm(label_features.items()):
        n_features = features.shape[0]
        f = features[:n_features]
        index.add(mat.apply_py(f) if pca else f)
        for n_feature in range(n_features):
            index_dict[subset_i+n_feature] = int(label)
        subset_i += n_features

    faiss.write_index(faiss.index_gpu_to_cpu(index) if gpu else index, INDEX_FILENAME)

    with open(INDEX_FILENAME_PK, 'wb') as fp:
        pickle.dump(index_dict, fp)

print("Indexed vectors {}".format(index.ntotal))
index.nprobe = 100

files = sorted(glob.glob(FEATURES_NPY))

suffix = "_tk{}".format(args.top_k)

if not args.extract_train_nn:
    test = np.empty((len(files), FEATURES_NUMBER), dtype=np.float32)
    subset_i = 0
    test_ids = []
def add_to_index(dump_paths,
                 trained_index_path,
                 target_index_path,
                 idx2id_path,
                 num_docs_per_add=1000,
                 cuda=False,
                 fine_quant='SQ4',
                 offset=0,
                 norm_th=999,
                 ignore_ids=None,
                 avg_vec=None,
                 std_vec=None,
                 first_passage=False,
                 index_filter=-1e8):

    sidx2doc_id = []
    sidx2word_id = []
    dumps = [h5py.File(dump_path, 'r') for dump_path in dump_paths]
    # filter dumps
    if index_filter != -1e8:
        f_dumps = [
            h5py.File(dump_path.replace('/phrase/', '/filter/'), 'r')
            for dump_path in dump_paths
        ]

    print('reading %s' % trained_index_path)
    start_index = faiss.read_index(trained_index_path)
    if 'none' not in fine_quant:
        index_ivf = faiss.extract_index_ivf(start_index)
        index_ivf.make_direct_map()
        index_ivf.set_direct_map_type(faiss.DirectMap.Hashtable)

    if cuda:
        if 'PQ' in fine_quant:
            index_ivf = faiss.extract_index_ivf(start_index)
            quantizer = index_ivf.quantizer
            quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer)
            index_ivf.quantizer = quantizer_gpu
        else:
            res = faiss.StandardGpuResources()
            co = faiss.GpuClonerOptions()
            co.useFloat16 = True
            start_index = faiss.index_cpu_to_gpu(res, 0, start_index, co)

    print('adding following dumps:')
    for dump_path in dump_paths:
        print(dump_path)
    start_total = 0
    start_total_prev = 0
    cnt = 0
    for di, phrase_dump in enumerate(tqdm(dumps, desc='dumps')):
        starts = []
        start_valids = []
        dump_length = len(phrase_dump)
        for i, (doc_idx, doc_group) in enumerate(
                tqdm(phrase_dump.items(), desc='adding %d' % di)):
            if ignore_ids is not None and doc_idx in ignore_ids:
                continue
            num_start = doc_group['start'].shape[0]
            if num_start == 0: continue
            cnt += 1

            # First passage only
            if first_passage:
                f2o_start = doc_group['f2o_start'][:]
                cut = sum(f2o_start < doc_group['len_per_para'][0])
                start = int8_to_float(doc_group['start'][:cut],
                                      doc_group.attrs['offset'],
                                      doc_group.attrs['scale'])
                num_start = start.shape[0]
            # Apply index filter
            elif index_filter != -1e8:
                o2f_start = {
                    orig: ft
                    for ft, orig in enumerate(doc_group['f2o_start'][:])
                }
                filter_start = f_dumps[di][doc_idx]['filter_start'][:]
                filter_end = f_dumps[di][doc_idx]['filter_end'][:]
                start_idxs, = np.where(filter_start > index_filter)
                end_idxs, = np.where(filter_end > index_filter)
                save_idx = set(np.concatenate([start_idxs, end_idxs]))
                save_idx = sorted(
                    [o2f_start[si] for si in save_idx if si in o2f_start])
                start = int8_to_float(doc_group['start'][save_idx],
                                      doc_group.attrs['offset'],
                                      doc_group.attrs['scale'])
                num_start = start.shape[0]
            else:
                start = int8_to_float(doc_group['start'][:],
                                      doc_group.attrs['offset'],
                                      doc_group.attrs['scale'])
            start_valid = np.linalg.norm(start, axis=1) <= norm_th

            starts.append(start)
            start_valids.append(start_valid)
            sidx2doc_id.extend([int(doc_idx)] * num_start)
            if index_filter == -1e8:
                sidx2word_id.extend(range(num_start))
            else:
                sidx2word_id.extend(save_idx)
            start_total += num_start

            if len(starts) > 0 and ((i % num_docs_per_add == 0) or
                                    (i == dump_length - 1)):
                print('adding at %d' % (i + 1))
                add_with_offset(
                    start_index,
                    concat_vectors(starts),
                    concat_vectors(start_valids),
                    start_total_prev,
                    offset,
                    fine_quant,
                )
                start_total_prev = start_total
                starts = []
                start_valids = []
        if len(starts) > 0:
            print('final adding at %d' % (i + 1))
            add_with_offset(
                start_index,
                concat_vectors(starts),
                concat_vectors(start_valids),
                start_total_prev,
                offset,
                fine_quant,
            )
            start_total_prev = start_total
    print('number of docs', cnt)

    for dump in dumps:
        dump.close()

    if cuda:
        print('moving back to cpu')
        if 'PQ' in fine_quant:
            index_ivf.quantizer = quantizer
            del quantizer_gpu
        else:
            start_index = faiss.index_gpu_to_cpu(start_index)

    print('start_index ntotal: %d' % start_index.ntotal)
    print(start_total)
    sidx2doc_id = np.array(sidx2doc_id, dtype=np.int32)
    sidx2word_id = np.array(sidx2word_id, dtype=np.int32)

    print('writing index and metadata')
    with h5py.File(idx2id_path, 'w') as f:
        g = f.create_group(str(offset))
        g.create_dataset('doc', data=sidx2doc_id)
        g.create_dataset('word', data=sidx2word_id)
        g.attrs['offset'] = offset

    faiss.write_index(start_index, target_index_path)
    print('done')
Exemplo n.º 8
0
 def saveindex(self, path):
     print(f"saving{path}")
     t = faiss.index_gpu_to_cpu(self.index)
     faiss.write_index(t, path)
Exemplo n.º 9
0
    def from_database(cls,
                      con,
                      table,
                      paramstyle,
                      hash_length,
                      ids_train=None,
                      train_size=None,
                      chunksize=100000,
                      metadata_columns=None,
                      index=None,
                      gpu=False,
                      dtype='uint8',
                      distance_metric='euclidean'):
        """Train and build a FAISS index from a database connection.

        Args:
            con: A database connection from which to obtain metadata for
                matched hashes.
            table: The table in the database that we should query for metadata.
            paramstyle: The parameter style for the given database
            hash_length: The length of the hash that is being matched against.
            ids_train: The IDs for the vectors to train on.
            train_size: The number of vectors to use for training. Will be
                randomly selected from 1 to the number of vectors in the database.
                Ignored if ids_train is not None.
            chunksize: The chunks of data to draw from the database at a time
                when adding vectors to the index.
            metadata_columns: The metadata that should be returned for queries.
            index: If a pretrained index is provided, training will be skipped,
                any existing vectors will be discarded, and the index will be
                repopulated with the current contents of the database.
            gpu: If true, will attempt to carry out training on a GPU.
            dtype: The data type for the vectors
            distance_metric: The distance metric for the vectors
        """
        assert dtype == 'uint8', 'Only unsigned 8-bit integer hashes are supported at this time.'
        assert distance_metric == 'euclidean', 'Only euclidean distance is supported at this time.'
        if index is None:
            # Train the index using the practices from
            # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index#if-below-1m-vectors-ivfx
            ntotal = pd.read_sql(
                sql="select count(*) as count from hashes",
                con=con).iloc[0]['count']
            assert train_size <= ntotal, 'Cannot train on more hashes than are available.'
            nlist = int(min(4 * np.sqrt(ntotal), ntotal / 39))
            min_train_size = 39 * nlist
            if ids_train is not None:
                train_size = len(ids_train)
            if train_size is None:
                train_size = min_train_size
            assert train_size >= min_train_size, f'Training an index used for {ntotal} hashes requires at least {min_train_size} training hashes.'
            if ids_train is None:
                ids_train = np.random.choice(
                    np.arange(ntotal), size=train_size, replace=False)
            df_train = query_by_id(
                con=con,
                table=table,
                ids=ids_train,
                paramstyle=paramstyle,
                extra_columns=['hash'])
            x_train = np.array([
                np.frombuffer(h, dtype=dtype) for h in df_train['hash']
            ]).astype('float32')
            assert x_train.shape[
                1] == hash_length, 'Hashes are of incorrect length.'

            index = faiss.IndexIVFFlat(
                faiss.IndexFlatL2(hash_length), hash_length, nlist)
            if gpu:
                res = faiss.StandardGpuResources()
                gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
                gpu_index.train(x_train)
                index = faiss.index_gpu_to_cpu(gpu_index)
            else:
                index.train(x_train)
        else:
            index.reset()

        # Add hashes to the index in chunks.
        for df_add in pd.read_sql(
                sql=f"SELECT id, hash FROM {table}", con=con,
                chunksize=chunksize):
            x_add = np.array([
                np.frombuffer(h, dtype=dtype) for h in df_add['hash']
            ]).astype('float32')
            index.add_with_ids(x_add, df_add['id'].values)
        return cls(
            con=con,
            index=index,
            hash_length=hash_length,
            distance_metric=distance_metric,
            dtype=dtype,
            table=table,
            paramstyle=paramstyle,
            metadata_columns=metadata_columns)
Exemplo n.º 10
0
def _build_ann_index(index_filename: str, embeddings: np.ndarray,
                     precursor_mzs: pd.Series, mz_splits: np.ndarray) -> None:
    """
    Create ANN indexes for the given embedding vectors.

    Vectors will be split over multiple ANN indexes based on the given m/z
    interval.

    Parameters
    ----------
    index_filename: str
        Base file name of the ANN index. Separate indexes for the given m/z
        splits will be created.
    embeddings: np.ndarray
        The embedding vectors to build the ANN index.
    precursor_mzs: pd.Series
        Precursor m/z's corresponding to the embedding vectors used to split
        the embeddings over multiple ANN indexes.
    mz_splits: np.ndarray
        M/z splits used to create separate ANN indexes.
    """
    logger.debug('Use %d GPUs for ANN index construction',
                 faiss.get_num_gpus())
    # Create separate indexes for all embeddings with precursor m/z in the
    # specified intervals.
    for mz in tqdm.tqdm(mz_splits, desc='Indexes built', unit='index'):
        if os.path.isfile(index_filename.format(mz)):
            continue
        # Create an ANN index using Euclidean distance for fast NN queries.
        index_embeddings_ids = _get_precursor_mz_interval_ids(
            precursor_mzs, mz, config.mz_interval, config.precursor_tol_mode,
            config.precursor_tol_mass)
        num_index_embeddings = len(index_embeddings_ids)
        # Figure out a decent value for the num_list hyperparameter based on
        # the number of embeddings. Rules of thumb from the Faiss wiki:
        # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index#how-big-is-the-dataset
        if num_index_embeddings == 0:
            continue
        elif num_index_embeddings < 10e5:
            # FIXME: A brute-force index might be better if there are too few
            #  embeddings.
            # Ceil to avoid zero.
            num_list = math.ceil(2**math.floor(
                math.log2(num_index_embeddings / 39)))
        elif num_index_embeddings < 10e6:
            num_list = 2**16
        elif num_index_embeddings < 10e7:
            num_list = 2**18
        else:
            num_list = 2**20
            if num_index_embeddings > 10e8:
                logger.warning('More than 1B embeddings to be indexed, '
                               'consider decreasing the ANN size')
        logger.debug(
            'Build the ANN index for precursor m/z %d–%d '
            '(%d embeddings, %d lists)', mz, mz + config.mz_interval,
            num_index_embeddings, num_list)
        # Large datasets won't fit in the GPU memory, so we first train the
        # index on the CPU.
        index_embeddings = embeddings[index_embeddings_ids]
        index_cpu = faiss.IndexIVFFlat(
            faiss.IndexFlatL2(config.embedding_size), config.embedding_size,
            num_list, faiss.METRIC_L2)
        index_cpu.train(index_embeddings)
        # Add the embeddings to the index using the GPU for increased
        # performance. Shard the GPU index over all available GPUs.
        logger.debug('Add %d embeddings to the ANN index',
                     num_index_embeddings)
        # https://github.com/facebookresearch/faiss/blob/2cce2e5f59a5047aa9a1729141e773da9bec6b78/benchs/bench_gpu_1bn.py#L506
        co = faiss.GpuMultipleClonerOptions()
        co.shard = True
        co.useFloat16 = True
        co.useFloat16CoarseQuantizer = False
        co.indicesOptions = faiss.INDICES_CPU
        co.reserveVecs = num_index_embeddings
        index_gpu = faiss.index_cpu_to_all_gpus(index_cpu, co)
        # Add the embeddings in batches to avoid exhausting the GPU memory.
        batch_size = config.batch_size_add
        for batch_start in tqdm.tqdm(range(0, num_index_embeddings,
                                           batch_size),
                                     desc='Batches processed',
                                     leave=False,
                                     unit='batch'):
            batch_stop = min(batch_start + batch_size, num_index_embeddings)
            index_gpu.add_with_ids(
                index_embeddings[batch_start:batch_stop],
                index_embeddings_ids[batch_start:batch_stop])
        # Combine the sharded index into a single index and save.
        logger.debug('Save the ANN index to file %s',
                     index_filename.format(mz))
        # https://github.com/facebookresearch/faiss/blob/2cce2e5f59a5047aa9a1729141e773da9bec6b78/benchs/bench_gpu_1bn.py#L544
        if hasattr(index_gpu, 'at'):  # Sharded index.
            for i in range(index_gpu.count()):
                index_src = faiss.index_gpu_to_cpu(index_gpu.at(i))
                index_src.copy_subset_to(index_cpu, 0, 0,
                                         int(precursor_mzs.index.max()))
                index_gpu.at(i).reset()
        else:  # Standard index.
            index_src = faiss.index_gpu_to_cpu(index_gpu)
            index_src.copy_subset_to(index_cpu, 0, 0,
                                     int(precursor_mzs.index.max()))
            index_gpu.reset()
        faiss.write_index(index_cpu, index_filename.format(mz))
        index_cpu.reset()
Exemplo n.º 11
0
def add_to_index(dump_paths,
                 trained_index_path,
                 target_index_path,
                 idx2id_path,
                 num_docs_per_add=1000,
                 cuda=False,
                 fine_quant='SQ4',
                 offset=0,
                 norm_th=999,
                 ignore_ids=None):

    sidx2doc_id = []
    sidx2word_id = []
    dumps = [h5py.File(dump_path, 'r') for dump_path in dump_paths]
    print('reading %s' % trained_index_path)
    start_index = faiss.read_index(trained_index_path)
    start_index.make_direct_map()
    start_index.set_direct_map_type(faiss.DirectMap.Hashtable)

    if cuda:
        if 'PQ' in fine_quant:
            index_ivf = faiss.extract_index_ivf(start_index)
            quantizer = index_ivf.quantizer
            quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer)
            index_ivf.quantizer = quantizer_gpu
        else:
            res = faiss.StandardGpuResources()
            co = faiss.GpuClonerOptions()
            co.useFloat16 = True
            start_index = faiss.index_cpu_to_gpu(res, 0, start_index, co)

    print('adding following dumps:')
    for dump_path in dump_paths:
        print(dump_path)
    start_total = 0
    start_total_prev = 0
    cnt = 0
    for di, phrase_dump in enumerate(tqdm(dumps, desc='dumps')):
        starts = []
        start_valids = []
        dump_length = len(phrase_dump)
        for i, (doc_idx, doc_group) in enumerate(
                tqdm(phrase_dump.items(), desc='adding %d' % di)):
            if ignore_ids is not None and doc_idx in ignore_ids:
                continue
            num_start = doc_group['start'].shape[0]
            if num_start == 0: continue
            cnt += 1

            start = int8_to_float(doc_group['start'][:],
                                  doc_group.attrs['offset'],
                                  doc_group.attrs['scale'])
            start_valid = np.linalg.norm(start, axis=1) <= norm_th

            starts.append(start)
            start_valids.append(start_valid)
            sidx2doc_id.extend([int(doc_idx)] * num_start)
            sidx2word_id.extend(range(num_start))
            start_total += num_start

            if len(starts) > 0 and ((i % num_docs_per_add == 0) or
                                    (i == dump_length - 1)):
                print('adding at %d' % (i + 1))
                add_with_offset(start_index, concat_vectors(starts),
                                concat_vectors(start_valids), start_total_prev,
                                offset)
                start_total_prev = start_total
                starts = []
                start_valids = []
        if len(starts) > 0:
            print('final adding at %d' % (i + 1))
            add_with_offset(start_index, concat_vectors(starts),
                            concat_vectors(start_valids), start_total_prev,
                            offset)
            start_total_prev = start_total
    print('number of docs', cnt)

    for dump in dumps:
        dump.close()

    if cuda:
        print('moving back to cpu')
        if 'PQ' in fine_quant:
            index_ivf.quantizer = quantizer
            del quantizer_gpu
        else:
            start_index = faiss.index_gpu_to_cpu(start_index)

    print('start_index ntotal: %d' % start_index.ntotal)
    print(start_total)
    sidx2doc_id = np.array(sidx2doc_id, dtype=np.int32)
    sidx2word_id = np.array(sidx2word_id, dtype=np.int32)

    print('writing index and metadata')
    with h5py.File(idx2id_path, 'w') as f:
        g = f.create_group(str(offset))
        g.create_dataset('doc', data=sidx2doc_id)
        g.create_dataset('word', data=sidx2word_id)
        g.attrs['offset'] = offset

    faiss.write_index(start_index, target_index_path)
    print('done')
Exemplo n.º 12
0
def build_index(cfg: DictConfig, model: object):
    """
    Builds faiss index from index dataset specified in the config.
        
    Args:
        cfg (DictConfig): Config file specifying index parameters
        model (object): Encoder model
    """

    # Get index dataset embeddings
    # PCA model exists and index embeddings have already been PCAed, no need to re-extract/PCA them
    if cfg.apply_pca and os.path.isfile(
            cfg.pca.pca_save_name) and os.path.isfile(
                cfg.pca_embeddings_save_name):
        logging.info("Loading reduced dimensionality embeddings")
        embeddings = h5py.File(cfg.pca_embeddings_save_name, "r")
        embeddings = embeddings[cfg.index_ds.name][:]

    elif os.path.isfile(cfg.embedding_save_name):
        logging.info("Loading previously extracted index dataset embeddings")
        embeddings = h5py.File(cfg.embedding_save_name, "r")
        embeddings = embeddings[cfg.index_ds.name][:]

    else:
        logging.info("Encoding index dataset, this may take a while")
        index_dataloader = model.setup_dataloader(cfg.index_ds,
                                                  is_index_data=True)
        embeddings, concept_ids = get_index_embeddings(cfg, index_dataloader,
                                                       model)

    # Create pca model to reduce dimensionality of index dataset and decrease memory footprint
    if cfg.apply_pca:

        # Need to train PCA model and apply PCA transformation with newly trained model
        if not os.path.isfile(cfg.pca.pca_save_name):
            logging.info(
                "Fitting PCA model for embedding dimensionality reduction")
            pca_train_set = random.sample(
                list(embeddings),
                k=int(len(embeddings) * cfg.pca.sample_fraction))
            pca = PCA(n_components=cfg.pca.output_dim)
            pca.fit(pca_train_set)
            pkl.dump(pca, open(cfg.pca.pca_save_name, "wb"))
            embeddings = reduce_embedding_dim(pca, embeddings, cfg)

        # PCA model already trained, just need to reduce dimensionality of all embeddings
        elif not os.path.isfile(cfg.pca_embeddings_save_name):
            pca = pkl.load(open(cfg.pca.pca_save_name, "rb"))
            embeddings = reduce_embedding_dim(pca, embeddings, cfg)

    # Build faiss index from embeddings
    logging.info(
        f"Training index with embedding dim size {cfg.dims} using {faiss.get_num_gpus()} gpus"
    )
    quantizer = faiss.IndexFlatL2(cfg.dims)
    index = faiss.IndexIVFFlat(quantizer, cfg.dims, cfg.nlist)
    index = faiss.index_cpu_to_all_gpus(index)
    index.train(embeddings)

    logging.info("Adding dataset embeddings to index")
    for i in tqdm(range(0, embeddings.shape[0], cfg.index_batch_size)):
        index.add(embeddings[i:i + cfg.index_batch_size])

    logging.info("Saving index")
    faiss.write_index(faiss.index_gpu_to_cpu(index), cfg.index_save_name)
    logging.info("Index built and saved")
Exemplo n.º 13
0
    macro_processor = Processor(macro_prefixes)

    logger.info("building index...")
    res = faiss.StandardGpuResources()
    co = faiss.GpuClonerOptions()
    index = faiss.index_factory(args.dim, args.index_type, faiss.METRIC_INNER_PRODUCT)
    if args.index_type != "Flat":
        macro_mat_sample = macro_processor.sample_across_mmap_shards(suffix='.emb.npy', sample=1000000)
        logger.info(f"training...")
        index.train(macro_mat_sample) # train on a large subset of macro data
    logger.info("index built!")
    logger.info("adding all vectors to index...")
    macro_ids = []
    for mat_batch, id_batch in macro_processor.iterate_across_mmap_shards(batch_size=args.batch_size):
        faiss.normalize_L2(mat_batch)
        index.add(mat_batch)   # add vectors to the index
        macro_ids.append(id_batch)
    macro_ids = np.concatenate(macro_ids, axis=0)

    macro_instances = read_dataset(args.text)

    if args.device >= 0:
        index = faiss.index_cpu_to_gpu(res, args.device, index, co)

    logger.info(f"saving index to {args.serialization_dir}...")
    if not os.path.isdir(args.serialization_dir):
        os.mkdir(args.serialization_dir)
    faiss.write_index(faiss.index_gpu_to_cpu(index), os.path.join(args.serialization_dir, "faiss.index"))
    write_dataset(macro_instances,  os.path.join(args.serialization_dir, "text.jsonl"))
    np.save(os.path.join(args.serialization_dir, "ids.npy"), macro_ids)
    logger.info("done!")
Exemplo n.º 14
0
    def process(self, scenes, tmp_dir):
        import faiss
        import torch
        z_dim = 512
        index = faiss.IndexFlatL2(z_dim)
        if torch.cuda.is_available():
            print('USING GPU')
            res = faiss.StandardGpuResources()
            index = faiss.index_cpu_to_gpu(res, 0, index)
        else:
            print('NOT USING GPU')
        scenes_and_windows = []
        for scene in scenes:
            print('Embedding scene {}'.format(scene.id))
            embeddings = []
            for window, embedding in scene.prediction_label_store.get_labels(
            ).get_embeddings():
                scenes_and_windows.append((scene, window))
                embeddings.append(embedding)
            print('.' * len(embeddings), end='', flush=True)
            print(np.array(embeddings).shape)
            index.add(np.array(embeddings))
        print('.')

        # Write index
        print('Writing index...')
        index_path = os.path.join(tmp_dir, 'embeddings.idx')
        if torch.cuda.is_available():
            faiss.write_index(faiss.index_gpu_to_cpu(index), index_path)
        else:
            faiss.write_index(index, index_path)
        if not os.path.exists(index_path):
            raise Exception('Did not write index to {}'.format(index_path))
        else:
            print('Wrote index to {}'.format(index_path))
            print('Uploading to {}'.format(self.config.index_output_uri))
        upload_or_copy(index_path, self.config.index_output_uri)

        # Write scene list.
        def make_scene_list_row(src_scene_id, src_window, i):
            return {
                'idx': i,
                'src_scene_id': src_scene_id,
                'src_window_xmin': src_window.xmin,
                'src_window_ymin': src_window.ymin,
                'src_window_xmax': src_window.xmax,
                'src_window_ymax': src_window.ymax
            }

        print('Writing scene list...')
        rows = []
        for (i, (scene, window)) in enumerate(scenes_and_windows):
            rows.append(
                make_scene_list_row(scene.raster_source.uris[0], window, i))
        df = pd.DataFrame(rows)

        path = os.path.join(tmp_dir, 'scene-windows.csv')
        df.to_csv(path)

        upload_or_copy(path, self.config.window_list_uri)

        if self.config.compute_nearest:
            # Calculate nearest.
            k = 4
            results = []
            for scene in scenes:
                print('Finding {} nearest for scene {}'.format(k, scene.id))
                windows = []
                embeddings = []
                for window, embedding in scene.prediction_label_store.get_labels(
                ).get_embeddings():
                    windows.append(window)
                    embeddings.append(embedding)
                print('.' * len(embeddings), end='', flush=True)
                D, I = index.search(np.array(embeddings), k)
                for i, nearest_idx in enumerate(I):
                    nearest = []
                    for idx in nearest_idx:
                        near_scene, near_window = scenes_and_windows[idx]
                        nearest.append((near_scene.id, near_window))
                    results.append((scene.id, windows[i], nearest))
            print('.')

            # Write results
            def make_nearest_row(src_scene_id, src_window, near_scene_id,
                                 near_window, near_rank):
                return {
                    'src_scene_id': src_scene_id,
                    'src_window_xmin': src_window.xmin,
                    'src_window_ymin': src_window.ymin,
                    'src_window_xmax': src_window.xmax,
                    'src_window_ymax': src_window.ymax,
                    'near_scene_id': near_scene_id,
                    'near_window_xmin': near_window.xmin,
                    'near_window_ymin': near_window.ymin,
                    'near_window_xmax': near_window.xmax,
                    'near_window_ymax': near_window.ymax,
                    'near_rank': near_rank
                }

            print('Writing results...')
            rows = []
            for (src_scene_id, src_window, neighbors) in results:
                for near_rank, (near_scene_id,
                                near_window) in enumerate(neighbors):
                    rows.append(
                        make_nearest_row(src_scene_id, src_window,
                                         near_scene_id, near_window,
                                         near_rank))
            df = pd.DataFrame(rows)

            path = os.path.join(tmp_dir, 'nearest.csv')
            df.to_csv(path)

            upload_or_copy(path, self.config.nearest_output_uri)
Exemplo n.º 15
0
def add_to_index(dump_paths,
                 trained_index_path,
                 target_index_path,
                 idx2id_path,
                 max_norm,
                 para=False,
                 num_docs_per_add=1000,
                 num_dummy_zeros=0,
                 cuda=False,
                 fine_quant='SQ8',
                 offset=0,
                 norm_th=999,
                 ignore_ids=None):
    idx2doc_id = []
    idx2para_id = []
    idx2word_id = []
    dumps = [h5py.File(dump_path, 'r') for dump_path in dump_paths]
    print('reading %s' % trained_index_path)
    start_index = faiss.read_index(trained_index_path)

    if cuda:
        if fine_quant.startswith('PQ'):
            print('PQ not supported on GPU; keeping CPU.')
        else:
            res = faiss.StandardGpuResources()
            start_index = faiss.index_cpu_to_gpu(res, 0, start_index)

    print('adding following dumps:')
    for dump_path in dump_paths:
        print(dump_path)
    if para:
        for di, phrase_dump in enumerate(tqdm(dumps, desc='dumps')):
            starts = []
            for i, (doc_idx, doc_group) in enumerate(
                    tqdm(phrase_dump.items(), desc='faiss indexing')):
                for para_idx, group in doc_group.items():
                    num_vecs = group['start'].shape[0]
                    start = int8_to_float(group['start'][:],
                                          group.attrs['offset'],
                                          group.attrs['scale'])
                    norms = np.linalg.norm(start, axis=1, keepdims=True)
                    consts = np.sqrt(np.maximum(0.0, max_norm**2 - norms**2))
                    start = np.concatenate([consts, start], axis=1)
                    if num_dummy_zeros > 0:
                        start = np.concatenate([
                            start,
                            np.zeros([start.shape[0], num_dummy_zeros],
                                     dtype=start.dtype)
                        ],
                                               axis=1)
                    starts.append(start)
                    idx2doc_id.extend([int(doc_idx)] * num_vecs)
                    idx2para_id.extend([int(para_idx)] * num_vecs)
                    idx2word_id.extend(list(range(num_vecs)))
                if len(starts) > 0 and i % num_docs_per_add == 0:
                    print('concatenating')
                    concat = np.concatenate(starts, axis=0)
                    print('adding')
                    add_with_offset(start_index, concat, offset)
                    # start_index.add(concat)
                    print('done')
                    starts = []
                if i % 100 == 0:
                    print('%d/%d' % (i + 1, len(phrase_dump.keys())))
            print('adding leftover')
            add_with_offset(start_index, np.concatenate(starts, axis=0),
                            offset)
            # start_index.add(np.concatenate(starts, axis=0))  # leftover
            print('done')
    else:
        for di, phrase_dump in enumerate(tqdm(dumps, desc='dumps')):
            starts = []
            valids = []
            for i, (doc_idx, doc_group) in enumerate(
                    tqdm(phrase_dump.items(), desc='adding %d' % di)):
                if ignore_ids is not None and doc_idx in ignore_ids:
                    continue
                num_vecs = doc_group['start'].shape[0]
                start = int8_to_float(doc_group['start'][:],
                                      doc_group.attrs['offset'],
                                      doc_group.attrs['scale'])
                valid = np.linalg.norm(start, axis=1) <= norm_th
                norms = np.linalg.norm(start, axis=1, keepdims=True)
                consts = np.sqrt(np.maximum(0.0, max_norm**2 - norms**2))
                start = np.concatenate([consts, start], axis=1)
                if num_dummy_zeros > 0:
                    start = np.concatenate([
                        start,
                        np.zeros([start.shape[0], num_dummy_zeros],
                                 dtype=start.dtype)
                    ],
                                           axis=1)
                starts.append(start)
                valids.append(valid)
                idx2doc_id.extend([int(doc_idx)] * num_vecs)
                idx2word_id.extend(range(num_vecs))
                if len(starts) > 0 and i % num_docs_per_add == 0:
                    add_with_offset(start_index,
                                    np.concatenate(starts, axis=0), offset,
                                    np.concatenate(valids))
                    # start_index.add(np.concatenate(starts, axis=0))
                    starts = []
                    valids = []
                if i % 100 == 0:
                    print('%d/%d' % (i + 1, len(phrase_dump.keys())))
            add_with_offset(start_index, np.concatenate(starts, axis=0),
                            offset, np.concatenate(valids))
            # start_index.add(np.concatenate(starts, axis=0))  # leftover

    for dump in dumps:
        dump.close()

    if cuda and not fine_quant.startswith('PQ'):
        print('moving back to cpu')
        start_index = faiss.index_gpu_to_cpu(start_index)

    print('index ntotal: %d' % start_index.ntotal)
    idx2doc_id = np.array(idx2doc_id, dtype=np.int32)
    idx2para_id = np.array(idx2para_id, dtype=np.int32)
    idx2word_id = np.array(idx2word_id, dtype=np.int32)

    print('writing index and metadata')
    with h5py.File(idx2id_path, 'w') as f:
        g = f.create_group(str(offset))
        g.create_dataset('doc', data=idx2doc_id)
        g.create_dataset('para', data=idx2para_id)
        g.create_dataset('word', data=idx2word_id)
        g.attrs['offset'] = offset
    faiss.write_index(start_index, target_index_path)
    print('done')
Exemplo n.º 16
0
 def cpu(self):
     super().cpu()
     self.index = faiss.index_gpu_to_cpu(self.index)
     
     return self
Exemplo n.º 17
0
 def save(self, path):
     faiss.write_index(faiss.index_gpu_to_cpu(self.index), path)
def get_distractor_graph(sx_1, k):

    ts = [time.time()]

    def pt():
        ts.append(time.time())
        return "  [%.3f s, %.2f GiB]" % (
            ts[-1] - ts[-2], faiss.get_mem_usage_kb() / float(1 << 20))

    ndis, d = sx_1.shape

    print(pt(), "make distractor graph for ndis=%d k=%d" % (ndis, k))

    fname_base = '%s/knngraph/ndis%d' % (os.getenv('DDIR'), ndis)

    print(pt(), "fname_base=", fname_base)

    fname_index = fname_base + ".index"

    if not os.path.exists(fname_index):

        index = make_index(sx_1, preproc=norm_L2)

        print(pt(), "move to CPU")
        index_cpu = faiss.index_gpu_to_cpu(index)

        print(pt(), "store", fname_index)
        faiss.write_index(index_cpu, fname_index)
        del index_cpu
        is_new_index = True
    else:

        print(pt(), "load", fname_index)
        index_cpu = faiss.read_index(fname_index)

        if faiss.get_num_gpus() > 0:
            index = move_index_to_gpu(index_cpu, True)
        else:
            # for run on cluster
            index = index_cpu
        del index_cpu
        is_new_index = False

    D_11, I_11 = None, None

    if not is_new_index:
        # otherwise presumaby we should recompute
        for log_ki in range(11):
            ki = 1 << log_ki
            if ki < k: continue

            fname_I = fname_base + '_k%d_I_11' % ki
            fname_D = fname_base + '_k%d_D_11' % ki

            if os.path.exists(fname_I + '.npy'):
                fname_D += '.npy'
                fname_I += '.npy'
                print(pt(), 'mmap', fname_D, fname_I)
                D_11 = np.load(fname_D, mmap_mode='r')
                I_11 = np.load(fname_I, mmap_mode='r')
                break

            if os.path.exists(fname_I + '.int%d' % ki):
                fname_I += '.int%d' % ki
                fname_D += '.float%d' % ki
                print(pt(), 'mmap', fname_D, fname_I)
                D_11 = np.memmap(fname_D, mode='r',
                                 dtype='float32').reshape(-1, ki)
                I_11 = np.memmap(fname_I, mode='r',
                                 dtype='int32').reshape(-1, ki)
                break

    if I_11 is None:
        # it was not computed for this value of ki
        for log_ki in range(11):
            ki = 1 << log_ki
            if ki >= k: break

        fname_D = fname_base + '_k%d_D_11.float%d' % (ki, ki)
        fname_I = fname_base + '_k%d_I_11.int%d' % (ki, ki)

        print(pt(), 'writing on-the-fly to ', fname_D, fname_I)
        file_D = open(fname_D, 'w')
        file_I = open(fname_I, 'w')

        search_by_blocks(index,
                         sx_1,
                         ki,
                         preproc=norm_L2,
                         output_files=(file_D, file_I))

        del file_D
        del file_I

        print(pt(), 'mmap', fname_D, fname_I)
        D_11 = np.memmap(fname_D, mode='r', dtype='float32').reshape(-1, ki)
        I_11 = np.memmap(fname_I, mode='r', dtype='int32').reshape(-1, ki)

    assert D_11.shape == I_11.shape
    assert D_11.shape[0] == ndis
    assert D_11.shape[1] >= k

    print(pt(), 'distractor graph ready')
    return index, D_11, I_11
Exemplo n.º 19
0
def build_index(embeds_path,
                output_path,
                num_clusters=65536,
                use_gpu=False,
                train_ratio=1.0,
                embeds_format='labeled_numpy',
                sort=True,
                **kwargs):
    # embeds_file_paths = pathex.get_sorted_files_from_all_sub_dirs__(embeds_path, full_path=True)

    # gx.write_all_lines(path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}_files.txt'), embeds_file_paths)
    # text_file_path = path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}.txt')
    # index_file_path = path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}.idx')

    embeds_list, _ = load_embeds(embeds_path=embeds_path,
                                 format=embeds_format,
                                 sort=sort,
                                 **kwargs)

    tic('Initializing index ...')
    if not num_clusters:
        num_clusters = len(embeds_list) // 100
    index = faiss.index_factory(embeds_list[0].shape[-1],
                                f"IVF{num_clusters},Flat",
                                faiss.METRIC_INNER_PRODUCT)
    if use_gpu:
        index = faiss.index_cpu_to_all_gpus(index)

    tic('Concatenating embeddings ...')
    if 0 < train_ratio < 1:
        gx.hprint_message(
            f"will sample subset for training with ratio {train_ratio}...")

    all_embeds = np.concatenate(embeds_list if train_ratio == 1 else list(
        gx.sampled_iter(embeds_list, train_ratio)))
    toc(msg=f'Initialization done!')

    tic(f'Training embeddings of shape {all_embeds.shape} ...')
    index.train(all_embeds)
    if use_gpu:
        index = faiss.index_gpu_to_cpu(index)
    toc(msg='Index training done!')

    tic('Add embeddings to index ...')
    del all_embeds
    embed_index_start = 0

    for embeds in tqdm(embeds_list):
        embed_count = embeds.shape[0]
        index.add_with_ids(
            embeds,
            np.arange(embed_index_start, embed_index_start + embed_count))
        embed_index_start += embed_count

    # with open(text_file_path, 'w+') as wf:
    #     for embeds, batch in embeds_iter(embeds_file_paths=embeds_file_paths, embeds_key=embeds_key, sample_file=sample_file, sample_ratio=train_ratio, embeds_idx=embeds_idx, use_tqdm=True, yield_batch=True):
    #         write_all_lines_to_stream(wf=wf, iterable=batch[embeds_txt_key], use_tqdm=False)
    #         embed_count = embeds.shape[0]
    #         index.add_with_ids(embeds, np.arange(embed_index_start, embed_index_start + embed_count))
    #         embed_index_start += embed_count

    if path.exists(output_path):
        os.remove(output_path)
    gx.hprint_message('saving indexed embeddings to', output_path)
    faiss.write_index(index, output_path)
    toc(msg='Indexing done!')
    return index
Exemplo n.º 20
0
ids_array = np.fromfile(train_ids_path, sep=' ', dtype='int64')
print("start read train features")
features_array = np.fromfile(train_features_path, sep=' ', dtype='>f4')
print("start reshape")
features_count = len(ids_array)
features = features_array.reshape((features_count, dimensions))
print("start train index1")
index.train(features)
index.add_with_ids(features, ids_array)
print("train index1 done. features count: " + str(features_count))

print("start read add ids")
ids_array = np.fromfile(add_ids_path, sep=' ', dtype='int64')
print("start read add features")
features_array = np.fromfile(add_features_path, sep=' ', dtype='>f4')
print("start reshape")
features_count = len(ids_array)
features = features_array.reshape((features_count, dimensions))
print("start add ids & features in index1")
index.add_with_ids(features, ids_array)
print("add done. features count: " + str(features_count))

# save index1
if USE_GPU:
    index = faiss.index_gpu_to_cpu(index)
faiss.write_index(index, index_output_path)
print("save index1 to file done: " + index_output_path)

process_time = time.time() - start_time
print(process_time)
Exemplo n.º 21
0
 def write(self, path):
     # faiss.write_index(self.index, path)
     faiss.write_index(faiss.index_gpu_to_cpu(self.index), path)
Exemplo n.º 22
0
                           default=1500000)
    arguments = argparser.parse_args()

    # We need between 983040 and 8388608 training vectors
    training_size = int(arguments.training_size)
    training_embeddings = numpy.fromfile(arguments.embedding_file,
                                         numpy.float32,
                                         training_size * DIMENSIONS)
    training_embeddings.resize(training_embeddings.shape[0] // DIMENSIONS,
                               DIMENSIONS)

    print("%i training embeddings..." % training_embeddings.shape[0])

    faiss.normalize_L2(training_embeddings)
    print("Training embeddings normalized...")

    index = faiss.index_factory(DIMENSIONS, "OPQ32_128,IVF32768,PQ32")
    gpu_index = faiss.index_cpu_to_all_gpus(index)
    print("Index created...")

    batch_size = int(arguments.batch_size)
    gpu_index.train(training_embeddings)
    print("Index trained", gpu_index.is_trained)
    add_embeddings_to_index(arguments.embedding_file, gpu_index, batch_size)
    print("%i embeddings in index" % gpu_index.ntotal)

    # Store index to a file
    index_to_store = faiss.index_gpu_to_cpu(gpu_index)
    print(index_to_store.ntotal)
    faiss.write_index(index_to_store, arguments.index_name)
Exemplo n.º 23
0
 def to_cpu(self):
     self.index = faiss.index_gpu_to_cpu(self.index)