Пример #1
0
 def index_embs_and_ids(index: faiss.Index, embeddings: np.array,
                        faiss_ids: np.array) -> faiss.Index:
     assert embeddings.shape[0] == faiss_ids.shape[0], \
         f'Found {embeddings.shape[0]} embeddings ' \
         f'and {faiss_ids.shape[0]} faiss_ids'
     faiss_ids = np.reshape(faiss_ids, (faiss_ids.shape[0], ))
     index.add_with_ids(embeddings, faiss_ids)
     return index
Пример #2
0
def quantize_vec_without_modifying_index(index: faiss.Index,
                                         vecs: np.ndarray) -> np.ndarray:
    """ Quantizes a batch of vectors if the index given uses quantization """

    try:
        return index.sa_decode(index.sa_encode(vecs))
    except (TypeError,
            RuntimeError):  # error if the index doesn't use quantization
        return vecs
Пример #3
0
def r_recall_at_r_single(
    query: np.ndarray,
    ground_truth: np.ndarray,
    other_index: faiss.Index,
    r_max: int = 40,
    eval_item_ids: Optional[np.ndarray] = None,
) -> List[int]:
    """ Compute an R-recall@R array for each R in range [1, R_max] """
    # O(r_max)

    _, inds = other_index.search(np.expand_dims(query, 0), r_max)

    res = inds[0]

    recall_count = []
    s_true = set()
    s_pred = set()
    tot = 0
    for p_true, p_pred in zip(ground_truth[:r_max], res):
        if eval_item_ids is not None and p_pred != -1:
            p_pred = eval_item_ids[p_pred]
        if p_true == p_pred and p_true != -1:
            tot += 1
        else:
            if p_true in s_pred and p_true != -1:
                tot += 1
            if p_pred in s_true and p_pred != -1:
                tot += 1

        s_true.add(p_true)
        s_pred.add(p_pred)
        recall_count.append(tot)

    return recall_count
Пример #4
0
def speed_test_ms_per_query(index: faiss.Index,
                            query: Optional[np.ndarray] = None,
                            ksearch: int = 40,
                            timout_s: Union[float, int] = 5.0) -> float:
    """ Evaluate the average speed in milliseconds of the index without using batch """

    nb_samples = 2_000

    if query is None:
        query = np.random.rand(nb_samples, index.d).astype("float32")

    count = 0
    nb_repeat = 1 + (nb_samples - 1) // query.shape[0]

    start_time = time.perf_counter()

    for one_query in chain.from_iterable(repeat(query, nb_repeat)):

        _, _ = index.search(np.expand_dims(one_query, 0), ksearch)

        count += 1

        if time.perf_counter() - start_time > timout_s:
            break

    return (time.perf_counter() - start_time) / count * 1000.0
Пример #5
0
def one_recall_at_r_single(
    query: np.ndarray,
    ground_truth: np.ndarray,
    other_index: faiss.Index,
    r_max: int = 40,
    eval_item_ids: Optional[np.ndarray] = None,
) -> List[int]:
    """
    Compute an 1-recall@R array for each R in range [1, r_max] for
    a single query.
    """
    # O(r_max)

    _, inds = other_index.search(np.expand_dims(query, 0), 1)

    first = inds[0][0]
    if eval_item_ids is not None and first != -1:
        first = eval_item_ids[first]

    # return empty array if no product is found by other_index
    if first == -1:
        return [0 for _ in ground_truth[:r_max]]

    recall_count = []

    seen = False
    for p_true in ground_truth[:r_max]:
        if p_true == first:
            seen = True
        recall_count.append(1 if seen else 0)

    return recall_count
Пример #6
0
def one_recall_at_r(
    query: np.ndarray,
    ground_truth: np.ndarray,
    other_index: faiss.Index,
    r_max: int = 40,
    eval_item_ids: Optional[np.ndarray] = None,
) -> List[float]:
    """ Compute an 1-recall@R array for each R in range [1, r_max] """
    # O(r_max)

    if r_max <= 0:
        return np.zeros((0, ))

    _, first = other_index.search(query, 1)

    if eval_item_ids is not None:
        first = np.vectorize(lambda e: eval_item_ids[e] if e != -1 else -1)(
            first)  # type: ignore

    recall_array = np.cumsum(
        (ground_truth[:, :r_max] == first) & (first != -1), axis=-1)

    avg_recall = np.mean(recall_array, axis=0)

    return avg_recall
Пример #7
0
def _dist_mz_interval(
        index: faiss.Index, vectors: np.ndarray, precursor_mzs: np.ndarray,
        batch_size: int, n_neighbors: int, n_neighbors_ann: int,
        precursor_tol_mass: float, precursor_tol_mode: str,
        distances: np.ndarray, indices: np.ndarray, indptr: np.ndarray,
        indptr_i: int) -> None:
    """
    Compute distances to the nearest neighbors for the given precursor m/z
    interval.

    Parameters
    ----------
    index : faiss.Index
        The NN index used to efficiently find distances to similar spectra.
    vectors : np.ndarray
        The spectrum vectors to be queried against the NN index.
    precursor_mzs : np.ndarray
        Precorsor m/z's of the spectra corresponding to the given vectors.
    batch_size : int
        The number of vectors to be simultaneously queried.
    n_neighbors : int
        The final (maximum) number of neighbors to retrieve for each vector.
    n_neighbors_ann : int
        The number of neighbors to retrieve using the ANN index. This can
        exceed the final number of neighbors (`n_neighbors`) to maximize the
        number of neighbors within the precursor m/z tolerance.
    precursor_tol_mass : float
        The precursor tolerance mass for vectors to be considered as neighbors.
    precursor_tol_mode : str
        The unit of the precursor m/z tolerance ('Da' or 'ppm').
    distances : np.ndarray
        The nearest neighbor distances. See `scipy.sparse.csr_matrix` (`data`).
    indices : np.ndarray
        The column indices for the nearest neighbor distances. See
        `scipy.sparse.csr_matrix`.
    indptr : np.ndarray
        The index pointers for the nearest neighbor distances. See
        `scipy.sparse.csr_matrix`.
    indptr_i : int
        The current start index in `indptr`.
    """
    for batch_start in range(0, vectors.shape[0], batch_size):
        batch_stop = min(batch_start + batch_size, index.ntotal)
        # Find nearest neighbors using ANN index searching.
        # noinspection PyArgumentList
        nn_dists, nn_idx_ann = index.search(vectors[batch_start:batch_stop],
                                            n_neighbors_ann)
        # Filter the neighbors based on the precursor m/z tolerance and assign
        # distances.
        _filter_neighbors_mz(
            precursor_mzs, batch_start, batch_stop, precursor_tol_mass,
            precursor_tol_mode, nn_dists, nn_idx_ann, n_neighbors, distances,
            indices, indptr, indptr_i + batch_start)
Пример #8
0
def search_speed_test(index: faiss.Index,
                      query: Optional[np.ndarray] = None,
                      ksearch: int = 40,
                      timout_s: Union[float, int] = 10.0) -> Dict[str, float]:
    """ return the average and 99p search speed """

    nb_samples = 2_000

    if query is None:
        query = np.random.rand(nb_samples, index.d).astype("float32")

    test_start_time_s = time.perf_counter()
    speed_list_ms = []  # in milliseconds

    nb_repeat = 1 + (nb_samples - 1) // query.shape[0]

    for one_query in chain.from_iterable(repeat(query, nb_repeat)):

        start_time_s = time.perf_counter()  # high precision
        _, _ = index.search(np.expand_dims(one_query, 0), ksearch)
        end_time_s = time.perf_counter()

        search_time_ms = 1000.0 * (end_time_s - start_time_s)
        speed_list_ms.append(search_time_ms)

        if time.perf_counter() - test_start_time_s > timout_s:
            break

    speed_list_ms = np.array(speed_list_ms)
    print(len(speed_list_ms))

    # avg2 = 1000 * (time.perf_counter() - test_start_time_s) / len(speed_list_ms)

    speed_infos = {
        "avg_search_speed_ms": np.average(speed_list_ms),
        "99p_search_speed_ms": np.quantile(speed_list_ms, 0.99),
    }

    return speed_infos
Пример #9
0
def knn(
        index: faiss.Index,
        embedding: np.ndarray,
        labels2captions: Union[np.ndarray, Dict],
        top_k: int = 3,
        k: int = 1,
):
    """
        Performs kNN on factory index + index with `name`.

        Args:
            index (faiss.Index): Index object
            embedding (np.ndarray): Embeddings query.
            top_k (int): Top K results to return.
            k (int): K parameter in kNN.
            labels2captions (Dict[int, str]):

        Returns List[Dict]: Closest neighbors.

        """

    results = []

    # Search for closest embeddings in terms of inner product distance
    nn_distances, nn_labels = index.search(
        embedding[np.newaxis, ...], k=index.ntotal)
    nn_distances = np.clip(nn_distances, 0.0, 1.0)
    nn_distances = np.arccos(nn_distances)

    nn_distances = np.squeeze(nn_distances)
    nn_labels = np.squeeze(nn_labels)

    true_label = None

    top_k = min(top_k, len(np.unique(nn_labels)))
    for _ in range(top_k):

        if true_label is not None:
            not_equal_indcs = np.where(nn_labels != true_label)[0]
            nn_labels = nn_labels[not_equal_indcs]
            nn_distances = nn_distances[not_equal_indcs]

        # Tale first k neighbor classes
        if nn_labels.ndim == 0:
            true_label = int(nn_labels)
            true_caption = labels2captions[true_label]
            closest_distance = float(nn_distances)
        else:
            knn_labels = nn_labels[:k]

            # Find most frequent from them
            true_label = mode(knn_labels, axis=0)[0][0]
            true_caption = labels2captions[true_label]

            closest_index = nn_labels.tolist().index(true_label)
            closest_distance = nn_distances[closest_index]

        result = {
            "label": true_label,
            "caption": true_caption,
            "distance": closest_distance,
        }

        results.append(result)

    return results
Пример #10
0
def quantize_vec_without_modifying_index(index: faiss.Index,
                                         vecs: np.ndarray) -> np.ndarray:
    """ qantize a batch of vectors """
    quantized_vecs = index.sa_decode(index.sa_encode(vecs))
    return quantized_vecs
    def gen_stats(self, dataset_idx: OrderedDict, dataset_doc_ids: [],
                  dataset_embeddings: np.ndarray, faiss_index: faiss.Index,
                  annos: dict,
                  distance_threshold: float) -> (dict, dict, dict):
        """
        Generate count stats from the dataset, grouped by different annotation types

        :param dataset_idx:  The dictionary to map document id to the index in dataset_doc_ids
        :param dataset_doc_ids: Array of document ids
        :param dataset_embeddings: Numpy array of document embeddings
        :param faiss_index:    Faiss index
        :param annos:   The dictionary to map each annotation type to a set of document ids.
        :param distance_threshold: A threhold to exclude dislike query results
        :return: A dictionary group document ids by annotation type, a dictionary group not sampled document ids
        by annotation type, a stats counts for each annotation type.
        """
        distances = {
            type_name: dict()
            for type_name in self.grouped_ids.keys()
        }
        print(type(dataset_embeddings))
        max_query_res = int(len(dataset_embeddings) * 0.8)
        if max_query_res > self.MAX_QUERY_RES:
            max_query_res = self.MAX_QUERY_RES
        print('Querying similar document embeddings...')
        for type_name, doc_ids in annos.items():
            subset_embeddings = np.array([
                dataset_embeddings[dataset_idx[doc_id]] for doc_id in doc_ids
            ])
            for i in range(0, len(subset_embeddings)):
                res_distances, res_doc_idx_ids = faiss_index.search(
                    subset_embeddings[i:i + 1], max_query_res)
                for j in range(0, len(res_distances[0])):
                    res_d = res_distances[0][j]
                    if res_d > distance_threshold:
                        break
                    doc_id = dataset_doc_ids[res_doc_idx_ids[0][j]]
                    self.grouped_ids[type_name].add(doc_id)
                    # update the distances of a candidate doc to the closest doc in the reviewed documents
                    if doc_id not in distances[
                            type_name] or res_d < distances[type_name][doc_id]:
                        distances[type_name][doc_id] = res_d

        # solve overlapping candidates
        print('Solve overlapping candidates...')
        for doc_id in dataset_doc_ids:
            shortest_distance = 10000
            to_remove_from_types = []
            previous_type = ''
            for type_name in distances.keys():
                if doc_id in distances[type_name] and distances[type_name][
                        doc_id] < shortest_distance:
                    shortest_distance = distances[type_name][doc_id]
                    if previous_type != '':
                        to_remove_from_types.append(type_name)
                    previous_type = type_name

            for type_name in to_remove_from_types:
                self.grouped_ids[type_name].remove(doc_id)

        available_outscope_ids = set(dataset_doc_ids)
        # identify the documents haven't been reviewed
        print("identify the documents haven't been reviewed")
        for type_name, doc_ids in self.grouped_ids.items():
            available_outscope_ids = available_outscope_ids - doc_ids
            self.new_ids[type_name] = doc_ids - self.previous_sampled_ids

        self.current_stats = {
            'all_counts': {
                type_name: len(value)
                for type_name, value in self.grouped_ids.items()
            },
            'new_counts': {
                type_name: len(value)
                for type_name, value in self.new_ids.items()
            }
        }

        self.available_not_contain = len(available_outscope_ids)
        self.current_stats['all_counts'][
            'not_contain'] = self.available_not_contain
        self.new_available_not_contain = len(available_outscope_ids -
                                             self.previous_sampled_ids)
        self.current_stats['new_counts'][
            'not_contain'] = self.new_available_not_contain
        return self.grouped_ids, self.new_ids, self.current_stats