Exemplo n.º 1
0
def get_optimal_index_keys(nb_vectors: int, dim_vector: int, max_index_memory_usage: str) -> List[str]:
    """
    Gives a list of interesting indices to try, *the one at the top is the most promising*

    See: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index for
    detailed explanations.
    """

    total_bytes = 4 * nb_vectors * dim_vector  # x4 because float32
    max_mem_bytes = cast_memory_to_bytes(max_index_memory_usage)

    # index options
    relevant_list: List[str] = []

    # Cases with a lot of memory -> HNSW
    if 1.7 * total_bytes < max_mem_bytes:
        relevant_list.append("HNSW32")
    elif 1.3 * total_bytes < max_mem_bytes:
        relevant_list.append("HNSW15")
    else:  # product quantization
        relevant_list.extend(
            get_optimal_quantization(nb_vectors, dim_vector, force_max_index_memory_usage=max_index_memory_usage)
        )

    return relevant_list
Exemplo n.º 2
0
def get_optimal_train_size(
    nb_vectors: int, index_key: str, current_memory_available: Optional[str], vec_dim: Optional[int]
) -> int:
    """
    Function that determines the number of training points necessary to
    train the index, based on faiss heuristics for k-means clustering.
    """

    train_size = nb_vectors

    matching = re.findall(r"IVF\d+|IMI\d+x\d+", index_key)

    if matching:

        nb_clusters = nb_vectors
        # case IVF index
        if re.findall(r"IVF\d+", matching[0]):
            nb_clusters = int(matching[0][3:])
        # case IMI index
        elif re.findall(r"IMI\d+x\d+", matching[0]):
            nb_clusters = 2 ** reduce(mul, [int(num) for num in re.findall(r"\d+", matching[0])])

        points_per_cluster: float = 100

        # compute best possible number of vectors to give to train the index
        # given memory constraints
        if current_memory_available and vec_dim:
            size = cast_memory_to_bytes(current_memory_available)
            points_per_cluster = max(min(size / (4.0 * nb_clusters * vec_dim), points_per_cluster), 31.0)

        # You will need between 30 * nb_clusters and 256 * nb_clusters to train the index
        train_size = min(round(points_per_cluster * nb_clusters), nb_vectors)

    return train_size
Exemplo n.º 3
0
def get_ground_truth(
    faiss_metric_type: int,
    embeddings_path: Union[np.ndarray, str],
    query_embeddings: np.ndarray,
    memory_available: Union[str, float],
):
    """ compute the ground truth (result with a perfect index) of the query on the embeddings """

    dim = query_embeddings.shape[-1]

    if isinstance(embeddings_path, str):
        perfect_index = MemEfficientFlatIndex(dim, faiss_metric_type)
        perfect_index.add_files(embeddings_path)
        block_bytes = next(
            read_embeddings_local(embeddings_path, verbose=False)).nbytes
    else:
        perfect_index = faiss.IndexFlat(dim, faiss_metric_type)
        perfect_index.add(embeddings_path.astype("float32"))  # pylint: disable= no-value-for-parameter
        block_bytes = embeddings_path.nbytes

    memory_available = cast_memory_to_bytes(memory_available) if isinstance(
        memory_available, str) else memory_available

    stack_input = max(int(0.25 * memory_available / block_bytes), 1)

    if isinstance(embeddings_path, str):
        _, ground_truth = perfect_index.search_files(query_embeddings,
                                                     k=40,
                                                     stack_input=stack_input)
    else:
        _, ground_truth = perfect_index.search(query_embeddings, k=40)

    return ground_truth
Exemplo n.º 4
0
def get_optimal_batch_size(nb_vectors: int, vec_dim: int, current_memory_available: str) -> int:
    """ compute optimal batch size to use the RAM at its full potential """

    total_size = nb_vectors * vec_dim * 4  # in bytes
    memory = cast_memory_to_bytes(current_memory_available)

    batch_size = int(0.5 * total_size / memory)

    return batch_size
Exemplo n.º 5
0
def get_optimal_quantization(
    nb_vectors: int,
    dim_vector: int,
    force_quantization_value: Optional[int] = None,
    force_max_index_memory_usage: Optional[str] = None,
) -> List[str]:
    """
    Function that returns a list of relevant index_keys to create quantized indices.

    Parameters:
    ----------
    nb_vectors: int
        Number of vectors in the dataset.
    dim_vector: int
        Dimension of the vectors in the dataset.
    force_quantization_value: Optional[int]
        Force to use this value as the size of the quantized vectors (PQx).
        It can be used with the force_max_index_memory_usage parameter,
        but the result might be empty.
    force_max_index_memory_usage: Optional[str]
        Add a memory constraint on the index.
        It can be used with the force_quantization_value parameter,
        but the result might be empty.

    Return:
    -------
    index_keys: List[str]
        List of index_keys that would be good choices for quantization.
        The list can be empty if the given constraints are too strong.
    """

    # Default values
    pq_values = [64, 48, 32, 24, 16, 8, 4]
    targeted_compression_ratio = 0.0  # 0 = no constraint

    # Force compression ratio if required
    if force_max_index_memory_usage is not None:
        total_bytes = 4.0 * nb_vectors * dim_vector  # x4 because float32
        max_mem_bytes = float(cast_memory_to_bytes(force_max_index_memory_usage))
        targeted_compression_ratio = total_bytes / max_mem_bytes

    # Force quantization value if required
    if force_quantization_value is not None:
        pq_values = [force_quantization_value]

    # Compute optimal number of clusters
    relevant_list: List[str] = []
    nb_clusters_list = get_optimal_nb_clusters(nb_vectors)

    # Look for matching index keys
    for pq in pq_values:
        if pq < dim_vector:

            for nb_clusters in nb_clusters_list:

                # Compute quantized vector size
                cluster_size_byte = 1 + (log2(nb_clusters) - 1) // 8
                vector_size_byte = pq + cluster_size_byte

                # Compute compression ratio with quantization PQx
                compression_ratio = (4 * dim_vector) / vector_size_byte

                # Add index_key if compression ratio is high enough
                if compression_ratio >= targeted_compression_ratio:

                    # y is a multiple of pq (required)
                    # y <= d, with d the dimension of the input vectors (preferable)
                    # y <= 6*pq (preferable)
                    # here we choose a y slightly bigger than d to avoid losing information
                    # in case such as 101, 128 is better than 64 to avoid losing information
                    # in the linear transform
                    y = (min(dim_vector // pq, 6) + 1) * pq
                    cluster_opt = f"IVF{nb_clusters}" if nb_clusters < 1000 else f"IVF{nb_clusters}_HNSW32"
                    relevant_list.append(f"OPQ{pq}_{y},{cluster_opt},PQ{pq}x8")

    return relevant_list