Пример #1
0
 def __init__(self, metric: str, dimension: int):
     self.name = f"eknn-exact-metric={metric}_dimension={dimension}"
     self.metric = metric
     self.dimension = dimension
     self.model = ElastiknnModel("exact", dealias_metric(metric))
     self.batch_res = None
     es_wait()
Пример #2
0
 def __init__(self, L: int, k: int, w: int):
     self.name_prefix = f"eknn-l2lsh-L={L}-k={k}-w={w}"
     self.name = None  # set based on query args.
     self.model = ElastiknnModel("lsh",
                                 "l2",
                                 mapping_params=dict(L=L, k=k, w=w))
     self.X_max = 1.0
     self.query_params = dict()
     self.batch_res = None
     self.sum_query_dur = 0
     self.num_queries = 0
     es_wait()
Пример #3
0
class L2Lsh(BaseANN):
    def __init__(self, L: int, k: int, w: int):
        self.name_prefix = f"eknn-l2lsh-L={L}-k={k}-w={w}"
        self.name = None  # set based on query args.
        self.model = ElastiknnModel("lsh",
                                    "l2",
                                    mapping_params=dict(L=L, k=k, w=w))
        self.X_max = 1.0
        self.query_params = dict()
        self.batch_res = None
        self.sum_query_dur = 0
        self.num_queries = 0
        es_wait()

    def fit(self, X):
        print(f"{self.name_prefix}: indexing {len(X)} vectors")

        # I found it's best to scale the vectors into [0, 1], i.e. divide by the max.
        self.X_max = X.max()
        return self.model.fit(X / self.X_max, shards=1)

    def set_query_arguments(self, candidates: int, probes: int):
        # This gets called when starting a new batch of queries.
        # Update the name and model's query parameters based on the given params.
        self.name = f"{self.name_prefix}_candidates={candidates}_probes={probes}"
        self.model.set_query_params(dict(candidates=candidates, probes=probes))
        # Reset the counters.
        self.num_queries = 0
        self.sum_query_dur = 0

    def query(self, q, n):
        # If QPS after 100 queries is < 10, this setting is bad and won't complete within the default timeout.
        if self.num_queries > 100 and self.num_queries / self.sum_query_dur < 10:
            print(
                "Throughput after 100 queries is less than 10 q/s. Terminating to avoid wasteful computation.",
                flush=True)
            exit(0)
        else:
            t0 = perf_counter()
            res = self.model.kneighbors(np.expand_dims(q, 0) / self.X_max,
                                        n)[0]
            dur = (perf_counter() - t0)
            self.sum_query_dur += dur
            self.num_queries += 1
            return res

    def batch_query(self, X, n):
        self.batch_res = self.model.kneighbors(X, n)

    def get_batch_results(self):
        return self.batch_res
Пример #4
0
def evaluate(dataset: Dataset, eknn: ElastiknnModel):
    n_neighbors = len(dataset.queries[0].indices)
    eknn.fit(dataset.corpus, shards=os.cpu_count() - 1)
    t0 = time()
    neighbors_pred = eknn.kneighbors([q.vector for q in dataset.queries],
                                     allow_missing=True,
                                     n_neighbors=n_neighbors)
    queries_per_sec = len(dataset.queries) / (time() - t0)
    recalls = [
        len(set(q.indices).intersection(p)) / len(q.indices)
        for (q, p) in zip(dataset.queries, neighbors_pred)
    ]
    recall = sum(recalls) / len(recalls)
    return recall, queries_per_sec
Пример #5
0
class Exact(BaseANN):
    def __init__(self, metric: str, dimension: int):
        self.name = f"eknn-exact-metric={metric}_dimension={dimension}"
        self.metric = metric
        self.dimension = dimension
        self.model = ElastiknnModel("exact", dealias_metric(metric))
        self.batch_res = None
        es_wait()

    def _handle_sparse(self, X):
        # convert list of lists of indices to sparse vectors.
        return [Vec.SparseBool(x, self.dimension) for x in X]

    def fit(self, X):
        if self.metric in {'jaccard', 'hamming'}:
            return self.model.fit(self._handle_sparse(X), shards=1)[0]
        else:
            return self.model.fit(X, shards=1)

    def query(self, q, n):
        if self.metric in {'jaccard', 'hamming'}:
            return self.model.kneighbors(self._handle_sparse([q]), n)[0]
        else:
            return self.model.kneighbors(np.expand_dims(q, 0), n)[0]

    def batch_query(self, X, n):
        if self.metric in {'jaccard', 'hamming'}:
            self.batch_res = self.model.kneighbors(self._handle_sparse(X), n)
        else:
            self.batch_res = self.model.kneighbors(X, n)

    def get_batch_results(self):
        return self.batch_res
Пример #6
0
    def test_exact_jaccard_mnist(self):
        # First run the query and make sure the results have the right form.
        n_neighbors = 20
        model = ElastiknnModel('exact', 'jaccard')
        model.fit(digits_train)

        inds1 = model.kneighbors(digits_validate, n_neighbors)
        inds2, dists2 = model.kneighbors(digits_validate,
                                         n_neighbors,
                                         return_similarity=True)

        assert np.all(inds1 == inds2)
        assert inds1.shape == (digits_validate.shape[0], n_neighbors)
        assert dists2.shape == inds2.shape

        # Then compare against scikit-learn. Intentionally using fewer neighbors to make sure recall will be 1
        # despite out-of-order indices due to equal distances.
        ref = NearestNeighbors(n_neighbors=int(n_neighbors / 2),
                               algorithm='brute',
                               metric='jaccard',
                               n_jobs=1)
        ref.fit(digits_train)
        inds3 = ref.kneighbors(digits_validate, return_distance=False)

        # Compute and check the recall.
        rec = self.recall(inds3, inds2)
        assert np.all(rec == 1)
Пример #7
0
def lsh(dataset: Dataset,
        bands: int = 165,
        rows: int = 1,
        candidates: float = 1.5):
    n_neighbors = len(dataset.queries[0].indices)
    eknn = ElastiknnModel(
        algorithm='lsh',
        metric='jaccard',
        n_jobs=1,
        index=f"{INDEX}-{int(time())}",
        mapping_params={
            "bands": bands,
            "rows": rows
        },
        query_params={"candidates": int(candidates * n_neighbors)})
    return evaluate(dataset, eknn)
Пример #8
0
def indexed(dataset: Dataset):
    eknn = ElastiknnModel(algorithm='sparse_indexed',
                          metric='jaccard',
                          n_jobs=1,
                          index=f"{INDEX}-{int(time())}")
    return evaluate(dataset, eknn)
Пример #9
0
def exact(dataset: Dataset):
    eknn = ElastiknnModel(algorithm='exact',
                          metric='jaccard',
                          n_jobs=1,
                          index=f"{INDEX}-exact")
    return evaluate(dataset, eknn)