예제 #1
0
    def get_Xss_confidence(self):
        X = self.X_data
        X = X.A if sp.issparse(X) else X
        Xss = self.Xss.get_X()
        alg = 'ball_tree' if Xss.shape[1] > 10 else 'kd_tree'

        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(X,
                             metric='euclidean',
                             n_neighbors=min(self.k, X.shape[0] - 1),
                             n_jobs=-1,
                             random_state=19491001)
            _, dist = nbrs.query(Xss, k=min(self.k, X.shape[0] - 1))
        else:
            alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree'
            nbrs = NearestNeighbors(n_neighbors=min(self.k, X.shape[0] - 1),
                                    algorithm=alg,
                                    n_jobs=-1).fit(X)
            dist, _ = nbrs.kneighbors(Xss)

        dist_m = dist.mean(1)
        confidence = 1 - dist_m / dist_m.max()

        return confidence
예제 #2
0
    def fit(self, data: np.ndarray, sight_k: int = None) -> Any:
        """Fits the model
        data: np.ndarray (samples, features)
            np
        sight_k: int
            the farthest point that a node is allowed to connect to when its closest neighbours are not allowed
        """
        self.data = data
        if sight_k is not None:
            self.sight_k = sight_k
        logging.debug(
            f"First search the {self.sight_k} nearest neighbours for {self.n_samples}"
        )
        np.random.seed(13)
        if self.metric == "correlation":
            self._nn = _NearestNeighbors(n_neighbors=self.sight_k + 1,
                                         metric=self.metric,
                                         p=self.minkowski_p,
                                         n_jobs=self.n_jobs,
                                         algorithm="brute")
            self._nn.fit(self.data)
        elif self.metric == "js":
            self._nn = NNDescent(data=self.data,
                                 metric=jensen_shannon_distance)
        else:
            self._nn = _NearestNeighbors(n_neighbors=self.sight_k + 1,
                                         metric=self.metric,
                                         p=self.minkowski_p,
                                         n_jobs=self.n_jobs,
                                         leaf_size=30)
            self._nn.fit(self.data)

        # call this to calculate bknn
        self.kneighbors_graph(mode='distance')
        return self
예제 #3
0
def test_transformer_equivalence():
    N_NEIGHBORS = 15
    EPSILON = 0.15
    train = nn_data[:400]
    test = nn_data[:200]

    # Note we shift N_NEIGHBORS to conform to sklearn's KNeighborTransformer defn
    nnd = NNDescent(data=train,
                    n_neighbors=N_NEIGHBORS + 1,
                    random_state=42,
                    compressed=False)
    indices, dists = nnd.query(test, k=N_NEIGHBORS, epsilon=EPSILON)
    sort_idx = np.argsort(indices, axis=1)
    indices_sorted = np.vstack(
        [indices[i, sort_idx[i]] for i in range(sort_idx.shape[0])])
    dists_sorted = np.vstack(
        [dists[i, sort_idx[i]] for i in range(sort_idx.shape[0])])

    # Note we shift N_NEIGHBORS to conform to sklearn' KNeighborTransformer defn
    transformer = PyNNDescentTransformer(n_neighbors=N_NEIGHBORS,
                                         search_epsilon=EPSILON,
                                         random_state=42).fit(
                                             train, compress_index=False)
    Xt = transformer.transform(test).sorted_indices()

    assert np.all(Xt.indices == indices_sorted.flatten())
    assert np.allclose(Xt.data, dists_sorted.flat)
예제 #4
0
    def fit(self, X, V, k, s=None, tol=1e-4):
        self.__reset__()
        # knn clustering
        if self.nbrs_idx is None:
            if X.shape[0] > 200000 and X.shape[1] > 2: 
                from pynndescent import NNDescent

                nbrs = NNDescent(X, metric='euclidean', n_neighbors=k + 1, n_jobs=-1,
                                  random_state=19491001)
                Idx, _ = nbrs.query(X, k=k+1)
            else:
                alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree'
                nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=alg, n_jobs=-1).fit(X)
                _, Idx = nbrs.kneighbors(X)

            self.nbrs_idx = Idx[:, 1:]
        else:
            Idx = self.nbrs_idx
        # compute transition prob.
        n = X.shape[0]
        self.P = np.zeros((n, n))
        for i in range(n):
            y = X[i]
            v = V[i]
            Y = X[Idx[i, 1:]]
            p = compute_markov_trans_prob(y, v, Y, s, cont_time=True)
            p[p <= tol] = 0  # tolerance check
            self.P[Idx[i, 1:], i] = p
            self.P[i, i] = -np.sum(p)
예제 #5
0
    def get_Xss_confidence(self, k=50):
        X = self.X_data
        X = X.A if sp.issparse(X) else X
        Xss = self.Xss.get_X()
        Xref = np.median(X, 0)
        Xss = np.vstack((Xss, Xref))

        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(X,
                             metric="euclidean",
                             n_neighbors=min(k, X.shape[0] - 1),
                             n_jobs=-1,
                             random_state=19491001)
            _, dist = nbrs.query(Xss, k=min(k, X.shape[0] - 1))
        else:
            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=min(k, X.shape[0] - 1),
                                    algorithm=alg,
                                    n_jobs=-1).fit(X)
            dist, _ = nbrs.kneighbors(Xss)

        dist_m = dist.mean(1)
        # confidence = 1 - dist_m / dist_m.max()
        sigma = 0.1 * 0.5 * (np.max(X[:, 0]) - np.min(X[:, 0]) +
                             np.max(X[:, 1]) - np.min(X[:, 1]))
        confidence = gaussian_1d(dist_m, sigma=sigma)
        confidence /= np.max(confidence)
        return confidence[:-1]
예제 #6
0
def graphize_vecfld(func, X, nbrs_idx=None, dist=None, k=30, distance_free=True, n_int_steps=20, cores=1):
    n, d = X.shape

    nbrs = None
    if nbrs_idx is None:
        if X.shape[0] > 200000 and X.shape[1] > 2: 
            from pynndescent import NNDescent

            nbrs = NNDescent(X, metric='euclidean', n_neighbors=k+1, n_jobs=-1, random_state=19491001)
            nbrs_idx, dist = nbrs.query(X, k=k+1)
        else:
            alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree'
            nbrs = NearestNeighbors(n_neighbors=k+1, algorithm=alg, n_jobs=-1).fit(X)
            dist, nbrs_idx = nbrs.kneighbors(X)

    if dist is None and not distance_free:
        D = pdist(X)
    else:
        D = None

    V = sp.csr_matrix((n, n))
    if cores == 1:
        for i, idx in tqdm(enumerate(nbrs_idx), desc='Constructing diffusion graph from reconstructed vector field'):
            V += construct_v(X, i, idx, n_int_steps, func, distance_free, dist, D, n)

    else:
        pool = ThreadPool(cores)
        res = pool.starmap(construct_v, zip(itertools.repeat(X), np.arange(len(nbrs_idx)), nbrs_idx, itertools.repeat(n_int_steps),
                                            itertools.repeat(func), itertools.repeat(distance_free),
                                            itertools.repeat(dist), itertools.repeat(D), itertools.repeat(n)))
        pool.close()
        pool.join()
        V = functools.reduce((lambda a, b: a + b), res)

    return V, nbrs
예제 #7
0
def bandwidth_selector(X):
    """
    This function computes an empirical bandwidth for a Gaussian kernel.
    """
    n, m = X.shape
    if n > 200000 and m > 2:
        from pynndescent import NNDescent

        nbrs = NNDescent(
            X,
            metric="euclidean",
            n_neighbors=max(2, int(0.2 * n)),
            n_jobs=-1,
            random_state=19491001,
        )
        _, distances = nbrs.query(X, k=max(2, int(0.2 * n)))
    else:
        alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
        nbrs = NearestNeighbors(n_neighbors=max(2, int(0.2 * n)),
                                algorithm=alg,
                                n_jobs=-1).fit(X)
        distances, _ = nbrs.kneighbors(X)

    d = np.mean(distances[:, 1:]) / 1.5
    return np.sqrt(2) * d
예제 #8
0
def test_nn_decent_with_parallel_backend():

    np.random.seed(42)
    N = 100
    D = 128
    chunk_size = N // 8
    n_neighbors = 25
    data = np.random.rand(N, D).astype(np.float32)

    nn_indices, nn_distances = NNDescent(
        data,
        n_neighbors=n_neighbors,
        max_candidates=max_candidates,
        n_iters=2,
        tree_init=False,
        seed_per_row=True,
    )._neighbor_graph

    with joblib.parallel_backend("threading"):
        nn_indices_threaded, nn_distances_threaded = NNDescent(
            data,
            n_neighbors=n_neighbors,
            max_candidates=max_candidates,
            n_iters=2,
            tree_init=False,
            seed_per_row=True,
        )._neighbor_graph

    assert_allclose(nn_indices_threaded, nn_indices)
    assert_allclose(nn_distances_threaded, nn_distances)
예제 #9
0
def test_tree_no_split(small_data, sparse_small_data, metric):
    k = 10
    for data, data_type in zip([small_data, sparse_small_data],
                               ["dense", "sparse"]):
        n_instances = data.shape[0]
        leaf_size = n_instances + 1  # just to be safe
        data_train = data[n_instances // 2:]
        data_test = data[:n_instances // 2]

        nnd = NNDescent(
            data_train,
            metric=metric,
            n_neighbors=data_train.shape[0] - 1,
            random_state=None,
            tree_init=True,
            leaf_size=leaf_size,
        )
        nnd.prepare()
        knn_indices, _ = nnd.query(data_test, k=k, epsilon=0.2)

        true_nnd = NearestNeighbors(metric=metric).fit(data_train)
        true_indices = true_nnd.kneighbors(data_test, k, return_distance=False)

        num_correct = 0.0
        for i in range(true_indices.shape[0]):
            num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

        percent_correct = num_correct / (true_indices.shape[0] * k)
        assert (
            percent_correct >= 0.95
        ), "NN-descent query did not get 95% for accuracy on nearest neighbors on {} data".format(
            data_type)
예제 #10
0
    def _init_pynndescent(self, distances):
        from pynndescent import NNDescent

        self._use_pynndescent = True

        first_col = np.arange(distances.shape[0])[:, None]
        init_indices = np.hstack((first_col, np.stack(distances.tolil().rows)))

        self._nnd_idx = NNDescent(
            data=self._rep,
            metric=self._metric,
            metric_kwds=self._metric_kwds,
            n_neighbors=self._n_neighbors,
            init_graph=init_indices,
            random_state=self._neigh_random_state,
        )

        # temporary hack for the broken forest storage
        from pynndescent.rp_trees import make_forest

        current_random_state = check_random_state(self._nnd_idx.random_state)
        self._nnd_idx._rp_forest = make_forest(
            self._nnd_idx._raw_data,
            self._nnd_idx.n_neighbors,
            self._nnd_idx.n_search_trees,
            self._nnd_idx.leaf_size,
            self._nnd_idx.rng_state,
            current_random_state,
            self._nnd_idx.n_jobs,
            self._nnd_idx._angular_trees,
        )
예제 #11
0
def compute_tau(X, V, k=100, nbr_idx=None):
    if nbr_idx is None:
        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(
                X,
                metric="euclidean",
                n_neighbors=k,
                n_jobs=-1,
                random_state=19491001,
            )
            _, dist = nbrs.query(X, k=k)
        else:
            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg, n_jobs=-1).fit(X)
            dists, _ = nbrs.kneighbors(X)

    else:
        dists = np.zeros(nbr_idx.shape)
        for i in range(nbr_idx.shape[0]):
            for j in range(nbr_idx.shape[1]):
                x = X[i]
                y = X[nbr_idx[i, j]]
                dists[i, j] = np.sqrt((x - y).dot(x - y))
    d = np.mean(dists[:, 1:], 1)
    v = np.linalg.norm(V, axis=1)
    tau = d / v
    return tau, v
예제 #12
0
 def get_knn_graph(self, data):
     nn = NNDescent(data,
                    metric="euclidean",
                    n_jobs=self.n_jobs,
                    random_state=self.random_state)
     indices, distances = nn.query(data, k=self.n_neighbors + 1)
     knn = indices[:, 1:]
     return knn
예제 #13
0
    def fit(self, X, W, y, verbose=0):

        """
        Fit a counterfactual estimation model given explanatory variables X, treatment variable W and target y
        This method fits a forest-based model, extracts a supervised embedding from its leaves,
        and builds an nearest neighbor index on the embedding

        Parameters
        ----------

        X : array-like or sparse matrix of shape = [n_samples, n_features]

        Data with explanatory variables, with possible confounders of treatment assignment and effect.

        W : array-like, shape = [n_samples]

        Treatment variable. The model will try to estimate a counterfactual outcome for each unique value in
        this variable.
        Should not exceed 10 values.

        y: array-like, shape = [n_samples]

        Target variable.

        verbose : int, optional (default=0)

        Verbosity level.

        Returns
        -------

        self: object

        """

        # checking if W has too many unique values
        if len(np.unique(W)) > 10:
            raise ValueError('More than 10 unique values for W. \
                Too many unique values will make the process very expensive.')

        # fitting the model
        self.model.fit(X, y)

        # getting forest embedding from model
        self.train_embed_ = self._get_forest_embed(X)

        # create neighbor index
        self.nn_index = NNDescent(self.train_embed_, metric='hamming')

        # creating a df with treatment assignments and outcomes
        self.train_outcome_df = pd.DataFrame({'neighbor': range(X.shape[0]), 'y': y, 'W': W})

        # saving explanatory variables
        if self.save_explanatory:
            self.X_train = X.assign(W=W, y=y)

        # return self
        return self
예제 #14
0
    def fit(self,
            X,
            V,
            k,
            s=None,
            method="qp",
            eps=None,
            tol=1e-4):  # pass index
        # the parameter k will be replaced by a connectivity matrix in the future.
        self.__reset__()
        # knn clustering
        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(X,
                             metric="euclidean",
                             n_neighbors=k,
                             n_jobs=-1,
                             random_state=19491001)
            Idx, _ = nbrs.query(X, k=k)
        else:
            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg,
                                    n_jobs=-1).fit(X)
            _, Idx = nbrs.kneighbors(X)
        # compute transition prob.
        n = X.shape[0]
        self.P = np.zeros((n, n))
        if method == "kernel":
            inv_s = np.linalg.inv(s)
            # compute density kernel
            if eps is not None:
                self.Kd = np.zeros((n, n))
                inv_eps = 1 / eps
                for i in range(n):
                    self.Kd[i, Idx[i]] = compute_density_kernel(
                        X[i], X[Idx[i]], inv_eps)
                D = np.sum(self.Kd, 0)
        for i in range(n):
            y = X[i]
            v = V[i]
            if method == "qp":
                Y = X[Idx[i, 1:]]
                p = compute_markov_trans_prob(y, v, Y, s)
                p[p <= tol] = 0  # tolerance check
                self.P[Idx[i, 1:], i] = p
                self.P[i, i] = 1 - np.sum(p)
            else:
                Y = X[Idx[i]]
                # p = compute_kernel_trans_prob(y, v, Y, inv_s)
                k = compute_drift_kernel(y, v, Y, inv_s)
                if eps is not None:
                    k /= D[Idx[i]]
                p = k / np.sum(k)
                p[p <= tol] = 0  # tolerance check
                p = p / np.sum(p)
                self.P[Idx[i], i] = p
예제 #15
0
def test_update_w_prepare_query_accuracy(nn_data, metric):
    nnd = NNDescent(
        nn_data[200:800],
        metric=metric,
        n_neighbors=10,
        random_state=None,
        compressed=False,
    )
    nnd.prepare()

    nnd.update(xs_fresh=nn_data[800:])
    nnd.prepare()

    knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2)

    true_nnd = NearestNeighbors(metric=metric).fit(nn_data[200:])
    true_indices = true_nnd.kneighbors(nn_data[:200],
                                       10,
                                       return_distance=False)

    num_correct = 0.0
    for i in range(true_indices.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

    percent_correct = num_correct / (true_indices.shape[0] * 10)
    assert percent_correct >= 0.95, ("NN-descent query did not get 95% "
                                     "accuracy on nearest neighbors")
예제 #16
0
    def calculate_neighbours(genes, n_neighbours: int, inverse: bool, scale: str, log: bool,
                             description: str = '', return_neigh_sim: bool = False,
                             genes_query_data: pd.DataFrame = None, remove_self: bool = False):
        """
        Calculate neighbours of genes based on cosine distance.
        :param genes: Data frame as in class init, gene names (rows) should match the one in init.
        :param n_neighbours: Number of neighbours to obtain for each gene. This will include self for non-inverse.
        :param inverse: Calculate most similar neighbours (False) or neighbours with inverse profile (True).
        :param scale: Scale expression by gene with 'minmax' (min=0, max=1) or 'mean0std1' (mean=0, std=1) or 'none'.
        :param log: Should expression data be log2(data+pseudocount) transformed before scaling.
        :param description: If an error occurs while making KNN index report this description with the error.
        :param return_neigh_sim: Return tuple with nearest neighbour matrix and similarity matrix data frames,
            as returned by pynndescent, but with distance matrix converted to similarities and with added gene
            names for the index.
        :param genes_query_data: Use this as query. If None use genes.
        :param remove_self: Used only if return_neigh_dist is true. Whether to remove sample from its closest
            neighbours or not. If return_neigh_dist is False this is done automatically. This also removes the last
            column of neighbours if self is not present - thus it should not be used with inverse,
            as self will not be present.
        :return: Dict with keys being gene pair names tuple (smaller name by alphabet is the first tuple value) and
            values representing cosine similarity. Or see return_neigh_dist.
        """
        genes_index, genes_query = NeighbourCalculator.get_index_query(genes=genes, inverse=inverse, scale=scale,
                                                                       log=log,
                                                                       genes_query_data=genes_query_data)
        # Random state was not set during the analysis in the paper so the obtained results might differ slightly
        try:
            index = NNDescent(genes_index, n_jobs=THREADS, metric='cosine', random_state=0)
        except ValueError:
            try:
                index = NNDescent(genes_index, tree_init=False, n_jobs=THREADS, random_state=0)
                warnings.warn(
                    'Dataset ' + description + ' index computed without tree initialisation',
                    Warning)
            except ValueError:
                raise ValueError('Dataset ' + description + ' can not be processed by pydescent')
        neighbours, distances = index.query(genes_query.tolist(), k=n_neighbours)

        if genes_query_data is None:
            genes_query_data = genes
        if return_neigh_sim:
            neighbours = NeighbourCalculator.parse_neighbours_matrix(neighbours=neighbours,
                                                                     genes_query=genes_query_data,
                                                                     genes_idx=genes)
            similarities = pd.DataFrame(NeighbourCalculator.parse_distances_matrix(distances),
                                        index=genes_query_data.index)
            if remove_self:
                neighbours, similarities = NeighbourCalculator.remove_self_pynn_matrix(neighbours=neighbours,
                                                                                       similarities=similarities)
            return neighbours, similarities
        else:
            return NeighbourCalculator.parse_neighbours(neighbours=neighbours, distances=distances,
                                                        genes_query=genes_query_data, genes_idx=genes)
예제 #17
0
def test_nn_descent():

    np.random.seed(42)
    N = 100
    # D = 128
    D = 4
    chunk_size = N // 8
    n_neighbors = 25
    data = np.random.rand(N, D).astype(np.float32)

    nn_indices, nn_distances = NNDescent(
        data,
        n_neighbors=n_neighbors,
        max_candidates=max_candidates,
        n_iters=1,
        random_state=42,
        delta=0,
        tree_init=False,
        seed_per_row=True,
    )._neighbor_graph

    for i in range(data.shape[0]):
        assert_equal(
            len(nn_indices[i]),
            len(np.unique(nn_indices[i])),
            "Duplicate graph_indices in unthreaded knn graph",
        )

    nn_indices_threaded, nn_distances_threaded = NNDescent(
        data,
        n_neighbors=n_neighbors,
        max_candidates=max_candidates,
        n_iters=1,
        random_state=42,
        delta=0,
        tree_init=False,
        seed_per_row=True,
        n_jobs=2,
    )._neighbor_graph

    for i in range(data.shape[0]):
        assert_equal(
            len(nn_indices_threaded[i]),
            len(np.unique(nn_indices_threaded[i])),
            "Duplicate graph_indices in threaded knn graph",
        )

    nbrs = NearestNeighbors(n_neighbors=n_neighbors,
                            algorithm="brute").fit(data)
    _, nn_gold_indices = nbrs.kneighbors(data)

    assert_allclose(nn_indices_threaded, nn_indices)
    assert_allclose(nn_distances_threaded, nn_distances)
예제 #18
0
def test_nn_descent_query_accuracy(nn_data):
    nnd = NNDescent(nn_data[200:], "euclidean", n_neighbors=10, random_state=None)
    knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2)

    tree = KDTree(nn_data[200:])
    true_indices = tree.query(nn_data[:200], 10, return_distance=False)

    num_correct = 0.0
    for i in range(true_indices.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

    percent_correct = num_correct / (true_indices.shape[0] * 10)
    assert percent_correct >= 0.95, (
        "NN-descent query did not get 95% " "accuracy on nearest neighbors"
    )
예제 #19
0
class NNDescent(KNNIndex):
    # TODO: Make mapping from sklearn metrics to lib metrics

    def build(self, data):
        self.index = LibNNDescent(data, metric=self.metric, n_neighbors=5)

    def query_train(self, data, k):
        search_neighbors = min(data.shape[0] - 1, k + 1)
        neighbors, distances = self.index.query(data,
                                                k=search_neighbors,
                                                queue_size=1)
        return neighbors[:, 1:], distances[:, 1:]

    def query(self, query, k):
        return self.index.query(query, k=k, queue_size=1)
예제 #20
0
def test_nn_descent_query_accuracy_angular(nn_data):
    nnd = NNDescent(nn_data[200:], "cosine", n_neighbors=30, random_state=None)
    knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.32)

    nn = NearestNeighbors(metric="cosine").fit(nn_data[200:])
    true_indices = nn.kneighbors(nn_data[:200], n_neighbors=10, return_distance=False)

    num_correct = 0.0
    for i in range(true_indices.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

    percent_correct = num_correct / (true_indices.shape[0] * 10)
    assert percent_correct >= 0.95, (
        "NN-descent query did not get 95% " "accuracy on nearest neighbors"
    )
예제 #21
0
def py_nearest_neighbors(dataset, K, metric, repetition):
    # (try) to enforce pynndescent using only a single thread:

    # a point is his own NN
    # thats why we query K+1 and remove afterwards

    runtime = np.zeros(repetition+1)
    nn_list = []
    for i in range(repetition+1):
        start = time.perf_counter()
        index = NNDescent(dataset.X,
                          n_neighbors=(K+1),
#                          verbose=True,
                          tree_init=False, # some fancy init
                          n_jobs=1)

        elapsed = time.perf_counter()-start
        runtime[i] = elapsed

        nn_arr = index._neighbor_graph[0]
        assert((nn_arr[:,0] == np.array(range(dataset.N))).all())
        nn_list.append(NearestNeighbors(nn_arr[:,1:], metric))

    # skip first repetition, since JIT does a lot of work then...
    return nn_list[1:], Timingdata(None, runtime[1:], "pynndescent")
예제 #22
0
def test_deduplicated_data_behaves_normally(seed, cosine_hang_data):

    data = np.unique(cosine_hang_data, axis=0)
    data = data[~np.all(data == 0, axis=1)]
    data = data[:1000]

    n_neighbors = 10
    knn_indices, _ = NNDescent(
        data,
        "cosine",
        {},
        n_neighbors,
        random_state=np.random.RandomState(seed),
        n_trees=20,
    )._neighbor_graph

    for i in range(data.shape[0]):
        assert len(knn_indices[i]) == len(np.unique(
            knn_indices[i])), "Duplicate graph_indices in knn graph"

    angular_data = normalize(data, norm="l2")
    tree = KDTree(angular_data)
    true_indices = tree.query(angular_data, n_neighbors, return_distance=False)

    num_correct = 0
    for i in range(data.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

    proportion_correct = num_correct / (data.shape[0] * n_neighbors)
    assert (proportion_correct >=
            0.95), "NN-descent did not get 95% accuracy on nearest neighbors"
예제 #23
0
def test_deduplicated_data_behaves_normally():
    this_dir = os.path.dirname(os.path.abspath(__file__))
    data_path = os.path.join(this_dir, "test_data/cosine_hang.npy")
    data = np.unique(np.load(data_path), axis=0)
    data = data[~np.all(data == 0, axis=1)]
    data = data[:1000]

    n_neighbors = 10
    knn_indices, _ = NNDescent(
        data, "cosine", {}, n_neighbors, random_state=np.random, n_trees=20
    )._neighbor_graph

    for i in range(data.shape[0]):
        assert_equal(
            len(knn_indices[i]),
            len(np.unique(knn_indices[i])),
            "Duplicate graph_indices in knn graph",
        )

    angular_data = normalize(data, norm="l2")
    tree = KDTree(angular_data)
    true_indices = tree.query(angular_data, n_neighbors, return_distance=False)

    num_correct = 0
    for i in range(data.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

    proportion_correct = num_correct / (data.shape[0] * n_neighbors)
    assert_greater_equal(
        proportion_correct,
        0.95,
        "NN-descent did not get 95%" " accuracy on nearest neighbors",
    )
예제 #24
0
def trn(X, n, return_index=True, seed=19491001, **kwargs):
    trnet = TRNET(n, X, seed)
    trnet.run(**kwargs)
    if not return_index:
        return trnet.W
    else:
        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(X, metric="euclidean", n_neighbors=1, n_jobs=-1, random_state=seed)
            idx, _ = nbrs.query(trnet.W, k=1)
        else:
            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=1, algorithm=alg, n_jobs=-1).fit(X)
            _, idx = nbrs.kneighbors(trnet.W)

        return idx[:, 0]
예제 #25
0
def test_tree_numbers_after_multiple_updates(n_trees):
    trees_after_update = max(1, int(np.round(n_trees / 3)))

    nnd = NNDescent(np.array([[1.0]]), n_neighbors=1, n_trees=n_trees)

    assert nnd.n_trees == n_trees, "NN-descent update changed the number of trees"
    assert (
        nnd.n_trees_after_update == trees_after_update
    ), "The value of the n_trees_after_update in NN-descent after update(s) is wrong"
    for i in range(5):
        nnd.update(xs_fresh=np.array([[i]], dtype=np.float64))
        assert (
            nnd.n_trees == trees_after_update
        ), "The value of the n_trees in NN-descent after update(s) is wrong"
        assert (
            nnd.n_trees_after_update == trees_after_update
        ), "The value of the n_trees_after_update in NN-descent after update(s) is wrong"
예제 #26
0
def test_joblib_dump():
    seed = np.random.RandomState(42)

    x1 = seed.normal(0, 100, (1000, 50))
    x2 = seed.normal(0, 100, (1000, 50))

    index1 = NNDescent(x1, "euclidean", {}, 10, random_state=None)
    neighbors1, distances1 = index1.query(x2)

    mem_temp = io.BytesIO()
    joblib.dump(index1, mem_temp)
    mem_temp.seek(0)
    index2 = joblib.load(mem_temp)

    neighbors2, distances2 = index2.query(x2)

    np.testing.assert_equal(neighbors1, neighbors2)
    np.testing.assert_equal(distances1, distances2)
예제 #27
0
    def __init__(self, data, n_components=30, normalize=False):
        self.data = data
        if self.data.shape[1] > n_components:
            from sklearn.decomposition import PCA
            from sklearn.preprocessing import StandardScaler

            data_std = StandardScaler().fit_transform(self.data)
            self.pca = PCA(n_components).fit_transform(data_std)
        else:
            self.pca = np.array(data)

        if normalize:
            # from sklearn.preprocessing import MaxAbsScaler
            # self.pca = MaxAbsScaler().fit_transform(self.pca)
            raise NotImplementedError

        from pynndescent import NNDescent

        self.ann_index = NNDescent(self.pca)
예제 #28
0
def test_sparse_nn_descent_query_accuracy():
    nnd = NNDescent(
        sparse_nn_data[200:], "euclidean", n_neighbors=10, random_state=None
    )
    knn_indices, _ = nnd.query(sparse_nn_data[:200], k=10)

    tree = KDTree(sparse_nn_data[200:].toarray())
    true_indices = tree.query(sparse_nn_data[:200].toarray(), 10, return_distance=False)

    num_correct = 0.0
    for i in range(true_indices.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

    percent_correct = num_correct / (true_indices.shape[0] * 10)
    assert_greater_equal(
        percent_correct,
        0.95,
        "Sparse NN-descent query did not get 95% " "accuracy on nearest neighbors",
    )
예제 #29
0
def p_ij_sym(x, perp, verbose=False):
    num_pts = x.shape[0]
    k = min(num_pts - 1, int(3 * perp))
    if verbose:
        print('Indexing')
    index = NNDescent(x)
    neighbors = np.empty((num_pts, k-1), dtype=np.int)
    p_ij = np.empty((num_pts, k-1))
    for i, xi in enumerate(x):
        if verbose:
            print('Calculating probabilities: {cur}/{tot}'.format(
                cur=i+1, tot=num_pts), end='\r')
        nn, dists = index.query([xi], k)
        beta = find_beta(dists[0, 1:], perp)
        neighbors[i] = nn[0, 1:]
        p_ij[i] = p_i(dists[0, 1:], beta)
    row_indices = np.repeat(np.arange(num_pts), k-1)
    p = csr_matrix((p_ij.ravel(), (row_indices, neighbors.ravel())))
    return 0.5*(p + p.transpose())
예제 #30
0
def test_nn_decent_with_n_jobs_minus_one():
    nn_indices, nn_distances = NNDescent(
        data,
        n_neighbors=n_neighbors,
        max_candidates=max_candidates,
        n_iters=2,
        delta=0,
        tree_init=False,
        seed_per_row=True,
    )._neighbor_graph

    for i in range(data.shape[0]):
        assert_equal(
            len(nn_indices[i]),
            len(np.unique(nn_indices[i])),
            "Duplicate indices in unthreaded knn graph",
        )

    nn_indices_threaded, nn_distances_threaded = NNDescent(
        data,
        n_neighbors=n_neighbors,
        max_candidates=max_candidates,
        n_iters=2,
        delta=0,
        tree_init=False,
        seed_per_row=True,
        n_jobs=-1,
    )._neighbor_graph

    for i in range(data.shape[0]):
        assert_equal(
            len(nn_indices_threaded[i]),
            len(np.unique(nn_indices_threaded[i])),
            "Duplicate indices in threaded knn graph",
        )

    nbrs = NearestNeighbors(n_neighbors=n_neighbors,
                            algorithm="brute").fit(data)
    _, nn_gold_indices = nbrs.kneighbors(data)

    assert_allclose(nn_indices_threaded, nn_indices)
    assert_allclose(nn_distances_threaded, nn_distances)