示例#1
0
def test_deterministic():
    seed = np.random.RandomState(42)

    x1 = seed.normal(0, 100, (1000, 50))
    x2 = seed.normal(0, 100, (1000, 50))

    index1 = NNDescent(x1, random_state=np.random.RandomState(42))
    neighbors1, distances1 = index1.query(x2)

    index2 = NNDescent(x1, random_state=np.random.RandomState(42))
    neighbors2, distances2 = index2.query(x2)

    np.testing.assert_equal(neighbors1, neighbors2)
    np.testing.assert_equal(distances1, distances2)
示例#2
0
class NNDescent(KNNIndex):
    # TODO: Make mapping from sklearn metrics to lib metrics

    def build(self, data):
        self.index = LibNNDescent(data, metric=self.metric, n_neighbors=5)

    def query_train(self, data, k):
        search_neighbors = min(data.shape[0] - 1, k + 1)
        neighbors, distances = self.index.query(data,
                                                k=search_neighbors,
                                                queue_size=1)
        return neighbors[:, 1:], distances[:, 1:]

    def query(self, query, k):
        return self.index.query(query, k=k, queue_size=1)
示例#3
0
def compute_tau(X, V, k=100, nbr_idx=None):
    if nbr_idx is None:
        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(
                X,
                metric="euclidean",
                n_neighbors=k,
                n_jobs=-1,
                random_state=19491001,
            )
            _, dist = nbrs.query(X, k=k)
        else:
            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg, n_jobs=-1).fit(X)
            dists, _ = nbrs.kneighbors(X)

    else:
        dists = np.zeros(nbr_idx.shape)
        for i in range(nbr_idx.shape[0]):
            for j in range(nbr_idx.shape[1]):
                x = X[i]
                y = X[nbr_idx[i, j]]
                dists[i, j] = np.sqrt((x - y).dot(x - y))
    d = np.mean(dists[:, 1:], 1)
    v = np.linalg.norm(V, axis=1)
    tau = d / v
    return tau, v
示例#4
0
def graphize_vecfld(func, X, nbrs_idx=None, dist=None, k=30, distance_free=True, n_int_steps=20, cores=1):
    n, d = X.shape

    nbrs = None
    if nbrs_idx is None:
        if X.shape[0] > 200000 and X.shape[1] > 2: 
            from pynndescent import NNDescent

            nbrs = NNDescent(X, metric='euclidean', n_neighbors=k+1, n_jobs=-1, random_state=19491001)
            nbrs_idx, dist = nbrs.query(X, k=k+1)
        else:
            alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree'
            nbrs = NearestNeighbors(n_neighbors=k+1, algorithm=alg, n_jobs=-1).fit(X)
            dist, nbrs_idx = nbrs.kneighbors(X)

    if dist is None and not distance_free:
        D = pdist(X)
    else:
        D = None

    V = sp.csr_matrix((n, n))
    if cores == 1:
        for i, idx in tqdm(enumerate(nbrs_idx), desc='Constructing diffusion graph from reconstructed vector field'):
            V += construct_v(X, i, idx, n_int_steps, func, distance_free, dist, D, n)

    else:
        pool = ThreadPool(cores)
        res = pool.starmap(construct_v, zip(itertools.repeat(X), np.arange(len(nbrs_idx)), nbrs_idx, itertools.repeat(n_int_steps),
                                            itertools.repeat(func), itertools.repeat(distance_free),
                                            itertools.repeat(dist), itertools.repeat(D), itertools.repeat(n)))
        pool.close()
        pool.join()
        V = functools.reduce((lambda a, b: a + b), res)

    return V, nbrs
示例#5
0
    def get_Xss_confidence(self):
        X = self.X_data
        X = X.A if sp.issparse(X) else X
        Xss = self.Xss.get_X()
        alg = 'ball_tree' if Xss.shape[1] > 10 else 'kd_tree'

        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(X,
                             metric='euclidean',
                             n_neighbors=min(self.k, X.shape[0] - 1),
                             n_jobs=-1,
                             random_state=19491001)
            _, dist = nbrs.query(Xss, k=min(self.k, X.shape[0] - 1))
        else:
            alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree'
            nbrs = NearestNeighbors(n_neighbors=min(self.k, X.shape[0] - 1),
                                    algorithm=alg,
                                    n_jobs=-1).fit(X)
            dist, _ = nbrs.kneighbors(Xss)

        dist_m = dist.mean(1)
        confidence = 1 - dist_m / dist_m.max()

        return confidence
示例#6
0
def test_tree_no_split(small_data, sparse_small_data, metric):
    k = 10
    for data, data_type in zip([small_data, sparse_small_data],
                               ["dense", "sparse"]):
        n_instances = data.shape[0]
        leaf_size = n_instances + 1  # just to be safe
        data_train = data[n_instances // 2:]
        data_test = data[:n_instances // 2]

        nnd = NNDescent(
            data_train,
            metric=metric,
            n_neighbors=data_train.shape[0] - 1,
            random_state=None,
            tree_init=True,
            leaf_size=leaf_size,
        )
        nnd.prepare()
        knn_indices, _ = nnd.query(data_test, k=k, epsilon=0.2)

        true_nnd = NearestNeighbors(metric=metric).fit(data_train)
        true_indices = true_nnd.kneighbors(data_test, k, return_distance=False)

        num_correct = 0.0
        for i in range(true_indices.shape[0]):
            num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

        percent_correct = num_correct / (true_indices.shape[0] * k)
        assert (
            percent_correct >= 0.95
        ), "NN-descent query did not get 95% for accuracy on nearest neighbors on {} data".format(
            data_type)
示例#7
0
    def get_Xss_confidence(self, k=50):
        X = self.X_data
        X = X.A if sp.issparse(X) else X
        Xss = self.Xss.get_X()
        Xref = np.median(X, 0)
        Xss = np.vstack((Xss, Xref))

        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(X,
                             metric="euclidean",
                             n_neighbors=min(k, X.shape[0] - 1),
                             n_jobs=-1,
                             random_state=19491001)
            _, dist = nbrs.query(Xss, k=min(k, X.shape[0] - 1))
        else:
            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=min(k, X.shape[0] - 1),
                                    algorithm=alg,
                                    n_jobs=-1).fit(X)
            dist, _ = nbrs.kneighbors(Xss)

        dist_m = dist.mean(1)
        # confidence = 1 - dist_m / dist_m.max()
        sigma = 0.1 * 0.5 * (np.max(X[:, 0]) - np.min(X[:, 0]) +
                             np.max(X[:, 1]) - np.min(X[:, 1]))
        confidence = gaussian_1d(dist_m, sigma=sigma)
        confidence /= np.max(confidence)
        return confidence[:-1]
示例#8
0
def test_update_w_prepare_query_accuracy(nn_data, metric):
    nnd = NNDescent(
        nn_data[200:800],
        metric=metric,
        n_neighbors=10,
        random_state=None,
        compressed=False,
    )
    nnd.prepare()

    nnd.update(xs_fresh=nn_data[800:])
    nnd.prepare()

    knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2)

    true_nnd = NearestNeighbors(metric=metric).fit(nn_data[200:])
    true_indices = true_nnd.kneighbors(nn_data[:200],
                                       10,
                                       return_distance=False)

    num_correct = 0.0
    for i in range(true_indices.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

    percent_correct = num_correct / (true_indices.shape[0] * 10)
    assert percent_correct >= 0.95, ("NN-descent query did not get 95% "
                                     "accuracy on nearest neighbors")
示例#9
0
def bandwidth_selector(X):
    """
    This function computes an empirical bandwidth for a Gaussian kernel.
    """
    n, m = X.shape
    if n > 200000 and m > 2:
        from pynndescent import NNDescent

        nbrs = NNDescent(
            X,
            metric="euclidean",
            n_neighbors=max(2, int(0.2 * n)),
            n_jobs=-1,
            random_state=19491001,
        )
        _, distances = nbrs.query(X, k=max(2, int(0.2 * n)))
    else:
        alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
        nbrs = NearestNeighbors(n_neighbors=max(2, int(0.2 * n)),
                                algorithm=alg,
                                n_jobs=-1).fit(X)
        distances, _ = nbrs.kneighbors(X)

    d = np.mean(distances[:, 1:]) / 1.5
    return np.sqrt(2) * d
示例#10
0
    def fit(self, X, V, k, s=None, tol=1e-4):
        self.__reset__()
        # knn clustering
        if self.nbrs_idx is None:
            if X.shape[0] > 200000 and X.shape[1] > 2: 
                from pynndescent import NNDescent

                nbrs = NNDescent(X, metric='euclidean', n_neighbors=k + 1, n_jobs=-1,
                                  random_state=19491001)
                Idx, _ = nbrs.query(X, k=k+1)
            else:
                alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree'
                nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=alg, n_jobs=-1).fit(X)
                _, Idx = nbrs.kneighbors(X)

            self.nbrs_idx = Idx[:, 1:]
        else:
            Idx = self.nbrs_idx
        # compute transition prob.
        n = X.shape[0]
        self.P = np.zeros((n, n))
        for i in range(n):
            y = X[i]
            v = V[i]
            Y = X[Idx[i, 1:]]
            p = compute_markov_trans_prob(y, v, Y, s, cont_time=True)
            p[p <= tol] = 0  # tolerance check
            self.P[Idx[i, 1:], i] = p
            self.P[i, i] = -np.sum(p)
示例#11
0
def test_transformer_equivalence():
    N_NEIGHBORS = 15
    EPSILON = 0.15
    train = nn_data[:400]
    test = nn_data[:200]

    # Note we shift N_NEIGHBORS to conform to sklearn's KNeighborTransformer defn
    nnd = NNDescent(data=train,
                    n_neighbors=N_NEIGHBORS + 1,
                    random_state=42,
                    compressed=False)
    indices, dists = nnd.query(test, k=N_NEIGHBORS, epsilon=EPSILON)
    sort_idx = np.argsort(indices, axis=1)
    indices_sorted = np.vstack(
        [indices[i, sort_idx[i]] for i in range(sort_idx.shape[0])])
    dists_sorted = np.vstack(
        [dists[i, sort_idx[i]] for i in range(sort_idx.shape[0])])

    # Note we shift N_NEIGHBORS to conform to sklearn' KNeighborTransformer defn
    transformer = PyNNDescentTransformer(n_neighbors=N_NEIGHBORS,
                                         search_epsilon=EPSILON,
                                         random_state=42).fit(
                                             train, compress_index=False)
    Xt = transformer.transform(test).sorted_indices()

    assert np.all(Xt.indices == indices_sorted.flatten())
    assert np.allclose(Xt.data, dists_sorted.flat)
示例#12
0
 def get_knn_graph(self, data):
     nn = NNDescent(data,
                    metric="euclidean",
                    n_jobs=self.n_jobs,
                    random_state=self.random_state)
     indices, distances = nn.query(data, k=self.n_neighbors + 1)
     knn = indices[:, 1:]
     return knn
示例#13
0
    def fit(self,
            X,
            V,
            k,
            s=None,
            method="qp",
            eps=None,
            tol=1e-4):  # pass index
        # the parameter k will be replaced by a connectivity matrix in the future.
        self.__reset__()
        # knn clustering
        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(X,
                             metric="euclidean",
                             n_neighbors=k,
                             n_jobs=-1,
                             random_state=19491001)
            Idx, _ = nbrs.query(X, k=k)
        else:
            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg,
                                    n_jobs=-1).fit(X)
            _, Idx = nbrs.kneighbors(X)
        # compute transition prob.
        n = X.shape[0]
        self.P = np.zeros((n, n))
        if method == "kernel":
            inv_s = np.linalg.inv(s)
            # compute density kernel
            if eps is not None:
                self.Kd = np.zeros((n, n))
                inv_eps = 1 / eps
                for i in range(n):
                    self.Kd[i, Idx[i]] = compute_density_kernel(
                        X[i], X[Idx[i]], inv_eps)
                D = np.sum(self.Kd, 0)
        for i in range(n):
            y = X[i]
            v = V[i]
            if method == "qp":
                Y = X[Idx[i, 1:]]
                p = compute_markov_trans_prob(y, v, Y, s)
                p[p <= tol] = 0  # tolerance check
                self.P[Idx[i, 1:], i] = p
                self.P[i, i] = 1 - np.sum(p)
            else:
                Y = X[Idx[i]]
                # p = compute_kernel_trans_prob(y, v, Y, inv_s)
                k = compute_drift_kernel(y, v, Y, inv_s)
                if eps is not None:
                    k /= D[Idx[i]]
                p = k / np.sum(k)
                p[p <= tol] = 0  # tolerance check
                p = p / np.sum(p)
                self.P[Idx[i], i] = p
示例#14
0
    def calculate_neighbours(genes, n_neighbours: int, inverse: bool, scale: str, log: bool,
                             description: str = '', return_neigh_sim: bool = False,
                             genes_query_data: pd.DataFrame = None, remove_self: bool = False):
        """
        Calculate neighbours of genes based on cosine distance.
        :param genes: Data frame as in class init, gene names (rows) should match the one in init.
        :param n_neighbours: Number of neighbours to obtain for each gene. This will include self for non-inverse.
        :param inverse: Calculate most similar neighbours (False) or neighbours with inverse profile (True).
        :param scale: Scale expression by gene with 'minmax' (min=0, max=1) or 'mean0std1' (mean=0, std=1) or 'none'.
        :param log: Should expression data be log2(data+pseudocount) transformed before scaling.
        :param description: If an error occurs while making KNN index report this description with the error.
        :param return_neigh_sim: Return tuple with nearest neighbour matrix and similarity matrix data frames,
            as returned by pynndescent, but with distance matrix converted to similarities and with added gene
            names for the index.
        :param genes_query_data: Use this as query. If None use genes.
        :param remove_self: Used only if return_neigh_dist is true. Whether to remove sample from its closest
            neighbours or not. If return_neigh_dist is False this is done automatically. This also removes the last
            column of neighbours if self is not present - thus it should not be used with inverse,
            as self will not be present.
        :return: Dict with keys being gene pair names tuple (smaller name by alphabet is the first tuple value) and
            values representing cosine similarity. Or see return_neigh_dist.
        """
        genes_index, genes_query = NeighbourCalculator.get_index_query(genes=genes, inverse=inverse, scale=scale,
                                                                       log=log,
                                                                       genes_query_data=genes_query_data)
        # Random state was not set during the analysis in the paper so the obtained results might differ slightly
        try:
            index = NNDescent(genes_index, n_jobs=THREADS, metric='cosine', random_state=0)
        except ValueError:
            try:
                index = NNDescent(genes_index, tree_init=False, n_jobs=THREADS, random_state=0)
                warnings.warn(
                    'Dataset ' + description + ' index computed without tree initialisation',
                    Warning)
            except ValueError:
                raise ValueError('Dataset ' + description + ' can not be processed by pydescent')
        neighbours, distances = index.query(genes_query.tolist(), k=n_neighbours)

        if genes_query_data is None:
            genes_query_data = genes
        if return_neigh_sim:
            neighbours = NeighbourCalculator.parse_neighbours_matrix(neighbours=neighbours,
                                                                     genes_query=genes_query_data,
                                                                     genes_idx=genes)
            similarities = pd.DataFrame(NeighbourCalculator.parse_distances_matrix(distances),
                                        index=genes_query_data.index)
            if remove_self:
                neighbours, similarities = NeighbourCalculator.remove_self_pynn_matrix(neighbours=neighbours,
                                                                                       similarities=similarities)
            return neighbours, similarities
        else:
            return NeighbourCalculator.parse_neighbours(neighbours=neighbours, distances=distances,
                                                        genes_query=genes_query_data, genes_idx=genes)
示例#15
0
def prepare_velocity_grid_data(
    X_emb,
    xy_grid_nums,
    density=None,
    smooth=None,
    n_neighbors=None,
):

    n_obs, n_dim = X_emb.shape
    density = 1 if density is None else density
    smooth = 0.5 if smooth is None else smooth

    grs, scale = [], 0
    for dim_i in range(n_dim):
        m, M = np.min(X_emb[:, dim_i]), np.max(X_emb[:, dim_i])
        m = m - 0.01 * np.abs(M - m)
        M = M + 0.01 * np.abs(M - m)
        gr = np.linspace(m, M, xy_grid_nums[dim_i] * density)
        scale += gr[1] - gr[0]
        grs.append(gr)

    scale = scale / n_dim * smooth

    meshes_tuple = np.meshgrid(*grs)
    X_grid = np.vstack([i.flat for i in meshes_tuple]).T

    # estimate grid velocities
    if n_neighbors is None:
        n_neighbors = np.max([10, int(n_obs / 50)])

    if X_emb.shape[0] > 200000 and X_emb.shape[1] > 2:
        from pynndescent import NNDescent

        nn = NNDescent(X_emb,
                       metric='euclidean',
                       n_neighbors=n_neighbors,
                       n_jobs=-1,
                       random_state=19491001)
        neighs, dists = nn.query(X_grid, k=n_neighbors)
    else:
        alg = "ball_tree" if X_emb.shape[1] > 10 else 'kd_tree'
        nn = NearestNeighbors(n_neighbors=n_neighbors,
                              n_jobs=-1,
                              algorithm=alg)
        nn.fit(X_emb)
        dists, neighs = nn.kneighbors(X_grid)

    weight = norm.pdf(x=dists, scale=scale)
    p_mass = weight.sum(1)

    return X_grid, p_mass, neighs, weight
示例#16
0
def test_nn_descent_query_accuracy(nn_data):
    nnd = NNDescent(nn_data[200:], "euclidean", n_neighbors=10, random_state=None)
    knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2)

    tree = KDTree(nn_data[200:])
    true_indices = tree.query(nn_data[:200], 10, return_distance=False)

    num_correct = 0.0
    for i in range(true_indices.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

    percent_correct = num_correct / (true_indices.shape[0] * 10)
    assert percent_correct >= 0.95, (
        "NN-descent query did not get 95% " "accuracy on nearest neighbors"
    )
示例#17
0
def test_nn_descent_query_accuracy_angular(nn_data):
    nnd = NNDescent(nn_data[200:], "cosine", n_neighbors=30, random_state=None)
    knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.32)

    nn = NearestNeighbors(metric="cosine").fit(nn_data[200:])
    true_indices = nn.kneighbors(nn_data[:200], n_neighbors=10, return_distance=False)

    num_correct = 0.0
    for i in range(true_indices.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

    percent_correct = num_correct / (true_indices.shape[0] * 10)
    assert percent_correct >= 0.95, (
        "NN-descent query did not get 95% " "accuracy on nearest neighbors"
    )
示例#18
0
def trn(X, n, return_index=True, seed=19491001, **kwargs):
    trnet = TRNET(n, X, seed)
    trnet.run(**kwargs)
    if not return_index:
        return trnet.W
    else:
        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(X, metric="euclidean", n_neighbors=1, n_jobs=-1, random_state=seed)
            idx, _ = nbrs.query(trnet.W, k=1)
        else:
            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=1, algorithm=alg, n_jobs=-1).fit(X)
            _, idx = nbrs.kneighbors(trnet.W)

        return idx[:, 0]
示例#19
0
def test_joblib_dump():
    seed = np.random.RandomState(42)

    x1 = seed.normal(0, 100, (1000, 50))
    x2 = seed.normal(0, 100, (1000, 50))

    index1 = NNDescent(x1, "euclidean", {}, 10, random_state=None)
    neighbors1, distances1 = index1.query(x2)

    mem_temp = io.BytesIO()
    joblib.dump(index1, mem_temp)
    mem_temp.seek(0)
    index2 = joblib.load(mem_temp)

    neighbors2, distances2 = index2.query(x2)

    np.testing.assert_equal(neighbors1, neighbors2)
    np.testing.assert_equal(distances1, distances2)
示例#20
0
def test_sparse_nn_descent_query_accuracy():
    nnd = NNDescent(
        sparse_nn_data[200:], "euclidean", n_neighbors=10, random_state=None
    )
    knn_indices, _ = nnd.query(sparse_nn_data[:200], k=10)

    tree = KDTree(sparse_nn_data[200:].toarray())
    true_indices = tree.query(sparse_nn_data[:200].toarray(), 10, return_distance=False)

    num_correct = 0.0
    for i in range(true_indices.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

    percent_correct = num_correct / (true_indices.shape[0] * 10)
    assert_greater_equal(
        percent_correct,
        0.95,
        "Sparse NN-descent query did not get 95% " "accuracy on nearest neighbors",
    )
示例#21
0
def p_ij_sym(x, perp, verbose=False):
    num_pts = x.shape[0]
    k = min(num_pts - 1, int(3 * perp))
    if verbose:
        print('Indexing')
    index = NNDescent(x)
    neighbors = np.empty((num_pts, k-1), dtype=np.int)
    p_ij = np.empty((num_pts, k-1))
    for i, xi in enumerate(x):
        if verbose:
            print('Calculating probabilities: {cur}/{tot}'.format(
                cur=i+1, tot=num_pts), end='\r')
        nn, dists = index.query([xi], k)
        beta = find_beta(dists[0, 1:], perp)
        neighbors[i] = nn[0, 1:]
        p_ij[i] = p_i(dists[0, 1:], beta)
    row_indices = np.repeat(np.arange(num_pts), k-1)
    p = csr_matrix((p_ij.ravel(), (row_indices, neighbors.ravel())))
    return 0.5*(p + p.transpose())
示例#22
0
def test_transformer_equivalence():
    N_NEIGHBORS = 15
    QUEUE_SIZE = 5.0
    train = nn_data[:400]
    test = nn_data[:200]

    nnd = NNDescent(data=train, n_neighbors=N_NEIGHBORS, random_state=42)
    indices, dists = nnd.query(test, k=N_NEIGHBORS, queue_size=QUEUE_SIZE)
    sort_idx = np.argsort(indices, axis=1)
    indices_sorted = np.vstack(
        [indices[i, sort_idx[i]] for i in range(sort_idx.shape[0])]
    )
    dists_sorted = np.vstack([dists[i, sort_idx[i]] for i in range(sort_idx.shape[0])])

    transformer = PyNNDescentTransformer(
        n_neighbors=N_NEIGHBORS, search_queue_size=QUEUE_SIZE, random_state=42
    ).fit(train)
    Xt = transformer.transform(test).sorted_indices()

    assert np.all(Xt.indices == indices_sorted.flat)
    assert np.allclose(Xt.data, dists_sorted.flat)
 def test_generate_triplets(self):
     key = random.PRNGKey(42)
     n_points = 1000
     n_inliers = 10
     n_outliers = 5
     n_random = 3
     n_extra = min(n_inliers + 50, n_points)
     # Currently testing it only for 'euclidean' distance. The test for other
     # cases breaks due to issues with the knn search NNDescent package, but
     # it works fine when tested in a colab.
     for distance in ['euclidean']:
         inputs = np.random.normal(size=(n_points, 100))
         index = NNDescent(inputs, metric=distance)
         index.prepare()
         neighbors = index.query(inputs, n_extra)[0]
         neighbors = np.concatenate(
             (np.arange(n_points).reshape([-1, 1]), neighbors), 1)
         distance_fn = trimap.get_distance_fn(distance)
         _, _, sig = trimap.find_scaled_neighbors(inputs, neighbors,
                                                  distance_fn)
         triplets, _ = trimap.generate_triplets(key,
                                                inputs,
                                                n_inliers=n_inliers,
                                                n_outliers=n_outliers,
                                                n_random=n_random,
                                                distance=distance)
         similar_pairs_distances = distance_fn(inputs[triplets[:, 0]],
                                               inputs[triplets[:, 1]])**2
         similar_pairs_distances /= (sig[triplets[:, 0]] *
                                     sig[triplets[:, 1]])
         outlier_pairs_distances = distance_fn(inputs[triplets[:, 0]],
                                               inputs[triplets[:, 2]])**2
         outlier_pairs_distances /= (sig[triplets[:, 0]] *
                                     sig[triplets[:, 2]])
         npt.assert_array_less(similar_pairs_distances,
                               outlier_pairs_distances)
     n_knn_triplets = inputs.shape[0] * n_inliers * n_outliers
     n_random_triplets = inputs.shape[0] * n_random
     npt.assert_equal(triplets.shape,
                      [n_knn_triplets + n_random_triplets, 3])
示例#24
0
def test_sparse_nn_descent_query_accuracy_angular():
    nnd = NNDescent(sparse_nn_data[200:],
                    "cosine",
                    n_neighbors=50,
                    random_state=None)
    knn_indices, _ = nnd.query(sparse_nn_data[:200], k=10, epsilon=0.36)

    nn = NearestNeighbors(metric="cosine").fit(sparse_nn_data[200:].toarray())
    true_indices = nn.kneighbors(sparse_nn_data[:200].toarray(),
                                 n_neighbors=10,
                                 return_distance=False)

    num_correct = 0.0
    for i in range(true_indices.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

    percent_correct = num_correct / (true_indices.shape[0] * 10)
    assert_greater_equal(
        percent_correct,
        0.95,
        "Sparse NN-descent query did not get 95% "
        "accuracy on nearest neighbors",
    )
示例#25
0
def module_computing():
    json_data = json.loads(request.form.get('data'))
    selected_nodes = json_data['nodes']
    data, cols = get_selected_data(selected_nodes)
    module_info = json_data['module_info']
    # data_new = call_module_function(data, cols, module_info)
    # data_new['kmeans_cluster'] = KMeans(n_clusters=4, random_state=0).fit(data_new).labels_
    # data_new = data_new.to_json(orient='records')
    # return jsonify(module_result=data_new)
    # return data_new
    # kNN graph
    from pynndescent import NNDescent
    df = pd.read_csv(APP_STATIC+"/uploads/processed_data.csv")
    activations_shape = df.shape[1]-1
    activations = df.iloc[:, 0:activations_shape]
    k=5
    index = NNDescent(activations, n_neighbors=5, metric='euclidean')
    out = index.query(activations, k=k)
    dist = out[1]
    s_dist=np.sort(dist, axis=0)
    s_dist = list(s_dist[:,k-1].astype("str"))
    print(s_dist)
    return jsonify(s_dist=s_dist)
示例#26
0
def test_pickle_unpickle():
    seed = np.random.RandomState(42)

    x1 = seed.normal(0, 100, (1000, 50))
    x2 = seed.normal(0, 100, (1000, 50))

    index1 = NNDescent(
        x1,
        "euclidean",
        {},
        10,
        random_state=None,
    )
    neighbors1, distances1 = index1.query(x2)

    pickle.dump(index1, open("test_tmp.pkl", "wb"))
    index2 = pickle.load(open("test_tmp.pkl", "rb"))
    os.remove("test_tmp.pkl")

    neighbors2, distances2 = index2.query(x2)

    np.testing.assert_equal(neighbors1, neighbors2)
    np.testing.assert_equal(distances1, distances2)
示例#27
0
def test_joblib_dump():
    seed = np.random.RandomState(42)

    x1 = seed.normal(0, 100, (1000, 50))
    x2 = seed.normal(0, 100, (1000, 50))

    index1 = NNDescent(
        x1,
        "euclidean",
        {},
        10,
        random_state=None,
    )
    neighbors1, distances1 = index1.query(x2)

    joblib.dump(index1, "test_tmp.dump")
    index2 = joblib.load("test_tmp.dump")
    os.remove("test_tmp.dump")

    neighbors2, distances2 = index2.query(x2)

    np.testing.assert_equal(neighbors1, neighbors2)
    np.testing.assert_equal(distances1, distances2)
示例#28
0
def test_one_dimensional_data(nn_data, metric):
    nnd = NNDescent(
        nn_data[200:, :1],
        metric=metric,
        n_neighbors=20,
        random_state=None,
        tree_init=False,
    )
    nnd.prepare()

    knn_indices, _ = nnd.query(nn_data[:200, :1], k=10, epsilon=0.2)

    true_nnd = NearestNeighbors(metric=metric).fit(nn_data[200:, :1])
    true_indices = true_nnd.kneighbors(nn_data[:200, :1],
                                       10,
                                       return_distance=False)

    num_correct = 0.0
    for i in range(true_indices.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

    percent_correct = num_correct / (true_indices.shape[0] * 10)
    assert percent_correct >= 0.95, ("NN-descent query did not get 95% "
                                     "accuracy on nearest neighbors")
示例#29
0
def fate_bias(
    adata,
    group,
    basis="umap",
    inds=None,
    speed_percentile=5,
    dist_threshold=None,
    source_groups=None,
    metric="euclidean",
    metric_kwds=None,
    cores=1,
    seed=19491001,
    **kwargs,
):
    """Calculate the lineage (fate) bias of states whose trajectory are predicted.

    Fate bias is currently calculated as the percentage of points along the predicted cell fate trajectory whose distance
    to their 0-th nearest neighbors on the data are close enough (determined by median 1-st nearest neighbors of all
    observed cells and the dist_threshold) to any cell from each group specified by `group` key. The details is described
    as following:

    Cell fate predicted by our vector field method sometimes end up in regions that are not sampled with cells. We thus
    developed a heuristic method to iteratively walk backward the integration path to assign cell fate. We first identify
    the regions with small velocity in the tail of the integration path (determined by `speed_percentile`), then we check
    whether the distance of 0-th nearest points on the observed data to all those points are far away from the observed
    data (determined by `dist_threshold`). If they are not all close to data, we then walk backwards along the trajectory
    by one time step until the distance of any currently visited integration path’s data points’ 0-th nearest points to
    the observed cells is close enough. In order to calculate the cell fate probability, we diffuse one step further of
    the identified nearest neighbors from the integration to identify more nearest observed cells, especially those from
    terminal cell types in case nearby cells first identified are all close to some random progenitor cells. Then we use
    group information of those observed cells to define the fate probability.

    `fate_bias` calculate a confidence score for the calculated fate probability with a simple metric, defined as
        :math:`1 - (sum(distances > dist_threshold * median_dist) + walk_back_steps) / (len(indices) + walk_back_steps)`

    The `distance` is currently visited integration path’s data points’ 0-th nearest points to the observed cells.
    `median_dist` is median distance of their 1-st nearest cell distance of all observed cells. `walk_back_steps` is the
    steps walked backward along the integration path until all currently visited integration points's 0-th nearest points
    to the observed cells satisfy the distance threshold. `indices` are the time indices of integration points that is
    regarded as the regions with `small velocity` (note when walking backward, those corresponding points are not
    necessarily have small velocity anymore).

    Arguments
    ---------
        adata: :class:`~anndata.AnnData`
            AnnData object that contains the predicted fate trajectories in the `uns` attribute.
        group: `str`
            The column key that corresponds to the cell type or other group information for quantifying the bias of cell
            state.
        basis: `str` or None (default: `None`)
            The embedding data space where cell fates were predicted and cell fates bias will be quantified.
        inds `list` or `float` or None (default: `None`):
            The indices of the time steps that will be used for calculating fate bias. If inds is None, the last a few
            steps of the fate prediction based on the `sink_speed_percentile` will be use. If inds is the float (between
            0 and 1), it will be regarded as a percentage, and the last percentage of steps will be used for fate bias
            calculation. Otherwise inds need to be a list of integers of the time steps.
        speed_percentile: `float` (default: `5`)
            The percentile of speed that will be used to determine the terminal cells (or sink region on the prediction
            path where speed is smaller than this speed percentile).
        dist_threshold: `float` or `None` (default: `None`)
            A multiplier of the median nearest cell distance on the embedding to determine cells that are outside the
            sampled domain of cells. If the mean distance of identified "terminal cells" is above this number, we will
            look backward along the trajectory (by minimize all indices by 1) until it finds cells satisfy this threshold.
            By default it is set to be 1 to ensure only considering points that are very close to observed data points.
        source_groups: `list` or `None` (default: `None`)
            The groups that corresponds to progenitor groups. They has to have at least one intersection with the groups
            from the `group` column. If group is not `None`, any identified "source_groups" cells that happen to be in
            those groups will be ignored and the probability of cell fate of those cells will be reassigned to the group
            that has the highest fate probability among other non source_groups group cells.
        metric: `str` or callable, default='euclidean'
            The distance metric to use for the tree.  The default metric is , and with p=2 is equivalent to the standard
            Euclidean metric. See the documentation of :class:`DistanceMetric` for a list of available metrics. If metric
            is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a
            :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors.
        metric_kwds : dict, default=None
            Additional keyword arguments for the metric function.
        cores: `int` (default: 1)
            The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
            ``-1`` means using all processors.
        seed: `int` (default `19491001`)
            Random seed to ensure the reproducibility of each run.
        kwargs:
            Additional arguments that will be passed to each nearest neighbor search algorithm.

    Returns
    -------
        fate_bias: `pandas.DataFrame`
            A DataFrame that stores the fate bias for each cell state (row) to each cell group (column).
    """

    if dist_threshold is None:
        dist_threshold = 1

    if group not in adata.obs.keys():
        raise ValueError(
            f"The group {group} you provided is not a key of .obs attribute.")
    else:
        clusters = adata.obs[group]

    basis_key = "X_" + basis if basis is not None else "X"
    fate_key = "fate_" + basis if basis is not None else "fate"

    if basis_key not in adata.obsm.keys():
        raise ValueError(
            f"The basis {basis_key} you provided is not a key of .obsm attribute."
        )
    if fate_key not in adata.uns.keys():
        raise ValueError(
            f"The {fate_key} key is not existed in the .uns attribute of the adata object. You need to run"
            f"dyn.pd.fate(adata, basis='{basis}') before calculate fate bias.")

    if source_groups is not None:
        if type(source_groups) is str:
            source_groups = [source_groups]
        source_groups = list(set(source_groups).intersection(clusters))
        if len(source_groups) == 0:
            raise ValueError(
                f"the {source_groups} you provided doesn't intersect with any groups in the {group} column."
            )

    X = adata.obsm[basis_key] if basis_key != "X" else adata.X

    if X.shape[0] > 5000 and X.shape[1] > 2:
        from pynndescent import NNDescent

        nbrs = NNDescent(X,
                         metric=metric,
                         metric_kwds=metric_kwds,
                         n_neighbors=30,
                         n_jobs=cores,
                         random_state=seed,
                         **kwargs)
        knn, distances = nbrs.query(X, k=30)
    else:
        alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
        nbrs = NearestNeighbors(n_neighbors=30, algorithm=alg,
                                n_jobs=cores).fit(X)
        distances, knn = nbrs.kneighbors(X)

    median_dist = np.median(distances[:, 1])

    pred_dict = {}
    cell_predictions, cell_indx = adata.uns[fate_key]["prediction"], adata.uns[
        fate_key]["init_cells"]
    t = adata.uns[fate_key]["t"]
    confidence = np.zeros(len(t))

    for i, prediction in tqdm(enumerate(cell_predictions),
                              desc="calculating fate distributions"):
        cur_t, n_steps = t[i], len(t[i])

        # ensure to identify sink where the speed is very slow if inds is not provided.
        # if inds is the percentage, use the last percentage of steps to check for cell fate bias.
        # otherwise inds need to be a list.
        if inds is None:
            avg_speed = np.array(
                [np.linalg.norm(i)
                 for i in np.diff(prediction, 1).T]) / np.diff(cur_t)
            sink_checker = np.where(
                avg_speed[::-1] > np.percentile(avg_speed, speed_percentile)
            )[0]
            indices = np.arange(n_steps - max(min(sink_checker), 10), n_steps)
        elif inds is float:
            indices = np.arange(int(n_steps - inds * n_steps), n_steps)
        else:
            indices = inds

        if hasattr(nbrs, "query"):
            knn, distances = nbrs.query(prediction[:, indices].T, k=30)
        else:
            distances, knn = nbrs.kneighbors(prediction[:, indices].T)

            # if final steps too far away from observed cells, ignore them
        walk_back_steps = 0
        while True:
            is_dist_larger_than_threshold = distances.flatten(
            ) < dist_threshold * median_dist
            if any(is_dist_larger_than_threshold):

                # let us diffuse one step further to identify cells from terminal cell types in case
                # cells with indices are all close to some random progenitor cells.
                if hasattr(nbrs, "query"):
                    knn, _ = nbrs.query(X[knn.flatten(), :], k=30)
                else:
                    _, knn = nbrs.kneighbors(X[knn.flatten(), :])

                fate_prob = clusters[knn.flatten()].value_counts() / len(
                    knn.flatten())
                if source_groups is not None:
                    source_p = fate_prob[source_groups].sum()
                    if 1 > source_p > 0:
                        fate_prob[source_groups] = 0
                        fate_prob[fate_prob.idxmax()] += source_p

                pred_dict[i] = fate_prob

                confidence[i] = 1 - (
                    sum(~is_dist_larger_than_threshold) + walk_back_steps) / (
                        len(is_dist_larger_than_threshold) + walk_back_steps)

                break
            else:
                walk_back_steps += 1

                if any(indices - 1 < 0):
                    pred_dict[i] = clusters[
                        knn.flatten()].value_counts() * np.nan
                    break

                if hasattr(nbrs, "query"):
                    knn, distances = nbrs.query(prediction[:, indices - 1].T,
                                                k=30)
                else:
                    distances, knn = nbrs.kneighbors(prediction[:,
                                                                indices - 1].T)

                knn, distances = knn[:, 0], distances[:, 0]
                indices = indices - 1

    bias = pd.DataFrame(pred_dict).T
    conf = pd.DataFrame({"confidence": confidence}, index=bias.index)
    bias = pd.merge(conf, bias, left_index=True, right_index=True)

    if cell_indx is not None:
        bias.index = cell_indx

    return bias
示例#30
0
def cell_wise_confidence(
    adata,
    X_data=None,
    V_data=None,
    ekey="M_s",
    vkey="velocity_S",
    neighbors_from_basis=False,
    method="jaccard",
):
    """Calculate the cell-wise velocity confidence metric.

    Parameters
    ----------
        adata: :class:`~anndata.AnnData`
            an Annodata object.
        X_data: 'np.ndarray' or `sp.csr_matrix` or None (optional, default `None`)
            The expression states of single cells (or expression states in reduced dimension, like pca, of single cells)
        V_data: 'np.ndarray' or `sp.csr_matrix` or None (optional, default `None`)
            The RNA velocity of single cells (or velocity estimates projected to reduced dimension, like pca, of single
            cells). Note that X, V_mat need to have the exact dimensionalities.
        ekey: `str` (optional, default `M_s`)
            The dictionary key that corresponds to the gene expression in the layer attribute. By default, it is the
            smoothed expression `M_s`.
        vkey: 'str' (optional, default `velocity_S`)
            The dictionary key that corresponds to the estimated velocity values in layers attribute.
        neighbors_from_basis: `bool` (optional, default `False`)
            Whether to construct nearest neighbors from low dimensional space as defined by the `basis`, instead of using
            that calculated during UMAP process.
        method: `str` (optional, default `jaccard`)
            Which method will be used for calculating the cell wise velocity confidence metric.
            By default it uses
            `jaccard` index, which measures how well each velocity vector meets the geometric constraints defined by the
            local neighborhood structure. Jaccard index is calculated as the fraction of the number of the intersected
            set of nearest neighbors from each cell at current expression state (X) and that from the future expression
            state (X + V) over the number of the union of these two sets. The `cosine` or `correlation` method is similar
            to that used by scVelo (https://github.com/theislab/scvelo).

    Returns
    -------
        adata: :class:`~anndata.AnnData`
            Returns an updated `~anndata.AnnData` with `.obs.confidence` as the cell-wise velocity confidence.
    """

    if method in ["cosine", "consensus", "correlation"]:
        if "indices" not in adata.uns["neighbors"].keys():
            adata.uns["neighbors"]["indices"], _ = adj_to_knn(
                adata.obsp["connectivities"],
                n_neighbors=adata.uns["neighbors"]["params"]["n_neighbors"])

    if ekey == "X":
        X, V = (
            adata.X if X_data is None else X_data,
            adata.layers[vkey] if V_data is None else V_data,
        )
        norm_method = adata.uns["pp"]["norm_method"].copy()
        adata.uns["pp"]["norm_method"] = "log1p"
        X = inverse_norm(adata, X) if X_data is None else X_data
        adata.uns["pp"]["norm_method"] = norm_method
    else:
        X, V = (
            adata.layers[ekey] if X_data is None else X_data,
            adata.layers[vkey] if V_data is None else V_data,
        )
        X = inverse_norm(adata, X) if X_data is None else X_data

    if not neighbors_from_basis:
        check_and_recompute_neighbors(adata, result_prefix="")
        n_neigh, X_neighbors = (
            adata.uns["neighbors"]["params"]["n_neighbors"],
            adata.obsp["connectivities"],
        )
    else:
        n_neigh = 30

        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(
                X,
                metric="euclidean",
                n_neighbors=n_neigh + 1,
                n_jobs=-1,
                random_state=19491001,
            )
            nbrs_idx, dist = nbrs.query(X, k=n_neigh + 1)
        else:
            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=n_neigh + 1,
                                    algorithm=alg,
                                    n_jobs=-1).fit(X)
            dist, nbrs_idx = nbrs.kneighbors(X)

        row = np.repeat(nbrs_idx[:, 0], n_neigh)
        col = nbrs_idx[:, 1:].flatten()
        X_neighbors = csr_matrix(
            (np.repeat(1, len(col)), (row, col)),
            shape=(adata.n_obs, adata.n_obs),
        )

    n_neigh = n_neigh[0] if type(n_neigh) == np.ndarray else n_neigh
    n_pca_components = adata.obsm["X"].shape[1]

    finite_inds = get_finite_inds(V, 0)
    X, V = X[:, finite_inds], V[:, finite_inds]
    if method == "jaccard":
        jac, _, _ = jaccard(X, V, n_pca_components, n_neigh, X_neighbors)
        confidence = jac

    elif method == "hybrid":
        # this is inspired from the locality preservation paper
        jac, intersect_, _ = jaccard(X, V, n_pca_components, n_neigh,
                                     X_neighbors)

        confidence = np.zeros(adata.n_obs)
        for i in tqdm(
                range(adata.n_obs),
                desc=
                "calculating hybrid method (jaccard + consensus) based cell wise confidence",
        ):
            neigh_ids = np.where(
                intersect_[i].A)[0] if issparse(intersect_) else np.where(
                    intersect_[i])[0]
            confidence[i] = (jac[i] * np.mean([
                consensus(V[i].A.flatten(), V[j].A.flatten())
                for j in neigh_ids
            ]) if issparse(V) else jac[i] * np.mean(
                [consensus(V[i].flatten(), V[j].flatten())
                 for j in neigh_ids]))

    elif method == "cosine":
        check_and_recompute_neighbors(adata, result_prefix="")
        indices = adata.uns["neighbors"]["indices"]
        confidence = np.zeros(adata.n_obs)
        for i in tqdm(
                range(adata.n_obs),
                desc="calculating cosine based cell wise confidence",
        ):
            neigh_ids = indices[i]
            confidence[i] = (np.mean([
                einsum_correlation(V[i].A, V[j].A.flatten(), type="cosine")[0,
                                                                            0]
                for j in neigh_ids
            ]) if issparse(V) else np.mean([
                einsum_correlation(
                    V[i][None, :], V[j].flatten(), type="cosine")[0, 0]
                for j in neigh_ids
            ]))

    elif method == "consensus":
        check_and_recompute_neighbors(adata, result_prefix="")
        indices = adata.uns["neighbors"]["indices"]
        confidence = np.zeros(adata.n_obs)
        for i in tqdm(
                range(adata.n_obs),
                desc="calculating consensus based cell wise confidence",
        ):
            neigh_ids = indices[i]
            confidence[i] = (np.mean([
                consensus(V[i].A.flatten(), V[j].A.flatten())
                for j in neigh_ids
            ]) if issparse(V) else np.mean(
                [consensus(V[i], V[j].flatten()) for j in neigh_ids]))

    elif method == "correlation":
        # this is equivalent to scVelo
        check_and_recompute_neighbors(adata, result_prefix="")
        indices = adata.uns["neighbors"]["indices"]
        confidence = np.zeros(adata.n_obs)
        for i in tqdm(
                range(adata.n_obs),
                desc="calculating correlation based cell wise confidence",
        ):
            neigh_ids = indices[i]
            confidence[i] = (np.mean([
                einsum_correlation(V[i].A, V[j].A.flatten(), type="pearson")[0,
                                                                             0]
                for j in neigh_ids
            ]) if issparse(V) else np.mean([
                einsum_correlation(
                    V[i][None, :], V[j].flatten(), type="pearson")[0, 0]
                for j in neigh_ids
            ]))

    elif method == "divergence":
        pass

    else:
        raise Exception(
            "The input {} method for cell-wise velocity confidence calculation is not implemented"
            " yet".format(method))

    adata.obs[method + "_velocity_confidence"] = confidence

    return adata