def calculate_potential(self, diff_op, t): """Calculates the diffusion potential Parameters ---------- diff_op : array-like, shape=[n_samples, n_samples] or [n_landmarks, n_landmarks] The diffusion operator fit on the input data t : int power to which the diffusion operator is powered sets the level of diffusion Returns ------- diff_potential : array-like, shape=[n_samples, n_samples] The diffusion potential fit on the input data """ tasklogger.log_start("diffusion potential") # diffused diffusion operator diff_op_t = np.linalg.matrix_power(diff_op, t) if self.gamma == 1: # handling small values diff_op_t = diff_op_t + 1e-7 diff_potential = -1 * np.log(diff_op_t) elif self.gamma == -1: diff_potential = diff_op_t else: c = (1 - self.gamma) / 2 diff_potential = ((diff_op_t)**c) / c tasklogger.log_complete("diffusion potential") return diff_potential
def fit(self, X): if not len(X.shape) == 3: raise ValueError("Expected X to be a tensor with three dimensions." " Got shape {}".format(X.shape)) if self.normalize: X = utils.normalize(X) tasklogger.log_start("multislice kernel") K = kernel.multislice_kernel(X, intraslice_knn=self.intraslice_knn, interslice_knn=self.interslice_knn, decay=self.decay, n_pca=self.n_pca, distance=self.knn_dist, n_jobs=self.n_jobs) tasklogger.log_complete("multislice kernel") tasklogger.log_start("graph and diffusion operator") n_landmark = self.n_landmark if self.n_landmark < K.shape[0] else None self.graph = graphtools.Graph(K, precomputed="affinity", n_landmark=n_landmark, n_svd=self.n_svd, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, **(self.kwargs)) self.diff_op tasklogger.log_complete("graph and diffusion operator") result = super().fit(self.graph) return result
def fit_transform(self, X, graph=None, **kwargs): """Computes the diffusion operator and the position of the cells in the embedding space Parameters ---------- X : array, shape=[n_samples, n_features] input data with `n_samples` samples and `n_features` dimensions. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. graph : `graphtools.Graph`, optional (default: None) If given, provides a precomputed kernel matrix with which to perform diffusion. kwargs : further arguments for `PHATE.transform()` Keyword arguments as specified in :func:`~phate.PHATE.transform` Returns ------- X_magic : array, shape=[n_samples, n_genes] The gene expression values after diffusion """ tasklogger.log_start('MAGIC') self.fit(X, graph=graph) X_magic = self.transform(**kwargs) tasklogger.log_complete('MAGIC') return X_magic
def fit_transform(self, X, **kwargs): """Computes the diffusion operator and the position of the cells in the embedding space Parameters ---------- X : array, shape=[n_samples, n_features] input data with `n_samples` samples and `n_dimensions` dimensions. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData` If `knn_dist` is 'precomputed', `data` should be a n_samples x n_samples distance or affinity matrix kwargs : further arguments for `PHATE.transform()` Keyword arguments as specified in :func:`~phate.PHATE.transform` Returns ------- embedding : array, shape=[n_samples, n_dimensions] The cells embedded in a lower dimensional space using PHATE """ tasklogger.log_start('PHATE') self.fit(X) embedding = self.transform(**kwargs) tasklogger.log_complete('PHATE') return embedding
def _reduce_data(self): """Private method to reduce data dimension. If data is dense, uses randomized PCA. If data is sparse, uses randomized SVD. TODO: should we subtract and store the mean? Returns ------- Reduced data matrix """ if self.n_pca is not None and self.n_pca < self.data.shape[1]: tasklogger.log_start("PCA") if sparse.issparse(self.data): if isinstance(self.data, sparse.coo_matrix) or \ isinstance(self.data, sparse.lil_matrix) or \ isinstance(self.data, sparse.dok_matrix): self.data = self.data.tocsr() self.data_pca = TruncatedSVD(self.n_pca, random_state=self.random_state) else: self.data_pca = PCA(self.n_pca, svd_solver='randomized', random_state=self.random_state) self.data_pca.fit(self.data) data_nu = self.data_pca.transform(self.data) tasklogger.log_complete("PCA") return data_nu else: data_nu = self.data if sparse.issparse(data_nu) and not isinstance( data_nu, (sparse.csr_matrix, sparse.csc_matrix, sparse.bsr_matrix)): data_nu = data_nu.tocsr() return data_nu
def fit(self, X): """Computes the diffusion operator Parameters ---------- X : array, shape=[n_samples, n_features] input data with `n_samples` samples and `n_dimensions` dimensions. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If `knn_dist` is 'precomputed', `data` should be a n_samples x n_samples distance or affinity matrix Returns ------- phate_operator : PHATE The estimator object """ X, n_pca, precomputed, update_graph = self._parse_input(X) if precomputed is None: tasklogger.log_info( "Running PHATE on {} cells and {} genes.".format( X.shape[0], X.shape[1])) else: tasklogger.log_info( "Running PHATE on precomputed {} matrix with {} cells.".format( precomputed, X.shape[0])) if self.n_landmark is None or X.shape[0] <= self.n_landmark: n_landmark = None else: n_landmark = self.n_landmark if self.graph is not None and update_graph: self._update_graph(X, precomputed, n_pca, n_landmark) self.X = X if self.graph is None: tasklogger.log_start("graph and diffusion operator") self.graph = graphtools.Graph( X, n_pca=n_pca, n_landmark=n_landmark, distance=self.knn_dist, precomputed=precomputed, knn=self.knn, decay=self.decay, thresh=1e-4, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, **(self.kwargs)) tasklogger.log_complete("graph and diffusion operator") # landmark op doesn't build unless forced self.diff_op return self
def PCA(X, *args, is_graph=False, seed=None, n_components=2, **kwargs): X = scprep.utils.toarray(X) tasklogger.log_start("PCA") Y = sklearn.decomposition.PCA(*args, n_components=n_components, random_state=seed, **kwargs).fit_transform(X) tasklogger.log_complete("PCA") return Y
def ISOMAP(X, *args, is_graph=False, seed=None, **kwargs): np.random.seed(seed) if is_graph: X = utils.geodesic_distance(X) tasklogger.log_start("ISOMAP") Y = Isomap_(*args, precomputed=is_graph, random_state=seed, **kwargs).fit_transform(X) tasklogger.log_complete("ISOMAP") return Y
def _calculate_potential(self, t=None, t_max=100, plot_optimal_t=False, ax=None): """Calculates the diffusion potential Parameters ---------- t : int power to which the diffusion operator is powered sets the level of diffusion t_max : int, default: 100 Maximum value of `t` to test plot_optimal_t : boolean, default: False If true, plots the Von Neumann Entropy and knee point ax : matplotlib.Axes, default: None If plot=True and ax is not None, plots the VNE on the given axis Otherwise, creates a new axis and displays the plot Returns ------- diff_potential : array-like, shape=[n_samples, n_samples] The diffusion potential fit on the input data """ if t is None: t = self.t if self._diff_potential is None: if t == 'auto': t = self._find_optimal_t(t_max=t_max, plot=plot_optimal_t, ax=ax) else: t = self.t tasklogger.log_start("diffusion potential") # diffused diffusion operator diff_op_t = np.linalg.matrix_power(self.diff_op, t) if self.gamma == 1: # handling small values diff_op_t = diff_op_t + 1e-7 self._diff_potential = -1 * np.log(diff_op_t) elif self.gamma == -1: self._diff_potential = diff_op_t else: c = (1 - self.gamma) / 2 self._diff_potential = ((diff_op_t)**c) / c tasklogger.log_complete("diffusion potential") elif plot_optimal_t: self._find_optimal_t(t_max=t_max, plot=plot_optimal_t, ax=ax) return self._diff_potential
def build_kernel(self): """Build the MNN kernel. Build a mutual nearest neighbors kernel. Returns ------- K : kernel matrix, shape=[n_samples, n_samples] symmetric matrix with ones down the diagonal with no non-negative entries. """ tasklogger.log_start("subgraphs") self.subgraphs = [] from .api import Graph # iterate through sample ids for i, idx in enumerate(self.samples): tasklogger.log_debug("subgraph {}: sample {}, " "n = {}, knn = {}".format( i, idx, np.sum(self.sample_idx == idx), self.weighted_knn[i])) # select data for sample data = self.data_nu[self.sample_idx == idx] # build a kNN graph for cells within sample graph = Graph(data, n_pca=None, knn=self.weighted_knn[i], decay=self.decay, distance=self.distance, thresh=self.thresh, verbose=self.verbose, random_state=self.random_state, n_jobs=self.n_jobs, initialize=False) self.subgraphs.append(graph) # append to list of subgraphs tasklogger.log_complete("subgraphs") if self.thresh > 0 or self.decay is None: K = sparse.lil_matrix( (self.data_nu.shape[0], self.data_nu.shape[0])) else: K = np.zeros([self.data_nu.shape[0], self.data_nu.shape[0]]) for i, X in enumerate(self.subgraphs): for j, Y in enumerate(self.subgraphs): tasklogger.log_start("kernel from sample {} to {}".format( self.samples[i], self.samples[j])) Kij = Y.build_kernel_to_data(X.data_nu, knn=self.weighted_knn[i]) if i == j: # downweight within-batch affinities by beta Kij = Kij * self.beta K = set_submatrix(K, self.sample_idx == self.samples[i], self.sample_idx == self.samples[j], Kij) tasklogger.log_complete("kernel from sample {} to {}".format( self.samples[i], self.samples[j])) return K
def TSNE(X, *args, is_graph=False, metric='euclidean', seed=None, **kwargs): if is_graph: X = utils.geodesic_distance(X) metric = 'precomputed' tasklogger.log_start("TSNE") Y = sklearn.manifold.TSNE(*args, metric=metric, random_state=seed, **kwargs).fit_transform(X) tasklogger.log_complete("TSNE") return Y
def Spring(X, *args, is_graph=False, seed=None, **kwargs): np.random.seed(seed) if not is_graph: G = graphtools.Graph(X, knn=3, decay=None, use_pygsp=True) else: G = pygsp.graphs.Graph(X) G = networkx.from_numpy_matrix(G.W.toarray()) tasklogger.log_start("Spring") X = networkx.spring_layout(G, *args, **kwargs) tasklogger.log_complete("Spring") X = np.vstack(list(X.values())) return X
def measure_method(data_noised, method, labels, data_name, subsample_idx=None): if subsample_idx is not None: data_noised = data_noised[subsample_idx] tasklogger.log_start(method.__name__, logger="demap") embedding = method(data_noised) tasklogger.log_complete(method.__name__, logger="demap") ari_score = demap.ari.ARI(labels, embedding, subsample_idx=subsample_idx) df = pd.DataFrame( { "dataset": data_name, "method": method.__name__, "ARI": ari_score }, index=[""]) return df
def build_kernel_to_data(self, Y, knn=None): """Build transition matrix from new data to the graph Creates a transition matrix such that `Y` can be approximated by a linear combination of landmarks. Any transformation of the landmarks can be trivially applied to `Y` by performing `transform_Y = transitions.dot(transform)` Parameters ---------- Y: array-like, [n_samples_y, n_features] new data for which an affinity matrix is calculated to the existing data. `n_features` must match either the ambient or PCA dimensions Returns ------- transitions : array-like, [n_samples_y, self.data.shape[0]] Transition matrix from `Y` to `self.data` Raises ------ ValueError: if `precomputed` is not `None`, then the graph cannot be extended. """ if knn is None: knn = self.knn if self.precomputed is not None: raise ValueError("Cannot extend kernel on precomputed graph") else: tasklogger.log_start("affinities") Y = self._check_extension_shape(Y) pdx = cdist(Y, self.data_nu, metric=self.distance) knn_dist = np.partition(pdx, knn, axis=1)[:, :knn] epsilon = np.max(knn_dist, axis=1) pdx = (pdx.T / epsilon).T K = np.exp(-1 * pdx**self.decay) # handle nan K = np.where(np.isnan(K), 1, K) K[K < self.thresh] = 0 tasklogger.log_complete("affinities") return K
def _find_optimal_t(self, t_max=100, plot=False, ax=None): """Find the optimal value of t Selects the optimal value of t based on the knee point of the Von Neumann Entropy of the diffusion operator. Parameters ---------- t_max : int, default: 100 Maximum value of t to test plot : boolean, default: False If true, plots the Von Neumann Entropy and knee point ax : matplotlib.Axes, default: None If plot=True and ax is not None, plots the VNE on the given axis Otherwise, creates a new axis and displays the plot Returns ------- t_opt : int The optimal value of t """ tasklogger.log_start("optimal t") t, h = self._von_neumann_entropy(t_max=t_max) t_opt = vne.find_knee_point(y=h, x=t) tasklogger.log_info("Automatically selected t = {}".format(t_opt)) tasklogger.log_complete("optimal t") if plot: if ax is None: fig, ax = plt.subplots() show = True else: show = False ax.plot(t, h) ax.scatter(t_opt, h[t == t_opt], marker='*', c='k', s=50) ax.set_xlabel("t") ax.set_ylabel("Von Neumann Entropy") ax.set_title("Optimal t = {}".format(t_opt)) if show: plt.show() self.optimal_t = t_opt return t_opt
def graphDiffusionCoordinates(G, n_eigenvectors=None): # diffusion maps with normalized Laplacian tasklogger.log_start("eigendecomposition") if n_eigenvectors is None: G.compute_fourier_basis() else: # temporary workaround until pygsp updates to pypi from scipy import sparse G._e, G._U = sparse.linalg.eigsh(G.L, n_eigenvectors, which='SM') tasklogger.log_complete("eigendecomposition") phi, lmbda = G.U, G.e # smallest to largest lmbda_idx = np.argsort(lmbda) phi, lmbda = phi[:, lmbda_idx], lmbda[lmbda_idx] # trim trivial information phi, lmbda = phi[:, 1:], lmbda[1:] return phi, lmbda
def MDS(X, *args, is_graph=False, dissimilarity='euclidean', seed=None, n_jobs=15, **kwargs): if is_graph: X = utils.geodesic_distance(X) dissimilarity = 'precomputed' tasklogger.log_start("MDS") Y = sklearn.manifold.MDS(*args, dissimilarity=dissimilarity, random_state=None, n_jobs=n_jobs, **kwargs).fit_transform(X) tasklogger.log_complete("MDS") return Y
def fit_transform(self, X, graph=None, **kwargs): """Computes the diffusion operator and the position of the cells in the embedding space Parameters ---------- X : array, shape=[n_samples, n_features] input data with `n_samples` samples and `n_features` dimensions. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. graph : `graphtools.Graph`, optional (default: None) If given, provides a precomputed kernel matrix with which to perform diffusion. genes : list or {"all_genes", "pca_only"}, optional (default: None) List of genes, either as integer indices or column names if input data is a pandas DataFrame. If "all_genes", the entire smoothed matrix is returned. If "pca_only", PCA on the smoothed data is returned. If None, the entire matrix is also returned, but a warning may be raised if the resultant matrix is very large. t_max : int, optional, default: 20 maximum t to test if `t` is set to 'auto' plot_optimal_t : boolean, optional, default: False If true and `t` is set to 'auto', plot the disparity used to select t ax : matplotlib.axes.Axes, optional If given and `plot_optimal_t` is true, plot will be drawn on the given axis. Returns ------- X_magic : array, shape=[n_samples, n_genes] The gene expression values after diffusion """ tasklogger.log_start('MAGIC') self.fit(X, graph=graph) X_magic = self.transform(**kwargs) tasklogger.log_complete('MAGIC') return X_magic
def measure_method(data, data_noised, method, data_name, subsample_idx=None): if subsample_idx is not None: data_noised = data_noised[subsample_idx] tasklogger.log_start(method.__name__, logger="demap") embedding = method(data_noised) tasklogger.log_complete(method.__name__, logger="demap") demap_score = demap.DEMaP(data, embedding, knn=5, subsample_idx=subsample_idx) df = pd.DataFrame( { "dataset": data_name, "method": method.__name__, "demap": demap_score }, index=[""], ) return df
def call(self, bam_dir, out_dir): """call CNV for each chromosome Parameters ---------- bam_dir : directory path which contains all BAM files out_dir : the output directory Returns ------- self """ Y_path = os.path.join(out_dir, 'temp.Y.csv') nor_Y_path = os.path.join(out_dir, 'temp.norY.csv') ref_path = os.path.join(out_dir, 'temp.ref.csv') gini_path = os.path.join(out_dir, 'temp.gini.csv') ploidy_path = os.path.join(out_dir, 'temp.ploidy.csv') scope_path = os.path.join(out_dir, 'run-scope.R') utils.write_scope(scope_path) command = 'Rscript {0} {1} {2} {3} {4} {5} {6} {7} {8} {9} {10} {11}'.format(scope_path, bam_dir, Y_path, ref_path, gini_path, ploidy_path, nor_Y_path,self.seq, self.reg, self.ref, self.mapq, self.bin_len) code = os.system(command) if code != 0: sys.exit(1) tasklogger.log_start('SCYN') Y = pd.read_csv(Y_path, index_col=0) nor_Y = pd.read_csv(nor_Y_path, index_col=0) ref = pd.read_csv(ref_path, index_col=0) gini = pd.read_csv(gini_path, index_col=0) ploidy = pd.read_csv(ploidy_path, index_col=0) self.meta_info = pd.DataFrame(index=['c_gini', 'c_ploidy'], columns=Y.columns) self.meta_info.loc['c_gini'] = gini.T.iloc[0].values self.meta_info.loc['c_ploidy'] = ploidy.T.iloc[0].values self.meta_info = self.meta_info.T self._cal_cnv(ref, Y, nor_Y) self.bin_info = ref # clean up temp files utils.clean_up([Y_path, nor_Y_path, ref_path, gini_path, ploidy_path, scope_path]) tasklogger.log_complete('SCYN') return self
def fit(self, X, Y, q=None): if hasattr(self, "phi_X"): tasklogger.log_info("Using precomputed diffusion coordinates.") else: tasklogger.log_start("diffusion coordinates") if q is None: with parallel.ParallelQueue(n_jobs=min(2, self.n_jobs)) as q: return self.fit(X, Y, q) else: q.queue( math.diffusionCoordinates, X, decay=self.decay_X, knn=self.knn_X, n_pca=self.n_pca_X if self.n_pca_X is not None and self.n_pca_X < min(X.shape) else None, n_eigenvectors=self.n_eigenvectors, n_jobs=max(self.n_jobs // 2, 1), verbose=self.verbose, random_state=self.random_state, ) q.queue( math.diffusionCoordinates, Y, decay=self.decay_Y, knn=self.knn_Y, n_pca=self.n_pca_Y if self.n_pca_Y is not None and self.n_pca_Y < min(Y.shape) else None, n_eigenvectors=self.n_eigenvectors, n_jobs=max(self.n_jobs // 2, 1), verbose=self.verbose, random_state=self.random_state, ) (phi_X, lambda_X), (phi_Y, lambda_Y) = q.run() self.phi_X = phi_X self.lambda_X = lambda_X self.phi_Y = phi_Y self.lambda_Y = lambda_Y tasklogger.log_complete("diffusion coordinates") return self
def PHATE(X, *args, is_graph=False, knn_dist='euclidean', solver='smacof', verbose=0, seed=None, n_jobs=15, **kwargs): if knn_dist is None: if is_graph: knn_dist = 'precomputed' tasklogger.log_start("PHATE") Y = phate.PHATE(*args, knn_dist=knn_dist, verbose=verbose, random_state=seed, n_jobs=n_jobs, mds_solver=solver, **kwargs).fit_transform(X) tasklogger.log_complete("PHATE") return Y
def impute(self, data): """Main function of I-Impute Parameters ---------- data : matrix, shape (m x n) The raw reads count matrix Returns ------- imputed_data: matrix, shape (m x n) The imputed matrix, pandas Dataframe object """ tasklogger.log_start('I-Impute') imputed_data = None if self.iteration: exp_mse = 1 mse = 100 previous_imputed_data = data iteration = 1 while mse > exp_mse: tasklogger.log_info( 'iteratively impute for the {0}th time'.format(iteration)) current_imputed_data = self._cimpute(previous_imputed_data) dist_matrix = (current_imputed_data - previous_imputed_data)**2 n_values = data.shape[0] * data.shape[1] mse = np.sqrt(dist_matrix.values.sum() / n_values) previous_imputed_data = current_imputed_data iteration += 1 imputed_data = previous_imputed_data else: imputed_data = self._cimpute(data) tasklogger.log_complete('I-Impute') return imputed_data
def build_landmark_op(self): """Build the landmark operator Calculates spectral clusters on the kernel, and calculates transition probabilities between cluster centers by using transition probabilities between samples assigned to each cluster. """ tasklogger.log_start("landmark operator") is_sparse = sparse.issparse(self.kernel) # spectral clustering tasklogger.log_start("SVD") _, _, VT = randomized_svd(self.diff_aff, n_components=self.n_svd, random_state=self.random_state) tasklogger.log_complete("SVD") tasklogger.log_start("KMeans") kmeans = MiniBatchKMeans(self.n_landmark, init_size=3 * self.n_landmark, batch_size=10000, random_state=self.random_state) self._clusters = kmeans.fit_predict(self.diff_op.dot(VT.T)) # some clusters are not assigned tasklogger.log_complete("KMeans") # transition matrices pmn = self._landmarks_to_data() # row normalize pnm = pmn.transpose() pmn = normalize(pmn, norm='l1', axis=1) pnm = normalize(pnm, norm='l1', axis=1) landmark_op = pmn.dot(pnm) # sparsity agnostic matrix multiplication if is_sparse: # no need to have a sparse landmark operator landmark_op = landmark_op.toarray() # store output self._landmark_op = landmark_op self._transitions = pnm tasklogger.log_complete("landmark operator")
def align(self, X, Y, phi_X=None, phi_Y=None, lambda_X=None, lambda_Y=None): """Harmonic alignment Parameters ---------- X : array-like, shape=[n_samples, n_features] Input dataset Y : array-like, shape=[m_samples, n_features] Input dataset phi_{X,Y} : array-like, shape=[{n,m}_samples, {n,m}_samples], optional (default: None) Precomputed Laplacian eigenvectors lambda_{X,Y} : list-like, shape=[{n,m}_samples], optional (default: None) Precomputed Laplacian eigenvalues Returns ------- XY_aligned : array-like, shape=[n_samples + m_samples, n_samples + m_samples - 1] """ tasklogger.log_start("Harmonic Alignment") np.random.seed(self.random_state) # normalized L with diffusion coordinates with parallel.ParallelQueue(n_jobs=min(2, self.n_jobs)) as q: if (phi_X is not None or phi_Y is not None or lambda_X is not None or lambda_Y is not None): if None in (phi_X, phi_Y, lambda_X, lambda_Y): raise RuntimeError( "If a precomputed eigensystem is provided, all of" " `phi_X, phi_Y, lambda_X, lambda_Y` must be provided." " Got phi_X={}, phi_Y={}, lambda_X={}, lambda_Y={}". format(phi_X, phi_Y, lambda_X, lambda_Y)) else: self.phi_X, self.phi_Y = phi_X, phi_Y self.lambda_X, self.lambda_Y = lambda_X, lambda_Y self.fit(X, Y, q) # evaluate wavelets over data in the spectral domain tasklogger.log_start("wavelets") transform = build_wavelet_transform( X, self.phi_X, self.lambda_X, Y, self.phi_Y, self.lambda_Y, self.n_filters, self.overlap, q=q, ) tasklogger.log_complete("wavelets") # compute transformed data tasklogger.log_start("transformed data") self.phi_combined, self.lambda_combined = combine_eigenvectors( transform, self.phi_X, self.phi_Y, self.lambda_X, self.lambda_Y) E = self.phi_combined @ np.diag(self.lambda_combined**self.t) # build the joint diffusion map tasklogger.log_start("graph Laplacian") self.graph = graphtools.Graph( E, knn=self.knn_XY, decay=self.decay_XY, n_pca=self.n_pca_XY if self.n_pca_XY is not None and self.n_pca_XY < min(E.shape) else None, use_pygsp=True, thresh=1e-4, anisotropy=1, lap_type="normalized", n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, ) tasklogger.log_complete("graph Laplacian") tasklogger.log_complete("transformed data") tasklogger.log_complete("Harmonic Alignment") return self.graph
def test_tasks(): logger = tasklogger.log_start("test") assert time.time() - logger.tasks['test'] < 0.01 time.sleep(logger.min_runtime) tasklogger.log_complete("test") assert 'test' not in logger.tasks
def fit(self, X): """Computes the diffusion operator Parameters ---------- X : array, shape=[n_samples, n_features] input data with `n_samples` samples and `n_features` dimensions. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. Returns ------- magic_operator : MAGIC The estimator object """ if self.knn_dist == 'precomputed': if isinstance(X, sparse.coo_matrix): X = X.tocsr() if X[0, 0] == 0: precomputed = "distance" else: precomputed = "affinity" tasklogger.log_info( "Using precomputed {} matrix...".format(precomputed)) n_pca = None else: precomputed = None if self.n_pca is None or X.shape[1] <= self.n_pca: n_pca = None else: n_pca = self.n_pca if self.graph is not None: if self.X is not None and not \ utils.matrix_is_equivalent(X, self.X): """ If the same data is used, we can reuse existing kernel and diffusion matrices. Otherwise we have to recompute. """ self.graph = None else: try: self.graph.set_params(decay=self.a, knn=self.k + 1, distance=self.knn_dist, precomputed=precomputed, n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca, thresh=1e-4, random_state=self.random_state) tasklogger.log_info( "Using precomputed graph and diffusion operator...") except ValueError as e: # something changed that should have invalidated the graph tasklogger.log_debug("Reset graph due to {}".format( str(e))) self.graph = None self.X = X if utils.has_empty_columns(X): warnings.warn("Input matrix contains unexpressed genes. " "Please remove them prior to running MAGIC.") if self.graph is None: # reset X_magic in case it was previously set self.X_magic = None tasklogger.log_start("graph and diffusion operator") self.graph = graphtools.Graph(X, n_pca=n_pca, knn=self.k + 1, decay=self.a, thresh=1e-4, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state) tasklogger.log_complete("graph and diffusion operator") return self
def impute(self, data, t_max=20, plot=False, ax=None, max_genes_compute_t=500, threshold=0.001): """Peform MAGIC imputation Parameters ---------- data : graphtools.Graph, graphtools.Data or array-like Input data t_max : int, optional (default: 20) Maximum value of t to consider for optimal t selection plot : bool, optional (default: False) Plot the optimal t selection graph ax : matplotlib.Axes, optional (default: None) Axis on which to plot. If None, a new axis is created max_genes_compute_t : int, optional (default: 500) Above this number, genes will be subsampled for optimal t selection threshold : float, optional (default: 0.001) Threshold after which Procrustes disparity is considered to have converged for optimal t selection Returns ------- X_magic : array-like, shape=[n_samples, n_pca] Imputed data """ if not isinstance(data, graphtools.base.Data): data = graphtools.base.Data(data, n_pca=self.n_pca) data_imputed = data.data_nu if data_imputed.shape[1] > max_genes_compute_t: subsample_genes = np.random.choice(data_imputed.shape[1], max_genes_compute_t, replace=False) else: subsample_genes = None if hasattr(data, "data_pca"): weights = None # data.data_pca.explained_variance_ratio_ else: weights = None if self.t == 'auto': _, data_prev = self.calculate_error( data_imputed, data_prev=None, weights=weights, subsample_genes=subsample_genes) error_vec = [] t_opt = None else: t_opt = self.t tasklogger.log_start("imputation") # classic magic # the diffusion matrix is powered when t has been specified by # the user, and the dimensions of the diffusion matrix are lesser # than those of the data matrix. (M^t) * D if (t_opt is not None) and \ (self.diff_op.shape[1] < data_imputed.shape[1]): diff_op_t = np.linalg.matrix_power(self.diff_op, t_opt) data_imputed = diff_op_t.dot(data_imputed) # fast magic # a while loop is used when the dimensions of the diffusion matrix # are greater than those of the data matrix, or when t is not specified # (so as to allow for the calculation of the optimal t value) else: i = 0 while (t_opt is None and i < t_max) or \ (t_opt is not None and i < t_opt): i += 1 data_imputed = self.diff_op.dot(data_imputed) if self.t == 'auto': error, data_prev = self.calculate_error( data_imputed, data_prev, weights=weights, subsample_genes=subsample_genes) error_vec.append(error) tasklogger.log_debug("{}: {}".format(i, error_vec)) if error < threshold and t_opt is None: t_opt = i + 1 tasklogger.log_info( "Automatically selected t = {}".format(t_opt)) tasklogger.log_complete("imputation") if plot: # continue to t_max tasklogger.log_start("optimal t plot") if t_opt is None: # never converged warnings.warn("optimal t > t_max ({})".format(t_max), RuntimeWarning) else: data_overimputed = data_imputed while i < t_max: i += 1 data_overimputed = self.diff_op.dot(data_overimputed) error, data_prev = self.calculate_error( data_overimputed, data_prev, weights=weights, subsample_genes=subsample_genes) error_vec.append(error) # create axis if ax is None: fig, ax = plt.subplots() show = True else: show = False # plot x = np.arange(len(error_vec)) + 1 ax.plot(x, error_vec) if t_opt is not None: ax.plot( t_opt, error_vec[t_opt - 1], 'ro', markersize=10, ) ax.plot(x, np.full(len(error_vec), threshold), 'k--') ax.set_xlabel('t') ax.set_ylabel('disparity(data_{t}, data_{t-1})') ax.set_xlim([1, len(error_vec)]) plt.tight_layout() tasklogger.log_complete("optimal t plot") if show: plt.show(block=False) return data_imputed
layer_ids = np.tile(data['layer'], n) epoch = np.repeat(np.arange(n), m) digit_ids = np.repeat(np.arange(10), 10) digit_activity = np.array([ np.sum(np.abs(trace[:, :, digit_ids == digit]), axis=2) for digit in np.unique(digit_ids) ]) most_active_digit = np.argmax(digit_activity, axis=0).flatten() tasklogger.log_start("Naive DR") trace_flat = trace.reshape(-1, trace.shape[-1]) tasklogger.log_start("PHATE") phate_naive_op = phate.PHATE(verbose=0) phate_naive = phate_naive_op.fit_transform(trace_flat) tasklogger.log_complete("PHATE") tasklogger.log_start("DM") dm_naive = m_phate.kernel.DM(phate_naive_op.graph) tasklogger.log_complete("DM") tasklogger.log_start("t-SNE") tsne_naive = TSNE().fit_transform(trace_flat) tasklogger.log_complete("t-SNE") tasklogger.log_start("ISOMAP") isomap_naive = Isomap().fit_transform(trace_flat) tasklogger.log_complete("ISOMAP") tasklogger.log_complete("Naive DR") tasklogger.log_start("Multislice DR") tasklogger.log_start("M-PHATE") m_phate_op = m_phate.M_PHATE(verbose=0) m_phate_data = m_phate_op.fit_transform(trace)
def transform(self, X=None, t_max=100, plot_optimal_t=False, ax=None): """Computes the position of the cells in the embedding space Parameters ---------- X : array, optional, shape=[n_samples, n_features] input data with `n_samples` samples and `n_dimensions` dimensions. Not required, since PHATE does not currently embed cells not given in the input matrix to `PHATE.fit()`. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If `knn_dist` is 'precomputed', `data` should be a n_samples x n_samples distance or affinity matrix t_max : int, optional, default: 100 maximum t to test if `t` is set to 'auto' plot_optimal_t : boolean, optional, default: False If true and `t` is set to 'auto', plot the Von Neumann entropy used to select t ax : matplotlib.axes.Axes, optional If given and `plot_optimal_t` is true, plot will be drawn on the given axis. Returns ------- embedding : array, shape=[n_samples, n_dimensions] The cells embedded in a lower dimensional space using PHATE """ if self.graph is None: raise NotFittedError("This PHATE instance is not fitted yet. Call " "'fit' with appropriate arguments before " "using this method.") elif X is not None and not utils.matrix_is_equivalent(X, self.X): # fit to external data warnings.warn( "Pre-fit PHATE should not be used to transform a " "new data matrix. Please fit PHATE to the new" " data by running 'fit' with the new data.", RuntimeWarning) if isinstance(self.graph, graphtools.graphs.TraditionalGraph) and \ self.graph.precomputed is not None: raise ValueError("Cannot transform additional data using a " "precomputed distance matrix.") else: if self.embedding is None: self.transform() transitions = self.graph.extend_to_data(X) return self.graph.interpolate(self.embedding, transitions) else: diff_potential = self._calculate_potential( t_max=t_max, plot_optimal_t=plot_optimal_t, ax=ax) if self.embedding is None: tasklogger.log_start("{} MDS".format(self.mds)) self.embedding = mds.embed_MDS(diff_potential, ndim=self.n_components, how=self.mds, distance_metric=self.mds_dist, n_jobs=self.n_jobs, seed=self.random_state, verbose=max( self.verbose - 1, 0)) tasklogger.log_complete("{} MDS".format(self.mds)) if isinstance(self.graph, graphtools.graphs.LandmarkGraph): tasklogger.log_debug("Extending to original data...") return self.graph.interpolate(self.embedding) else: return self.embedding