def symmetrize_kernel(self, K): if self.kernel_symm == 'gamma' and self.gamma is not None and \ not isinstance(self.gamma, numbers.Number): # matrix gamma # Gamma can be a matrix with specific values transitions for # each batch. This allows for technical replicates and # experimental samples to be corrected simultaneously tasklogger.log_debug("Using gamma symmetrization. " "Gamma:\n{}".format(self.gamma)) for i, sample_i in enumerate(self.samples): for j, sample_j in enumerate(self.samples): if j < i: continue Kij = K[np.ix_(self.sample_idx == sample_i, self.sample_idx == sample_j)] Kji = K[np.ix_(self.sample_idx == sample_j, self.sample_idx == sample_i)] Kij_symm = self.gamma[i, j] * \ elementwise_minimum(Kij, Kji.T) + \ (1 - self.gamma[i, j]) * \ elementwise_maximum(Kij, Kji.T) K = set_submatrix(K, self.sample_idx == sample_i, self.sample_idx == sample_j, Kij_symm) if not i == j: K = set_submatrix(K, self.sample_idx == sample_j, self.sample_idx == sample_i, Kij_symm.T) else: K = super().symmetrize_kernel(K) return K
def build_kernel(self): """Build the MNN kernel. Build a mutual nearest neighbors kernel. Returns ------- K : kernel matrix, shape=[n_samples, n_samples] symmetric matrix with ones down the diagonal with no non-negative entries. """ tasklogger.log_start("subgraphs") self.subgraphs = [] from .api import Graph # iterate through sample ids for i, idx in enumerate(self.samples): tasklogger.log_debug("subgraph {}: sample {}, " "n = {}, knn = {}".format( i, idx, np.sum(self.sample_idx == idx), self.weighted_knn[i])) # select data for sample data = self.data_nu[self.sample_idx == idx] # build a kNN graph for cells within sample graph = Graph(data, n_pca=None, knn=self.weighted_knn[i], decay=self.decay, distance=self.distance, thresh=self.thresh, verbose=self.verbose, random_state=self.random_state, n_jobs=self.n_jobs, initialize=False) self.subgraphs.append(graph) # append to list of subgraphs tasklogger.log_complete("subgraphs") if self.thresh > 0 or self.decay is None: K = sparse.lil_matrix( (self.data_nu.shape[0], self.data_nu.shape[0])) else: K = np.zeros([self.data_nu.shape[0], self.data_nu.shape[0]]) for i, X in enumerate(self.subgraphs): for j, Y in enumerate(self.subgraphs): tasklogger.log_start("kernel from sample {} to {}".format( self.samples[i], self.samples[j])) Kij = Y.build_kernel_to_data(X.data_nu, knn=self.weighted_knn[i]) if i == j: # downweight within-batch affinities by beta Kij = Kij * self.beta K = set_submatrix(K, self.sample_idx == self.samples[i], self.sample_idx == self.samples[j], Kij) tasklogger.log_complete("kernel from sample {} to {}".format( self.samples[i], self.samples[j])) return K
def __init__(self, kernel_symm='+', gamma=None, initialize=True, **kwargs): self.kernel_symm = kernel_symm self.gamma = gamma self._check_symmetrization(kernel_symm, gamma) if initialize: tasklogger.log_debug("Initializing kernel...") self.K else: tasklogger.log_debug("Not initializing kernel.") super().__init__(**kwargs)
def cmdscale_fast(D, ndim): """Fast CMDS using random SVD Parameters ---------- D : array-like, input data [n_samples, n_dimensions] ndim : int, number of dimensions in which to embed `D` Returns ------- Y : array-like, embedded data [n_sample, ndim] """ tasklogger.log_debug("Performing classic MDS on {} of shape {}...".format( type(D).__name__, D.shape)) D = D**2 D = D - D.mean(axis=0)[None, :] D = D - D.mean(axis=1)[:, None] pca = PCA(n_components=ndim, svd_solver='randomized') Y = pca.fit_transform(D) return Y
def _update_graph(self, X, precomputed, n_pca, n_landmark): if self.X is not None and not utils.matrix_is_equivalent( X, self.X): """ If the same data is used, we can reuse existing kernel and diffusion matrices. Otherwise we have to recompute. """ self._reset_graph() else: try: self.graph.set_params( decay=self.decay, knn=self.knn, distance=self.knn_dist, precomputed=precomputed, n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca, n_landmark=n_landmark, random_state=self.random_state) tasklogger.log_info( "Using precomputed graph and diffusion operator...") except ValueError as e: # something changed that should have invalidated the graph tasklogger.log_debug("Reset graph due to {}".format( str(e))) self._reset_graph()
def symmetrize_kernel(self, K): # symmetrize if self.kernel_symm == "+": tasklogger.log_debug("Using addition symmetrization.") K = (K + K.T) / 2 elif self.kernel_symm == "*": tasklogger.log_debug("Using multiplication symmetrization.") K = K.multiply(K.T) elif self.kernel_symm == 'gamma': tasklogger.log_debug( "Using gamma symmetrization (gamma = {}).".format(self.gamma)) K = self.gamma * elementwise_minimum(K, K.T) + \ (1 - self.gamma) * elementwise_maximum(K, K.T) elif self.kernel_symm is None: tasklogger.log_debug("Using no symmetrization.") pass else: # this should never happen raise ValueError( "Expected kernel_symm in ['+', '*', 'gamma' or None]. " "Got {}".format(self.gamma)) return K
def run_magic_from_file( filename, # data loading params sparse=True, gene_names=None, cell_names=None, cell_axis=None, gene_labels=None, allow_duplicates=None, genome=None, metadata_channels=None, # filtering params min_library_size=2000, min_cells_per_gene=10, # normalization params library_size_normalize=True, transform='sqrt', pseudocount=None, cofactor=None, # kernel params knn=5, decay=15, n_pca=100, knn_dist='euclidean', n_jobs=1, random_state=42, verbose=1, # magic params t_magic='auto', genes=None, # output params output='magic.csv', validate=False): """Run MAGIC on a file Parameters ---------- filename : str Allowed types: csv, tsv, mtx, hdf5/h5 (10X format), directory/zip (10X format) sparse : bool (recommended: True for scRNAseq, False for CyTOF) Force data sparsity. If `None`, sparsity is determined by data type. gene_names : str, list or bool Allowed values: - if filetype is csv or fcs, `True` says gene names are data headers, `str` gives a path to a separate csv or tsv file containing gene names, list gives an array of gene names, `False` means no gene names are given - if filetype is mtx, `str` gives a path to a separate csv or tsv file containing gene names, list gives an array of gene names, or `False` means no gene names are given - if filetype is hdf5, h5, directory or zip, must be `None`. cell_names : str, list or bool Allowed values: - if filetype is csv or fcs, `True` says cell names are data headers, `str` gives a path to a separate csv or tsv file containing cell names, list gives an array of cell names, `False` means no cell names are given - if filetype is mtx, `str` gives a path to a separate csv or tsv file containing cell names, list gives an array of cell names, or `False` means no gene names are given - if filetype is hdf5, h5, directory or zip, must be `None`. cell_axis : {'row', 'column'} States whether cells are on rows or columns. If cell_axis=='row', data is of shape [n_cells, n_genes]. If cell_axis=='column', data is of shape [n_genes, n_cells]. Only valid for filetype mtx and csv gene_labels : {'symbol', 'id', 'both'} Choice of gene labels for 10X data. Recommended: 'both' Only valid for directory, zip, hdf5, h5 allow_duplicates : bool Allow duplicate gene names in 10X data. Recommended: True Only valid for directory, zip, hdf5, h5 genome : str Genome name. Only valid for hdf5, h5 metadata_channels : list of str (recommended: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1']) Names of channels in fcs data which are not real measurements. Only valid if datatype is fcs. min_library_size : int or `None`, optional (default: 2000) Cutoff for library size normalization. If `None`, library size filtering is not used min_cells_per_gene : int or `None`, optional (default: 10) Minimum non-zero cells for a gene to be used. If `None`, genes are not removed library_size_normalize : `bool`, optional (default: True) Use library size normalization transform : {'sqrt', 'log', 'arcsinh', None} How to transform the data. If `None`, no transformation is done pseudocount : float (recommended: 1) Number of pseudocounts to add to genes prior to log transformation cofactor : float (recommended: 5) Factor by which to divide genes prior to arcsinh transformation knn : int, optional, default: 10 number of nearest neighbors on which to build kernel decay : int, optional, default: 15 sets decay rate of kernel tails. If None, alpha decaying kernel is not used n_pca : int, optional, default: 100 Number of principal components to use for calculating neighborhoods. For extremely large datasets, using n_pca < 20 allows neighborhoods to be calculated in roughly log(n_samples) time. knn_dist : string, optional, default: 'euclidean' recommended values: 'euclidean', 'cosine' Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. n_jobs : integer, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize random PCA If an integer is given, it fixes the seed Defaults to the global `numpy` random number generator verbose : `int` or `boolean`, optional (default: 1) If `True` or `> 0`, print status messages t_magic : int, optional, default: 'auto' power to which the diffusion operator is powered for MAGIC. This sets the level of diffusion. If 'auto', t is selected according to the Procrustes disparity of the diffused data genes : list or {"all_genes", "pca_only"}, optional (default: None) List of genes to return from MAGIC, either as integer indices or column names if input data is a pandas DataFrame. If "all_genes", the entire smoothed matrix is returned. If "pca_only", PCA on the smoothed data is returned. If None, the entire matrix is also returned, but a warning may be raised if the resultant matrix is very large. output : str, optional (default: 'magic.csv') Output CSV file to save smoothed data matrix """ # check arguments filetype = check_filetype(filename) load_fn, load_kws = check_load_args(filetype, sparse=sparse, gene_names=gene_names, cell_names=cell_names, cell_axis=cell_axis, gene_labels=gene_labels, allow_duplicates=allow_duplicates, genome=genome, metadata_channels=metadata_channels) transform_fn, transform_kws = check_transform_args(transform=transform, pseudocount=pseudocount, cofactor=cofactor) # set up logging # https://github.com/scottgigante/tasklogger tasklogger.set_level(verbose) # load data # example: scprep.io.load_csv("data.csv") # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.io tasklogger.log_info("Loading data from {}...".format(filename)) data = load_fn(filename, **load_kws) data = scprep.sanitize.check_numeric(data, copy=True) tasklogger.log_info("Loaded {} cells and {} genes.".format( data.shape[0], data.shape[1])) # filter data # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.filter if min_library_size is not None: tasklogger.log_info("Filtering cells by library size >= {}...".format( min_library_size)) data = scprep.filter.filter_library_size(data, cutoff=min_library_size) tasklogger.log_info("Retained {} cells.".format(data.shape[0])) if min_cells_per_gene is not None: tasklogger.log_info( "Filtering genes by min cells >= {}...".format(min_cells_per_gene)) data = scprep.filter.filter_rare_genes(data, min_cells=min_cells_per_gene) tasklogger.log_info("Retained {} genes.".format(data.shape[1])) # normalize data # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.normalize if library_size_normalize: tasklogger.log_info("Library size normalizing data...") data = scprep.normalize.library_size_normalize(data) # transform data # example: data = scprep.transform.sqrt(data) # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.transform if transform is not None: tasklogger.log_info("Applying {} transform...".format(transform)) data = transform_fn(data, **transform_kws) # run MAGIC # https://magic.readthedocs.io/ magic_op = magic.MAGIC(knn=knn, decay=decay, t=t_magic, n_pca=n_pca, knn_dist=knn_dist, n_jobs=n_jobs, random_state=random_state, verbose=verbose) magic_data = magic_op.fit_transform(data, genes=genes) # save as csv magic_data = pd.DataFrame(magic_data) if cell_axis in ['col', 'column']: magic_data = magic_data.T tasklogger.log_info("Saving data to {}...".format(output)) magic_data.to_csv(output) tasklogger.log_info("Complete.".format(output)) if validate: correct_magic_data = scprep.io.load_csv( 'https://raw.githubusercontent.com/KrishnaswamyLab/magic-docker/' 'master/magic-validate.csv', sparse=False) try: np.testing.assert_equal(scprep.utils.toarray(magic_data), scprep.utils.toarray(correct_magic_data)) tasklogger.log_debug( "Validation complete, output is equal to expected") except AssertionError: np.testing.assert_allclose( scprep.utils.toarray(magic_data), scprep.utils.toarray(correct_magic_data), atol=1e-14) tasklogger.log_debug( "Validation complete, output is numerically equivalent to expected" )
def test_log(): tasklogger.log_debug("debug") tasklogger.log_info("info") tasklogger.log_warning("warning") tasklogger.log_error("error") tasklogger.log_critical("critical")
def impute(self, data, t_max=20, plot=False, ax=None, max_genes_compute_t=500, threshold=0.001): """Peform MAGIC imputation Parameters ---------- data : graphtools.Graph, graphtools.Data or array-like Input data t_max : int, optional (default: 20) Maximum value of t to consider for optimal t selection plot : bool, optional (default: False) Plot the optimal t selection graph ax : matplotlib.Axes, optional (default: None) Axis on which to plot. If None, a new axis is created max_genes_compute_t : int, optional (default: 500) Above this number, genes will be subsampled for optimal t selection threshold : float, optional (default: 0.001) Threshold after which Procrustes disparity is considered to have converged for optimal t selection Returns ------- X_magic : array-like, shape=[n_samples, n_pca] Imputed data """ if not isinstance(data, graphtools.base.Data): data = graphtools.base.Data(data, n_pca=self.n_pca) data_imputed = data.data_nu if data_imputed.shape[1] > max_genes_compute_t: subsample_genes = np.random.choice(data_imputed.shape[1], max_genes_compute_t, replace=False) else: subsample_genes = None if hasattr(data, "data_pca"): weights = None # data.data_pca.explained_variance_ratio_ else: weights = None if self.t == 'auto': _, data_prev = self.calculate_error( data_imputed, data_prev=None, weights=weights, subsample_genes=subsample_genes) error_vec = [] t_opt = None else: t_opt = self.t tasklogger.log_start("imputation") # classic magic # the diffusion matrix is powered when t has been specified by # the user, and the dimensions of the diffusion matrix are lesser # than those of the data matrix. (M^t) * D if (t_opt is not None) and \ (self.diff_op.shape[1] < data_imputed.shape[1]): diff_op_t = np.linalg.matrix_power(self.diff_op, t_opt) data_imputed = diff_op_t.dot(data_imputed) # fast magic # a while loop is used when the dimensions of the diffusion matrix # are greater than those of the data matrix, or when t is not specified # (so as to allow for the calculation of the optimal t value) else: i = 0 while (t_opt is None and i < t_max) or \ (t_opt is not None and i < t_opt): i += 1 data_imputed = self.diff_op.dot(data_imputed) if self.t == 'auto': error, data_prev = self.calculate_error( data_imputed, data_prev, weights=weights, subsample_genes=subsample_genes) error_vec.append(error) tasklogger.log_debug("{}: {}".format(i, error_vec)) if error < threshold and t_opt is None: t_opt = i + 1 tasklogger.log_info( "Automatically selected t = {}".format(t_opt)) tasklogger.log_complete("imputation") if plot: # continue to t_max tasklogger.log_start("optimal t plot") if t_opt is None: # never converged warnings.warn("optimal t > t_max ({})".format(t_max), RuntimeWarning) else: data_overimputed = data_imputed while i < t_max: i += 1 data_overimputed = self.diff_op.dot(data_overimputed) error, data_prev = self.calculate_error( data_overimputed, data_prev, weights=weights, subsample_genes=subsample_genes) error_vec.append(error) # create axis if ax is None: fig, ax = plt.subplots() show = True else: show = False # plot x = np.arange(len(error_vec)) + 1 ax.plot(x, error_vec) if t_opt is not None: ax.plot( t_opt, error_vec[t_opt - 1], 'ro', markersize=10, ) ax.plot(x, np.full(len(error_vec), threshold), 'k--') ax.set_xlabel('t') ax.set_ylabel('disparity(data_{t}, data_{t-1})') ax.set_xlim([1, len(error_vec)]) plt.tight_layout() tasklogger.log_complete("optimal t plot") if show: plt.show(block=False) return data_imputed
def fit(self, X): """Computes the diffusion operator Parameters ---------- X : array, shape=[n_samples, n_features] input data with `n_samples` samples and `n_features` dimensions. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. Returns ------- magic_operator : MAGIC The estimator object """ if self.knn_dist == 'precomputed': if isinstance(X, sparse.coo_matrix): X = X.tocsr() if X[0, 0] == 0: precomputed = "distance" else: precomputed = "affinity" tasklogger.log_info( "Using precomputed {} matrix...".format(precomputed)) n_pca = None else: precomputed = None if self.n_pca is None or X.shape[1] <= self.n_pca: n_pca = None else: n_pca = self.n_pca if self.graph is not None: if self.X is not None and not \ utils.matrix_is_equivalent(X, self.X): """ If the same data is used, we can reuse existing kernel and diffusion matrices. Otherwise we have to recompute. """ self.graph = None else: try: self.graph.set_params(decay=self.a, knn=self.k + 1, distance=self.knn_dist, precomputed=precomputed, n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca, thresh=1e-4, random_state=self.random_state) tasklogger.log_info( "Using precomputed graph and diffusion operator...") except ValueError as e: # something changed that should have invalidated the graph tasklogger.log_debug("Reset graph due to {}".format( str(e))) self.graph = None self.X = X if utils.has_empty_columns(X): warnings.warn("Input matrix contains unexpressed genes. " "Please remove them prior to running MAGIC.") if self.graph is None: # reset X_magic in case it was previously set self.X_magic = None tasklogger.log_start("graph and diffusion operator") self.graph = graphtools.Graph(X, n_pca=n_pca, knn=self.k + 1, decay=self.a, thresh=1e-4, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state) tasklogger.log_complete("graph and diffusion operator") return self
def embed_MDS(X, ndim=2, how='metric', distance_metric='euclidean', n_jobs=1, seed=None, verbose=0): """Performs classic, metric, and non-metric MDS Metric MDS is initialized using classic MDS, non-metric MDS is initialized using metric MDS. Parameters ---------- X: ndarray [n_samples, n_samples] 2 dimensional input data array with n_samples embed_MDS does not check for matrix squareness, but this is necessary for PHATE n_dim : int, optional, default: 2 number of dimensions in which the data will be embedded how : string, optional, default: 'classic' choose from ['classic', 'metric', 'nonmetric'] which MDS algorithm is used for dimensionality reduction distance_metric : string, optional, default: 'euclidean' choose from ['cosine', 'euclidean'] distance metric for MDS n_jobs : integer, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used seed: integer or numpy.RandomState, optional The generator used to initialize SMACOF (metric, nonmetric) MDS If an integer is given, it fixes the seed Defaults to the global numpy random number generator Returns ------- Y : ndarray [n_samples, n_dim] low dimensional embedding of X using MDS """ if how not in ['classic', 'metric', 'nonmetric']: raise ValueError("Allowable 'how' values for MDS: 'classic', " "'metric', or 'nonmetric'. " "'{}' was passed.".format(how)) # MDS embeddings, each gives a different output. X_dist = squareform(pdist(X, distance_metric)) # initialize all by CMDS Y = cmdscale_fast(X_dist, ndim) if how in ['metric', 'nonmetric']: tasklogger.log_debug("Performing metric MDS on " "{} of shape {}...".format( type(X_dist), X_dist.shape)) # Metric MDS from sklearn Y, _ = smacof(X_dist, n_components=ndim, metric=True, max_iter=3000, eps=1e-6, random_state=seed, n_jobs=n_jobs, n_init=1, init=Y, verbose=verbose) if how == 'nonmetric': tasklogger.log_debug("Performing non-metric MDS on " "{} of shape {}...".format( type(X_dist), X_dist.shape)) # Nonmetric MDS from sklearn using metric MDS as an initialization Y, _ = smacof(X_dist, n_components=ndim, metric=True, max_iter=3000, eps=1e-6, random_state=seed, n_jobs=n_jobs, n_init=1, init=Y, verbose=verbose) return Y
def fit(self, X, graph=None): """Computes the diffusion operator Parameters ---------- X : array, shape=[n_samples, n_features] input data with `n_samples` samples and `n_features` dimensions. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. graph : `graphtools.Graph`, optional (default: None) If given, provides a precomputed kernel matrix with which to perform diffusion. Returns ------- magic_operator : MAGIC The estimator object """ if self.n_pca is None or X.shape[1] <= self.n_pca: n_pca = None else: n_pca = self.n_pca tasklogger.log_info("Running MAGIC on {} cells and {} genes.".format( X.shape[0], X.shape[1])) if graph is None: graph = self.graph if self.X is not None and not \ utils.matrix_is_equivalent(X, self.X): """ If the same data is used, we can reuse existing kernel and diffusion matrices. Otherwise we have to recompute. """ tasklogger.log_debug( "Reset graph due to difference in input data") graph = None elif graph is not None: try: graph.set_params(decay=self.decay, knn=self.knn, distance=self.knn_dist, n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca, thresh=1e-4, random_state=self.random_state) except ValueError as e: # something changed that should have invalidated the graph tasklogger.log_debug("Reset graph due to {}".format( str(e))) graph = None else: self.knn = graph.knn self.alpha = graph.decay self.n_pca = graph.n_pca self.knn_dist = graph.distance self.X = X if utils.has_empty_columns(X): warnings.warn("Input matrix contains unexpressed genes. " "Please remove them prior to running MAGIC.") if graph is not None: tasklogger.log_info( "Using precomputed graph and diffusion operator...") self.graph = graph else: # reset X_magic in case it was previously set self.X_magic = None tasklogger.log_start("graph and diffusion operator") self.graph = graphtools.Graph(X, n_pca=n_pca, knn=self.knn, decay=self.decay, thresh=1e-4, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state) tasklogger.log_complete("graph and diffusion operator") return self
def fit(self, X): """Computes the diffusion operator Parameters ---------- X : array, shape=[n_samples, n_features] input data with `n_samples` samples and `n_dimensions` dimensions. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If `knn_dist` is 'precomputed', `data` should be a n_samples x n_samples distance or affinity matrix Returns ------- phate_operator : PHATE The estimator object """ try: if isinstance(X, anndata.AnnData): X = X.X except NameError: # anndata not installed pass if self.knn_dist.startswith('precomputed'): if self.knn_dist == 'precomputed': # automatic detection if isinstance(X, sparse.coo_matrix): X = X.tocsr() if X[0, 0] == 0: precomputed = "distance" else: precomputed = "affinity" elif self.knn_dist in [ 'precomputed_affinity', 'precomputed_distance' ]: precomputed = self.knn_dist.split("_")[1] else: raise ValueError( "knn_dist {} not recognized. Did you mean " "'precomputed_distance', " "'precomputed_affinity', or 'precomputed' " "(automatically detects distance or affinity)?") tasklogger.log_info( "Using precomputed {} matrix...".format(precomputed)) n_pca = None else: precomputed = None if X.shape[1] <= self.n_pca: n_pca = None else: n_pca = self.n_pca if self.n_landmark is None or X.shape[0] <= self.n_landmark: n_landmark = None else: n_landmark = self.n_landmark if self.graph is not None: if self.X is not None and not matrix_is_equivalent(X, self.X): """ If the same data is used, we can reuse existing kernel and diffusion matrices. Otherwise we have to recompute. """ self._reset_graph() else: try: self.graph.set_params(decay=self.a, knn=self.k + 1, distance=self.knn_dist, precomputed=precomputed, n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca, thresh=1e-4, n_landmark=n_landmark, random_state=self.random_state) tasklogger.log_info( "Using precomputed graph and diffusion operator...") except ValueError as e: # something changed that should have invalidated the graph tasklogger.log_debug("Reset graph due to {}".format( str(e))) self._reset_graph() self.X = X if self.graph is None: tasklogger.log_start("graph and diffusion operator") self.graph = graphtools.Graph(X, n_pca=n_pca, n_landmark=n_landmark, distance=self.knn_dist, precomputed=precomputed, knn=self.k + 1, decay=self.a, thresh=1e-4, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state) tasklogger.log_complete("graph and diffusion operator") # landmark op doesn't build unless forced self.diff_op return self
def build_kernel_to_data(self, Y, knn=None): """Build a kernel from new input data `Y` to the `self.data` Parameters ---------- Y: array-like, [n_samples_y, n_features] new data for which an affinity matrix is calculated to the existing data. `n_features` must match either the ambient or PCA dimensions knn : `int` or `None`, optional (default: `None`) If `None`, defaults to `self.knn` Returns ------- K_yx: array-like, [n_samples_y, n_samples] kernel matrix where each row represents affinities of a single sample in `Y` to all samples in `self.data`. Raises ------ ValueError: if the supplied data is the wrong shape """ if knn is None: knn = self.knn if knn > self.data.shape[0]: warnings.warn("Cannot set knn ({k}) to be greater than " "data.shape[0] ({n}). Setting knn={n}".format( k=knn, n=self.data.shape[0])) Y = self._check_extension_shape(Y) tasklogger.log_start("KNN search") if self.decay is None or self.thresh == 1: # binary connectivity matrix K = self.knn_tree.kneighbors_graph(Y, n_neighbors=knn, mode='connectivity') tasklogger.log_complete("KNN search") else: # sparse fast alpha decay knn_tree = self.knn_tree search_knn = min(knn * 20, self.data_nu.shape[0]) distances, indices = knn_tree.kneighbors(Y, n_neighbors=search_knn) if np.any(distances[:, 1] == 0): has_duplicates = distances[:, 1] == 0 idx = np.argwhere((distances == 0) & has_duplicates[:, None]) duplicate_ids = np.array([[indices[i[0], i[1]], i[0]] for i in idx if indices[i[0], i[1]] < i[0]]) duplicate_ids = duplicate_ids[np.argsort(duplicate_ids[:, 0])] duplicate_names = ", ".join( ["{} and {}".format(i[0], i[1]) for i in duplicate_ids]) warnings.warn( "Detected zero distance between samples {}. " "Consider removing duplicates to avoid errors in " "downstream processing.".format(duplicate_names), RuntimeWarning) tasklogger.log_complete("KNN search") tasklogger.log_start("affinities") bandwidth = distances[:, knn - 1] radius = bandwidth * np.power(-1 * np.log(self.thresh), 1 / self.decay) update_idx = np.argwhere( np.max(distances, axis=1) < radius).reshape(-1) tasklogger.log_debug("search_knn = {}; {} remaining".format( search_knn, len(update_idx))) if len(update_idx) > 0: distances = [d for d in distances] indices = [i for i in indices] while len(update_idx) > Y.shape[0] // 10 and \ search_knn < self.data_nu.shape[0] / 2: # increase the knn search search_knn = min(search_knn * 20, self.data_nu.shape[0]) dist_new, ind_new = knn_tree.kneighbors(Y[update_idx], n_neighbors=search_knn) for i, idx in enumerate(update_idx): distances[idx] = dist_new[i] indices[idx] = ind_new[i] update_idx = [ i for i, d in enumerate(distances) if np.max(d) < radius[i] ] tasklogger.log_debug("search_knn = {}; {} remaining".format( search_knn, len(update_idx))) if search_knn > self.data_nu.shape[0] / 2: knn_tree = NearestNeighbors(knn, algorithm='brute', n_jobs=self.n_jobs).fit( self.data_nu) if len(update_idx) > 0: tasklogger.log_debug("radius search on {}".format( len(update_idx))) # give up - radius search dist_new, ind_new = knn_tree.radius_neighbors( Y[update_idx, :], radius=np.max(radius[update_idx])) for i, idx in enumerate(update_idx): distances[idx] = dist_new[i] indices[idx] = ind_new[i] data = np.concatenate( [distances[i] / bandwidth[i] for i in range(len(distances))]) indices = np.concatenate(indices) indptr = np.concatenate([[0], np.cumsum([len(d) for d in distances])]) K = sparse.csr_matrix((data, indices, indptr), shape=(Y.shape[0], self.data_nu.shape[0])) K.data = np.exp(-1 * np.power(K.data, self.decay)) # handle nan K.data = np.where(np.isnan(K.data), 1, K.data) # TODO: should we zero values that are below thresh? K.data[K.data < self.thresh] = 0 K = K.tocoo() K.eliminate_zeros() K = K.tocsr() tasklogger.log_complete("affinities") return K
def Graph(data, n_pca=None, sample_idx=None, adaptive_k='sqrt', precomputed=None, knn=5, decay=10, distance='euclidean', thresh=1e-4, kernel_symm='+', gamma=None, n_landmark=None, n_svd=100, beta=1, n_jobs=-1, verbose=False, random_state=None, graphtype='auto', use_pygsp=False, initialize=True, **kwargs): """Create a graph built on data. Automatically selects the appropriate DataGraph subclass based on chosen parameters. Selection criteria: - if `graphtype` is given, this will be respected - otherwise: -- if `sample_idx` is given, an MNNGraph will be created -- if `precomputed` is not given, and either `decay` is `None` or `thresh` is given, a kNNGraph will be created - otherwise, a TraditionalGraph will be created. Incompatibilities: - MNNGraph and kNNGraph cannot be precomputed - kNNGraph and TraditionalGraph do not accept sample indices Parameters ---------- data : array-like, shape=[n_samples,n_features] accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`. TODO: accept pandas dataframes n_pca : `int` or `None`, optional (default: `None`) number of PC dimensions to retain for graph building. If `None`, uses the original data. Note: if data is sparse, uses SVD instead of PCA TODO: should we subtract and store the mean? knn : `int`, optional (default: 5) Number of nearest neighbors (including self) to use to build the graph decay : `int` or `None`, optional (default: 10) Rate of alpha decay to use. If `None`, alpha decay is not used. distance : `str`, optional (default: `'euclidean'`) Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. TODO: actually sklearn.neighbors has even more choices thresh : `float`, optional (default: `1e-4`) Threshold above which to calculate alpha decay kernel. All affinities below `thresh` will be set to zero in order to save on time and memory constraints. kernel_symm : string, optional (default: '+') Defines method of MNN symmetrization. '+' : additive '*' : multiplicative 'gamma' : min-max 'none' : no symmetrization gamma: float (default: None) Min-max symmetrization constant or matrix. Only used if kernel_symm='gamma'. K = `gamma * min(K, K.T) + (1 - gamma) * max(K, K.T)` precomputed : {'distance', 'affinity', 'adjacency', `None`}, optional (default: `None`) If the graph is precomputed, this variable denotes which graph matrix is provided as `data`. Only one of `precomputed` and `n_pca` can be set. beta: float, optional(default: 1) Multiply within - batch connections by(1 - beta) sample_idx: array-like Batch index for MNN kernel adaptive_k : `{'min', 'mean', 'sqrt', 'none'}` (default: 'sqrt') Weights MNN kernel adaptively using the number of cells in each sample according to the selected method. n_landmark : `int`, optional (default: 2000) number of landmarks to use n_svd : `int`, optional (default: 100) number of SVD components to use for spectral clustering random_state : `int` or `None`, optional (default: `None`) Random state for random PCA verbose : `bool`, optional (default: `True`) Verbosity. TODO: should this be an integer instead to allow multiple levels of verbosity? n_jobs : `int`, optional (default : 1) The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used graphtype : {'exact', 'knn', 'mnn', 'auto'} (Default: 'auto') Manually selects graph type. Only recommended for expert users use_pygsp : `bool` (Default: `False`) If true, inherits from `pygsp.graphs.Graph`. initialize : `bool` (Default: `True`) If True, initialize the kernel matrix on instantiation **kwargs : extra arguments for `pygsp.graphs.Graph` Returns ------- G : `DataGraph` Raises ------ ValueError : if selected parameters are incompatible. """ tasklogger.set_level(verbose) if sample_idx is not None and len(np.unique(sample_idx)) == 1: warnings.warn("Only one unique sample. " "Not using MNNGraph") sample_idx = None if graphtype == 'mnn': graphtype = 'auto' if graphtype == 'auto': # automatic graph selection if sample_idx is not None: # only mnn does batch correction graphtype = "mnn" elif precomputed is None and (decay is None or thresh > 0): # precomputed requires exact graph # no decay or threshold decay require knngraph graphtype = "knn" else: graphtype = "exact" # set base graph type if graphtype == "knn": basegraph = graphs.kNNGraph if precomputed is not None: raise ValueError("kNNGraph does not support precomputed " "values. Use `graphtype='exact'` or " "`precomputed=None`") if sample_idx is not None: raise ValueError("kNNGraph does not support batch " "correction. Use `graphtype='mnn'` or " "`sample_idx=None`") elif graphtype == "mnn": basegraph = graphs.MNNGraph if precomputed is not None: raise ValueError("MNNGraph does not support precomputed " "values. Use `graphtype='exact'` and " "`sample_idx=None` or `precomputed=None`") elif graphtype == "exact": basegraph = graphs.TraditionalGraph if sample_idx is not None: raise ValueError("TraditionalGraph does not support batch " "correction. Use `graphtype='mnn'` or " "`sample_idx=None`") else: raise ValueError("graphtype '{}' not recognized. Choose from " "['knn', 'mnn', 'exact', 'auto']") # set add landmarks if necessary parent_classes = [basegraph] msg = "Building {} graph".format(graphtype) if n_landmark is not None: parent_classes.append(graphs.LandmarkGraph) msg = msg + " with landmarks" if use_pygsp: parent_classes.append(base.PyGSPGraph) if len(parent_classes) > 2: msg = msg + " with PyGSP inheritance" else: msg = msg + " and PyGSP inheritance" tasklogger.log_debug(msg) class_names = [p.__name__.replace("Graph", "") for p in parent_classes] try: Graph = eval("graphs." + "".join(class_names) + "Graph") except NameError: raise RuntimeError("unknown graph classes {}".format(parent_classes)) params = kwargs for parent_class in parent_classes: for param in parent_class._get_param_names(): try: params[param] = eval(param) except NameError: # keyword argument not specified above - no problem pass # build graph and return tasklogger.log_debug("Initializing {} with arguments {}".format( parent_classes, ", ".join([ "{}='{}'".format(key, value) for key, value in params.items() if key != "data" ]))) return Graph(**params)
def transform(self, X=None, t_max=100, plot_optimal_t=False, ax=None): """Computes the position of the cells in the embedding space Parameters ---------- X : array, optional, shape=[n_samples, n_features] input data with `n_samples` samples and `n_dimensions` dimensions. Not required, since PHATE does not currently embed cells not given in the input matrix to `PHATE.fit()`. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If `knn_dist` is 'precomputed', `data` should be a n_samples x n_samples distance or affinity matrix t_max : int, optional, default: 100 maximum t to test if `t` is set to 'auto' plot_optimal_t : boolean, optional, default: False If true and `t` is set to 'auto', plot the Von Neumann entropy used to select t ax : matplotlib.axes.Axes, optional If given and `plot_optimal_t` is true, plot will be drawn on the given axis. Returns ------- embedding : array, shape=[n_samples, n_dimensions] The cells embedded in a lower dimensional space using PHATE """ if self.graph is None: raise NotFittedError("This PHATE instance is not fitted yet. Call " "'fit' with appropriate arguments before " "using this method.") elif X is not None and not utils.matrix_is_equivalent(X, self.X): # fit to external data warnings.warn( "Pre-fit PHATE should not be used to transform a " "new data matrix. Please fit PHATE to the new" " data by running 'fit' with the new data.", RuntimeWarning) if isinstance(self.graph, graphtools.graphs.TraditionalGraph) and \ self.graph.precomputed is not None: raise ValueError("Cannot transform additional data using a " "precomputed distance matrix.") else: if self.embedding is None: self.transform() transitions = self.graph.extend_to_data(X) return self.graph.interpolate(self.embedding, transitions) else: diff_potential = self._calculate_potential( t_max=t_max, plot_optimal_t=plot_optimal_t, ax=ax) if self.embedding is None: tasklogger.log_start("{} MDS".format(self.mds)) self.embedding = mds.embed_MDS(diff_potential, ndim=self.n_components, how=self.mds, distance_metric=self.mds_dist, n_jobs=self.n_jobs, seed=self.random_state, verbose=max( self.verbose - 1, 0)) tasklogger.log_complete("{} MDS".format(self.mds)) if isinstance(self.graph, graphtools.graphs.LandmarkGraph): tasklogger.log_debug("Extending to original data...") return self.graph.interpolate(self.embedding) else: return self.embedding
def test_log(): tasklogger.log_debug('debug') tasklogger.log_info('info') tasklogger.log_warning('warning') tasklogger.log_error('error') tasklogger.log_critical('critical')
args.metadata_channels = None else: parser.error( "Cannot handle --metadata-channels with {} file".format( filetype)) # check for inappropriately set parameters if not args.transform == 'log': if '--pseudocount' in sys.argv: parser.error( "Cannot handle --pseudocount with --transform {}".format( args.transform)) else: args.pseudocount = None if not args.transform == 'arcsinh': if '--cofactor' in sys.argv: parser.error("Cannot handle --cofactor with --transform {}".format( args.transform)) else: args.cofactor = None return args if __name__ == "__main__": args = parse_args() tasklogger.set_level(args.verbose) tasklogger.log_debug("Running MAGIC with arguments {}".format( args.__dict__)) run_magic_from_file(**(args.__dict__))