def measure_splat_range(load_fn, var_name, var_range, n_jobs=1, seed=None, **load_kwargs): truth_kwargs = {} truth_kwargs["dropout"] = 0 truth_kwargs["bcv"] = 0 truth_kwargs.update(load_kwargs) tasklogger.log_info("Generating ground truth data...", logger="demap") data_truth = load_fn(seed=seed, **truth_kwargs) results = pd.concat([ measure_all_methods( data_truth, load_fn, load_params=dict(**{var_name: var_value}, seed=seed, **load_kwargs), n_jobs=n_jobs, seed=seed, ) for var_value in var_range ]) results.to_csv("../results/{}_{}_{}_{}_{}.csv".format( load_fn.__name__, var_name, var_range.min(), var_range.max(), seed)) results_agg = (results.groupby("method").agg({ "demap": [np.mean, np.std] }).sort_values(("demap", "mean"), ascending=False)) print(results_agg) return results_agg
def get_levels(grad): """Short summary. Parameters ---------- grad : type Description of parameter `Xs`. Returns ------- type Description of returned object. """ tasklogger.log_info("Identifying salient levels of resolution...") minimum = np.max(grad) levels = [] levels.append(0) for i in range(1, len(grad) - 1): if grad[i] <= minimum and grad[i] < grad[i + 1]: levels.append(i) minimum = grad[i] return levels
def compute_gradient(Xs, merges): """Short summary. Parameters ---------- Xs : type Description of parameter `Xs`. merges : type Description of parameter `merges`. Returns ------- type Description of returned object. """ tasklogger.log_info("Computing gradient...") gradient = [] m = 0 X = Xs[0] for l in range(0, len(Xs) - 1): if X.shape[0] != Xs[l + 1].shape[0]: X_1 = condense_visualization(merges[m], X) m = m + 1 while X_1.shape[0] != Xs[l + 1].shape[0]: X_1 = condense_visualization(merges[m], X_1) m = m + 1 else: X_1 = X gradient.append(np.sum(np.abs(X_1 - Xs[l + 1]))) X = Xs[l + 1] return np.array(gradient)
def fit(self, X): """Computes the diffusion operator Parameters ---------- X : array, shape=[n_samples, n_features] input data with `n_samples` samples and `n_dimensions` dimensions. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If `knn_dist` is 'precomputed', `data` should be a n_samples x n_samples distance or affinity matrix Returns ------- phate_operator : PHATE The estimator object """ X, n_pca, precomputed, update_graph = self._parse_input(X) if precomputed is None: tasklogger.log_info( "Running PHATE on {} cells and {} genes.".format( X.shape[0], X.shape[1])) else: tasklogger.log_info( "Running PHATE on precomputed {} matrix with {} cells.".format( precomputed, X.shape[0])) if self.n_landmark is None or X.shape[0] <= self.n_landmark: n_landmark = None else: n_landmark = self.n_landmark if self.graph is not None and update_graph: self._update_graph(X, precomputed, n_pca, n_landmark) self.X = X if self.graph is None: tasklogger.log_start("graph and diffusion operator") self.graph = graphtools.Graph( X, n_pca=n_pca, n_landmark=n_landmark, distance=self.knn_dist, precomputed=precomputed, knn=self.knn, decay=self.decay, thresh=1e-4, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, **(self.kwargs)) tasklogger.log_complete("graph and diffusion operator") # landmark op doesn't build unless forced self.diff_op return self
def measure_all_methods(load_fn, n_cells=None, n_jobs=1, load_params=None, seed=None): if load_params is None: load_params = {} if "n_cells" in load_params: n_cells = load_params["n_cells"] del load_params["n_cells"] tasklogger.log_info("Generating noisy data with {}...".format(load_params), logger="demap") data_noised, labels = load_fn(return_groups=True, **load_params) data_name = load_fn.__name__ if n_cells is not None: subsample_idx = np.random.choice(data_noised.shape[0], n_cells, replace=False) else: subsample_idx = None measure = partial( measure_method, labels=labels, data_noised=data_noised, data_name=data_name, subsample_idx=subsample_idx, ) if n_jobs == 1: results = [ measure(method=method) for method in demap.embed.all_methods ] else: results = Parallel(n_jobs=n_jobs)( delayed(measure)(method=method) for method in demap.embed.parallel_methods) results = results + [ measure(method=method) for method in demap.embed.non_parallel_methods ] df = pd.concat(results) df = df.sort_values("ARI", ascending=False) for key, value in load_params.items(): df[key] = value if n_cells is not None: df["n_cells"] = n_cells print(df) return df
def _find_optimal_t(self, t_max=100, plot=False, ax=None): """Find the optimal value of t Selects the optimal value of t based on the knee point of the Von Neumann Entropy of the diffusion operator. Parameters ---------- t_max : int, default: 100 Maximum value of t to test plot : boolean, default: False If true, plots the Von Neumann Entropy and knee point ax : matplotlib.Axes, default: None If plot=True and ax is not None, plots the VNE on the given axis Otherwise, creates a new axis and displays the plot Returns ------- t_opt : int The optimal value of t """ tasklogger.log_start("optimal t") t, h = self._von_neumann_entropy(t_max=t_max) t_opt = vne.find_knee_point(y=h, x=t) tasklogger.log_info("Automatically selected t = {}".format(t_opt)) tasklogger.log_complete("optimal t") if plot: if ax is None: fig, ax = plt.subplots() show = True else: show = False ax.plot(t, h) ax.scatter(t_opt, h[t == t_opt], marker='*', c='k', s=50) ax.set_xlabel("t") ax.set_ylabel("Von Neumann Entropy") ax.set_title("Optimal t = {}".format(t_opt)) if show: plt.show() self.optimal_t = t_opt return t_opt
def fit(self, X, Y, q=None): if hasattr(self, "phi_X"): tasklogger.log_info("Using precomputed diffusion coordinates.") else: tasklogger.log_start("diffusion coordinates") if q is None: with parallel.ParallelQueue(n_jobs=min(2, self.n_jobs)) as q: return self.fit(X, Y, q) else: q.queue( math.diffusionCoordinates, X, decay=self.decay_X, knn=self.knn_X, n_pca=self.n_pca_X if self.n_pca_X is not None and self.n_pca_X < min(X.shape) else None, n_eigenvectors=self.n_eigenvectors, n_jobs=max(self.n_jobs // 2, 1), verbose=self.verbose, random_state=self.random_state, ) q.queue( math.diffusionCoordinates, Y, decay=self.decay_Y, knn=self.knn_Y, n_pca=self.n_pca_Y if self.n_pca_Y is not None and self.n_pca_Y < min(Y.shape) else None, n_eigenvectors=self.n_eigenvectors, n_jobs=max(self.n_jobs // 2, 1), verbose=self.verbose, random_state=self.random_state, ) (phi_X, lambda_X), (phi_Y, lambda_Y) = q.run() self.phi_X = phi_X self.lambda_X = lambda_X self.phi_Y = phi_Y self.lambda_Y = lambda_Y tasklogger.log_complete("diffusion coordinates") return self
def compute_condensation_param(X, granularity): """Short summary. Parameters ---------- X : type Description of parameter `X`. granularity : type Description of parameter `granularity`. Returns ------- type Description of returned object. """ epsilon = granularity * (0.1 * np.mean(np.std(X))) / (X.shape[0] ** (-1 / 5)) D = scipy.spatial.distance.pdist(X, metric="euclidean") merge_threshold = np.percentile(D, 0.001) + 0.001 tasklogger.log_info("Setting epsilon to " + str(round(epsilon, 4))) tasklogger.log_info("Setting merge threshold to " + str(round(merge_threshold, 4))) return epsilon, merge_threshold
def _update_graph(self, X, precomputed, n_pca, n_landmark): if self.X is not None and not utils.matrix_is_equivalent( X, self.X): """ If the same data is used, we can reuse existing kernel and diffusion matrices. Otherwise we have to recompute. """ self._reset_graph() else: try: self.graph.set_params( decay=self.decay, knn=self.knn, distance=self.knn_dist, precomputed=precomputed, n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca, n_landmark=n_landmark, random_state=self.random_state) tasklogger.log_info( "Using precomputed graph and diffusion operator...") except ValueError as e: # something changed that should have invalidated the graph tasklogger.log_debug("Reset graph due to {}".format( str(e))) self._reset_graph()
def impute(self, data): """Main function of I-Impute Parameters ---------- data : matrix, shape (m x n) The raw reads count matrix Returns ------- imputed_data: matrix, shape (m x n) The imputed matrix, pandas Dataframe object """ tasklogger.log_start('I-Impute') imputed_data = None if self.iteration: exp_mse = 1 mse = 100 previous_imputed_data = data iteration = 1 while mse > exp_mse: tasklogger.log_info( 'iteratively impute for the {0}th time'.format(iteration)) current_imputed_data = self._cimpute(previous_imputed_data) dist_matrix = (current_imputed_data - previous_imputed_data)**2 n_values = data.shape[0] * data.shape[1] mse = np.sqrt(dist_matrix.values.sum() / n_values) previous_imputed_data = current_imputed_data iteration += 1 imputed_data = previous_imputed_data else: imputed_data = self._cimpute(data) tasklogger.log_complete('I-Impute') return imputed_data
def measure_all_methods(load_fn, dropout=None, bcv=None, n_cells=None, n_genes=None, n_jobs=6, seed=None): dataset = load_fn(seed=seed, dropout=dropout, bcv=bcv, n_genes=n_genes) data_truth = dataset.X_true tasklogger.log_info("Calculating geodesic distances...") geodesic_dist = quantify.geodesic_distance(data_truth) data_noised = dataset.X if n_cells is not None and n_cells < Splatter.N_CELLS: subsample_idx = np.random.choice( data.shape[0], n_cells, replace=False) else: subsample_idx = None # embed tasklogger.log_info("Embedding...") methods = [m for m in embed.__all__ if not (m.__name__ in ['MDS', 'PHATE'])] embeddings = Parallel(6)(delayed(method)(data_noised, seed=seed) for method in methods) methods.append(embed.PHATE) embeddings.append(embed.PHATE(data_noised, seed=seed, n_jobs=10)) methods.append(embed.MDS) embeddings.append(embed.MDS(data_noised, seed=seed, n_jobs=10)) # plot tasklogger.log_info("Plotting...") fig, axes = plt.subplots(1, len(embeddings), figsize=(4*len(embeddings), 4)) for embedding, ax, method in zip(embeddings, axes, methods): scprep.plot.scatter2d(embedding, ax=ax, label_prefix=method.__name__, ticks=False, c=dataset.c, legend=False) plt.tight_layout() fig.savefig(IMG_PATH.format(dataset.name, seed)) # evaluate tasklogger.log_info("Evaluating...") results = [measure_method(embedding=embedding, data=data_truth, data_noised=data_noised, name=method.__name__, subsample_idx=subsample_idx, geodesic_dist=geodesic_dist, labels=dataset.c, seed=seed) for embedding, method in zip(embeddings, methods)] df = pd.concat(results) df = df.sort_values('DEMaP', ascending=False) print(df) return df
def fit(self, X): """Computes the diffusion operator Parameters ---------- X : array, shape=[n_samples, n_features] input data with `n_samples` samples and `n_dimensions` dimensions. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If `knn_dist` is 'precomputed', `data` should be a n_samples x n_samples distance or affinity matrix Returns ------- phate_operator : PHATE The estimator object """ try: if isinstance(X, anndata.AnnData): X = X.X except NameError: # anndata not installed pass if self.knn_dist.startswith('precomputed'): if self.knn_dist == 'precomputed': # automatic detection if isinstance(X, sparse.coo_matrix): X = X.tocsr() if X[0, 0] == 0: precomputed = "distance" else: precomputed = "affinity" elif self.knn_dist in [ 'precomputed_affinity', 'precomputed_distance' ]: precomputed = self.knn_dist.split("_")[1] else: raise ValueError( "knn_dist {} not recognized. Did you mean " "'precomputed_distance', " "'precomputed_affinity', or 'precomputed' " "(automatically detects distance or affinity)?") tasklogger.log_info( "Using precomputed {} matrix...".format(precomputed)) n_pca = None else: precomputed = None if X.shape[1] <= self.n_pca: n_pca = None else: n_pca = self.n_pca if self.n_landmark is None or X.shape[0] <= self.n_landmark: n_landmark = None else: n_landmark = self.n_landmark if self.graph is not None: if self.X is not None and not matrix_is_equivalent(X, self.X): """ If the same data is used, we can reuse existing kernel and diffusion matrices. Otherwise we have to recompute. """ self._reset_graph() else: try: self.graph.set_params(decay=self.a, knn=self.k + 1, distance=self.knn_dist, precomputed=precomputed, n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca, thresh=1e-4, n_landmark=n_landmark, random_state=self.random_state) tasklogger.log_info( "Using precomputed graph and diffusion operator...") except ValueError as e: # something changed that should have invalidated the graph tasklogger.log_debug("Reset graph due to {}".format( str(e))) self._reset_graph() self.X = X if self.graph is None: tasklogger.log_start("graph and diffusion operator") self.graph = graphtools.Graph(X, n_pca=n_pca, n_landmark=n_landmark, distance=self.knn_dist, precomputed=precomputed, knn=self.k + 1, decay=self.a, thresh=1e-4, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state) tasklogger.log_complete("graph and diffusion operator") # landmark op doesn't build unless forced self.diff_op return self
def run_magic_from_file( filename, # data loading params sparse=True, gene_names=None, cell_names=None, cell_axis=None, gene_labels=None, allow_duplicates=None, genome=None, metadata_channels=None, # filtering params min_library_size=2000, min_cells_per_gene=10, # normalization params library_size_normalize=True, transform='sqrt', pseudocount=None, cofactor=None, # kernel params knn=5, decay=15, n_pca=100, knn_dist='euclidean', n_jobs=1, random_state=42, verbose=1, # magic params t_magic='auto', genes=None, # output params output='magic.csv', validate=False): """Run MAGIC on a file Parameters ---------- filename : str Allowed types: csv, tsv, mtx, hdf5/h5 (10X format), directory/zip (10X format) sparse : bool (recommended: True for scRNAseq, False for CyTOF) Force data sparsity. If `None`, sparsity is determined by data type. gene_names : str, list or bool Allowed values: - if filetype is csv or fcs, `True` says gene names are data headers, `str` gives a path to a separate csv or tsv file containing gene names, list gives an array of gene names, `False` means no gene names are given - if filetype is mtx, `str` gives a path to a separate csv or tsv file containing gene names, list gives an array of gene names, or `False` means no gene names are given - if filetype is hdf5, h5, directory or zip, must be `None`. cell_names : str, list or bool Allowed values: - if filetype is csv or fcs, `True` says cell names are data headers, `str` gives a path to a separate csv or tsv file containing cell names, list gives an array of cell names, `False` means no cell names are given - if filetype is mtx, `str` gives a path to a separate csv or tsv file containing cell names, list gives an array of cell names, or `False` means no gene names are given - if filetype is hdf5, h5, directory or zip, must be `None`. cell_axis : {'row', 'column'} States whether cells are on rows or columns. If cell_axis=='row', data is of shape [n_cells, n_genes]. If cell_axis=='column', data is of shape [n_genes, n_cells]. Only valid for filetype mtx and csv gene_labels : {'symbol', 'id', 'both'} Choice of gene labels for 10X data. Recommended: 'both' Only valid for directory, zip, hdf5, h5 allow_duplicates : bool Allow duplicate gene names in 10X data. Recommended: True Only valid for directory, zip, hdf5, h5 genome : str Genome name. Only valid for hdf5, h5 metadata_channels : list of str (recommended: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1']) Names of channels in fcs data which are not real measurements. Only valid if datatype is fcs. min_library_size : int or `None`, optional (default: 2000) Cutoff for library size normalization. If `None`, library size filtering is not used min_cells_per_gene : int or `None`, optional (default: 10) Minimum non-zero cells for a gene to be used. If `None`, genes are not removed library_size_normalize : `bool`, optional (default: True) Use library size normalization transform : {'sqrt', 'log', 'arcsinh', None} How to transform the data. If `None`, no transformation is done pseudocount : float (recommended: 1) Number of pseudocounts to add to genes prior to log transformation cofactor : float (recommended: 5) Factor by which to divide genes prior to arcsinh transformation knn : int, optional, default: 10 number of nearest neighbors on which to build kernel decay : int, optional, default: 15 sets decay rate of kernel tails. If None, alpha decaying kernel is not used n_pca : int, optional, default: 100 Number of principal components to use for calculating neighborhoods. For extremely large datasets, using n_pca < 20 allows neighborhoods to be calculated in roughly log(n_samples) time. knn_dist : string, optional, default: 'euclidean' recommended values: 'euclidean', 'cosine' Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. n_jobs : integer, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize random PCA If an integer is given, it fixes the seed Defaults to the global `numpy` random number generator verbose : `int` or `boolean`, optional (default: 1) If `True` or `> 0`, print status messages t_magic : int, optional, default: 'auto' power to which the diffusion operator is powered for MAGIC. This sets the level of diffusion. If 'auto', t is selected according to the Procrustes disparity of the diffused data genes : list or {"all_genes", "pca_only"}, optional (default: None) List of genes to return from MAGIC, either as integer indices or column names if input data is a pandas DataFrame. If "all_genes", the entire smoothed matrix is returned. If "pca_only", PCA on the smoothed data is returned. If None, the entire matrix is also returned, but a warning may be raised if the resultant matrix is very large. output : str, optional (default: 'magic.csv') Output CSV file to save smoothed data matrix """ # check arguments filetype = check_filetype(filename) load_fn, load_kws = check_load_args(filetype, sparse=sparse, gene_names=gene_names, cell_names=cell_names, cell_axis=cell_axis, gene_labels=gene_labels, allow_duplicates=allow_duplicates, genome=genome, metadata_channels=metadata_channels) transform_fn, transform_kws = check_transform_args(transform=transform, pseudocount=pseudocount, cofactor=cofactor) # set up logging # https://github.com/scottgigante/tasklogger tasklogger.set_level(verbose) # load data # example: scprep.io.load_csv("data.csv") # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.io tasklogger.log_info("Loading data from {}...".format(filename)) data = load_fn(filename, **load_kws) data = scprep.sanitize.check_numeric(data, copy=True) tasklogger.log_info("Loaded {} cells and {} genes.".format( data.shape[0], data.shape[1])) # filter data # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.filter if min_library_size is not None: tasklogger.log_info("Filtering cells by library size >= {}...".format( min_library_size)) data = scprep.filter.filter_library_size(data, cutoff=min_library_size) tasklogger.log_info("Retained {} cells.".format(data.shape[0])) if min_cells_per_gene is not None: tasklogger.log_info( "Filtering genes by min cells >= {}...".format(min_cells_per_gene)) data = scprep.filter.filter_rare_genes(data, min_cells=min_cells_per_gene) tasklogger.log_info("Retained {} genes.".format(data.shape[1])) # normalize data # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.normalize if library_size_normalize: tasklogger.log_info("Library size normalizing data...") data = scprep.normalize.library_size_normalize(data) # transform data # example: data = scprep.transform.sqrt(data) # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.transform if transform is not None: tasklogger.log_info("Applying {} transform...".format(transform)) data = transform_fn(data, **transform_kws) # run MAGIC # https://magic.readthedocs.io/ magic_op = magic.MAGIC(knn=knn, decay=decay, t=t_magic, n_pca=n_pca, knn_dist=knn_dist, n_jobs=n_jobs, random_state=random_state, verbose=verbose) magic_data = magic_op.fit_transform(data, genes=genes) # save as csv magic_data = pd.DataFrame(magic_data) if cell_axis in ['col', 'column']: magic_data = magic_data.T tasklogger.log_info("Saving data to {}...".format(output)) magic_data.to_csv(output) tasklogger.log_info("Complete.".format(output)) if validate: correct_magic_data = scprep.io.load_csv( 'https://raw.githubusercontent.com/KrishnaswamyLab/magic-docker/' 'master/magic-validate.csv', sparse=False) try: np.testing.assert_equal(scprep.utils.toarray(magic_data), scprep.utils.toarray(correct_magic_data)) tasklogger.log_debug( "Validation complete, output is equal to expected") except AssertionError: np.testing.assert_allclose( scprep.utils.toarray(magic_data), scprep.utils.toarray(correct_magic_data), atol=1e-14) tasklogger.log_debug( "Validation complete, output is numerically equivalent to expected" )
def test_log(): tasklogger.log_debug("debug") tasklogger.log_info("info") tasklogger.log_warning("warning") tasklogger.log_error("error") tasklogger.log_critical("critical")
def impute(self, data, t_max=20, plot=False, ax=None, max_genes_compute_t=500, threshold=0.001): """Peform MAGIC imputation Parameters ---------- data : graphtools.Graph, graphtools.Data or array-like Input data t_max : int, optional (default: 20) Maximum value of t to consider for optimal t selection plot : bool, optional (default: False) Plot the optimal t selection graph ax : matplotlib.Axes, optional (default: None) Axis on which to plot. If None, a new axis is created max_genes_compute_t : int, optional (default: 500) Above this number, genes will be subsampled for optimal t selection threshold : float, optional (default: 0.001) Threshold after which Procrustes disparity is considered to have converged for optimal t selection Returns ------- X_magic : array-like, shape=[n_samples, n_pca] Imputed data """ if not isinstance(data, graphtools.base.Data): data = graphtools.base.Data(data, n_pca=self.n_pca) data_imputed = data.data_nu if data_imputed.shape[1] > max_genes_compute_t: subsample_genes = np.random.choice(data_imputed.shape[1], max_genes_compute_t, replace=False) else: subsample_genes = None if hasattr(data, "data_pca"): weights = None # data.data_pca.explained_variance_ratio_ else: weights = None if self.t == 'auto': _, data_prev = self.calculate_error( data_imputed, data_prev=None, weights=weights, subsample_genes=subsample_genes) error_vec = [] t_opt = None else: t_opt = self.t tasklogger.log_start("imputation") # classic magic # the diffusion matrix is powered when t has been specified by # the user, and the dimensions of the diffusion matrix are lesser # than those of the data matrix. (M^t) * D if (t_opt is not None) and \ (self.diff_op.shape[1] < data_imputed.shape[1]): diff_op_t = np.linalg.matrix_power(self.diff_op, t_opt) data_imputed = diff_op_t.dot(data_imputed) # fast magic # a while loop is used when the dimensions of the diffusion matrix # are greater than those of the data matrix, or when t is not specified # (so as to allow for the calculation of the optimal t value) else: i = 0 while (t_opt is None and i < t_max) or \ (t_opt is not None and i < t_opt): i += 1 data_imputed = self.diff_op.dot(data_imputed) if self.t == 'auto': error, data_prev = self.calculate_error( data_imputed, data_prev, weights=weights, subsample_genes=subsample_genes) error_vec.append(error) tasklogger.log_debug("{}: {}".format(i, error_vec)) if error < threshold and t_opt is None: t_opt = i + 1 tasklogger.log_info( "Automatically selected t = {}".format(t_opt)) tasklogger.log_complete("imputation") if plot: # continue to t_max tasklogger.log_start("optimal t plot") if t_opt is None: # never converged warnings.warn("optimal t > t_max ({})".format(t_max), RuntimeWarning) else: data_overimputed = data_imputed while i < t_max: i += 1 data_overimputed = self.diff_op.dot(data_overimputed) error, data_prev = self.calculate_error( data_overimputed, data_prev, weights=weights, subsample_genes=subsample_genes) error_vec.append(error) # create axis if ax is None: fig, ax = plt.subplots() show = True else: show = False # plot x = np.arange(len(error_vec)) + 1 ax.plot(x, error_vec) if t_opt is not None: ax.plot( t_opt, error_vec[t_opt - 1], 'ro', markersize=10, ) ax.plot(x, np.full(len(error_vec), threshold), 'k--') ax.set_xlabel('t') ax.set_ylabel('disparity(data_{t}, data_{t-1})') ax.set_xlim([1, len(error_vec)]) plt.tight_layout() tasklogger.log_complete("optimal t plot") if show: plt.show(block=False) return data_imputed
def fit(self, X): """Computes the diffusion operator Parameters ---------- X : array, shape=[n_samples, n_features] input data with `n_samples` samples and `n_features` dimensions. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. Returns ------- magic_operator : MAGIC The estimator object """ if self.knn_dist == 'precomputed': if isinstance(X, sparse.coo_matrix): X = X.tocsr() if X[0, 0] == 0: precomputed = "distance" else: precomputed = "affinity" tasklogger.log_info( "Using precomputed {} matrix...".format(precomputed)) n_pca = None else: precomputed = None if self.n_pca is None or X.shape[1] <= self.n_pca: n_pca = None else: n_pca = self.n_pca if self.graph is not None: if self.X is not None and not \ utils.matrix_is_equivalent(X, self.X): """ If the same data is used, we can reuse existing kernel and diffusion matrices. Otherwise we have to recompute. """ self.graph = None else: try: self.graph.set_params(decay=self.a, knn=self.k + 1, distance=self.knn_dist, precomputed=precomputed, n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca, thresh=1e-4, random_state=self.random_state) tasklogger.log_info( "Using precomputed graph and diffusion operator...") except ValueError as e: # something changed that should have invalidated the graph tasklogger.log_debug("Reset graph due to {}".format( str(e))) self.graph = None self.X = X if utils.has_empty_columns(X): warnings.warn("Input matrix contains unexpressed genes. " "Please remove them prior to running MAGIC.") if self.graph is None: # reset X_magic in case it was previously set self.X_magic = None tasklogger.log_start("graph and diffusion operator") self.graph = graphtools.Graph(X, n_pca=n_pca, knn=self.k + 1, decay=self.a, thresh=1e-4, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state) tasklogger.log_complete("graph and diffusion operator") return self
def test_log(): tasklogger.log_debug('debug') tasklogger.log_info('info') tasklogger.log_warning('warning') tasklogger.log_error('error') tasklogger.log_critical('critical')
def fit(self, X, graph=None): """Computes the diffusion operator Parameters ---------- X : array, shape=[n_samples, n_features] input data with `n_samples` samples and `n_features` dimensions. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. graph : `graphtools.Graph`, optional (default: None) If given, provides a precomputed kernel matrix with which to perform diffusion. Returns ------- magic_operator : MAGIC The estimator object """ if self.n_pca is None or X.shape[1] <= self.n_pca: n_pca = None else: n_pca = self.n_pca tasklogger.log_info("Running MAGIC on {} cells and {} genes.".format( X.shape[0], X.shape[1])) if graph is None: graph = self.graph if self.X is not None and not \ utils.matrix_is_equivalent(X, self.X): """ If the same data is used, we can reuse existing kernel and diffusion matrices. Otherwise we have to recompute. """ tasklogger.log_debug( "Reset graph due to difference in input data") graph = None elif graph is not None: try: graph.set_params(decay=self.decay, knn=self.knn, distance=self.knn_dist, n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca, thresh=1e-4, random_state=self.random_state) except ValueError as e: # something changed that should have invalidated the graph tasklogger.log_debug("Reset graph due to {}".format( str(e))) graph = None else: self.knn = graph.knn self.alpha = graph.decay self.n_pca = graph.n_pca self.knn_dist = graph.distance self.X = X if utils.has_empty_columns(X): warnings.warn("Input matrix contains unexpressed genes. " "Please remove them prior to running MAGIC.") if graph is not None: tasklogger.log_info( "Using precomputed graph and diffusion operator...") self.graph = graph else: # reset X_magic in case it was previously set self.X_magic = None tasklogger.log_start("graph and diffusion operator") self.graph = graphtools.Graph(X, n_pca=n_pca, knn=self.knn, decay=self.decay, thresh=1e-4, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state) tasklogger.log_complete("graph and diffusion operator") return self
def online_update_tree( data_1, data_2, pca_centroid, pca_op, partitions, diff_operator, diff_pca_op, Xs, NxTs, Ks, Merges, Ps, scale, n_jobs=10, random_state=None, ): """Short summary. Parameters ---------- data_1 : type Description of parameter `data_1`. data_2 : type Description of parameter `data_2`. pca_centroid : type Description of parameter `pca_centroid`. pca_op : type Description of parameter `pca_op`. partitions : type Description of parameter `partitions`. diff_operator : type Description of parameter `diff_operator`. diff_pca_op : type Description of parameter `diff_pca_op`. Xs : type Description of parameter `Xs`. NxTs : type Description of parameter `NxTs`. Ks : type Description of parameter `Ks`. Merges : type Description of parameter `Merges`. Ps : type Description of parameter `Ps`. scale : type Description of parameter `scale`. n_jobs : type Description of parameter `n_jobs`. random_state : integer or numpy.RandomState, optional, default: None The random number generator. If an integer is given, it fixes the seed. Defaults to the global `numpy` random number generator Returns ------- type Description of returned object. """ with tasklogger.log_task("Multiscale PHATE tree mapping"): if data_1.shape[0] != len(np.unique(partitions)): tasklogger.log_info("PCA compressing new data...") data_pca_1 = pca_op.transform(np.array(data_1)) data_pca_2 = pca_op.transform(np.array(data_2)) # Mapping new data to partitions partition_assignments = compress.map_update_data(pca_centroid, data_pca_1, data_pca_2, partitions, nn=5, n_jobs=n_jobs) tasklogger.log_info("Points not mapped to partitions: " + str(sum(partition_assignments == -1))) # creating new joint paritions mapping new_partition_clusters = list(partitions) new_partition_clusters.extend(partition_assignments) new_partition_clusters = np.asarray(new_partition_clusters) update_idx = np.where(new_partition_clusters == -1)[0] max_partition = max(new_partition_clusters) for i in range(len(update_idx)): new_partition_clusters[update_idx[i]] = max_partition + 1 max_partition += 1 if sum(partition_assignments == -1) > 0: diff_pot_1 = diffuse.online_update_diffusion_potential( data_pca_2[partition_assignments == -1, :], diff_operator, diff_pca_op, ) epsilon, merge_threshold = condense.compute_condensation_param( diff_pot_1, granularity=0.1) # change to granularity pca_total = np.concatenate( [pca_centroid, data_pca_2[partition_assignments == -1, :]]) NxTs_n, Xs_n, Ks_n, Merges_n, Ps_n = condense.condense( diff_pot_1, new_partition_clusters, scale, epsilon, merge_threshold, n_jobs=n_jobs, random_state=random_state, ) return NxTs_n, Xs_n, Ks_n, Merges_n, Ps_n, pca_total else: clusters = new_partition_clusters tasklogger.log_info("Rebuilding condensation tree...") clusters_idx = [] for c in clusters: clusters_idx.append(np.where(NxTs[0] == c)[0][0]) NxTs_l = [] for l in range(len(NxTs)): NxTs_l.append(NxTs[l][clusters_idx]) return NxTs_l, Xs, Ks, Merges, Ps, pca_centroid else: tasklogger.log_info("PCA compressing new data...") data_pca_2 = pca_op.transform(np.array(data_2)) diff_pot_1 = diffuse.online_update_diffusion_potential( data_pca_2, diff_operator, diff_pca_op) clusters = np.arange(diff_pot_1.shape[0]) epsilon, merge_threshold = condense.compute_condensation_param( diff_pot_1, granularity=0.1) # change to granularity NxTs_n, Xs_n, Ks_n, Merges_n, Ps_n = condense.condense( diff_pot_1, clusters, scale, epsilon, merge_threshold, n_jobs=n_jobs, random_state=random_state, ) return ( NxTs_n, Xs_n, Ks_n, Merges_n, Ps_n, np.concatenate([pca_centroid, data_pca_2]), )
def transform(self, X=None, t_max=100, plot_optimal_t=False, ax=None): """Computes the position of the cells in the embedding space Parameters ---------- X : array, optional, shape=[n_samples, n_features] input data with `n_samples` samples and `n_dimensions` dimensions. Not required, since PHATE does not currently embed cells not given in the input matrix to `PHATE.fit()`. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If `knn_dist` is 'precomputed', `data` should be a n_samples x n_samples distance or affinity matrix t_max : int, optional, default: 100 maximum t to test if `t` is set to 'auto' plot_optimal_t : boolean, optional, default: False If true and `t` is set to 'auto', plot the Von Neumann entropy used to select t ax : matplotlib.axes.Axes, optional If given and `plot_optimal_t` is true, plot will be drawn on the given axis. Returns ------- embedding : array, shape=[n_samples, n_dimensions] The cells embedded in a lower dimensional space using PHATE """ if self.graph is None: raise NotFittedError("This PHATE instance is not fitted yet. Call " "'fit' with appropriate arguments before " "using this method.") elif X is not None and not matrix_is_equivalent(X, self.X): # fit to external data warnings.warn( "Pre-fit PHATE cannot be used to transform a " "new data matrix. Please fit PHATE to the new" " data by running 'fit' with the new data.", RuntimeWarning) if isinstance(self.graph, graphtools.graphs.TraditionalGraph) and \ self.graph.precomputed is not None: raise ValueError("Cannot transform additional data using a " "precomputed distance matrix.") else: transitions = self.graph.extend_to_data(X) return self.graph.interpolate(self.embedding, transitions) else: if self.diff_potential is None: if self.t == 'auto': t = self.optimal_t(t_max=t_max, plot=plot_optimal_t, ax=ax) tasklogger.log_info( "Automatically selected t = {}".format(t)) else: t = self.t self.diff_potential = self.calculate_potential(self.diff_op, t) elif plot_optimal_t: self.optimal_t(t_max=t_max, plot=plot_optimal_t, ax=ax) if self.embedding is None: tasklogger.log_start("{} MDS".format(self.mds)) self.embedding = embed_MDS(self.diff_potential, ndim=self.n_components, how=self.mds, distance_metric=self.mds_dist, n_jobs=self.n_jobs, seed=self.random_state, verbose=self.verbose - 1) tasklogger.log_complete("{} MDS".format(self.mds)) if isinstance(self.graph, graphtools.graphs.LandmarkGraph): tasklogger.log_debug("Extending to original data...") return self.graph.interpolate(self.embedding) else: return self.embedding
def parse_args(): parser = argparse.ArgumentParser( description='Run MAGIC for imputation of ' 'high-dimensional data.', epilog='For help, visit magic.readthedocs.io or ' 'krishnaswamylab.org/get-help', add_help=True, allow_abbrev=True) io_group = parser.add_argument_group('Data IO') filename = io_group.add_mutually_exclusive_group(required=True) filename.add_argument('--filename', type=str, default=None, help='Input data. Allowed types: csv, tsv, mtx, ' 'hdf5/h5 (10X format), directory/zip (10X format)') filename.add_argument('--validate', action='store_true', default=False, help='Run MAGIC on a test dataset to ensure ' 'output is correct.') sparse = io_group.add_mutually_exclusive_group() sparse.add_argument('--sparse', action='store_true', help='Use sparse data format', dest='sparse', default=None) sparse.add_argument('--dense', action='store_false', help='Use dense data format', dest='sparse', default=None) gene_names = io_group.add_mutually_exclusive_group() gene_names.add_argument('--gene-names', action='store_true', help='Use gene name headers in data file' ' (csv, tsv, fcs)', dest='gene_names', default=True) gene_names.add_argument('--no-gene-names', action='store_false', help='Do not use gene names' ' (csv, tsv, fcs, mtx)', dest='gene_names', default=True) gene_names.add_argument('--gene-name-file', type=str, help='Use gene name headers in FILE' ' (csv, tsv, fcs, mtx)', metavar='FILE', dest='gene_names', default=True) cell_names = io_group.add_mutually_exclusive_group() cell_names.add_argument('--cell-names', action='store_true', help='Use cell name headers in data file' ' (csv, tsv, fcs)', dest='cell_names', default=True) cell_names.add_argument('--no-cell-names', action='store_false', help='Do not use cell names' ' (csv, tsv, fcs, mtx)', dest='cell_names', default=True) cell_names.add_argument('--cell-name-file', type=str, help='Use cell name headers in FILE' ' (csv, tsv, fcs, mtx)', metavar='FILE', dest='cell_names', default=True) io_group.add_argument('--cell-axis', type=str, choices=['row', 'column'], default='row', help='States whether cells are on rows or columns ' '(csv, tsv, mtx)') io_group.add_argument('--gene-labels', type=str, default='both', choices=['symbol', 'id', 'both'], help='Choice of gene labels for 10X data' ' (dir, zip, hdf5)') io_group.add_argument('--genome', type=str, default=None, help='Genome name for 10X HDF5 data (hdf5)') io_group.add_argument( '--metadata-channels', type=str, nargs='+', default=[ 'Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1' ], help='Names of channels to remove from fcs data (fcs)', metavar='CHANNEL') preprocess_group = parser.add_argument_group('Preprocessing') cell_filter = preprocess_group.add_mutually_exclusive_group() cell_filter.add_argument('--min-library-size', type=int, default=2000, help='Filter cells with less than COUNTS counts', dest='min_library_size', metavar='COUNTS') cell_filter.add_argument('--no-cell-filter', action='store_false', default=2000, dest='min_library_size', help='Do not filter cells') gene_filter = preprocess_group.add_mutually_exclusive_group() gene_filter.add_argument( '--min-cells-per-gene', type=int, default=10, help='Filter genes with less than CELLS non-zero cells', dest='min_cells_per_gene', metavar='CELLS') gene_filter.add_argument('--no-gene-filter', action='store_false', default=2000, dest='min_cells_per_gene', help='Do not filter genes') libnorm = preprocess_group.add_mutually_exclusive_group() libnorm.add_argument( '--normalize', action='store_true', default=True, dest='library_size_normalize', help='Normalize cells by total UMI count (library size)') libnorm.add_argument('--no-normalize', action='store_false', default=True, dest='library_size_normalize', help='Do not normalize cells') transform = preprocess_group.add_mutually_exclusive_group() transform.add_argument('--transform', type=str, default='sqrt', choices=['sqrt', 'log', 'arcsinh'], help='Sublinear data transformation function') transform.add_argument('--no-transform', action='store_false', default='sqrt', dest='transform', help='Do not transform data') preprocess_group.add_argument('--pseudocount', type=float, default=1, help='Pseudocount to add to genes prior ' 'to log transform', metavar='PCOUNT') preprocess_group.add_argument('--cofactor', type=float, default=5, help='Factor by which to divide genes prior ' 'to arcsinh transform') kernel_group = parser.add_argument_group('Kernel Computation') kernel_group.add_argument('-k', '--knn', type=int, default=10, dest='knn', help='Number of nearest neighbors on which to ' 'build kernel') decay = kernel_group.add_mutually_exclusive_group() decay.add_argument('-a', '--decay', type=int, default=15, dest='decay', help='Sets decay rate of kernel tails') decay.add_argument('--no-decay', action='store_false', default=15, dest='decay', help='Do not use alpha decay') pca = kernel_group.add_mutually_exclusive_group() pca.add_argument('--pca', type=int, default=100, dest='n_pca', help='Number of principal components to use for ' 'neighborhoods') pca.add_argument('--no-pca', action='store_false', default=100, dest='n_pca', help='Do not use PCA') kernel_group.add_argument('--knn-dist', type=str, default='euclidean', help='Distance metric to use for calculating ' 'neighborhoods. Recommended values are ' '"euclidean" and "cosine"', metavar='DISTANCE') kernel_group.add_argument( '-t', '--threads', type=int, default=1, help='Use THREADS threads. If -1 all CPUs are used', metavar='THREADS', dest='random_state') kernel_group.add_argument('--seed', type=int, default=None, help='Integer random seed', metavar='SEED', dest='random_state') verbose = kernel_group.add_mutually_exclusive_group() verbose.add_argument('-v', '--verbose', action='store_true', default=True, help='Print verbose output') verbose.add_argument('-q', '--quiet', action='store_false', default=True, help='Do not print verbose output', dest='verbose') verbose.add_argument('-vv', '--debug', action='store_true', default=False, help='Print debugging output', dest='debug') magic_group = parser.add_argument_group('MAGIC') magic_group.add_argument('--t-magic', type=str, default='auto', help='Level of diffusion for MAGIC', metavar='T') genes = magic_group.add_mutually_exclusive_group() genes.add_argument('--pca-only', action='store_true', default=False, help='Return PCA on the smoothed matrix') genes.add_argument('--all-genes', action='store_true', default=False, help='Return the entire smoothed matrix') genes.add_argument('--gene-list', type=str, nargs='+', default=None, help='List of genes to return from MAGIC, ' 'either as integer indices or column names.', metavar='GENE', dest='genes') magic_group.add_argument('--output', type=str, default='magic.csv', help='Output CSV file to save smoothed ' 'data matrix', metavar='FILE') args = parser.parse_args() if args.validate: tasklogger.set_level(2) tasklogger.log_info("Running MAGIC validation.") args.filename = "https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_small.csv" args.sparse = False args.gene_names = True args.cell_names = True args.cell_axis = "row" args.gene_labels = "both" args.genome = None args.metadata_channels = None args.min_library_size = 1 args.min_cells_per_gene = 1 args.library_size_normalize = True args.transform = 'sqrt' args.pseudocount = None args.cofactor = None args.knn = 3 args.decay = 20 args.n_pca = None args.knn_dist = "euclidean" args.n_jobs = 1 args.random_state = 42 args.verbose = True args.debug = True args.t_magic = "auto" args.all_genes = True args.output = "magic-validate.csv" # fix magic "genes" argument if args.all_genes: args.genes = "all_genes" elif args.pca_only: args.genes = "pca_only" else: try: args.genes = [int(g) for g in args.genes] except TypeError: # string gene names pass del args.all_genes del args.pca_only # fix t argument if args.t_magic != 'auto': try: args.t_magic = int(args.t_magic) except TypeError: parser.error("argument --t-magic: invalid int value: '{}'".format( args.t_magic)) # fix debug argument if args.debug: args.verbose = 2 del args.debug # store None values where appropriate if args.decay is False: args.decay = None if args.n_pca is False: args.n_pca = None if args.min_library_size is False: args.min_library_size = None if args.min_cells_per_gene is False: args.min_cells_per_gene = None # check for inappropriately set defaults try: filetype = check_filetype(args.filename) except RuntimeError as e: parser.error(str(e)) if filetype not in ['csv', 'tsv', 'csv.gz', 'tsv.gz', 'fcs']: if '--gene-names' not in sys.argv: args.gene_names = None else: parser.error( "Cannot handle --gene-names with {} file".format(filetype)) if '--cell-names' not in sys.argv: args.cell_names = None else: parser.error( "Cannot handle --cell-names with {} file".format(filetype)) if filetype not in ['csv', 'tsv', 'csv.gz', 'tsv.gz', 'mtx']: if '--cell-axis' not in sys.argv: args.cell_axis = None else: parser.error( "Cannot handle --cell-axis with {} file".format(filetype)) if filetype not in ['dir', 'zip', 'hdf5', 'h5']: if '--gene-labels' not in sys.argv: args.gene_labels = None else: parser.error( "Cannot handle --gene-labels with {} file".format(filetype)) if filetype not in ['hdf5', 'h5']: if '--genome' not in sys.argv: args.genome = None else: parser.error( "Cannot handle --genome with {} file".format(filetype)) if filetype not in ['fcs']: if '--metadata-channels' not in sys.argv: args.metadata_channels = None else: parser.error( "Cannot handle --metadata-channels with {} file".format( filetype)) # check for inappropriately set parameters if not args.transform == 'log': if '--pseudocount' in sys.argv: parser.error( "Cannot handle --pseudocount with --transform {}".format( args.transform)) else: args.pseudocount = None if not args.transform == 'arcsinh': if '--cofactor' in sys.argv: parser.error("Cannot handle --cofactor with --transform {}".format( args.transform)) else: args.cofactor = None return args
def _cimpute(self, data): """Main function of C-Impute Parameters ---------- data : matrix, shape (m x n) The raw reads count matrix Returns ------- imputed_data: matrix, shape (m x n) The imputed matrix """ # tasklogger.log_info('reading data...') # read data raw_data = data.fillna(0) if self.normalize: tasklogger.log_info('normalizing data by library size...') # normalize by library size norm_data = (raw_data * np.power(10, 6) / raw_data.sum().replace(0, 1)).values else: norm_data = raw_data.values tasklogger.log_info('preprocessing data...') # remove zero sum genes and cells filtered_rows_indexes = np.where(np.all(norm_data == 0, axis=1)) filtered_rows_data = np.delete(norm_data, filtered_rows_indexes[0], axis=0) filtered_columns_indexes = np.where( np.all(filtered_rows_data == 0, axis=0)) filtered_data = np.delete(filtered_rows_data, filtered_columns_indexes[0], axis=1) # log(x + 1.01) if self.normalize: log_data = np.log10(filtered_data + 1.01) else: log_data = filtered_data + self.ZERO_VALUE tasklogger.log_info('performing pca...') # pca pca = PCA() pca_data = pca.fit_transform(log_data.T) selected_pca_data = pca_data[:, :self._cal_explained_component_number( pca.explained_variance_ratio_)].T tasklogger.log_info('detecting outlier cells...') # remove outlier cells # 1. cal distance matrix dist_matrix = euclidean_distances(selected_pca_data.T, selected_pca_data.T) dist_matrix[dist_matrix == 0.0] = np.inf # 2. get min distance vector for each cell min_dist_vector = dist_matrix.min(axis=0) # 3. find outlier cells outlier_indexes = self._detect_outliers(min_dist_vector) tmp_remained_data = np.delete(log_data, outlier_indexes[0], axis=1) remained_pca_data = np.delete(selected_pca_data, outlier_indexes[0], axis=1) # remove rows in which all values is zero all_zeros_rows_indexes = np.where( np.all(tmp_remained_data == self.ZERO_VALUE, axis=1)) remained_data = np.delete(tmp_remained_data, all_zeros_rows_indexes[0], axis=0) tasklogger.log_info('calculating the affinity matrix...') # cal affinity matrix remained_dist_matrix = euclidean_distances(remained_pca_data.T, remained_pca_data.T) remained_dist_matrix[remained_dist_matrix == 0.0] = np.inf if self.n >= int(remained_dist_matrix.shape[0] / 2): self.n = int(remained_dist_matrix.shape[0] / 2) nth_smallest_dist_index = self.n - 1 nth_smallest_dist_vector = np.partition( remained_dist_matrix, nth_smallest_dist_index, axis=1)[:, nth_smallest_dist_index] exp_matrix = np.exp(-remained_dist_matrix / (2 * np.power(nth_smallest_dist_vector, 2))) larged_indexes = np.column_stack( np.where(remained_dist_matrix <= nth_smallest_dist_vector)) remained_dist_matrix[ remained_dist_matrix > nth_smallest_dist_vector] = 0 for index in larged_indexes: remained_dist_matrix[index[0]][index[1]] = exp_matrix[index[0]][ index[1]] tasklogger.log_info( 'calculating the droupout probability matrix using EM algorithm...' ) # EM algorithm D = self._EM_algorithm(remained_data) tasklogger.log_info( 'calculating the weight matrix using non-negative least squares lasso regression...' ) imputed_data = np.zeros(remained_data.shape) for index, row in enumerate(remained_dist_matrix): tasklogger.log_info('imputing gene expression for cell ' + str(index)) D_j = D[:, index] X_j = remained_data[:, index] Y = (1 - D_j) * X_j non_self_dist = np.delete(row, index) # N-1 X 1 non_self_data = np.delete(remained_data, index, axis=1) # M X N-1 non_self_D = np.delete(D, index, axis=1) diag = np.diag(non_self_dist) # N-1 X N-1 X = np.matmul(diag, non_self_data.T).T * (1 - non_self_D ) # M X N-1 lasso = Lasso(alpha=self.alpha, positive=True, max_iter=3000) lasso.fit(X, Y) imputed_data[:, index] = lasso.predict(X) tasklogger.log_info('processing imputed data...') imputed_indexes = np.column_stack(np.where(D >= self.c_drop)) for index in imputed_indexes: remained_data[index[0]][index[1]] = imputed_data[index[0]][ index[1]] tasklogger.log_info('recovering unimputed data...') # recover deleted rows and columns # 1. insert rows for index in all_zeros_rows_indexes[0]: remained_data = np.insert(remained_data, index, tmp_remained_data[index], axis=0) # 2. insert columns for index in outlier_indexes[0]: remained_data = np.insert(remained_data, index, log_data[:, index], axis=1) # 3. recover original values for each value remained_data[remained_data < self.ZERO_VALUE] = self.ZERO_VALUE if self.normalize: remained_data = np.power(10, remained_data) - 1.01 else: remained_data = remained_data - self.ZERO_VALUE # 4. insert columns for index in filtered_columns_indexes[0]: remained_data = np.insert(remained_data, index, filtered_rows_data[:, index], axis=1) # 5. insert rows for index in filtered_rows_indexes[0]: remained_data = np.insert(remained_data, index, norm_data[index], axis=0) # 6. recover original values if self.normalize: remained_data = remained_data * raw_data.sum().replace(0, 1).values / \ np.power(10, 6) tasklogger.log_info('generating the final imputed matrix...') # recover indexes name and columns name final_imputed_data = pd.DataFrame(remained_data) final_imputed_data.index = raw_data.index final_imputed_data.columns = raw_data.columns # set percision final_imputed_data = round(final_imputed_data, 3) return final_imputed_data