def get_data(numgroups): with localconverter(ro.default_converter + pandas2ri.converter): if numgroups == 2: r.source('~/Documents/rscripts/splatter-2.R') elif numgroups == 6: r.source('~/Documents/rscripts/splatter-6.R') counts = r2py(r['counts']) # cell-by-gene dataframe cellinfo = r2py(r['cellinfo']) # Cell, Batch, Group geneinfo = r2py(r['geneinfo']) # Gene sim = sc.AnnData(counts.values, obs=cellinfo, var=geneinfo) sim.obs_names = cellinfo.Cell sim.var_names = geneinfo.Gene if numgroups == 2: sc.pp.filter_genes( sim, min_counts=1 ) # omitted in 6 case so we can generalize to diff dropout %s truecounts = r2py(r['truecounts']) dropout = r2py(r['dropout']) print("percent dropout: {}".format( np.sum(dropout.values) / (sim.n_obs * sim.n_vars))) sim_true = sc.AnnData(truecounts.values, obs=cellinfo, var=geneinfo) sim_true.obs_names = cellinfo.Cell sim_true.var_names = geneinfo.Gene sim_true = sim_true[:, sim.var_names] return [sim, sim_true]
def test_api(): # reference adata X = np.random.normal(size=(1000, 10)) adata = sc.AnnData(X=X) adata.obs['cell_type'] = list("ABCDE") * 200 adata.obs['condition'] = list("MNOP") * 250 adata.raw = adata adata.obs['size_factors'] = 1.0 # print(len(adata.obs['condition'].unique().tolist())) # print(len(adata.obs['cell_type'].unique().tolist())) model = sca.models.scArches(10, list('MNOP')) model.train(adata, "condition", 0.8) print(model.network_kwargs) # query adata X = np.random.normal(size=(1000, 10)) adata = sc.AnnData(X=X) adata.obs['cell_type'] = list("ABCL") * 250 adata.obs['condition'] = list("QRST") * 250 adata.raw = adata adata.obs['size_factors'] = 1.0 new_model = sca.operate(model, "new_task", adata.obs['condition'].unique().tolist()) print(new_model.network_kwargs) # print(new_model.n_conditions, new_model.n_mmd_conditions, new_model.condition_encoder) new_model.train(adata, "condition", 0.8, n_epochs=1)
def latent_as_anndata(self): if type(self.outer_model) is TOTALVI: latent = self.outer_model.get_latent_representation(self.adata) else: if self.modified: latents = self.model.sample_from_posterior_z( self.x_tensor, y=self.label_tensor, batch_index=self.batch_tensor) else: latents = self.model.sample_from_posterior_z( self.x_tensor, y=self.label_tensor, ) if self.annotated: latent = latents.cpu().detach().numpy() latent2, _, _ = self.model.encoder_z2_z1( latents, self.label_tensor) latent2 = latent2.cpu().detach().numpy() post_adata_2 = sc.AnnData(latent2) post_adata_2.obs['cell_type'] = self.cell_types post_adata_2.obs['batch'] = self.batch_names self.post_adata_2 = post_adata_2 else: latent = latents.cpu().detach().numpy() post_adata = sc.AnnData(latent) post_adata.obs['cell_type'] = self.cell_types post_adata.obs['batch'] = self.batch_names return post_adata
def check_rep_results(func, X, **kwargs): """Checks that the results of a computation add values/ mutate the anndata object in a consistent way.""" # Gen data adata_X = sc.AnnData( X=X.copy(), layers={"layer": np.zeros(shape=X.shape, dtype=X.dtype)}, obsm={"obsm": np.zeros(shape=X.shape, dtype=X.dtype)}, ) adata_layer = sc.AnnData( X=np.zeros(shape=X.shape, dtype=X.dtype), layers={"layer": X.copy()}, obsm={"obsm": np.zeros(shape=X.shape, dtype=X.dtype)}, ) adata_obsm = sc.AnnData( X=np.zeros(shape=X.shape, dtype=X.dtype), layers={"layer": np.zeros(shape=X.shape, dtype=X.dtype)}, obsm={"obsm": X.copy()}, ) # Apply function func(adata_X, **kwargs) func(adata_layer, layer="layer", **kwargs) func(adata_obsm, obsm="obsm", **kwargs) # Reset X adata_X.X = np.zeros(shape=X.shape, dtype=X.dtype) adata_layer.layers["layer"] = np.zeros(shape=X.shape, dtype=X.dtype) adata_obsm.obsm["obsm"] = np.zeros(shape=X.shape, dtype=X.dtype) # Check equality assert_equal(adata_X, adata_layer) assert_equal(adata_X, adata_obsm)
def test_obs_df(): adata = sc.AnnData( X=np.ones((2, 2)), obs=pd.DataFrame({"obs1": [0, 1], "obs2": ["a", "b"]}, index=["cell1", "cell2"]), var=pd.DataFrame({"gene_symbols": ["genesymbol1", "genesymbol2"]}, index=["gene1", "gene2"]), obsm={"eye": np.eye(2), "sparse": sparse.csr_matrix(np.eye(2))}, layers={"double": np.ones((2, 2)) * 2} ) adata.raw = sc.AnnData( X=np.zeros((2, 2)), var=pd.DataFrame({"gene_symbols": ["raw1", "raw2"]}, index=["gene1", "gene2"]) ) assert np.all(np.equal( sc.get.obs_df(adata, keys=["gene2", "obs1"], obsm_keys=[("eye", 0), ("sparse", 1)]), pd.DataFrame({"gene2": [1, 1], "obs1": [0, 1], "eye-0": [1, 0], "sparse-1": [0, 1]}, index=adata.obs_names) )) assert np.all(np.equal( sc.get.obs_df(adata, keys=["genesymbol2", "obs1"], obsm_keys=[("eye", 0), ("sparse", 1)], gene_symbols="gene_symbols"), pd.DataFrame({"genesymbol2": [1, 1], "obs1": [0, 1], "eye-0": [1, 0], "sparse-1": [0, 1]}, index=adata.obs_names) )) assert np.all(np.equal( sc.get.obs_df(adata, keys=["gene2", "obs1"], layer="double"), pd.DataFrame({"gene2": [2, 2], "obs1": [0, 1]}, index=adata.obs_names) )) assert np.all(np.equal( sc.get.obs_df(adata, keys=["raw2", "obs1"], gene_symbols="gene_symbols", use_raw=True), pd.DataFrame({"raw2": [0, 0], "obs1": [0, 1]}, index=adata.obs_names) )) badkeys = ["badkey1", "badkey2"] with pytest.raises(KeyError) as badkey_err: sc.get.obs_df(adata, keys=badkeys) with pytest.raises(AssertionError): sc.get.obs_df(adata, keys=["gene1"], use_raw=True, layer="double") assert all(badkey_err.match(k) for k in badkeys)
def do_latent_evaluation( spliced_net, sc_dual_full_dataset, outdir: str, prefix: str = "" ): """ Pull out latent space and write to file """ logging.info("Inferring latent representations") encoded_from_rna, encoded_from_atac = spliced_net.get_encoded_layer( sc_dual_full_dataset ) if hasattr(sc_dual_full_dataset.dataset_x, "data_raw"): encoded_from_rna_adata = sc.AnnData( encoded_from_rna, obs=sc_dual_full_dataset.dataset_x.data_raw.obs.copy(deep=True), ) encoded_from_rna_adata.write( os.path.join(outdir, f"{prefix}_rna_encoded_adata.h5ad".strip("_")) ) if hasattr(sc_dual_full_dataset.dataset_y, "data_raw"): encoded_from_atac_adata = sc.AnnData( encoded_from_atac, obs=sc_dual_full_dataset.dataset_y.data_raw.obs.copy(deep=True), ) encoded_from_atac_adata.write( os.path.join(outdir, f"{prefix}_atac_encoded_adata.h5ad".strip("_")) )
def generate_simulated_pca(path, actual_data, clust_typ, source_cell, sim_data, first_cell): target = actual_data[actual_data.obs["clusters"] == clust_typ] target = sc.AnnData( target.X, obs={"cell_type": ["Target_" + clust_typ] * len(target)}, var={"var_names": target.var_names}) source = actual_data[actual_data.obs["clusters"] == source_cell] source = sc.AnnData( source.X, obs={"cell_type": ["Source_" + source_cell] * len(source)}, var={"var_names": source.var_names}) predicted = sc.AnnData(sim_data.X, obs={"cell_type": ["Predicted"] * len(sim_data)}, var={"var_names": sim_data.var_names}) combined_data = source.concatenate(target) combined_data = combined_data.concatenate(predicted) sc.pp.neighbors(combined_data) sc.tl.pca(combined_data, svd_solver='arpack') sc.pl.pca(combined_data, color=["cell_type"], legend_fontsize=12, palette=['r', 'k', 'y'], frameon=True, s=35, save="_" + first_cell + "_to_" + clust_typ + "_celltypes.pdf")
def group_cells(data1, data2): meta_cells = data1.obs['ClusterID'].unique() meta1 = [] meta2 = [] weight = [] for cluster in meta_cells: idx = np.where(data1.obs['ClusterID'] == cluster) bc_set = data1.obs['ClusterID'].index[idx] try: meta1.append(data1.layers['norm_data'][idx].mean(axis=0)) except: meta1.append(data1.X[idx].mean(axis=0)) meta2.append(data2[bc_set, ].X.mean(axis=0)) weight.append(len(idx[0])) df1 = pd.DataFrame(np.array(meta1), columns=data1.var_names, index=meta_cells) df2 = pd.DataFrame(np.array(meta2), columns=data2.var_names, index=meta_cells) adata1 = sc.AnnData(df1) adata2 = sc.AnnData(df2) adata1.obs['Weights'] = weight adata2.obs['Weights'] = weight return adata1, adata2
def test_normalize_total(typ): adata = sc.AnnData(typ(X_total, dtype='float32')) sc.pp.normalize_total(adata, key_added='n_counts') assert np.allclose(np.ravel(adata.X.sum(axis=1)), [3., 3., 3.]) sc.pp.normalize_total(adata, target_sum=1, key_added='n_counts2') assert np.allclose(np.ravel(adata.X.sum(axis=1)), [1., 1., 1.]) adata = sc.AnnData(typ(X_frac, dtype='float32')) sc.pp.normalize_total(adata, fraction=0.7) assert np.allclose(np.ravel(adata.X[:, 1:3].sum(axis=1)), [1., 1., 1.])
def adatas(): pbmc = sc.datasets.pbmc68k_reduced() n_split = 500 adata_ref = sc.AnnData(pbmc.X[:n_split, :], obs=pbmc.obs.iloc[:n_split]) adata_new = sc.AnnData(pbmc.X[n_split:, :]) sc.pp.pca(adata_ref) sc.pp.neighbors(adata_ref) sc.tl.umap(adata_ref) return adata_ref, adata_new
def test_regress_out_constants_equivalent(): # Tests that constant values don't change results # (since support for constant values is implemented by us) from sklearn.datasets import make_blobs X, cat = make_blobs(100, 20) a = sc.AnnData(np.hstack([X, np.zeros((100, 5))]), obs={"cat": pd.Categorical(cat)}) b = sc.AnnData(X, obs={"cat": pd.Categorical(cat)}) sc.pp.regress_out(a, "cat") sc.pp.regress_out(b, "cat") np.testing.assert_equal(a[:, b.var_names].X, b.X)
def generate_simulated_reg_plots(path, actual_data, clust_typ, cells): os.chdir(path) actual_data_temp = actual_data[actual_data.obs["cell_type"] == clust_typ] reg_mean_vals = [] for file in glob.glob("*.h5ad"): print(file) adata = sc.read(file) pred_data = sc.AnnData(adata.X, obs={"comparison_typ": ["pred"] * len(adata)}, var={"var_names": adata.var_names}) actual_data_temp = sc.AnnData( actual_data_temp.X, obs={"comparison_typ": ["actual"] * len(actual_data_temp)}, var={"var_names": actual_data_temp.var_names}) first_cell = file[0:file.find('.')] #print(first_cell) plot_data = actual_data_temp.concatenate(pred_data) top_100_gene_list = list( actual_data.uns["rank_genes_groups"]['names'][clust_typ]) #print(top_100_gene_list) reg_val = reg_mean_plot(plot_data, condition_key="comparison_typ", axis_keys={ "x": "actual", "y": "pred" }, path_to_save="./reg_mean_" + file + "_TO_" + clust_typ + ".png", legend=False, labels={ "x": "actual", "y": "pred" }, show=False, gene_list=top_100_gene_list[:5], top_100_genes=top_100_gene_list, fontsize=14, textsize=14) reg_mean_vals.append(list([first_cell, reg_val[0], reg_val[1]])) if reg_val[1] >= 0.40: source_cell = [string for string in cells if string in file] source_cell = source_cell[0] generate_simulated_umaps(path, actual_data, clust_typ, source_cell, adata, first_cell) return reg_mean_vals
def merge_matrix(ad,obskeys = None,use_raw = False,keep_only_mutual=False): '''merge matrix stored in ad ad: dictionary of anndata to merge obskeys: list to merge within anndata use_raw: if True, merge from .raw.X''' smp_list = list(ad.keys()) obs_dict = defaultdict(list) obs_names = [] for smp in smp_list: ad[smp].obs['name'] = smp if not obskeys: obskey_list = [] obskeys = [] for sample in smp_list: obskey_list.extend(list(ad[sample].obs.columns)) for (obskey, number) in Counter(obskey_list).items(): if number == len(smp_list): obskeys.append(obskey) else: if keep_only_mutual: pass else: for sample in smp_list: if obskey not in ad[sample].obs.columns: ad[sample].obs[obskey]='n/a' obskeys.append(obskey) for sample in smp_list: obs_names.extend(list(ad[sample].obs_names)) for key in obskeys: obs_dict[key].extend(list(ad[sample].obs[key])) from scipy.sparse import vstack if use_raw == True: stack = vstack([ad[x].raw.X for x in smp_list]) # stack data adata = sc.AnnData(stack, var = ad[smp_list[0]].raw.var) else: stack = vstack([ad[x].X for x in smp_list]) # stack data adata = sc.AnnData(stack, var = ad[smp_list[0]].var) adata.obs_names = obs_names print(len(adata)) for obs_col in obs_dict: print(obs_col) adata.obs[obs_col] = obs_dict[obs_col] return adata
def test_qc_metrics_format(): a = np.random.binomial(100, .005, (1000, 1000)) init_var = pd.DataFrame({ "mito": np.concatenate((np.ones(100, dtype=bool), np.zeros(900, dtype=bool))) }) adata_dense = sc.AnnData(X=a, var=init_var.copy()) sc.pp.calculate_qc_metrics(adata_dense, qc_vars=["mito"], inplace=True) for fmt in [sparse.csr_matrix, sparse.csc_matrix, sparse.coo_matrix]: adata = sc.AnnData(X=fmt(a), var=init_var.copy()) sc.pp.calculate_qc_metrics(adata, qc_vars=["mito"], inplace=True) assert np.allclose(adata.obs, adata_dense.obs) for col in adata.var: # np.allclose doesn't like mix of types assert np.allclose(adata.var[col], adata_dense.var[col])
def regress_batch_v2(adata,batch_key,confounder_key): '''batch regression tool batch_key=list of observation categories to be regressed out confounder_key=list of observation categories to be kept returns ndata with corrected X''' from sklearn.linear_model import Ridge print('fitting linear model...') dummy = pd.get_dummies(adata.obs[batch_key+confounder_key],drop_first=False) X_exp = adata.X # scaled data if scipy.sparse.issparse(X_exp): X_exp = X_exp.todense() LR = Ridge(fit_intercept=False,alpha=1.0) LR.fit(dummy,X_exp) if len(batch_key)>1: batch_index = np.logical_or.reduce(np.vstack([dummy.columns.str.startswith(x) for x in batch_key])) else: batch_index = np.vstack([dummy.columns.str.startswith(x) for x in batch_key])[0] print('corrcting batch...') dm = np.array(dummy)[:,batch_index] X_explained = dm.dot(LR.coef_[:,batch_index].T) X_remain = X_exp - X_explained ndata = sc.AnnData(X_remain) ndata.obs = adata.obs ndata.var = adata.var return ndata, X_explained
def calcular_leiden(array, res, subres, seed): print('Calculando leiden') adata = sc.AnnData(X=np.nan_to_num(array)) sc.pp.neighbors(adata) sc.tl.leiden(adata, resolution=res, random_state=seed) if subres > 0: array_return = np.zeros(len(adata)) clusters = list(dict.fromkeys(adata.obs['leiden'].values)) n_clusters = 0 for cluster in clusters: index_cluster = np.argwhere( adata.obs['leiden'] == cluster).flatten() subadata = adata[index_cluster].copy() subadata.X = np.nan_to_num(subadata.X) sc.pp.neighbors(subadata) sc.tl.leiden(subadata, resolution=subres, random_state=seed) array_return[index_cluster] = subadata.obs['leiden'].values.astype( int) + n_clusters n_clusters += len(list(dict.fromkeys(subadata.obs['leiden']))) return array_return else: return adata.obs['leiden'].values.astype(int)
def check_rep_results(func, X, *, fields=["layer", "obsm"], **kwargs): """Checks that the results of a computation add values/ mutate the anndata object in a consistent way.""" # Gen data empty_X = np.zeros(shape=X.shape, dtype=X.dtype) adata = sc.AnnData( X=empty_X.copy(), layers={"layer": empty_X.copy()}, obsm={"obsm": empty_X.copy()}, ) adata_X = adata.copy() adata_X.X = X.copy() adatas_proc = {} for field in fields: cur = adata.copy() sc.get._set_obs_rep(cur, X.copy(), **{field: field}) adatas_proc[field] = cur # Apply function func(adata_X, **kwargs) for field in fields: func(adatas_proc[field], **{field: field}, **kwargs) # Reset X adata_X.X = empty_X.copy() for field in fields: sc.get._set_obs_rep(adatas_proc[field], empty_X.copy(), **{field: field}) for field_a, field_b in permutations(fields, 2): assert_equal(adatas_proc[field_a], adatas_proc[field_b]) for field in fields: assert_equal(adata_X, adatas_proc[field])
def check_rep_mutation(func, X, *, fields=["layer", "obsm"], **kwargs): """Check that only the array meant to be modified is modified.""" adata = sc.AnnData(X=X.copy(), dtype=X.dtype) for field in fields: sc.get._set_obs_rep(adata, X, **{field: field}) X_array = asarray(X) adata_X = func(adata, copy=True, **kwargs) adatas_proc = { field: func(adata, copy=True, **{field: field}, **kwargs) for field in fields } # Modified fields for field in fields: result_array = asarray( sc.get._get_obs_rep(adatas_proc[field], **{field: field})) np.testing.assert_array_equal(asarray(adata_X.X), result_array) # Unmodified fields for field in fields: np.testing.assert_array_equal(X_array, asarray(adatas_proc[field].X)) np.testing.assert_array_equal( X_array, asarray(sc.get._get_obs_rep(adata_X, **{field: field}))) for field_a, field_b in permutations(fields, 2): result_array = asarray( sc.get._get_obs_rep(adatas_proc[field_a], **{field_b: field_b})) np.testing.assert_array_equal(X_array, result_array)
def data_process(dta, min_genes = 100, min_cells = 10, mt_pct = 10, npcs = 50, oversd = None): adata = sc.AnnData(dta) adata.var_names_make_unique() # quality control sc.pp.filter_cells(adata, min_genes = min_genes) sc.pp.filter_genes(adata, min_cells = min_cells) adata.var['mt'] = adata.var_names.str.startswith('MT-') sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, inplace=True) if oversd is not None: mu = np.mean(adata.obs.n_genes_by_counts) sd = np.std(adata.obs.n_genes_by_counts) thres = mu + oversd * sd adata = adata[adata.obs.n_genes_by_counts < thres, :] adata = adata[adata.obs.pct_counts_mt < mt_pct, :] # normalization sc.pp.normalize_total(adata, target_sum=1e4) sc.pp.log1p(adata) # find highly variable gene sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) adata.raw = adata adata = adata[:, adata.var.highly_variable] # pca sc.tl.pca(adata, svd_solver='arpack', n_comps = npcs) return adata
def visualize_trained_network_results(data_dict, z_dim=100, subsample=None, arch_style=1): plt.close("all") data_name = data_dict['name'] metadata_path = data_dict['metadata'] cell_type_key = data_dict['cell_type'] spec_cell_type = data_dict.get("spec_cell_types", None) data = sc.read( f"../data/{data_name}/anna/processed_adata_Cusanovich_brain_May29_2019_5000.h5ad" ) data.X += abs(data.X.min()) if subsample is not None: data = data[:subsample] cell_types = data.obs[cell_type_key].unique().tolist() path_to_save = f"../results/VAE/{data_name}/{arch_style}-{z_dim}/Visualizations/" os.makedirs(path_to_save, exist_ok=True) sc.settings.figdir = os.path.abspath(path_to_save) train_data = data.copy() network = trvae.VAE( x_dimension=data.shape[1], z_dimension=z_dim, arch_style=arch_style, model_path=f"../models/VAE/{data_name}-{arch_style}/{z_dim}/", ) network.restore_model() if sparse.issparse(data.X): data.X = data.X.A feed_data = data.X latent = network.to_z_latent(feed_data) latent = sc.AnnData(X=latent) latent.obs[cell_type_key] = data.obs[cell_type_key].values color = [cell_type_key] sc.pp.neighbors(train_data) sc.tl.umap(train_data) sc.pl.umap(train_data, color=color, save=f'_{data_name}_train_data.pdf', show=False) sc.pp.neighbors(latent) sc.tl.umap(latent) sc.pl.umap(latent, color=color, save=f"_{data_name}_latent.pdf", show=False) plt.close("all")
def test_linear_works(): X, y = make_regression( n_samples=1000, n_features=100, n_informative=10, n_targets=1, ) # quantiles for y -> string day labels yq = pd.qcut(y, 3, labels=["0", "1", "2"]) obs = pd.DataFrame({"day": yq}) obs.index = obs.index.map(str) adata = sc.AnnData(X=X, obs=obs) _ = parallel_runs( adata, n_processes=4, n_bootstraps=32, X_noise=0.01, y_noise=0.5, alpha=0.9, lambda_path=np.geomspace(10, 0.01, num=10), target_col="day", target_map={"0": 0, "1": 1, "2": 2}, )
def run_pca(data, n_components=300, use_hvg=True): """Run PCA :param data: Dataframe of cells X genes. Typicaly multiscale space diffusion components :param n_components: Number of principal components :return: PCA projections of the data and the explained variance """ if type(data) is sc.AnnData: ad = data else: ad = sc.AnnData(data.values) # Run PCA if not use_hvg: n_comps = n_components else: sc.pp.pca(ad, n_comps=1000, use_highly_variable=True, zero_center=False) try: n_comps = np.where(np.cumsum(ad.uns['pca']['variance_ratio']) > 0.85)[0][0] except IndexError: n_comps = n_components # Rerun with selection number of components sc.pp.pca(ad, n_comps=n_comps, use_highly_variable=use_hvg, zero_center=False) # Return PCA projections if it is a dataframe pca_projections = pd.DataFrame(ad.obsm['X_pca'], index=ad.obs_names) return pca_projections, ad.uns['pca']['variance_ratio']
def scanpy_first(): # results_file = 'scanpy_output\scanpy_output.h5ad' # file_path = '../dataset/scanpy_data/' # file_path = 'human_brain_output/scIGANs-brainTags.csv-src_label.txt-100-15-16-5.0-2.0.csv' file_path = '../dataset/human_brain/brainTags.csv' # label_path = '../dataset/pollen_labels.txt' # label_set = pd.read_table(label_path, header=None, index_col=False) # src_label = pd.Categorical(label_set.iloc[:, 1]).codes adata = sc.AnnData(pd.read_csv(file_path, header=0, index_col=0).transpose()) # adata = sc.read_10x_mtx(file_path, var_names='gene_symbols', cache=True) # adata = ad.read_csv(file_path, first_column_names=True) # adata.var_names_make_unique() # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx` # print('X:', adata.X, ' \ncells:', adata.obs, ' \ngenes:', adata.var) # print('cell name:', adata.obs_names, '\ngene name:', adata.var_names) # print(adata.obs.shape) # print(adata.var.shape) sc.pp.normalize_total(adata, target_sum=1e4) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) # sc.pl.highly_variable_genes(adata) # 保存原始数据 adata.raw = adata sc.pp.neighbors(adata, n_neighbors=15, n_pcs=40) sc.tl.leiden(adata) sc.tl.umap(adata) sc.pl.umap(adata, color=['leiden'])
def do_evaluation_atac_from_rna( spliced_net, sc_dual_full_dataset, gene_names: str, atac_names: str, outdir: str, ext: str, marker_genes: List[str], prefix: str = "", ): ### RNA > ATAC logging.info("Inferring ATAC from RNA") sc_rna_atac_full_preds = spliced_net.translate_1_to_2(sc_dual_full_dataset) sc_rna_atac_full_preds_anndata = sc.AnnData( scipy.sparse.csr_matrix(sc_rna_atac_full_preds), obs=sc_dual_full_dataset.dataset_x.data_raw.obs, ) sc_rna_atac_full_preds_anndata.var_names = atac_names logging.info("Writing ATAC from RNA") sc_rna_atac_full_preds_anndata.write( os.path.join(outdir, f"{prefix}_rna_atac_adata.h5ad".strip("_")) ) if hasattr(sc_dual_full_dataset.dataset_y, "data_raw") and ext is not None: logging.info("Plotting ATAC from RNA") plot_utils.plot_auroc( utils.ensure_arr(sc_dual_full_dataset.dataset_y.data_raw.X).flatten(), utils.ensure_arr(sc_rna_atac_full_preds).flatten(), title_prefix=f"{DATASET_NAME} RNA > ATAC".strip(), fname=os.path.join(outdir, f"{prefix}_rna_atac_auroc.{ext}".strip("_")), )
def simulate_multiple_cell(path, data, model, z_dim, feature): variable_names = data.var_names data_latent = model.to_latent(data.X) latent_df = pd.DataFrame(data_latent) latent_df[feature] = list(data.obs[feature]) cells = list(set(data.obs[feature])) try: os.makedirs(path + "/gene_heatmaps/") except OSError: pass x_dim = data.shape[1] for cell in cells: data_ast = latent_df[latent_df[feature] == cell] cell_one = data_ast.iloc[[0], [0, 1, 2, 3, 4]] for dim in range(z_dim): increment_range = np.arange(min(data_latent[:, dim]), max(data_latent[:, dim]), 0.01) result_array = np.empty((0, x_dim)) for inc in increment_range: cell_latent = cell_one #print(cell_latent) #print(cell_latent.shape) cell_latent.iloc[:, dim] = inc cell_recon = model.reconstruct(cell_latent) result_array = np.append(result_array, cell_recon, axis=0) result_adata = sc.AnnData(result_array, obs={"inc_vals": increment_range}, var={"var_names": variable_names}) result_adata.write(path + "/gene_heatmaps/" + str(cell) + "_" + str(dim) + ".h5ad")
def simulate_one_cell(path, data, cell, model, z_dim, feature): variable_names = data.var_names data_latent = model.to_latent(data.X) try: os.makedirs(path + "/gene_heatmaps/") except OSError: pass x_dim = data.shape[1] data_ast = data[data.obs[feature] == cell] cell_one = data_ast[0, :].X cell_one = np.reshape(cell_one, (1, x_dim)) cell_one = model.to_latent(cell_one) for dim in range(z_dim): increment_range = np.arange(min(data_latent[:, dim]), max(data_latent[:, dim]), 0.01) result_array = np.empty((0, x_dim)) for inc in increment_range: cell_latent = cell_one #print(cell_latent) #print(cell_latent.shape) cell_latent[:, dim] = inc cell_recon = model.reconstruct(cell_latent) result_array = np.append(result_array, cell_recon, axis=0) result_adata = sc.AnnData(result_array, obs={"inc_vals": increment_range}, var={"var_names": variable_names}) result_adata.write(path + "/gene_heatmaps/" + str(cell) + "_" + str(dim) + ".h5ad")
def test_rank_genes_groups_df(): a = np.zeros((20, 3)) a[:10, 0] = 5 adata = sc.AnnData( a, obs=pd.DataFrame( {"celltype": list(chain(repeat("a", 10), repeat("b", 10)))}, index=[f"cell{i}" for i in range(a.shape[0])]), var=pd.DataFrame(index=[f"gene{i}" for i in range(a.shape[1])]), ) sc.tl.rank_genes_groups(adata, groupby="celltype", method="wilcoxon") dedf = sc.get.rank_genes_groups_df(adata, "a") assert dedf["pvals"].value_counts()[1.] == 2 assert sc.get.rank_genes_groups_df(adata, "a", log2fc_max=.1).shape[0] == 2 assert sc.get.rank_genes_groups_df(adata, "a", log2fc_min=.1).shape[0] == 1 assert sc.get.rank_genes_groups_df(adata, "a", pval_cutoff=.9).shape[0] == 1 del adata.uns["rank_genes_groups"] sc.tl.rank_genes_groups(adata, groupby="celltype", method="wilcoxon", key_added="different_key") with pytest.raises(KeyError): sc.get.rank_genes_groups_df(adata, "a") dedf2 = sc.get.rank_genes_groups_df(adata, "a", key="different_key") pd.testing.assert_frame_equal(dedf, dedf2)
def impute_neighbor(bdata, n_neighbor=10): from scipy.spatial import cKDTree from sklearn.neighbors import KDTree import multiprocessing as mp n_jobs = mp.cpu_count() # Get neighborhood structure based on ckd = cKDTree(bdata.obsm["X_umap"]) ckdout = ckd.query(x=bdata.obsm["X_umap"], k=n_neighbor, n_jobs=n_jobs) indices = ckdout[1] sum_list = [] import scipy for i in range(0, bdata.raw.X.shape[0], 10000): start = i end = min(i + 10000, bdata.raw.X.shape[0]) X_list = [ bdata.raw.X[indices[start:end, i]] for i in range(n_neighbor) ] X_sum = scipy.sparse.csr_matrix(np.sum(X_list) / n_neighbor) sum_list.append(X_sum) print(i) imputed = scipy.sparse.vstack(sum_list) idata = sc.AnnData(imputed) idata.obs = bdata.obs.copy() idata.var = bdata.raw.var.copy() idata.obsm = bdata.obsm.copy() idata.uns = bdata.uns.copy() return idata
def blobs(n_variables=11, n_centers=5, cluster_std=1.0, n_observations=640): """Gaussian Blobs. Parameters ---------- n_variables : `int`, optional (default: 11) Dimension of feature space. n_centers : `int`, optional (default: 5) Number of cluster centers. cluster_std : `float`, optional (default: 1.0) Standard deviation of clusters. n_observations : `int`, optional (default: 640) Number of observations. By default, this is the same observation number as in ``sc.datasets.krumsiek11()``. Returns ------- adata : :class:`~anndata.AnnData` Annotated data matrix containing a observation annotation 'blobs' that indicates cluster identity. """ import sklearn.datasets X, y = sklearn.datasets.make_blobs(n_samples=n_observations, n_features=n_variables, centers=n_centers, cluster_std=cluster_std, random_state=0) return sc.AnnData(X, obs={'blobs': y.astype(str)})
def test_logistic_works(): X, y, coef = make_regression(n_samples=1000, n_features=10, n_informative=1, n_targets=1, coef=True) y = expit(y / 100) obs = pd.DataFrame({"day": y}) obs["day"] = pd.qcut(obs["day"], 2, labels=["0", "1"]) obs.index = obs.index.map(str) var = pd.DataFrame({"Gene name": [str(i) for i in range(X.shape[1])]}) var.index = var.index.astype(str) adata = sc.AnnData(X=X, obs=obs, var=var) _ = parallel_runs( adata, n_processes=4, n_bootstraps=32, X_noise=0.01, alpha=0.9, lambda_path=np.geomspace(10, 0.01, num=10), target_col="day", )