def test_scanorama_correct_scanpy(): """ Test that Scanorama correction with Scanpy AnnData works. Ensures that the function call runs, dimensions match, and metadata is in the correct order. """ from anndata import AnnData import pandas as pd datasets, genes_list = data_gen() adatas = [] for i in range(len(datasets)): adata = AnnData(datasets[i]) adata.obs = pd.DataFrame(list(range(datasets[i].shape[0])), columns=['obs1']) adata.var = pd.DataFrame(genes_list[i], columns=['var1']) adata.var_names = genes_list[i] adatas.append(adata) corrected = scanorama.correct_scanpy(adatas) for adata_cor, adata_orig in zip(corrected, adatas): assert (adata_cor.X.shape[0] == adata_orig.X.shape[0]) assert (adata_cor.X.shape[1] == adatas[0].X.shape[1]) assert (list(adata_cor.obs['obs1']) == list(adata_orig.obs['obs1'])) assert (list(adata_cor.var['var1']) == list(adatas[0].var['var1']))
def correct_scanorama(dataset_list, cell_metadata): ''' This function runs Scanorama and saves the corrected object to h5ad file''' #Scanorama corrected = scanorama.correct_scanpy(dataset_list) #merge Scanorama corrected object corrected_dataset = corrected[0].concatenate(corrected[1:], join="inner", batch_key = 'Batch') print("Scanorama worked!") #append metadata corrected_dataset.obs = cell_metadata save_h5ad(corrected_dataset)
def runScanorama(adata, batch, hvg=None): import scanorama checkSanity(adata, batch, hvg) split = splitBatches(adata.copy(), batch) emb, corrected = scanorama.correct_scanpy(split, return_dimred=True) corrected = corrected[0].concatenate(corrected[1:]) emb = np.concatenate(emb, axis=0) corrected.obsm['X_emb'] = emb #corrected.uns['emb']=True return corrected
def correct_scanorama(dataset_list, cell_metadata): ''' This function runs Scanorama and saves the corrected object to h5ad file''' # run Scanorama corrected = scanorama.correct_scanpy(dataset_list) # merge Scanorama corrected object corrected_dataset = corrected[0].concatenate(corrected[1:], join='inner', batch_key=args.batch_key) # append metadata corrected_dataset.obs = cell_metadata save_h5ad(corrected_dataset)
def correction(self): print("Start Scanorama...\n") start = time.time() adata_scanorama = self.adata.copy() adata_list = [ adata_scanorama[adata_scanorama.obs[self.batch] == i] for i in adata_scanorama.obs[self.batch].unique() ] corrected = scanorama.correct_scanpy(adata_list, return_dimred=True) corrected_merged_dge = corrected[0].concatenate(corrected[1:], join="outer") corrected_merged_dge.obs = adata_scanorama.obs self.adata = corrected_merged_dge print(f"Scanorama has taken {round(time.time() - start, 2)} seconds")
def correct(datasets): ''' This function runs Scanorama and saves the corrected object to h5ad file''' #Scanorama corrected = scanorama.correct_scanpy(datasets) #merge Scanorama corrected object corrected_dataset = corrected[0].concatenate(corrected[1:], join="inner", batch_key='Batch') print("Scanorama worked!") #save Scanorama corrected object corrected_dataset.write(args.output) print("Corrected object saved!")
def runScanorama(adata, batch, hvg=None): import scanorama checkSanity(adata, batch, hvg) split, categories = splitBatches(adata.copy(), batch, return_categories=True) corrected = scanorama.correct_scanpy(split, return_dimred=True) corrected = anndata.AnnData.concatenate(*corrected, batch_key=batch, batch_categories=categories, index_unique=None) corrected.obsm['X_emb'] = corrected.obsm['X_scanorama'] #corrected.uns['emb']=True return corrected
def combineAdataUseScanorama(adataLs, batchKey, batchCateLs, subSample=False, subSampleCounts=0): """ 利用Scanorama整合不同adata adataLs: [adata1, adata2] batchKey: 添加的label batchCateLs: 每个batch的名称 需要和adataLs一致 subSampleCounts: 下采样的样本数。 return: 整合后的adata """ import scanorama adataLs = [x.copy() for x in adataLs] if subSample: sampleSize = min([x.shape[0] for x in adataLs]) if subSampleCounts: sampleSize = min(sampleSize, subSampleCounts) logger.info(f"sample size: {sampleSize}") [sc.pp.subsample(x, n_obs=sampleSize) for x in adataLs] for adata in adataLs: sc.pp.normalize_total(adata, inplace=True) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2000, inplace=True) print(f"↓↓↓↓↓↓↓↓↓{batchCateLs}↓↓↓↓↓↓↓↓") combineScanoramaLs = scanorama.correct_scanpy(adataLs, return_dimred=True) print(f"↑↑↑↑↑↑↑↑↑{batchCateLs}↑↑↑↑↑↑↑↑") combineAdata = sc.concat(combineScanoramaLs, label=batchKey, index_unique="-", keys=batchCateLs) sc.pp.neighbors(combineAdata, n_pcs=50, use_rep="X_scanorama") sc.tl.umap(combineAdata) return combineAdata
def scanorama_merge(adata_trains, adata_pred, keepdimensionality): """ corrects datasets using scanorama and merge training datasets subsequently This function reads a list of training datasets (at least one) and one testing dataset from .h5ad files and returns a merged and corrected training dataset anndata object, and a corrected testing anndata object. parameters ---------- adata_trains: `list` list of training dataset adata objects adata_pred: AnnData testing dataset anndata object keepdimensionality: `bool` determines if we should use all common genes or if we should reduce dimensionality to 100. False not currently implemented train_datasets: `list` names of train datasets returns ------- AnnData A concatenated and corrected anndata object of all training datasets. AnnData An anndata object containing corrected testing dataset. """ adata_pred_obssave = adata_pred nonmerged_adata_train = naive_merge( adata_trains) # to have merged obs ready all_adata = naive_merge([nonmerged_adata_train, adata_pred]) adata_trains.append(adata_pred) print('using scanorama rn') integrated, corrected = scan.correct_scanpy(adata_trains, return_dimred=True) print('integrating training set') if len(adata_trains) != 2: adata_train = sc.AnnData.concatenate(*corrected[:-1]) else: adata_train = corrected[0] adata_train.obs = nonmerged_adata_train.obs adata_train.var = all_adata.var adata_pred = sc.AnnData(corrected[-1]) adata_pred.obs = adata_pred_obssave.obs adata_pred.var = all_adata.var adata_trains.pop() return adata_train, adata_pred
# Plot UMAP and T-SNE before correction umapplot(adata, color_by=[args.celltype, args.batch], save_file_prefix=f"scanorama_umap_{args.adata}_before_cor") # tsneplot(adata, color_by=[args.celltype, args.batch], save_file_prefix=f"tsne_{args.adata}_before_cor") # Correction print("Starting Scanorama...") start = time.time() adata_scanorama = adata.copy() adata_list = [ adata_scanorama[adata_scanorama.obs[args.batch] == i] for i in adata_scanorama.obs[args.batch].unique() ] corrected = scanorama.correct_scanpy(adata_list, return_dimred=True) corrected_merged_dge = corrected[0].concatenate(*corrected[1:]) corrected_merged_dge.obs = adata_scanorama.obs print(f"Scanorama has taken {time.time() - start} seconds") # Plot UMAP after correction sc.pp.neighbors(corrected_merged_dge, n_neighbors=10, n_pcs=20) sc.tl.umap(corrected_merged_dge) sc.pl.umap(corrected_merged_dge, color=[args.celltype, args.batch], show=False) resname = f"./visualization/scanorama_umap_{args.adata}_after_cor.png" plt.savefig(resname, dpi=100) # Save corrected adata if not os.path.exists(f"./{args.adata[:6]}"): os.makedirs(f"./{args.adata[:6]}") corrected_merged_dge.write_h5ad(
import numpy as np from scbean.tools import utils as tl import scanpy as sc import pandas as pd import scanorama import argparse base_path = '/Users/zhongyuanke/data/' file1 = 'dropviz/mouse_brain_dropviz_filtered.h5ad' file2 = 'nuclei/adata_nuclei_filtered.h5ad' scan_path = 'results/scan_mouse.h5ad' # -------------train--------------------- adata1 = tl.read_sc_data(file1, fmt='h5ad') adata2 = tl.read_sc_data(file2, fmt='h5ad') # orig_label =adata_orig.obs['label'] print(adata1) print(adata2) datas = [adata1, adata2] corrected = scanorama.correct_scanpy(datas, return_dimred=True, dimred=16) adata_corrected = corrected[0].concatenate(corrected[1]) print(adata_corrected) sc.pp.neighbors(adata_corrected, use_rep='X_scanorama') sc.tl.umap(adata_corrected) adata_corrected.write_h5ad(scan_path)
import scanpy.api as sc from umap import UMAP import scanorama import sys script_path = os.path.dirname(os.path.realpath(__file__)) output_dir = os.path.join(script_path, '../../Figures') + '/' adata_scv_pru = sc.read_h5ad(output_dir + '../Data/pru/adata_sc_velocyto.h5ad') adata_scv_me49 = sc.read_h5ad(output_dir + '../Data/011_me49/adata_sc_velocyto.h5ad') adatas = [adata_scv_me49.copy(), adata_scv_pru.copy()] integrated, corrected = scanorama.correct_scanpy(adatas, return_dimred=True) merged_x = np.concatenate(integrated) umap_merged_x = UMAP(n_components=2, random_state=4, min_dist=0.3, n_neighbors=50).fit_transform(merged_x) adatas = corrected[0].concatenate(corrected[1]) adatas.obs_names = [x.split('-')[0] for x in adatas.obs_names] adatas.obsm['X_corrected'] = merged_x adatas.obsm['X_corrected_umap'] = umap_merged_x adatas.layers['original_mat'] = sp.sparse.csr_matrix( np.concatenate([adata_scv_me49.X.A, adata_scv_pru.X.A])) batch = ['ME49' if '10099011' in x else 'Pru' for x in adatas.obs_names] adatas.obs['batch'] = batch ## Save scanorama results adatas.write_h5ad(filename=output_dir + '../Data/pru/adata_integrated_0506_me49.h5ad', compression='gzip')
help="base path") opt = parser.parse_args() base_path = opt.base_path file1 = base_path + 'blood_5w.h5ad' file2 = base_path + 'bone_5w.h5ad' time_list = [] adata1 = sc.read_h5ad(base_path + 'blood_5w.h5ad') adata2 = sc.read_h5ad(base_path + 'bone_5w.h5ad') print(adata1) print(adata2) data_list = [adata1, adata2] t0 = time.time() integrated, corrected = scanorama.correct_scanpy(data_list, return_dimred=True) t1 = time.time() print("Total time running DAVAE 10w cells: %s seconds" % (str(t1 - t0))) time_list.append(t1 - t0) adata1 = sc.read_h5ad(base_path + 'blood_10w.h5ad') adata2 = sc.read_h5ad(base_path + 'bone_10w.h5ad') data_list = [adata1, adata2] t0 = time.time() integrated, corrected = scanorama.correct_scanpy(data_list, return_dimred=True) t1 = time.time() print("Total time running DAVAE 20w cells: %s seconds" % (str(t1 - t0))) time_list.append(t1 - t0) adata1 = sc.read_h5ad(base_path + 'blood_20w.h5ad')
def combineAdataUseScanoramaOld( adataLs, batchKey, batchCateLs, subSample=False, subSampleCounts=0 ): """ 利用Scanorama整合不同adata adataLs: [adata1, adata2] batchKey: 添加的label batchCateLs: 每个batch的名称 需要和adataLs一致 subSampleCounts: 下采样的样本数。 return: 整合后的adata """ import scanorama adataLs = [x.copy() for x in adataLs] if subSample: sampleSize = min([x.shape[0] for x in adataLs]) if subSampleCounts: sampleSize = min(sampleSize, subSampleCounts) logger.info(f"sample size: {sampleSize}") [sc.pp.subsample(x, n_obs=sampleSize) for x in adataLs] combineAdata = adataLs[0].concatenate( adataLs[1:], batch_key=batchKey, batch_categories=batchCateLs ) sc.pp.normalize_per_cell(combineAdata, counts_per_cell_after=1e4) sc.pp.log1p(combineAdata) sc.pp.highly_variable_genes( combineAdata, min_mean=0.0125, max_mean=3, min_disp=1.5, batch_key=batchKey ) sc.pl.highly_variable_genes(combineAdata) varGenes = combineAdata.var.highly_variable varGenes = varGenes[varGenes].keys() varGenes = list(varGenes) alldata = {} for oneBatchName in combineAdata.obs[batchKey].unique(): alldata[oneBatchName] = combineAdata[ combineAdata.obs[batchKey] == oneBatchName, varGenes ] combineAdataLs = list(alldata.values()) print(f"↓↓↓↓↓↓↓↓↓{batchCateLs}↓↓↓↓↓↓↓↓") combineScanoramaLs = scanorama.correct_scanpy(combineAdataLs, return_dimred=True) print(f"↑↑↑↑↑↑↑↑↑{batchCateLs}↑↑↑↑↑↑↑↑") combineAdata = sc.concat(combineScanoramaLs) # import pdb; pdb.set_trace() # combineScanoramaAr = np.concatenate(combineScanoramaLs) # combineAdata.obsm["SC"] = combineScanoramaAr # combineAdata.raw = combineAdata # combineAdata = combineAdata[:, varGenes] # sc.pp.scale(combineAdata, max_value=10) # sc.tl.pca(combineAdata, svd_solver="arpack", n_comps=50) # sc.pl.pca(combineAdata) sc.pp.neighbors(combineAdata, n_pcs=50, use_rep="X_scanorama") sc.tl.umap(combineAdata) return combineAdata
def correct(adatas): integrated, correcteds = scanorama.correct_scanpy(adatas, return_dimred=True) return correcteds
def integrate_and_correct(adatas, assay="counts"): correcteds = scanorama.correct_scanpy(adatas) correct = correcteds.pop(0) corrected = correct.concatenate(*correcteds, batch_key="batch") return corrected
def scanorama_bc(adatas, n_comps, save_folder, possible_batch_effects, batch_key='library_id'): """Apply Scanorama Batch correction Scanorama enables batch-correction and integration of heterogeneous scRNA-seq datasets Parameters ---------- adatas : annData n_comps : int save_folder : str possible_batch_effects : list, str batch_key : str Returns ------- """ # 1. Score uncorrectd matrix _score_uncorrectd_data(adatas=adatas, n_comps=n_comps, save_folder=save_folder, batch_key=batch_key, possible_batch_effects=possible_batch_effects) # 2. Apply scanoroma batch correction method # 2.1 Split list into annData objects split = _split_batches(adatas[:, adatas.var['highly_variable']].copy(), batch=batch_key) # 2.2 run scanorama batch correction kwargs = {"return_dimred": True} emb, corrected = scanorama.correct_scanpy(split, **kwargs) # concatenate corrected adatas emb = np.concatenate(emb, axis=0) adata_cor = ann.AnnData.concatenate(*corrected, batch_key=batch_key, batch_categories=adatas.obs[batch_key].cat.categories,).copy() adatas.obsm['X_emb'] = emb # 2.3 Score correct matrix # 2.3.2 Determine No. PCs pca = pc_determination.pcs_combs(adatas.obsm['X_emb'], save_folder, raw=False, type_dataset="No_HVG_corrected", use_highly_variable=False, copy=True, return_info=True) # 2.4 Calculate variance after batch correction - might be that the variance increased within a covariate dict_r2var = dict() adata_cor.obs[batch_key] = adatas.obs[batch_key].values # Score variance contribution by batch score_variance(adata=adata_cor, obs=batch_key, pca=pca, r2var=dict_r2var) for poss_be in possible_batch_effects: adata_cor.obs[poss_be] = adatas.obs[poss_be].values # Score variance contribution by other covariate score_variance(adata=adata_cor, obs=poss_be, pca=pca, r2var=dict_r2var) print(dict_r2var) # Compute Visualisation of corrected matrix try: n_comps = int(input("Please provide the No. principal components (default 50): ")) except ValueError: n_comps = int(50) sc.pp.pca(adata_cor, n_comps=n_comps, use_highly_variable=False, svd_solver='arpack') sc.pp.neighbors(adata_cor) sc.tl.umap(adata_cor) # Plot for poss_be in possible_batch_effects: plt_pp_plots.plot_batch_correction(adata_cor, save_folder, batch_key="bc_matrix", possible_batch_effect=poss_be) # Compute Visualisation of corrected embedding sc.pp.neighbors(adatas, use_rep='X_emb') sc.tl.umap(adatas) for poss_be in possible_batch_effects: plt_pp_plots.plot_batch_correction(adatas, save_folder, batch_key="bc_embedding", possible_batch_effect=poss_be) adatas.X = sparse.csr_matrix(adatas.X) return adatas
opt = parser.parse_args() base_path = opt.base_path file1 = base_path + 'blood_' + opt.type + '.h5ad' file2 = base_path + 'bone_' + opt.type + '.h5ad' adata1 = sc.read_h5ad(file1) adata2 = sc.read_h5ad(file2) # adata = anndata.AnnData(X=adata.X.A,obs=adata.obs, var=adata.var) print(adata1) print(adata2) t0 = time.time() data_list = [adata1, adata2] integrated, corrected = scanorama.correct_scanpy(data_list, return_dimred=True, approx=False) # integrated, corrected = scanorama.correct_scanpy(data_list, return_dimred=True) t1 = time.time() # # info = psutil.virtual_memory() # print('内存使用:', psutil.Process(os.getpid()).memory_info().rss/1024/1024/1024, 'GB') # print('总内存:', info.total/1024/1024/1024, 'GB') # print('内存占比:', info.percent) # print('cpu个数:', psutil.cpu_count()) t = (t1 - t0) / 60 print("Total time running: %s min" % (str(t))) # result = [psutil.Process(os.getpid()).memory_info().rss/1024/1024/1024, t1-t0] # result = np.array(result)