def scrublet_predictions(self, vlm, input_dir, doublet_rate=0.06): import scrublet as scr import scipy.io print('Loading counts matrix {}/matrix.mtx'.format(input_dir), file=sys.stderr) counts_matrix = scipy.io.mmread(input_dir + '/matrix.mtx').T.tocsc() print("Loading barcodes {}/barcodes.tsv".format(input_dir), file=sys.stderr) barcodes = np.array( scr.load_genes(input_dir + 'barcodes.tsv', delimiter='t', column=0)) print("Initializing scrublet object", file=sys.stderr) scrub = scr.Scrublet( counts_matrix, expected_doublet_rate=doublet_rate) #whole counts matrix print("Computing doublet predictions", file=sys.stderr) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) #collapse barcodes, scores, and predictions into a dict doublet_dict = { barcode: [doublet_scores[i], predicted_doublets[i]] for i, barcode in enumerate(barcodes) } #add doublet score and doublet prediction as column attributes: vlm.ca["doublet_scores"] = np.array( [doublet_dict[barcode][0] for barcode in vlm.ca['CellID']]) vlm.ca["doublet_predictions"] = np.array( [doublet_dict[barcode][1] for barcode in vlm.ca['CellID']]) return vlm
def run_scrublet(tenx_h5, doublet_rate=0.06, npca=40, save_to=None): if not save_to: raise ValueError( "Please, specify prefix path where to save results to") if tenx_h5.endswith(".h5"): ds = sc.read_10x_h5(tenx_h5) counts_matrix = ds.X.tocsc().astype(np.longlong) obs = ds.obs.reset_index() obs.columns = ["0"] else: counts_matrix = scipy.io.mmread(gzip.open(tenx_h5 + '/matrix.mtx.gz')).T.tocsc() obs = pd.read_table(gzip.open(tenx_h5 + '/barcodes.tsv.gz'), header=None) #features = pd.read_table(gzip.open(input_dir + '/features.tsv.gz'), header=None) #genes = scr.make_genes_unique(features[1]) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=doublet_rate) doublet_scores, doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=npca) save_dir = os.path.dirname(save_to) if not os.path.exists(save_dir): os.makedirs(save_dir) obs['doublet'] = doublet_scores obs.to_csv(save_to + 'doublets.csv') scrub.plot_histogram() plt.savefig(save_to + 'doublet_hist.pdf') if not os.path.exists(save_to + 'threshold.txt'): with open(save_to + 'threshold.txt', 'w') as f: f.write(str(scrub.threshold_))
def run_scrublet(adata, neotic_ratio=.5): ''' ''' import scrublet as scr from scipy.stats import rankdata expected_doublet_th = adata.shape[0] / 1000 * .01 * neotic_ratio adata_raw = adata.raw.copy() adata_raw = adata_raw[:, adata_raw.var.index.isin( adata.var_names.tolist())] counts_matris_2 = adata_raw.X.expm1() del adata_raw scrub = scr.Scrublet( counts_matris_2, expected_doublet_rate=expected_doublet_th) doublet_scores, predicted_doublets = scrub.scrub_doublets( distance_metric='cosine', mean_center=False, n_prin_comps=50, log_transform=True, min_gene_variability_pctl=0) scrub.plot_histogram() predicted_doublets = scrub.call_doublets(threshold=np.quantile( doublet_scores, 1 - expected_doublet_th)) # directly call by trheshold print('total predicted doublets:', sum(predicted_doublets)) print('predicted doublets ratio:', sum( predicted_doublets) / len(predicted_doublets)) adata.obs['doublet_score'] = doublet_scores adata.obs['doublet'] = predicted_doublets adata.obs['doublet_quantile'] = ( rankdata(doublet_scores) / len(doublet_scores)) return(adata)
def annotate_doublets(mtx_fpath, feature_fpath, expected_doublet_rate2=0.06): if False: plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.sans-serif'] = 'Arial' plt.rc('font', size=14) plt.rcParams['pdf.fonttype'] = 42 counts_matrix = scipy.io.mmread(mtx_fpath).T.tocsc() genes = np.array(scr.load_genes(feature_fpath, delimiter='\t', column=1)) print('Counts matrix shape: {} rows, {} columns'.format( counts_matrix.shape[0], counts_matrix.shape[1])) print('Number of genes in gene list: {}'.format(len(genes))) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=expected_doublet_rate2) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) if False: scrub.plot_histogram() print('Running UMAP...') scrub.set_embedding( 'UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3)) print('Done.') scrub.plot_embedding('UMAP', order_points=True) return ([doublet_scores, predicted_doublets])
def Bertie_preclustered(adata,batch_key='batch',cluster_key='louvain'): import scrublet as scr scorenames = ['scrublet_score','scrublet_cluster_score','bh_pval'] adata.obs['doublet_scores']=0 def bh(pvalues): ''' Computes the Benjamini-Hochberg FDR correction. Input: * pvals - vector of p-values to correct ''' n = int(pvalues.shape[0]) new_pvalues = np.empty(n) values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ] values.sort() values.reverse() new_values = [] for i, vals in enumerate(values): rank = n - i pvalue, index = vals new_values.append((n/rank) * pvalue) for i in range(0, int(n)-1): if new_values[i] < new_values[i+1]: new_values[i+1] = new_values[i] for i, vals in enumerate(values): pvalue, index = vals new_pvalues[index] = new_values[i] return new_pvalues for i in np.unique(adata.obs[batch_key]): adata_sample = adata[adata.obs[batch_key]==i,:] scrub = scr.Scrublet(adata_sample.X) doublet_scores, predicted_doublets = scrub.scrub_doublets(verbose=False) adata_sample.obs['scrublet_score'] = doublet_scores adata_sample=adata_sample.copy() for clus in np.unique(adata_sample.obs[cluster_key]): adata_sample.obs.loc[adata_sample.obs[cluster_key]==clus, 'scrublet_cluster_score'] = \ np.median(adata_sample.obs.loc[adata_sample.obs[cluster_key]==clus, 'scrublet_score']) med = np.median(adata_sample.obs['scrublet_cluster_score']) mask = adata_sample.obs['scrublet_cluster_score']>med mad = np.median(adata_sample.obs['scrublet_cluster_score'][mask]-med) #let's do a one-sided test. the Bertie write-up does not address this but it makes sense pvals = 1-scipy.stats.norm.cdf(adata_sample.obs['scrublet_cluster_score'], loc=med, scale=1.4826*mad) adata_sample.obs['bh_pval'] = bh(pvals) #create results data frame for single sample and copy stuff over from the adata object scrublet_sample = pd.DataFrame(0, index=adata_sample.obs_names, columns=scorenames) for meta in scorenames: scrublet_sample[meta] = adata_sample.obs[meta] #write out complete sample scores #scrublet_sample.to_csv('scrublet-scores/'+i+'.csv') scrub.plot_histogram(); #plt.savefig('limb/sample_'+i+'_doulet_histogram.pdf') adata.obs.loc[adata.obs[batch_key]==i,'doublet_scores']=doublet_scores adata.obs.loc[adata.obs[batch_key]==i,'bh_pval'] = bh(pvals) del adata_sample return adata
def run_scrublet(adata, resolution_function=None): old_verbosity = sc.settings.verbosity sc.settings.verbosity = 1 if resolution_function is None: resolution_function = lambda x: np.maximum( np.maximum(np.log10(x) - 1, 0)**2, 0.1) scrub = scr.Scrublet(adata.X) #this has the potential to brick for poor quality data #if so, abort it and everything downstream try: ds, pd = scrub.scrub_doublets(verbose=False) except: return adata.obs['scrublet_score'] = ds adata_copy = adata.copy() sc.pp.filter_genes(adata_copy, min_cells=3) sc.pp.normalize_total(adata_copy, target_sum=1e4) sc.pp.log1p(adata_copy) sc.pp.highly_variable_genes(adata_copy, min_mean=0.0125, max_mean=3, min_disp=0.5, subset=True) sc.pp.scale(adata_copy, zero_center=False) sc.pp.pca(adata_copy, svd_solver='arpack', zero_center=False) sc.pp.neighbors(adata_copy, n_pcs=30) sc.tl.umap(adata_copy) sc.tl.leiden(adata_copy, resolution=1) for clst in np.unique(adata_copy.obs['leiden']): clst_size = sum(adata_copy.obs['leiden'] == clst) sc.tl.leiden(adata_copy, restrict_to=('leiden', [clst]), resolution=resolution_function(clst_size), key_added='leiden_R') adata_copy.obs['leiden'] = adata_copy.obs['leiden_R'] clst_meds = [] for clst in np.unique(adata_copy.obs['leiden']): k = adata_copy.obs['leiden'] == clst clst_med = np.median(adata_copy.obs.loc[k, 'scrublet_score']) adata_copy.obs.loc[k, 'cluster_scrublet_score'] = clst_med clst_meds.append(clst_med) clst_meds = np.array(clst_meds) pvals, bh_pvals = test_outlier(clst_meds) for i, clst in enumerate(np.unique(adata_copy.obs['leiden'])): k = adata_copy.obs['leiden'] == clst adata_copy.obs.loc[k, 'pval'] = pvals[i] adata_copy.obs.loc[k, 'bh_pval'] = bh_pvals[i] sc.settings.verbosity = old_verbosity #need to also export the clustering, for soupx purposes adata.obs['scrublet_leiden'] = adata_copy.obs['leiden'] adata.obs['scrublet_score'] = adata_copy.obs['scrublet_score'] adata.obs['cluster_scrublet_score'] = adata_copy.obs[ 'cluster_scrublet_score'] adata.obs['doublet_pval'] = adata_copy.obs['pval'] adata.obs['doublet_bh_pval'] = adata_copy.obs['bh_pval'] del adata_copy
def main(): # parse command line options parser = OptionParser() parser.add_option("--inputDir", "-i", dest="input_dir", default=None, help=("Directory of input matrix in 10x cellranger format")) parser.add_option("--outFile", "-o", dest="out_file", default=None, help=("Path for output file [default: $i/scrublet_table.tsv]")) parser.add_option("--cellranger2", "-2", dest="cellranger2", action="store_true", default=False, help="Use it for cellranger v2 instead of v3") parser.add_option("--expected_rate", "-r", dest="expected_rate", default=None, help="Expected doublet rate: [default: n_cell/100K].") parser.add_option("--homotypicP", dest="homotypic_prop", default=0.15, type=float, help="Proportion of homotypic doublets: [default: %default].") (options, args) = parser.parse_args() dat_path = os.path.abspath(options.input_dir) version3 = options.cellranger2 == False mat_dat, gene_ids, cell_ids = load_10X(dat_path, min_counts=None, min_cells=None, version3=version3) n_cell = mat_dat.shape[1] if options.expected_rate is None: expected_rate = n_cell / 100000.0 else: expected_rate = float(options.expected_rate) expected_rate = min(expected_rate, 0.5) homotypic_prop = min(max(options.homotypic_prop, 0.01), 0.99) print("Files loaded: %d cells." %(n_cell)) print("Expected doublet rate: %.3f" %(expected_rate)) scrub = scr.Scrublet(mat_dat.transpose(), expected_doublet_rate=expected_rate) raw_scores, raw_doublet = scrub.scrub_doublets(n_prin_comps=30) simu_scores = scrub.doublet_scores_sim_ # when there is no suggested threshold if raw_doublet is None: raw_doublet = np.array([None] * len(raw_scores)) _cutoff = np.quantile(raw_scores, 1 - (1 - homotypic_prop) * expected_rate) label_frac = raw_scores >= _cutoff if options.out_file is None: out_file = dat_path + "/scrublet_table.tsv" else: out_file = options.out_file fid = open(out_file, "w") fid.writelines("cellID\tscore\tlabel_raw\tlabel_frac\n") for i in range(len(cell_ids)): out_list = [cell_ids[i], "%.3f" %raw_scores[i], str(raw_doublet[i]), str(label_frac[i])] fid.writelines("\t".join(out_list) + "\n") fid.close()
def run_scrublet_atac(input_dir): counts_matrix = scipy.io.mmread(input_dir + 'matrix.mtx').T.tocsc() print('Counts matrix shape: {} rows, {} columns'.format(counts_matrix.shape[0], counts_matrix.shape[1])) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.05) doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) np.savetxt(input_dir + 'predicted_doublet_mask.txt', scrub.predicted_doublets_, fmt='%s') np.savetxt(input_dir + 'doublet_scores.txt', scrub.doublet_scores_obs_, fmt='%.4f')
def run_scrublet_rna(input_dir): counts_matrix = scipy.io.mmread(input_dir + 'matrix.mtx').T.tocsc() genes = np.array(scr.load_genes(input_dir + 'features.tsv', delimiter='\t', column=1)) print('Counts matrix shape: {} rows, {} columns'.format(counts_matrix.shape[0], counts_matrix.shape[1])) print('Number of genes in gene list: {}'.format(len(genes))) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.05) doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) np.savetxt(input_dir + 'predicted_doublet_mask.txt', scrub.predicted_doublets_, fmt='%s') np.savetxt(input_dir + 'doublet_scores.txt', scrub.doublet_scores_obs_, fmt='%.4f')
def doublet(adata, key='Sample'): '''detecting doublet using scrublet per key''' doublet = [] for filename in set(adata.obs[key]): print(filename) sdata = adata[adata.obs[key] == filename].copy() scrub = scr.Scrublet(sdata.X) doublet_scores, predicted_doublets = scrub.scrub_doublets(verbose=False) doublet.extend([(x,y,z) for x,y,z in zip(sdata.obs_names,doublet_scores,predicted_doublets)]) doublet_score = {x:y for (x,y,z) in doublet} doublet_predict = {x:z for (x,y,z) in doublet} adata.obs['doublet_score'] = [doublet_score[obs_name] for obs_name in list(adata.obs_names)] adata.obs['doublet_predict'] = [doublet_predict[obs_name] for obs_name in list(adata.obs_names)]
def run_scrublet(sample_name, counts_matrix): print('run_scrublet.py: run_scrublet: begin') warnings.showwarning = handle_warning if(numpy.size(counts_matrix, 0) == 0 or numpy.size(counts_matrix, 1) == 0): filename = args.sample_name + "-scrublet_hist.png" image = Image.new(mode = "RGB", size = (800,600), color = "white") draw = ImageDraw.Draw(image) draw.text((50,50), "Scrublet failed. This is generally because there aren't enough cells with sufficient reads.\n", fill = "black") return(-1) if(not scipy.sparse.isspmatrix_csc(counts_matrix)): counts_matrix = counts_matrix.T.tocsc() else: counts_matrix = counts_matrix.T # count_matrix # rows: cells # cols: genes scrub = scr.Scrublet(counts_matrix) try: doublet_scores, predicted_doublets = scrub.scrub_doublets() scrub.plot_histogram()[0].savefig(args.sample_name + "-scrublet_hist.png") all_scores = numpy.vstack((doublet_scores, predicted_doublets)) all_scores = numpy.transpose(all_scores) numpy.savetxt(args.sample_name + "-scrublet_table.csv", all_scores, delimiter=",", fmt='%.8e,%d') except (ZeroDivisionError, FloatingPointError, ValueError) as eobj: tb_str = traceback.format_exc() print('%s' % ( tb_str ), file=sys.stderr) temp = numpy.array(["NA"] * numpy.size(counts_matrix, 0)) all_scores = numpy.vstack((temp, temp)) all_scores = numpy.transpose(all_scores) filename = args.sample_name + "-scrublet_hist.png" image = Image.new(mode = "RGB", size = (800,600), color = "white") draw = ImageDraw.Draw(image) draw.text((50,50), "Scrublet failed. This is generally because there aren't enough cells with sufficient reads.\n\nFailure message:\n\n" + tb_str, fill = "black") image.save(filename) numpy.savetxt(args.sample_name + "-scrublet_table.csv", all_scores, fmt="%s", delimiter=",") except (AttributeError) as eobj: tb_str = traceback.format_exc() print('%s' % ( tb_str ), file=sys.stderr) predicted_doublets = scrub.call_doublets(threshold=0.15) scrub.plot_histogram()[0].savefig(args.sample_name + "-scrublet_hist.png") all_scores = numpy.vstack((doublet_scores, predicted_doublets)) all_scores = numpy.transpose(all_scores) numpy.savetxt(args.sample_name + "-scrublet_table.csv", all_scores, delimiter=",", header='doublet_score,doublet') print('run_scrublet.py: run_scrublet: end') return( 0 )
def score_doublets(mtx, doublet_rate): scrub = scr.Scrublet(mtx.T, expected_doublet_rate=doublet_rate) doublet_scores, predicted_doublets = scrub.scrub_doublets() def manual_threshold(scores): top_n = int(doublet_rate * len(scores)) sorted_scores = np.sort(scores) threshold = sorted_scores[len(scores) - top_n:].min() return threshold # scrublet can be conservative -- making sure I get most doublets if scrub.threshold_ > 0.3 and sum( predicted_doublets) < (doublet_rate * len(doublet_scores)) / 2: threshold = manual_threshold(doublet_scores) doublets = scrub.call_doublets(threshold=threshold) else: doublets = scrub.call_doublets() return doublets
def identify_doublets(data, **kw): """Detect doublets in single-cell RNA-seq data https://github.com/AllonKleinLab/scrublet """ import scrublet as scr adata = data.copy() col_sum = adata.X.sum(0) if hasattr(col_sum, 'A'): col_sum = col_sum.A.squeeze() keep = col_sum > 3 adata = adata[:,keep] scrub = scr.Scrublet(adata.X, **kw) doublet_score, predicted_doublets = scrub.scrub_doublets() if predicted_doublets is None: predicted_doublets = scrub.call_doublets(threshold=0.34) data.obs['doublet_score'] = doublet_score data.obs['predicted_doublets'] = predicted_doublets return data
def detectDoublet(args): counts_matrix = readMatrix(args.input, binary=False) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.06, sim_doublet_ratio=3, n_neighbors=25) doublet_scores, _ = scrub.scrub_doublets( min_counts=1, min_cells=3, min_gene_variability_pctl=85, mean_center=True, normalize_variance=True, n_prin_comps=min(30, counts_matrix.get_shape()[0] // 10)) # Fit a Gaussian mixture model X = scrub.doublet_scores_sim_ X = np.array([X]).T gmm = BayesianGaussianMixture(n_components=2, max_iter=1000, random_state=2394).fit(X) i = np.argmax(gmm.means_) probs_sim = gmm.predict_proba(X)[:, i] vals = X[np.argwhere(probs_sim > 0.5)].flatten() if vals.size == 0: threshold = np.amax(X.flatten()) else: threshold = min(vals) X = np.array([doublet_scores]).T probs = gmm.predict_proba(X)[:, i].tolist() with open(args.output, 'w') as fl: fl.write('\t'.join(map(str, probs))) fl.write("\n") fl.write(str(threshold)) fl.write("\n") fl.write('\t'.join(map(str, (doublet_scores.tolist())))) fl.write("\n") fl.write('\t'.join(map(str, scrub.doublet_scores_sim_)))
def scrublet(adata, expected_rate=0.06, doublet_score=None): import scrublet as scr import numpy as np scrub = scr.Scrublet(adata.X, expected_doublet_rate=expected_rate) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) adata.obs['doublet_scores'] = doublet_scores adata.obs['predicted_doublets'] = predicted_doublets adata.obs[ 'predicted_doublets'] = adata.obs['doublet_scores'] > doublet_score print('Nr of predicted doublets ', np.sum(adata.obs['predicted_doublets'])) print('Doublets indices saved in adata.obs["predicted_doublets"]') scrub.plot_histogram()
def scrublet_py(i, j, val, dim, expected_doublet_rate, min_counts, min_cells, min_gene_variability_pctl, n_prin_comps, sim_doublet_ratio, n_neighbors): import matplotlib matplotlib.use('agg') import scrublet as scr import scipy.io import numpy as np import os from scipy.sparse import csc_matrix data = csc_matrix((val, (i, j)), shape=dim) scrub = scr.Scrublet(data, expected_doublet_rate=expected_doublet_rate, sim_doublet_ratio=int(sim_doublet_ratio), n_neighbors=int(n_neighbors)) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=int(min_counts), min_cells=int(min_cells), min_gene_variability_pctl=min_gene_variability_pctl, n_prin_comps=int(n_prin_comps)) return (doublet_scores, predicted_doublets)
def dedoublets(adata, edr=0.1, npc=30, pctl=85, pl=False, f_out_fig=None, dpi=300): scrub = scr.Scrublet(adata.X, expected_doublet_rate=edr) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_gene_variability_pctl=pctl, n_prin_comps=npc) #1. remove doublets adata.obs['doublets'] = predicted_doublets adata = adata[adata.obs['doublets'] == False, :].copy() #2. drop doublets column in obs adata.obs = adata.obs.drop('doublets', axis=1) #3. plot if pl: scrub.plot_histogram() plt.savefig(f_out_fig, dpi=dpi) plt.close() return adata
def anndata_from_mtx(outpath, name): import numpy as np import pandas as pd import scanpy.api as sc import scrublet as scr from scipy import sparse from skimage.filters import threshold_minimum sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3) sc.logging.print_versions() if not name: name = 'scanpy' results_file = os.path.join(outpath, name + '_raw.h5ad') sc.settings.set_figure_params(dpi=80) adata = sc.read(os.path.join(outpath, 'matrix.mtx'), cache=False).T # transpose the data adata.var_names = pd.read_csv(os.path.join(outpath, 'genes.tsv'), header=None, sep='\t')[0] adata.obs_names = pd.read_csv(os.path.join(outpath, 'barcodes.tsv'), header=None, sep='\t')[0] adata.var_names_make_unique() counts_matrix = sparse.csc_matrix(adata.X) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=round( counts_matrix.shape[0] / 125000, 4)) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) threshold = threshold_minimum(scrub.doublet_scores_sim_) adata.obs['doublet_score'] = scrub.doublet_scores_obs_ adata.uns['doublet_threshold'] = threshold adata.write_h5ad(results_file) return adata
def scrublet_c(sample, inDir, outDir, expected_doublet_rate, sim_doublet_ratio, ratio_df, out_df): print(sample, "start scrublet") counts_matrix = scipy.io.mmread(os.path.join(inDir, 'matrix.mtx')).T.tocsc() genes = np.array( scr.load_genes(os.path.join(inDir, 'genes.tsv'), delimiter='\t', column=1)) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=expected_doublet_rate, sim_doublet_ratio=sim_doublet_ratio) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) scrub.plot_histogram() plt.savefig( os.path.join( outDir, "{0}_scrublet_doublet_score_histogram.pdf".format(sample))) print(sample, 'Running scrublet UMAP...') scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3)) print(sample, 'scrublet Done.') scrub.plot_embedding('UMAP', order_points=True) plt.savefig(os.path.join(outDir, "{0}_scrublet_UMAP.pdf".format(sample))) print(sample, "Done scrublet") ratio_df.loc['scrublet', sample] = scrub.detected_doublet_rate_ out_df['scrublet_doublet_scores'] = doublet_scores out_df['scrublet_doublets'] = predicted_doublets return ratio_df, out_df
def detect(self): try: self.adata = sc.read_h5ad(self._adata) except: self.adata = sc.read_10x_mtx(self._adata) print("### Initialize Scruble") counts_matrix = self.adata.raw.X scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=self._expected_doublet_rate, sim_doublet_ratio=1.0) print("### Detect ,Nomalize,PCA") doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=self._min_counts, min_cells=self._min_cells, min_gene_variability_pctl=85, n_prin_comps=self._n_prin_comps) self.adata.obs["doublet_scores_obs"] = doublet_scores self.adata.obs["doublet_errors_obs"] = scrub.doublet_errors_obs_ self.adata.obs["doublet_errors_sim"] = scrub.doublet_errors_sim_ self.adata.obs["doublet_scores_sim"] = scrub.doublet_scores_sim_ self.scrub = scrub
plt.rc('font', size=14) plt.rcParams['pdf.fonttype'] = 42 #filtered #input_dir = '/share/ScratchGeneral/briglo/scRNA/venchi/data/hg19/VENCHI_SampleBCITE/outs/filtered_feature_bc_matrix/' input_dir = '/share/ScratchGeneral/briglo/scRNA/venchi/outputs/seurat/' counts_matrix = scipy.io.mmread(input_dir + 'combined.human.mtx').T.tocsc() genes = np.array(scr.load_genes(input_dir + 'genes.tsv', delimiter='\t', column=0)) print('Counts matrix shape: {} rows, {} columns'.format(counts_matrix.shape[0], counts_matrix.shape[1])) print('Number of genes in gene list: {}'.format(len(genes))) #Counts matrix shape: 12865 rows, 32738 columns #Number of genes in gene list: 32738 scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.06) doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30, get_doublet_neighbor_parents=True) scrub.plot_histogram(); plt.savefig('/share/ScratchGeneral/briglo/scRNA/venchi/plt.png') scrub.call_doublets(threshold=0.24) scrub.plot_histogram(); plt.savefig('/share/ScratchGeneral/briglo/scRNA/venchi/plt.png') print('Running UMAP...') # scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
import scrublet as rc import matplotlib.pyplot as plt import scipy.io from scipy.sparse import csc_matrix import numpy as np wd = "/restricted/projectnb/camplab/home/syyang/contamination/data/pbmc/4k/" counts = scipy.io.mmread( wd + 'data/matrix.mtx' ) geneIndex = ((counts > 2).sum( axis = 1 ) > 2 ) counts_filter = counts.toarray()[ np.array(geneIndex).reshape(-1), : ] print( counts_filter.shape) counts_csc = csc_matrix( counts_filter.T ) genes = np.array( rc.load_genes( wd + 'data/genes.tsv', delimiter='\t', column=1)) [ np.array(geneIndex).reshape(-1) ] scrub = rc.Scrublet(counts_csc, expected_doublet_rate=0.06) doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) np.save( "doublet_scores.npy", doublet_scores ) np.save( "predicted_doublets.npy", predicted_doublets * 1 )
data = libraries['SIGAG4'].concatenate([libraries['SIGAH4']]) data.obs['organ'] = 'PB' # Predict and remove putative doublets scrub = {} doublet_scores = {} predicted_doublets = {} # expected multiplet rate emr = {'SIGAG4': 0.076, 'SIGAH4': 0.076} for sample in samples: print(sample) scrub[sample] = scr.Scrublet( data[np.array(data.obs['library'] == sample), :].X) doublet_scores[sample], predicted_doublets[sample] = scrub[ sample].scrub_doublets() print('\n\n') for sample in samples: print(sample, ':', sum(predicted_doublets[sample])) sample = 'SIGAG4' scrub[sample].plot_histogram() sample = 'SIGAH4' scrub[sample].plot_histogram() data_doublets = os.path.join(sc.settings.writedir, '..', 'doublets') if not os.path.exists(data_doublets):
raise ArgumentError("You need to supply a working directory, a sample metadata file and a genome build!") # return argument values return wd,sampleID,genome_builds working_dir,sampleID,genomes= parse_arguments(sys.argv) ## Perform doublet detection for each sample sequencially for genome in genomes.split(','): if os.path.isfile(working_dir+'/count/'+sampleID+'/outs/filtered_feature_bc_matrix/matrix.mtx.gz'): matrix_path = working_dir+'/count/'+sampleID+'/outs/filtered_feature_bc_matrix/matrix.mtx.gz' raw_counts = mmread(matrix_path).T.tocsc() scrub = scr.Scrublet(raw_counts, expected_doublet_rate=0.06) doublet_scores, predicted_doublets = scrub.scrub_doublets() output_dir=working_dir+'/count/'+sampleID+'/outs/analysis/doubletdetection' if not os.path.isdir(output_dir): os.makedirs(output_dir) output_doublets = open(output_dir+'/'+sampleID+'_'+genome+'_scrublet_doublets.txt','w') if os.path.isfile(working_dir+'/count/'+sampleID+'/outs/filtered_feature_bc_matrix/barcodes.tsv.gz'): barcode_path = working_dir+'/count/'+sampleID+'/outs/filtered_feature_bc_matrix/barcodes.tsv.gz' barcodesList=list() for line in gzip.open(barcode_path, 'rb'): barcodesList.append(line.rstrip())
def scrublet_simulate_doublets( adata: AnnData, layer=None, sim_doublet_ratio: float = 2.0, synthetic_doublet_umi_subsampling: float = 1.0, random_seed: int = 0, ) -> AnnData: """\ Simulate doublets by adding the counts of random observed transcriptome pairs. Parameters ---------- adata The annotated data matrix of shape ``n_obs`` × ``n_vars``. Rows correspond to cells and columns to genes. Genes should have been filtered for expression and variability, and the object should contain raw expression of the same dimensions. layer Layer of adata where raw values are stored, or 'X' if values are in .X. sim_doublet_ratio Number of doublets to simulate relative to the number of observed transcriptomes. If `None`, self.sim_doublet_ratio is used. synthetic_doublet_umi_subsampling Rate for sampling UMIs when creating synthetic doublets. If 1.0, each doublet is created by simply adding the UMIs from two randomly sampled observed transcriptomes. For values less than 1, the UMI counts are added and then randomly sampled at the specified rate. Returns ------- adata : anndata.AnnData with simulated doublets in .X Adds fields to ``adata``: ``.obsm['scrublet']['doublet_parents']`` Pairs of ``.obs_names`` used to generate each simulated doublet transcriptome ``.uns['scrublet']['parameters']`` Dictionary of Scrublet parameters See also -------- :func:`~scanpy.external.pp.scrublet`: Main way of running Scrublet, runs preprocessing, doublet simulation (this function) and calling. :func:`~scanpy.external.pl.scrublet_score_distribution`: Plot histogram of doublet scores for observed transcriptomes and simulated doublets. """ try: import scrublet as sl except ImportError: raise ImportError( 'Please install scrublet: `pip install scrublet` or `conda install scrublet`.' ) X = _get_obs_rep(adata, layer=layer) scrub = sl.Scrublet(X) scrub.simulate_doublets( sim_doublet_ratio=sim_doublet_ratio, synthetic_doublet_umi_subsampling=synthetic_doublet_umi_subsampling, ) adata_sim = AnnData(scrub._E_sim) adata_sim.obs['n_counts'] = scrub._total_counts_sim adata_sim.obsm['doublet_parents'] = scrub.doublet_parents_ adata_sim.uns['scrublet'] = { 'parameters': { 'sim_doublet_ratio': sim_doublet_ratio } } return adata_sim
def _scrublet_call_doublets( adata_obs: AnnData, adata_sim: AnnData, n_neighbors: Optional[int] = None, expected_doublet_rate: float = 0.05, stdev_doublet_rate: float = 0.02, mean_center: bool = True, normalize_variance: bool = True, n_prin_comps: int = 30, use_approx_neighbors: bool = True, knn_dist_metric: str = 'euclidean', get_doublet_neighbor_parents: bool = False, threshold: Optional[float] = None, random_state: int = 0, verbose: bool = True, ) -> AnnData: """\ Core function for predicting doublets using Scrublet [Wolock19]_. Predict cell doublets using a nearest-neighbor classifier of observed transcriptomes and simulated doublets. This is a wrapper around the core functions of `Scrublet <https://github.com/swolock/scrublet>`__ to allow for flexibility in applying Scanpy filtering operations upstream. Unless you know what you're doing you should use the main scrublet() function. .. note:: More information and bug reports `here <https://github.com/swolock/scrublet>`__. Parameters ---------- adata_obs The annotated data matrix of shape ``n_obs`` × ``n_vars``. Rows correspond to cells and columns to genes. Should be normalised with scanpy.pp.normalize_total() and filtered to include only highly variable genes. adata_sim Anndata object generated by sc.external.pp.scrublet_simulate_doublets(), with same number of vars as adata_obs. This should have been built from adata_obs after filtering genes and cells and selcting highly-variable genes. n_neighbors Number of neighbors used to construct the KNN graph of observed transcriptomes and simulated doublets. If ``None``, this is automatically set to ``np.round(0.5 * np.sqrt(n_obs))``. expected_doublet_rate The estimated doublet rate for the experiment. stdev_doublet_rate Uncertainty in the expected doublet rate. mean_center If True, center the data such that each gene has a mean of 0. `sklearn.decomposition.PCA` will be used for dimensionality reduction. normalize_variance If True, normalize the data such that each gene has a variance of 1. `sklearn.decomposition.TruncatedSVD` will be used for dimensionality reduction, unless `mean_center` is True. n_prin_comps Number of principal components used to embed the transcriptomes prior to k-nearest-neighbor graph construction. use_approx_neighbors Use approximate nearest neighbor method (annoy) for the KNN classifier. knn_dist_metric Distance metric used when finding nearest neighbors. For list of valid values, see the documentation for annoy (if `use_approx_neighbors` is True) or sklearn.neighbors.NearestNeighbors (if `use_approx_neighbors` is False). get_doublet_neighbor_parents If True, return the parent transcriptomes that generated the doublet neighbors of each observed transcriptome. This information can be used to infer the cell states that generated a given doublet state. threshold Doublet score threshold for calling a transcriptome a doublet. If `None`, this is set automatically by looking for the minimum between the two modes of the `doublet_scores_sim_` histogram. It is best practice to check the threshold visually using the `doublet_scores_sim_` histogram and/or based on co-localization of predicted doublets in a 2-D embedding. random_state Initial state for doublet simulation and nearest neighbors. verbose If True, print progress updates. Returns ------- adata : anndata.AnnData if ``copy=True`` it returns or else adds fields to ``adata``: ``.obs['doublet_score']`` Doublet scores for each observed transcriptome ``.obs['predicted_doublets']`` Boolean indicating predicted doublet status ``.uns['scrublet']['doublet_scores_sim']`` Doublet scores for each simulated doublet transcriptome ``.uns['scrublet']['doublet_parents']`` Pairs of ``.obs_names`` used to generate each simulated doublet transcriptome ``.uns['scrublet']['parameters']`` Dictionary of Scrublet parameters """ try: import scrublet as sl except ImportError: raise ImportError( 'Please install scrublet: `pip install scrublet` or `conda install scrublet`.' ) # Estimate n_neighbors if not provided, and create scrublet object. if n_neighbors is None: n_neighbors = int(round(0.5 * np.sqrt(adata_obs.shape[0]))) # Note: Scrublet() will sparse adata_obs.X if it's not already, but this # matrix won't get used if we pre-set the normalised slots. scrub = sl.Scrublet( adata_obs.X, n_neighbors=n_neighbors, expected_doublet_rate=expected_doublet_rate, stdev_doublet_rate=stdev_doublet_rate, random_state=random_state, ) # Ensure normalised matrix sparseness as Scrublet does # https://github.com/swolock/scrublet/blob/67f8ecbad14e8e1aa9c89b43dac6638cebe38640/src/scrublet/scrublet.py#L100 scrub._E_obs_norm = sparse.csc_matrix(adata_obs.X) scrub._E_sim_norm = sparse.csc_matrix(adata_sim.X) scrub.doublet_parents_ = adata_sim.obsm['doublet_parents'] # Call scrublet-specific preprocessing where specified if mean_center and normalize_variance: sl.pipeline_zscore(scrub) elif mean_center: sl.pipeline_mean_center(scrub) elif normalize_variance: sl.pipeline_normalize_variance(scrub) # Do PCA. Scrublet fits to the observed matrix and decomposes both observed # and simulated based on that fit, so we'll just let it do its thing rather # than trying to use Scanpy's PCA wrapper of the same functions. if mean_center: logg.info('Embedding transcriptomes using PCA...') sl.pipeline_pca(scrub, n_prin_comps=n_prin_comps, random_state=scrub.random_state) else: logg.info('Embedding transcriptomes using Truncated SVD...') sl.pipeline_truncated_svd(scrub, n_prin_comps=n_prin_comps, random_state=scrub.random_state) # Score the doublets scrub.calculate_doublet_scores( use_approx_neighbors=use_approx_neighbors, distance_metric=knn_dist_metric, get_doublet_neighbor_parents=get_doublet_neighbor_parents, ) # Actually call doublets scrub.call_doublets(threshold=threshold, verbose=verbose) # Store results in AnnData for return adata_obs.obs['doublet_score'] = scrub.doublet_scores_obs_ # Store doublet Scrublet metadata adata_obs.uns['scrublet'] = { 'doublet_scores_sim': scrub.doublet_scores_sim_, 'doublet_parents': adata_sim.obsm['doublet_parents'], 'parameters': { 'expected_doublet_rate': expected_doublet_rate, 'sim_doublet_ratio': (adata_sim.uns.get('scrublet', {}).get('parameters', {}).get('sim_doublet_ratio', None)), 'n_neighbors': n_neighbors, 'random_state': random_state, }, } # If threshold hasn't been located successfully then we couldn't make any # predictions. The user will get a warning from Scrublet, but we need to # set the boolean so that any downstream filtering on # predicted_doublet=False doesn't incorrectly filter cells. The user can # still use this object to generate the plot and derive a threshold # manually. if hasattr(scrub, 'threshold_'): adata_obs.uns['scrublet']['threshold'] = scrub.threshold_ adata_obs.obs['predicted_doublet'] = scrub.predicted_doublets_ else: adata_obs.obs['predicted_doublet'] = False if get_doublet_neighbor_parents: adata_obs.uns['scrublet'][ 'doublet_neighbor_parents'] = scrub.doublet_neighbor_parents_ return adata_obs
parser.add_argument('-i', '--input', help='raw 10X file directory for input', type=str) parser.add_argument('-o', '--output', help='output directory', type=str, default="./") parser.add_argument('-n', '--name', help='name of output files', type=str, default="name") parser.add_argument('-r', '--doublet', help='expected doublet rate, default=0.06', type=float, default=0.06) parser.add_argument('-e', '--embed', help='plot UMAP and TSNE. True or False.', type=bool, default=False) args = parser.parse_args() #load counts matrix, genes, barcodes print("Loading counts matrix %s" % args.input + '/matrix.mtx', file=sys.stderr) counts_matrix = scipy.io.mmread(args.input + '/matrix.mtx').T.tocsc() print("Loading barcodes %s" % args.input + '/barcodes.tsv', file=sys.stderr) barcodes = np.array(scr.load_genes(args.input + 'barcodes.tsv', delimiter='t', column=0)) #initialize scrublet object print("Initializing scrublet object", file=sys.stderr) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=args.doublet) #whole counts matrix print("Computing doublet predictions", file=sys.stderr) doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) #write scrublet output to file: print("Writing doublet predictions to %s" % args.output + "/" + args.name + "_predicted_doublets.tsv", file=sys.stderr) with open(args.output + "/" + args.name + "_predicted_doublets.tsv", 'w') as outfile: outfile.write("\t".join(["barcode", "doublet_score", "doublet_prediction"])+"\n") for barcode, score, prediction in zip(barcodes, doublet_scores, predicted_doublets): if prediction == False: doublet = "0" else:
data.obs['organ'] = 'PB' # Predict and remove putative doublets scrub = {} doublet_scores = {} predicted_doublets = {} # expected multiplet rate emr = {'SIGAE2': 0.069, 'SIGAF2': 0.076, 'SIGAG2': 0.076} for sample in samples: print(sample) scrub[sample] = scr.Scrublet( data[np.array(data.obs['library'] == sample), :].X, expected_doublet_rate=emr[sample]) doublet_scores[sample], predicted_doublets[sample] = scrub[ sample].scrub_doublets() print('\n\n') for sample in samples: print(sample, ':', sum(predicted_doublets[sample])) sample = 'SIGAE2' scrub[sample].plot_histogram() predicted_doublets[sample] = scrub[sample].call_doublets(threshold=0.3) sum(predicted_doublets[sample]) sample = 'SIGAF2' scrub[sample].plot_histogram()
################################################################################ # Processing... if args.use_variable_features: print("Subsetting the variable features from the counts matrix...") if args.h5ad_with_variable_features_info is None: raise Exception("VSN ERROR: Expecting --h5ad-with-variable-features-info argument to be set since --use-variable-features argument is set to True.") FILE_PATH_H5AD_WITH_HVG_INFO = args.h5ad_with_variable_features_info adata_hvg = sc.read_h5ad(filename=FILE_PATH_H5AD_WITH_HVG_INFO.name) counts_matrix = adata_raw.X[:, np.array(adata_hvg.var['highly_variable'])] else: counts_matrix = adata_raw.X scrub = scr.Scrublet(counts_matrix) adata_raw.obs['doublet_scores'], adata_raw.obs['predicted_doublets'] = scrub.scrub_doublets( synthetic_doublet_umi_subsampling=args.synthetic_doublet_umi_subsampling, use_approx_neighbors=True, distance_metric='euclidean', get_doublet_neighbor_parents=False, min_counts=args.min_counts, min_cells=args.min_cells, min_gene_variability_pctl=args.min_gene_variability_pctl, log_transform=args.log_transform, mean_center=args.mean_center, normalize_variance=args.normalize_variance, n_prin_comps=args.n_prin_comps, verbose=True ) # Rename the columns
plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.sans-serif'] = 'Arial' plt.rc('font', size=14) plt.rcParams['pdf.fonttype'] = 42 ## Basic run with scrublet input_dir = os.path.join(sys.argv[1]) counts_matrix = scipy.io.mmread(input_dir + 'matrix.mtx').T.tocsc() genes = np.array( scr.load_genes(input_dir + 'genes.tsv', delimiter='\t', column=1)) # Use with the raw data print('Counts matrix shape: {} rows, {} columns'.format( counts_matrix.shape[0], counts_matrix.shape[1])) print('Number of genes in gene list: {}'.format(len(genes))) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.15, sim_doublet_ratio=2) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=150, min_gene_variability_pctl=var_number, n_prin_comps=30) scrub.call_doublets(threshold=0.40) outdir = sys.argv[2] scrub.plot_histogram() plt.savefig(os.path.join(outdir, 'figure1.png')) print('Running UMAP...') scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))