def annotate_doublets(mtx_fpath, feature_fpath, expected_doublet_rate2=0.06): if False: plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.sans-serif'] = 'Arial' plt.rc('font', size=14) plt.rcParams['pdf.fonttype'] = 42 counts_matrix = scipy.io.mmread(mtx_fpath).T.tocsc() genes = np.array(scr.load_genes(feature_fpath, delimiter='\t', column=1)) print('Counts matrix shape: {} rows, {} columns'.format( counts_matrix.shape[0], counts_matrix.shape[1])) print('Number of genes in gene list: {}'.format(len(genes))) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=expected_doublet_rate2) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) if False: scrub.plot_histogram() print('Running UMAP...') scrub.set_embedding( 'UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3)) print('Done.') scrub.plot_embedding('UMAP', order_points=True) return ([doublet_scores, predicted_doublets])
def set_embedding(self): print("#### Run UMAP") self.scrub.set_embedding( 'UMAP', scr.get_umap(self.scrub.manifold_obs_, 30, min_dist=0.3)) # # Uncomment to run tSNE - slow print('#### Running tSNE...') self.scrub.set_embedding( 'tSNE', scr.get_tsne(self.scrub.manifold_obs_, angle=0.9, verbose=True)) print('Done.') with open(os.path.join(self._outdir, "scrub.pkl"), "wb") as f: pickle.dump(self.scrub, f) f.close()
def scrublet_c(sample, inDir, outDir, expected_doublet_rate, sim_doublet_ratio, ratio_df, out_df): print(sample, "start scrublet") counts_matrix = scipy.io.mmread(os.path.join(inDir, 'matrix.mtx')).T.tocsc() genes = np.array( scr.load_genes(os.path.join(inDir, 'genes.tsv'), delimiter='\t', column=1)) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=expected_doublet_rate, sim_doublet_ratio=sim_doublet_ratio) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) scrub.plot_histogram() plt.savefig( os.path.join( outDir, "{0}_scrublet_doublet_score_histogram.pdf".format(sample))) print(sample, 'Running scrublet UMAP...') scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3)) print(sample, 'scrublet Done.') scrub.plot_embedding('UMAP', order_points=True) plt.savefig(os.path.join(outDir, "{0}_scrublet_UMAP.pdf".format(sample))) print(sample, "Done scrublet") ratio_df.loc['scrublet', sample] = scrub.detected_doublet_rate_ out_df['scrublet_doublet_scores'] = doublet_scores out_df['scrublet_doublets'] = predicted_doublets return ratio_df, out_df
#write scrublet output to file: print("Writing doublet predictions to %s" % args.output + "/" + args.name + "_predicted_doublets.tsv", file=sys.stderr) with open(args.output + "/" + args.name + "_predicted_doublets.tsv", 'w') as outfile: outfile.write("\t".join(["barcode", "doublet_score", "doublet_prediction"])+"\n") for barcode, score, prediction in zip(barcodes, doublet_scores, predicted_doublets): if prediction == False: doublet = "0" else: doublet = "1" outfile.write("\t".join([barcode, str(score), doublet])+"\n") print("Plotting doublet score histogram to %s" % args.output + "/" + args.name + "_score_histogram.pdf", file=sys.stderr) f = scrub.plot_histogram() plt.savefig(args.output + "/" + args.name + "_score_histogram.pdf") if args.embed == True: print("Running UMAP", file=sys.stderr) scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3)) print("Plotting UMAP to %s" % args.output + "/" + args.name + "_UMAP.pdf", file=sys.stderr) f = scrub.plot_embedding('UMAP', order_points=True); plt.savefig(args.output + "/" + args.name + "_UMAP.pdf") print("Running TSNE", file=sys.stderr) scrub.set_embedding('tSNE', scr.get_tsne(scrub.manifold_obs_, angle=0.9)) print("Plotting TSNE to %s" % args.output + "/" + args.name + "_TSNE.pdf", file=sys.stderr) f = scrub.plot_embedding('tSNE', order_points=True); plt.savefig(args.output + "/" + args.name + "_TSNE.pdf")
predicted_doublets[sample] = scrub[sample].call_doublets(threshold=0.25) sum(predicted_doublets[sample]) sample = 'SIGAG2' scrub[sample].plot_histogram() predicted_doublets[sample] = scrub[sample].call_doublets(threshold=0.35) sum(predicted_doublets[sample]) for sample in samples: print(sample, ':', sum(predicted_doublets[sample])) # Iteratively adjust thresholds according to plot results sample = 'SIGAE2' scrub[sample].set_embedding( 'UMAP', scr.get_umap(scrub[sample].manifold_obs_, 10, min_dist=0.3)) scrub[sample].plot_embedding('UMAP', order_points=True) #scrub[sample].predicted_doublets_ = scrub[sample].call_doublets(threshold=0.3) #scrub[sample].call_doublets(threshold=0.3).sum() #predicted_doublets[sample] = scrub[sample].call_doublets(threshold=0.3) sample = 'SIGAF2' scrub[sample].set_embedding( 'UMAP', scr.get_umap(scrub[sample].manifold_obs_, 10, min_dist=0.25)) scrub[sample].plot_embedding('UMAP', order_points=True) #scrub[sample].predicted_doublets_ = scrub[sample].call_doublets(threshold=0.25) #scrub[sample].call_doublets(threshold=0.25).sum() #predicted_doublets[sample] = scrub[sample].call_doublets(threshold=0.25) sample = 'SIGAG2' scrub[sample].set_embedding(
out_df = pd.DataFrame({ 'Cell.ID': cells['Cell.ID'], 'scrublet_doublet_score': doublet_scores, 'scrublet_doublet_call1': predicted_doublets, 'scrublet_doublet_call2': predicted_doublets_025 }) out_df.to_csv('%s.scrublet_out.csv' % (prefix), sep="\t", index=False) with PdfPages('%s.scrublet_out.pdf' % (prefix)) as pdf: scrub.plot_histogram() pdf.savefig() print('Running UMAP...') scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3)) # # Uncomment to run tSNE - slow # print('Running tSNE...') # scrub.set_embedding('tSNE', scr.get_tsne(scrub.manifold_obs_, angle=0.9)) # # Uncomment to run force layout - slow # print('Running ForceAtlas2...') # scrub.set_embedding('FA', scr.get_force_layout(scrub.manifold_obs_, n_neighbors=5. n_iter=1000)) pdf.savefig() print('Done.') scrub.plot_embedding('UMAP', order_points=True) # scrub.plot_embedding('tSNE', order_points=True); # scrub.plot_embedding('FA', order_points=True); pdf.savefig()
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, epilog='author: {0} mail: {1}'.format(__author__, __mail__)) parser.add_argument('-m', '--mtx', help='cellranger分析结果中的matrix.mtx', dest='mtx', required=True) parser.add_argument('-f', '--feature', help='cellranger分析结果中的feature.csv或genes.csv', dest='feature', required=True) parser.add_argument('-o', '--outdir', help='结果输出目录', dest='outdir', required=True) parser.add_argument('-s', '--sampleName', help='样本名', dest='sampleName', required=True) parser.add_argument('-e', '--expectedDoubletRate', help='细胞结团率', dest='expectedDoubletRate', type=float, required=True) parser.add_argument('-p', '--pc', help='PC值', dest='pc', type=int, default=30) args = parser.parse_args() logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s - %(message)s" ) logging.info("开始分析") # plt.rcParams['font.family'] = 'sans-serif' # plt.rcParams['font.sans-serif'] = 'Arial' # plt.rc('font', size=14) # plt.rcParams['pdf.fonttype'] = 42 pc = args.pc expectedDoubletRate = args.expectedDoubletRate sampleName = args.sampleName #Load counts matrix and gene list counts_matrix = scipy.io.mmread(args.mtx).T.tocsc() genes = np.array(scr.load_genes(args.feature, delimiter='\t', column=1)) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=expectedDoubletRate) #Run the default pipeline doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=pc) #Plot doublet score histograms for observed transcriptomes and simulated doublets scrub.call_doublets(threshold=0.1) scrub.plot_histogram() plt.savefig(args.outdir + "/" + sampleName + '_Scrublet_Histogram.png') scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3)) scrub.plot_embedding('UMAP', order_points=True) plt.savefig(args.outdir + "/" + sampleName + '_Scrublet_UMAP.png') #output the log file #expected doublet rate、 detected doublet rate、 doublet threshold、overall doublet rate logging.info( "patientID\texpected_doublet_rate\tdetected_doublet_rate\toverall_doublet_rate\tthreshold\tPC\n" ) logging.info( "%s\t%.4f\t%.4f\t%.4f\t%.4f\t%s\n" % (sampleName, scrub.expected_doublet_rate, scrub.detected_doublet_rate_, scrub.overall_doublet_rate_, scrub.threshold_, pc)) #output the doublet status of every single cell with open(args.outdir + "/" + sampleName + ".predictDoublet_scrublet.txt", "w+") as fo: for i in scrub.predicted_doublets_: fo.write("%s\n" % (i))
counts_matrix = scipy.io.mmread(inputfolder + '/' + jobname + '_filtered.mtx').T.tocsc() print('Counts matrix shape: {} rows, {} columns'.format( counts_matrix.shape[0], counts_matrix.shape[1])) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=exp_db_rate) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=npcs) scrub.call_doublets(threshold=db_thr) # Duplet score for cells savetxt(outputfolder + '/' + jobname + '_duplets_score.csv', scrub.doublet_scores_obs_, delimiter=',') # Simulated duplets savetxt(outputfolder + '/' + jobname + '_sim_duplets_score.csv', scrub.doublet_scores_sim_, delimiter=',') # UMAP print('Running UMAP...') umap = scr.get_umap(scrub.manifold_obs_, n_neighbors=10, min_dist=0.1) savetxt(outputfolder + '/' + jobname + '_umap_scrublet.csv', umap, delimiter=',') print('Done.')