def annotate_doublets(mtx_fpath, feature_fpath, expected_doublet_rate2=0.06):
    if False:
        plt.rcParams['font.family'] = 'sans-serif'
        plt.rcParams['font.sans-serif'] = 'Arial'
        plt.rc('font', size=14)
        plt.rcParams['pdf.fonttype'] = 42

    counts_matrix = scipy.io.mmread(mtx_fpath).T.tocsc()
    genes = np.array(scr.load_genes(feature_fpath, delimiter='\t', column=1))

    print('Counts matrix shape: {} rows, {} columns'.format(
        counts_matrix.shape[0], counts_matrix.shape[1]))
    print('Number of genes in gene list: {}'.format(len(genes)))

    scrub = scr.Scrublet(counts_matrix,
                         expected_doublet_rate=expected_doublet_rate2)

    doublet_scores, predicted_doublets = scrub.scrub_doublets(
        min_counts=2,
        min_cells=3,
        min_gene_variability_pctl=85,
        n_prin_comps=30)

    if False:
        scrub.plot_histogram()

        print('Running UMAP...')
        scrub.set_embedding(
            'UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
        print('Done.')

        scrub.plot_embedding('UMAP', order_points=True)

    return ([doublet_scores, predicted_doublets])
    def set_embedding(self):
        print("#### Run UMAP")
        self.scrub.set_embedding(
            'UMAP', scr.get_umap(self.scrub.manifold_obs_, 30, min_dist=0.3))

        # # Uncomment to run tSNE - slow
        print('#### Running tSNE...')
        self.scrub.set_embedding(
            'tSNE',
            scr.get_tsne(self.scrub.manifold_obs_, angle=0.9, verbose=True))
        print('Done.')
        with open(os.path.join(self._outdir, "scrub.pkl"), "wb") as f:
            pickle.dump(self.scrub, f)
        f.close()
Пример #3
0
def scrublet_c(sample, inDir, outDir, expected_doublet_rate, sim_doublet_ratio,
               ratio_df, out_df):
    print(sample, "start scrublet")
    counts_matrix = scipy.io.mmread(os.path.join(inDir,
                                                 'matrix.mtx')).T.tocsc()
    genes = np.array(
        scr.load_genes(os.path.join(inDir, 'genes.tsv'),
                       delimiter='\t',
                       column=1))

    scrub = scr.Scrublet(counts_matrix,
                         expected_doublet_rate=expected_doublet_rate,
                         sim_doublet_ratio=sim_doublet_ratio)
    doublet_scores, predicted_doublets = scrub.scrub_doublets(
        min_counts=2,
        min_cells=3,
        min_gene_variability_pctl=85,
        n_prin_comps=30)

    scrub.plot_histogram()
    plt.savefig(
        os.path.join(
            outDir, "{0}_scrublet_doublet_score_histogram.pdf".format(sample)))
    print(sample, 'Running scrublet UMAP...')
    scrub.set_embedding('UMAP',
                        scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
    print(sample, 'scrublet Done.')

    scrub.plot_embedding('UMAP', order_points=True)
    plt.savefig(os.path.join(outDir, "{0}_scrublet_UMAP.pdf".format(sample)))
    print(sample, "Done scrublet")

    ratio_df.loc['scrublet', sample] = scrub.detected_doublet_rate_
    out_df['scrublet_doublet_scores'] = doublet_scores
    out_df['scrublet_doublets'] = predicted_doublets

    return ratio_df, out_df
Пример #4
0
#write scrublet output to file:
print("Writing doublet predictions to %s" % args.output + "/" + args.name + "_predicted_doublets.tsv", file=sys.stderr)
with open(args.output + "/" + args.name + "_predicted_doublets.tsv", 'w') as outfile:
	outfile.write("\t".join(["barcode", "doublet_score", "doublet_prediction"])+"\n")
	for barcode, score, prediction in zip(barcodes, doublet_scores, predicted_doublets):
		if prediction == False:
			doublet = "0"
		else:
			doublet = "1"
		outfile.write("\t".join([barcode, str(score), doublet])+"\n")

print("Plotting doublet score histogram to %s" % args.output + "/" + args.name + "_score_histogram.pdf", file=sys.stderr)
f = scrub.plot_histogram()
plt.savefig(args.output + "/" + args.name + "_score_histogram.pdf")

if args.embed == True:
	print("Running UMAP", file=sys.stderr)
	scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
	print("Plotting UMAP to %s" % args.output + "/" + args.name + "_UMAP.pdf", file=sys.stderr)
	f = scrub.plot_embedding('UMAP', order_points=True);
	plt.savefig(args.output + "/" + args.name + "_UMAP.pdf")

	print("Running TSNE", file=sys.stderr)
	scrub.set_embedding('tSNE', scr.get_tsne(scrub.manifold_obs_, angle=0.9))
	print("Plotting TSNE to %s" % args.output + "/" + args.name + "_TSNE.pdf", file=sys.stderr)
	f = scrub.plot_embedding('tSNE', order_points=True);
	plt.savefig(args.output + "/" + args.name + "_TSNE.pdf")



predicted_doublets[sample] = scrub[sample].call_doublets(threshold=0.25)
sum(predicted_doublets[sample])

sample = 'SIGAG2'
scrub[sample].plot_histogram()
predicted_doublets[sample] = scrub[sample].call_doublets(threshold=0.35)
sum(predicted_doublets[sample])

for sample in samples:
    print(sample, ':', sum(predicted_doublets[sample]))

# Iteratively adjust thresholds according to plot results

sample = 'SIGAE2'
scrub[sample].set_embedding(
    'UMAP', scr.get_umap(scrub[sample].manifold_obs_, 10, min_dist=0.3))
scrub[sample].plot_embedding('UMAP', order_points=True)
#scrub[sample].predicted_doublets_ = scrub[sample].call_doublets(threshold=0.3)
#scrub[sample].call_doublets(threshold=0.3).sum()
#predicted_doublets[sample] = scrub[sample].call_doublets(threshold=0.3)

sample = 'SIGAF2'
scrub[sample].set_embedding(
    'UMAP', scr.get_umap(scrub[sample].manifold_obs_, 10, min_dist=0.25))
scrub[sample].plot_embedding('UMAP', order_points=True)
#scrub[sample].predicted_doublets_ = scrub[sample].call_doublets(threshold=0.25)
#scrub[sample].call_doublets(threshold=0.25).sum()
#predicted_doublets[sample] = scrub[sample].call_doublets(threshold=0.25)

sample = 'SIGAG2'
scrub[sample].set_embedding(
Пример #6
0
out_df = pd.DataFrame({
    'Cell.ID': cells['Cell.ID'],
    'scrublet_doublet_score': doublet_scores,
    'scrublet_doublet_call1': predicted_doublets,
    'scrublet_doublet_call2': predicted_doublets_025
})

out_df.to_csv('%s.scrublet_out.csv' % (prefix), sep="\t", index=False)

with PdfPages('%s.scrublet_out.pdf' % (prefix)) as pdf:
    scrub.plot_histogram()
    pdf.savefig()

    print('Running UMAP...')
    scrub.set_embedding('UMAP',
                        scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
    # # Uncomment to run tSNE - slow
    # print('Running tSNE...')
    # scrub.set_embedding('tSNE', scr.get_tsne(scrub.manifold_obs_, angle=0.9))
    # # Uncomment to run force layout - slow
    # print('Running ForceAtlas2...')
    # scrub.set_embedding('FA', scr.get_force_layout(scrub.manifold_obs_, n_neighbors=5. n_iter=1000))
    pdf.savefig()
    print('Done.')

    scrub.plot_embedding('UMAP', order_points=True)
    # scrub.plot_embedding('tSNE', order_points=True);
    # scrub.plot_embedding('FA', order_points=True);
    pdf.savefig()
Пример #7
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='author:	{0}	mail:	{1}'.format(__author__, __mail__))
    parser.add_argument('-m',
                        '--mtx',
                        help='cellranger分析结果中的matrix.mtx',
                        dest='mtx',
                        required=True)
    parser.add_argument('-f',
                        '--feature',
                        help='cellranger分析结果中的feature.csv或genes.csv',
                        dest='feature',
                        required=True)
    parser.add_argument('-o',
                        '--outdir',
                        help='结果输出目录',
                        dest='outdir',
                        required=True)
    parser.add_argument('-s',
                        '--sampleName',
                        help='样本名',
                        dest='sampleName',
                        required=True)
    parser.add_argument('-e',
                        '--expectedDoubletRate',
                        help='细胞结团率',
                        dest='expectedDoubletRate',
                        type=float,
                        required=True)
    parser.add_argument('-p',
                        '--pc',
                        help='PC值',
                        dest='pc',
                        type=int,
                        default=30)
    args = parser.parse_args()

    logging.basicConfig(
        level=logging.DEBUG,
        format=
        "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s - %(message)s"
    )
    logging.info("开始分析")

    # plt.rcParams['font.family'] = 'sans-serif'
    # plt.rcParams['font.sans-serif'] = 'Arial'
    # plt.rc('font', size=14)
    # plt.rcParams['pdf.fonttype'] = 42

    pc = args.pc
    expectedDoubletRate = args.expectedDoubletRate
    sampleName = args.sampleName

    #Load counts matrix and gene list
    counts_matrix = scipy.io.mmread(args.mtx).T.tocsc()
    genes = np.array(scr.load_genes(args.feature, delimiter='\t', column=1))
    scrub = scr.Scrublet(counts_matrix,
                         expected_doublet_rate=expectedDoubletRate)

    #Run the default pipeline
    doublet_scores, predicted_doublets = scrub.scrub_doublets(
        min_counts=2,
        min_cells=3,
        min_gene_variability_pctl=85,
        n_prin_comps=pc)
    #Plot doublet score histograms for observed transcriptomes and simulated doublets
    scrub.call_doublets(threshold=0.1)
    scrub.plot_histogram()
    plt.savefig(args.outdir + "/" + sampleName + '_Scrublet_Histogram.png')
    scrub.set_embedding('UMAP',
                        scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
    scrub.plot_embedding('UMAP', order_points=True)
    plt.savefig(args.outdir + "/" + sampleName + '_Scrublet_UMAP.png')

    #output the log file
    #expected doublet rate、 detected doublet rate、 doublet threshold、overall doublet rate
    logging.info(
        "patientID\texpected_doublet_rate\tdetected_doublet_rate\toverall_doublet_rate\tthreshold\tPC\n"
    )
    logging.info(
        "%s\t%.4f\t%.4f\t%.4f\t%.4f\t%s\n" %
        (sampleName, scrub.expected_doublet_rate, scrub.detected_doublet_rate_,
         scrub.overall_doublet_rate_, scrub.threshold_, pc))

    #output the doublet status of every single cell
    with open(args.outdir + "/" + sampleName + ".predictDoublet_scrublet.txt",
              "w+") as fo:
        for i in scrub.predicted_doublets_:
            fo.write("%s\n" % (i))
Пример #8
0
counts_matrix = scipy.io.mmread(inputfolder + '/' + jobname +
                                '_filtered.mtx').T.tocsc()

print('Counts matrix shape: {} rows, {} columns'.format(
    counts_matrix.shape[0], counts_matrix.shape[1]))

scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=exp_db_rate)

doublet_scores, predicted_doublets = scrub.scrub_doublets(
    min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=npcs)

scrub.call_doublets(threshold=db_thr)

# Duplet score for cells
savetxt(outputfolder + '/' + jobname + '_duplets_score.csv',
        scrub.doublet_scores_obs_,
        delimiter=',')
# Simulated duplets
savetxt(outputfolder + '/' + jobname + '_sim_duplets_score.csv',
        scrub.doublet_scores_sim_,
        delimiter=',')

# UMAP
print('Running UMAP...')
umap = scr.get_umap(scrub.manifold_obs_, n_neighbors=10, min_dist=0.1)
savetxt(outputfolder + '/' + jobname + '_umap_scrublet.csv',
        umap,
        delimiter=',')

print('Done.')