예제 #1
0
    def scrublet_predictions(self, vlm, input_dir, doublet_rate=0.06):
        import scrublet as scr
        import scipy.io
        print('Loading counts matrix {}/matrix.mtx'.format(input_dir),
              file=sys.stderr)
        counts_matrix = scipy.io.mmread(input_dir + '/matrix.mtx').T.tocsc()
        print("Loading barcodes {}/barcodes.tsv".format(input_dir),
              file=sys.stderr)
        barcodes = np.array(
            scr.load_genes(input_dir + 'barcodes.tsv', delimiter='t',
                           column=0))

        print("Initializing scrublet object", file=sys.stderr)
        scrub = scr.Scrublet(
            counts_matrix,
            expected_doublet_rate=doublet_rate)  #whole counts matrix
        print("Computing doublet predictions", file=sys.stderr)
        doublet_scores, predicted_doublets = scrub.scrub_doublets(
            min_counts=2,
            min_cells=3,
            min_gene_variability_pctl=85,
            n_prin_comps=30)
        #collapse barcodes, scores, and predictions into a dict
        doublet_dict = {
            barcode: [doublet_scores[i], predicted_doublets[i]]
            for i, barcode in enumerate(barcodes)
        }

        #add doublet score and doublet prediction as column attributes:
        vlm.ca["doublet_scores"] = np.array(
            [doublet_dict[barcode][0] for barcode in vlm.ca['CellID']])
        vlm.ca["doublet_predictions"] = np.array(
            [doublet_dict[barcode][1] for barcode in vlm.ca['CellID']])
        return vlm
def annotate_doublets(mtx_fpath, feature_fpath, expected_doublet_rate2=0.06):
    if False:
        plt.rcParams['font.family'] = 'sans-serif'
        plt.rcParams['font.sans-serif'] = 'Arial'
        plt.rc('font', size=14)
        plt.rcParams['pdf.fonttype'] = 42

    counts_matrix = scipy.io.mmread(mtx_fpath).T.tocsc()
    genes = np.array(scr.load_genes(feature_fpath, delimiter='\t', column=1))

    print('Counts matrix shape: {} rows, {} columns'.format(
        counts_matrix.shape[0], counts_matrix.shape[1]))
    print('Number of genes in gene list: {}'.format(len(genes)))

    scrub = scr.Scrublet(counts_matrix,
                         expected_doublet_rate=expected_doublet_rate2)

    doublet_scores, predicted_doublets = scrub.scrub_doublets(
        min_counts=2,
        min_cells=3,
        min_gene_variability_pctl=85,
        n_prin_comps=30)

    if False:
        scrub.plot_histogram()

        print('Running UMAP...')
        scrub.set_embedding(
            'UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
        print('Done.')

        scrub.plot_embedding('UMAP', order_points=True)

    return ([doublet_scores, predicted_doublets])
def run_scrublet_rna(input_dir):
	counts_matrix = scipy.io.mmread(input_dir + 'matrix.mtx').T.tocsc()
	genes = np.array(scr.load_genes(input_dir + 'features.tsv', delimiter='\t', column=1))
	print('Counts matrix shape: {} rows, {} columns'.format(counts_matrix.shape[0], counts_matrix.shape[1]))
	print('Number of genes in gene list: {}'.format(len(genes)))
	scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.05)
	doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, 
                                                          min_cells=3, 
                                                          min_gene_variability_pctl=85, 
                                                          n_prin_comps=30)
	np.savetxt(input_dir + 'predicted_doublet_mask.txt', scrub.predicted_doublets_, fmt='%s')
	np.savetxt(input_dir + 'doublet_scores.txt', scrub.doublet_scores_obs_, fmt='%.4f')
예제 #4
0
def scrublet_c(sample, inDir, outDir, expected_doublet_rate, sim_doublet_ratio,
               ratio_df, out_df):
    print(sample, "start scrublet")
    counts_matrix = scipy.io.mmread(os.path.join(inDir,
                                                 'matrix.mtx')).T.tocsc()
    genes = np.array(
        scr.load_genes(os.path.join(inDir, 'genes.tsv'),
                       delimiter='\t',
                       column=1))

    scrub = scr.Scrublet(counts_matrix,
                         expected_doublet_rate=expected_doublet_rate,
                         sim_doublet_ratio=sim_doublet_ratio)
    doublet_scores, predicted_doublets = scrub.scrub_doublets(
        min_counts=2,
        min_cells=3,
        min_gene_variability_pctl=85,
        n_prin_comps=30)

    scrub.plot_histogram()
    plt.savefig(
        os.path.join(
            outDir, "{0}_scrublet_doublet_score_histogram.pdf".format(sample)))
    print(sample, 'Running scrublet UMAP...')
    scrub.set_embedding('UMAP',
                        scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
    print(sample, 'scrublet Done.')

    scrub.plot_embedding('UMAP', order_points=True)
    plt.savefig(os.path.join(outDir, "{0}_scrublet_UMAP.pdf".format(sample)))
    print(sample, "Done scrublet")

    ratio_df.loc['scrublet', sample] = scrub.detected_doublet_rate_
    out_df['scrublet_doublet_scores'] = doublet_scores
    out_df['scrublet_doublets'] = predicted_doublets

    return ratio_df, out_df
예제 #5
0
import sys
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', help='raw 10X file directory for input', type=str)
parser.add_argument('-o', '--output', help='output directory', type=str, default="./")
parser.add_argument('-n', '--name', help='name of output files', type=str, default="name")
parser.add_argument('-r', '--doublet', help='expected doublet rate, default=0.06', type=float, default=0.06)
parser.add_argument('-e', '--embed', help='plot UMAP and TSNE. True or False.', type=bool, default=False)
args = parser.parse_args()

#load counts matrix, genes, barcodes
print("Loading counts matrix %s" % args.input + '/matrix.mtx', file=sys.stderr)
counts_matrix = scipy.io.mmread(args.input + '/matrix.mtx').T.tocsc()
print("Loading barcodes %s" % args.input + '/barcodes.tsv', file=sys.stderr)
barcodes = np.array(scr.load_genes(args.input + 'barcodes.tsv', delimiter='t', column=0))

#initialize scrublet object
print("Initializing scrublet object", file=sys.stderr)
scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=args.doublet) #whole counts matrix

print("Computing doublet predictions", file=sys.stderr)
doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, 
                                                          min_cells=3, 
                                                          min_gene_variability_pctl=85, 
                                                          n_prin_comps=30)

#write scrublet output to file:
print("Writing doublet predictions to %s" % args.output + "/" + args.name + "_predicted_doublets.tsv", file=sys.stderr)
with open(args.output + "/" + args.name + "_predicted_doublets.tsv", 'w') as outfile:
	outfile.write("\t".join(["barcode", "doublet_score", "doublet_prediction"])+"\n")
예제 #6
0
import scrublet as scr
import scipy.io
import matplotlib.pyplot as plt
import numpy as np
import os

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rc('font', size=14)
plt.rcParams['pdf.fonttype'] = 42

#filtered
#input_dir = '/share/ScratchGeneral/briglo/scRNA/venchi/data/hg19/VENCHI_SampleBCITE/outs/filtered_feature_bc_matrix/'
input_dir = '/share/ScratchGeneral/briglo/scRNA/venchi/outputs/seurat/'
counts_matrix = scipy.io.mmread(input_dir + 'combined.human.mtx').T.tocsc()
genes = np.array(scr.load_genes(input_dir + 'genes.tsv', delimiter='\t', column=0))

print('Counts matrix shape: {} rows, {} columns'.format(counts_matrix.shape[0], counts_matrix.shape[1]))
print('Number of genes in gene list: {}'.format(len(genes)))

#Counts matrix shape: 12865 rows, 32738 columns
#Number of genes in gene list: 32738

scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.06)

doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, 
                                                          min_cells=3, 
                                                          min_gene_variability_pctl=85, 
                                                         n_prin_comps=30,
                                                         get_doublet_neighbor_parents=True)
예제 #7
0
import scrublet as rc 
import matplotlib.pyplot as plt 
import scipy.io 
from scipy.sparse import csc_matrix 
import numpy as np 

wd = "/restricted/projectnb/camplab/home/syyang/contamination/data/pbmc/4k/" 

counts = scipy.io.mmread( wd + 'data/matrix.mtx' ) 

geneIndex = ((counts > 2).sum( axis = 1 ) > 2 )  
counts_filter = counts.toarray()[ np.array(geneIndex).reshape(-1), : ] 

print( counts_filter.shape) 

counts_csc = csc_matrix( counts_filter.T ) 
genes = np.array( rc.load_genes( wd + 'data/genes.tsv', delimiter='\t', column=1)) [ np.array(geneIndex).reshape(-1)  ]


scrub = rc.Scrublet(counts_csc, expected_doublet_rate=0.06)


doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30)


np.save( "doublet_scores.npy", doublet_scores  ) 
np.save( "predicted_doublets.npy", predicted_doublets * 1 )
예제 #8
0
print('Arguments:', len(sys.argv))
print('List:', str(sys.argv))
var_number = float(sys.argv[3])
print(var_number)

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rc('font', size=14)
plt.rcParams['pdf.fonttype'] = 42

## Basic run with scrublet
input_dir = os.path.join(sys.argv[1])
counts_matrix = scipy.io.mmread(input_dir + 'matrix.mtx').T.tocsc()
genes = np.array(
    scr.load_genes(input_dir + 'genes.tsv', delimiter='\t',
                   column=1))  # Use with the raw data
print('Counts matrix shape: {} rows, {} columns'.format(
    counts_matrix.shape[0], counts_matrix.shape[1]))
print('Number of genes in gene list: {}'.format(len(genes)))
scrub = scr.Scrublet(counts_matrix,
                     expected_doublet_rate=0.15,
                     sim_doublet_ratio=2)
doublet_scores, predicted_doublets = scrub.scrub_doublets(
    min_counts=2,
    min_cells=150,
    min_gene_variability_pctl=var_number,
    n_prin_comps=30)

scrub.call_doublets(threshold=0.40)

outdir = sys.argv[2]
import scipy.io
import matplotlib.pyplot as plt
import numpy as np
import os

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rc('font', size=14)
plt.rcParams['pdf.fonttype'] = 42

tag = 'FFT4G_10x'
output_dir = '/home/jovyan/snSeq_QCandAnalysis/scrublet'
input_dir = '/home/jovyan/data/snQCandAnalysis/FFT4G_10x/filtered_feature_bc_matrix'
counts_matrix = scipy.io.mmread(input_dir + '/matrix.mtx.gz').T.tocsc()
genes = np.array(
    scr.load_genes(input_dir + '/features.tsv', delimiter='\t', column=1))

print('Counts matrix shape: {} rows, {} columns'.format(
    counts_matrix.shape[0], counts_matrix.shape[1]))
print('Number of genes in gene list: {}'.format(len(genes)))

scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.06)
doublet_scores, predicted_doublets = scrub.scrub_doublets(
    min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30)

predicted_doublets = predicted_doublets * 1
predicted_doublets = predicted_doublets.astype(int)
detected_doublets_rate = round(scrub.detected_doublet_rate_, 4)
overall_doublets_rate = round(scrub.overall_doublet_rate_, 4)

np.savetxt(output_dir + '/' + tag + '_' + 'doublets_scores.txt',
예제 #10
0
if len(sys.argv) == 1:
    print('input file prefix: python scrublet_doublet.py FEL011_S')
    exit()
else:
    prefix = sys.argv[1]
    print(prefix)

#plt.rcParams['font.family'] = 'sans-serif'
#plt.rcParams['font.sans-serif'] = 'Arial'
plt.rc('font', size=14)
plt.rcParams['pdf.fonttype'] = 42

counts_matrix = scipy.io.mmread(prefix + '.matrix.mtx').T.tocsc()
genes = np.array(
    scr.load_genes(prefix + '.genes.tsv', delimiter='\t', column=0))
cells = pd.read_table(prefix + '.barcodes.tsv', header=None)
cells.columns = ["Cell.ID"]

print('Counts matrix shape: {} rows, {} columns'.format(
    counts_matrix.shape[0], counts_matrix.shape[1]))
print('Number of genes in gene list: {}'.format(len(genes)))

#indexnames  = list(counts_matrix.index)
#columnnames = list(counts_matrix.columns)

#print('10 index values: {}'.format(indexnames[1:10]))
#print('10 column values: {}'.format(counts_matrix[1:3,1:3]))

scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.06)
예제 #11
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='author:	{0}	mail:	{1}'.format(__author__, __mail__))
    parser.add_argument('-m',
                        '--mtx',
                        help='cellranger分析结果中的matrix.mtx',
                        dest='mtx',
                        required=True)
    parser.add_argument('-f',
                        '--feature',
                        help='cellranger分析结果中的feature.csv或genes.csv',
                        dest='feature',
                        required=True)
    parser.add_argument('-o',
                        '--outdir',
                        help='结果输出目录',
                        dest='outdir',
                        required=True)
    parser.add_argument('-s',
                        '--sampleName',
                        help='样本名',
                        dest='sampleName',
                        required=True)
    parser.add_argument('-e',
                        '--expectedDoubletRate',
                        help='细胞结团率',
                        dest='expectedDoubletRate',
                        type=float,
                        required=True)
    parser.add_argument('-p',
                        '--pc',
                        help='PC值',
                        dest='pc',
                        type=int,
                        default=30)
    args = parser.parse_args()

    logging.basicConfig(
        level=logging.DEBUG,
        format=
        "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s - %(message)s"
    )
    logging.info("开始分析")

    # plt.rcParams['font.family'] = 'sans-serif'
    # plt.rcParams['font.sans-serif'] = 'Arial'
    # plt.rc('font', size=14)
    # plt.rcParams['pdf.fonttype'] = 42

    pc = args.pc
    expectedDoubletRate = args.expectedDoubletRate
    sampleName = args.sampleName

    #Load counts matrix and gene list
    counts_matrix = scipy.io.mmread(args.mtx).T.tocsc()
    genes = np.array(scr.load_genes(args.feature, delimiter='\t', column=1))
    scrub = scr.Scrublet(counts_matrix,
                         expected_doublet_rate=expectedDoubletRate)

    #Run the default pipeline
    doublet_scores, predicted_doublets = scrub.scrub_doublets(
        min_counts=2,
        min_cells=3,
        min_gene_variability_pctl=85,
        n_prin_comps=pc)
    #Plot doublet score histograms for observed transcriptomes and simulated doublets
    scrub.call_doublets(threshold=0.1)
    scrub.plot_histogram()
    plt.savefig(args.outdir + "/" + sampleName + '_Scrublet_Histogram.png')
    scrub.set_embedding('UMAP',
                        scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
    scrub.plot_embedding('UMAP', order_points=True)
    plt.savefig(args.outdir + "/" + sampleName + '_Scrublet_UMAP.png')

    #output the log file
    #expected doublet rate、 detected doublet rate、 doublet threshold、overall doublet rate
    logging.info(
        "patientID\texpected_doublet_rate\tdetected_doublet_rate\toverall_doublet_rate\tthreshold\tPC\n"
    )
    logging.info(
        "%s\t%.4f\t%.4f\t%.4f\t%.4f\t%s\n" %
        (sampleName, scrub.expected_doublet_rate, scrub.detected_doublet_rate_,
         scrub.overall_doublet_rate_, scrub.threshold_, pc))

    #output the doublet status of every single cell
    with open(args.outdir + "/" + sampleName + ".predictDoublet_scrublet.txt",
              "w+") as fo:
        for i in scrub.predicted_doublets_:
            fo.write("%s\n" % (i))
import os
import time
import sys
input_dir = sys.argv[1] + "/"

# The raw counts matrix (E) should be a scipy sparse CSC matrix
# with cells as rows and genes as columns

if os.path.isfile(input_dir + '/gene_count.npz'):
    E = scipy.sparse.load_npz(input_dir + '/gene_count.npz')
else:
    E = scipy.io.mmread(input_dir + '/gene_count.mtx').T.tocsc()
    scipy.sparse.save_npz(input_dir + '/gene_count.npz', E, compressed=True)


genes = np.array(scr.load_genes(input_dir + 'df_gene.tsv', delimiter='\t', column=1))

print('Expression matrix shape: {} rows, {} columns'.format(E.shape[0], E.shape[1]))
print('Number of genes in gene list: {}'.format(len(genes)))

scrub = scr.Scrublet(E, expected_doublet_rate=0.05)
doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, 
                                                          min_cells=3, 
                                                          min_gene_variability_pctl=85, 
                                                          n_prin_comps=30)
scrub.call_doublets(threshold=0.22)
scrub.plot_histogram()
plt.savefig(input_dir + "/hist1.png")
print('Running UMAP...')
scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
plt.savefig(input_dir + "/umap.png")