Пример #1
0
def snpCalling(args):

    sample = args.sample
    outdir = args.outdir
    thread = int(args.thread)
    match_dir = args.match_dir
    bam = args.bam
    genomeDir = args.genomeDir
    gene_list_file = args.gene_list
    min_query_length = args.min_query_length

    # process args
    barcodes, _nCell = read_barcode_file(match_dir)

    # check dir
    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % (outdir))

    # get genome file
    _refFlat, gtf, fasta = glob_genomeDir(genomeDir, fa=True)

    # convert gene
    gene_id_name_dic = convert(gene_list_file, gtf)

    # split bam
    index_file, count_file = split_bam(bam, barcodes, outdir, sample,
                                       gene_id_name_dic, min_query_length)

    # snp
    call_all_snp(index_file, outdir, thread, fasta)

    # summary
    summary(index_file, count_file, outdir, sample)
Пример #2
0
def STAR(args):
    # check
    refFlat, gtf = glob_genomeDir(args.genomeDir)

    # check dir
    if not os.path.exists(args.outdir):
        os.system('mkdir -p %s' % (args.outdir))

    # run STAR
    outPrefix = args.outdir + '/' + args.sample + '_'
    outBam = args.outdir + '/' + args.sample + '_'
    # cmd = ['STAR', '--runThreadN', str(args.thread), '--genomeDir', args.genomeDir, '--readFilesIn', args.fq, '--readFilesCommand', 'zcat', '--outFilterMultimapNmax', '1', '--outReadsUnmapped', 'Fastx', '--outFileNamePrefix', outPrefix, '--outSAMtype', 'BAM', 'SortedByCoordinate']
    cmd = ['STAR', '--runThreadN', str(args.thread), '--genomeDir', args.genomeDir,
           '--readFilesIn', args.fq, '--readFilesCommand', 'zcat', '--outFilterMultimapNmax',
           '1', '--outFileNamePrefix', outPrefix, '--outSAMtype', 'BAM', 'SortedByCoordinate']
    if args.out_unmapped:
        cmd += ['--outReadsUnmapped', 'Fastx']
    STAR.logger.info('%s' % (' '.join(cmd)))
    subprocess.check_call(cmd)

    STAR.logger.info('picard start...')
    outBam = outPrefix + 'Aligned.sortedByCoord.out.bam'
    region_txt = args.outdir + '/' + args.sample + '_region.log'
    cmd = [
        'picard',
        '-Xmx4G',
        '-XX:ParallelGCThreads=4',
        'CollectRnaSeqMetrics',
        'I=%s' %
        (outBam),
        'O=%s' %
        (region_txt),
        'REF_FLAT=%s' %
        (refFlat),
        'STRAND=NONE',
        'VALIDATION_STRINGENCY=SILENT']
    STAR.logger.info('%s' % (' '.join(cmd)))
    res = subprocess.run(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
    STAR.logger.info(res.stdout)
    STAR.logger.info('picard done.')

    plot = format_stat(
        args.outdir +
        '/' +
        args.sample +
        '_Log.final.out',
        region_txt,
        args.sample)
    t = reporter(
        name='STAR',
        assay=args.assay,
        sample=args.sample,
        stat_file=args.outdir + '/stat.txt',
        outdir=args.outdir + '/..',
        plot=plot)
    t.get_report()
Пример #3
0
def count_capture_rna(args):

    # check
    _refFlat, gtf = glob_genomeDir(args.genomeDir)
    id_name = gene_convert(gtf)

    # 检查和创建输出目录
    if not os.path.exists(args.outdir):
        os.system('mkdir -p %s' % (args.outdir))

    # umi纠错,输出Barcode geneID  UMI     count为表头的表格
    count_detail_file = args.outdir + '/' + args.sample + '_count_detail.txt'
    df_probe = bam2table(args.bam, count_detail_file, id_name)
    df_probe.to_csv(f'{args.outdir}/{args.sample}_probe_gene_count.tsv',
                    sep='\t',
                    index=False)

    df = pd.read_table(count_detail_file, header=0)

    # call cells
    pdf = args.outdir + '/barcode_filter_magnitude.pdf'
    marked_counts_file = args.outdir + '/' + args.sample + '_counts.txt'
    (validated_barcodes, threshold, cell_num,
     CB_describe) = call_cells(df, args.cells, pdf, marked_counts_file)

    # match barcode
    sc_cell_barcodes, sc_cell_number = read_barcode_file(args.match_dir)

    # 输出matrix
    (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome,
     match_cell_str,
     match_UMI_median) = expression_matrix(df, validated_barcodes, args.outdir,
                                           args.sample, id_name,
                                           sc_cell_barcodes, sc_cell_number)

    # downsampling
    validated_barcodes = set(validated_barcodes)
    downsample_file = args.outdir + '/' + args.sample + '_downsample.txt'
    Saturation = downsample(count_detail_file, validated_barcodes,
                            downsample_file)

    # summary
    stat_file = args.outdir + '/stat.txt'
    get_summary(df, args.sample, Saturation, CB_describe, CB_total_Genes,
                CB_reads_count, reads_mapped_to_transcriptome, match_cell_str,
                match_UMI_median, stat_file, args.outdir + '/../')

    report_prepare(marked_counts_file, downsample_file, args.outdir + '/..')

    t = reporter(assay=args.assay,
                 name='count_capture_rna',
                 sample=args.sample,
                 stat_file=args.outdir + '/stat.txt',
                 outdir=args.outdir + '/..')
    t.get_report()
Пример #4
0
 def setUp(self):
     os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/0910_panel/')
     self.sample = 'S20071508_D_TS'
     count_detail_file = './/S20071508_D_TS/05.count_capture_rna/S20071508_D_TS_count_detail.txt'
     self.df = pd.read_table(count_detail_file, header=0)
     self.match_dir = '/SGRNJ02/RandD4/RD20051303_Panel/20200729/S20071508_D_ZL'
     self.sc_cell_barcodes, self.sc_cell_number = read_barcode_file(self.match_dir)
     self.outdir = f'{self.sample}/05.count_capture_rna/'
     self.genomeDir = '/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92'
     self.validated_barcodes, _ = read_one_col(f'{self.sample}/05.count_capture_rna/{self.sample}_matrix_10X/barcodes.tsv') 
     _refFlat, self.gtf = glob_genomeDir(self.genomeDir)
     self.assay = 'capture_rna'
Пример #5
0
 def setUp(self):
     os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/unittest/snp')
     self.genomeDir = '/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92'
     self.gene_list_file = './gene_list.tsv'
     self.index_file = './S20070818_TS/05.snpCalling/S20070818_TS_cell_index.tsv'
     self.thread = 20
     self.outdir = './S20070818_TS/05.snpCalling/'
     _refFlat, self.gtf, self.fasta = glob_genomeDir(self.genomeDir,
                                                     fa=True)
     self.match_dir = '/SGRNJ02/RandD4/RD20051303_Panel/20200717/S20070818_ZL/'
     self.sample = 'S20070818_TS'
     self.count_file = './S20070818_TS/05.snpCalling/S20070818_TS_count.tsv'
Пример #6
0
 def picard(self):
     self.refFlat, self.gtf = glob_genomeDir(self.genomeDir)
     self.picard_region_log = f'{self.outdir}/{self.sample}_region.log'
     cmd = [
         'picard', '-Xmx20G', '-XX:ParallelGCThreads=4',
         'CollectRnaSeqMetrics',
         'I=%s' % (self.STAR_bam),
         'O=%s' % (self.picard_region_log),
         'REF_FLAT=%s' % (self.refFlat), 'STRAND=NONE',
         'VALIDATION_STRINGENCY=SILENT'
     ]
     res = subprocess.run(cmd,
                          stderr=subprocess.STDOUT,
                          stdout=subprocess.PIPE)
Пример #7
0
def featureCounts(args):

    # check
    _refFlat, gtf = glob_genomeDir(args.genomeDir)

    # check dir
    if not os.path.exists(args.outdir):
        os.mkdir(args.outdir)

    # run featureCounts
    outPrefix = args.outdir + '/' + args.sample
    cmd = ['featureCounts', '-a', gtf, '-o', outPrefix, '-R', 'BAM',
           '-T', str(args.thread), '-t', args.gtf_type, args.input]
    featureCounts.logger.info('%s' % (' '.join(cmd)))
    subprocess.check_call(cmd)

    subprocess.check_call(['which', 'samtools'])

    # sort by name:BC and umi
    featureCounts.logger.info('samtools sort ...!')
    bam_basename = os.path.basename(args.input)
    cmd = [
        'samtools',
        'sort',
        '-n',
        '-@',
        '3',
        '-o',
        outPrefix +
        '_name_sorted.bam',
        args.outdir +
        '/' +
        bam_basename +
        '.featureCounts.bam']
    featureCounts.logger.info('%s' % (' '.join(cmd)))
    subprocess.check_call(cmd)
    featureCounts.logger.info('samtools sort done.')

    format_stat(args.outdir + '/' + args.sample + '.summary', args.sample)
    t = reporter(
        name='featureCounts',
        assay=args.assay,
        sample=args.sample,
        stat_file=args.outdir +
        '/stat.txt',
        outdir=args.outdir +
        '/..')
    t.get_report()
Пример #8
0
def count(args):

    # check
    refFlat, gtf = glob_genomeDir(args.genomeDir)

    # 检查和创建输出目录
    if not os.path.exists(args.outdir):
        os.system('mkdir -p %s' % (args.outdir))

    # umi纠错,输出Barcode geneID  UMI     count为表头的表格
    count_detail_file = args.outdir + '/' + args.sample + '_count_detail.txt'
    bam2table(args.bam, count_detail_file)

    df = pd.read_table(count_detail_file, header=0)

    # call cells
    pdf = args.outdir + '/barcode_filter_magnitude.pdf'
    marked_counts_file = args.outdir + '/' + args.sample + '_counts.txt'
    (validated_barcodes, threshold, cell_num,
     CB_describe) = call_cells(df, args.cells, pdf, marked_counts_file)

    # 输出matrix
    (CB_total_Genes, CB_reads_count,
     reads_mapped_to_transcriptome) = expression_matrix(
         df, validated_barcodes, args.outdir, args.sample, gtf)

    # downsampling
    validated_barcodes = set(validated_barcodes)
    downsample_file = args.outdir + '/' + args.sample + '_downsample.txt'
    Saturation = downsample(count_detail_file, validated_barcodes,
                            downsample_file)

    # summary
    stat_file = args.outdir + '/stat.txt'
    get_summary(df, args.sample, Saturation, CB_describe, CB_total_Genes,
                CB_reads_count, reads_mapped_to_transcriptome, stat_file,
                args.outdir + '/../')

    report_prepare(marked_counts_file, downsample_file, args.outdir + '/..')

    t = reporter(assay=args.assay,
                 name='count',
                 sample=args.sample,
                 stat_file=args.outdir + '/stat.txt',
                 outdir=args.outdir + '/..')
    t.get_report()
Пример #9
0
 def setUp(self):
     os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/unittest/snp')
     self.genomeDir = '/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92'
     self.gene_list_file = './gene_list.tsv'
     self.index_file = './S20070818_TS/05.snpCalling/S20070818_TS_cell_index.tsv'
     self.thread = 20
     self.outdir = './S20070818_TS/05.snpCalling/'
     self.analysis_outdir = './S20070818_TS/06.analysis_snp'
     _refFlat, self.gtf, self.fasta = glob_genomeDir(self.genomeDir, fa=True)
     self.match_dir = '/SGRNJ02/RandD4/RD20051303_Panel/20200717/S20070818_ZL/'
     self.sample = 'S20070818_TS'
     self.count_file = './S20070818_TS/05.snpCalling/S20070818_TS_count.tsv'
     self.vcf_file = './S20070818_TS/05.snpCalling/S20070818_TS_anno.vcf'
     self.assay = 'snp'
     self.annovar_config = '/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/soft/annovar/annovar.config'
     self.step_analysis_variant = analysis_variant(
         self.analysis_outdir,
         self.sample,
         self.match_dir,
         self.vcf_file,
         self.index_file,
         self.assay,
         self.annovar_config,
     )
Пример #10
0
 def test_convert(self):
     _refFlat, gtf = glob_genomeDir(self.genomeDir)
     gene_list = convert(self.gene_list_file, gtf)
     print(gene_list)
Пример #11
0
def count(args):
    # args
    outdir = args.outdir
    sample = args.sample
    assay = args.assay
    cells = args.cells
    rescue = args.rescue

    # check
    refFlat, gtf_file = glob_genomeDir(args.genomeDir)

    # 检查和创建输出目录
    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % (outdir))

    # umi纠错,输出Barcode geneID  UMI     count为表头的表格
    count_detail_file = outdir + '/' + sample + '_count_detail.txt.gz'
    bam2table(args.bam, count_detail_file)

    df = pd.read_table(count_detail_file, header=0)

    # export all matrix
    dir_name = 'all_matrix'
    matrix_10X(df, outdir, sample, gtf_file, dir_name=dir_name)

    # call cells
    pdf = outdir + '/barcode_filter_magnitude.pdf'
    df_sum, threshold = call_cells(df, cells, pdf)

    # rescue low UMI cells
    if rescue:
        matrix_dir = f"{outdir}/{sample}_{dir_name}/"
        threshold = rescue_cells(outdir, sample, matrix_dir, threshold)

    # get cell stats
    marked_counts_file = outdir + '/' + sample + '_counts.txt'
    validated_barcodes, CB_describe = get_cell_stats(df_sum, threshold,
                                                     marked_counts_file)

    # export cell matrix
    matrix_10X(df,
               outdir,
               sample,
               gtf_file,
               dir_name='matrix_10X',
               validated_barcodes=validated_barcodes)
    (CB_total_Genes, CB_reads_count,
     reads_mapped_to_transcriptome) = expression_matrix(
         df, validated_barcodes, outdir, sample, gtf_file)

    # downsampling
    validated_barcodes = set(validated_barcodes)
    downsample_file = args.outdir + '/' + args.sample + '_downsample.txt'
    Saturation = downsample(count_detail_file, validated_barcodes,
                            downsample_file)

    # summary
    stat_file = outdir + '/stat.txt'
    get_summary(df, sample, Saturation, CB_describe, CB_total_Genes,
                CB_reads_count, reads_mapped_to_transcriptome, stat_file,
                outdir + '/../')

    report_prepare(marked_counts_file, downsample_file, outdir + '/..')

    t = reporter(assay=assay,
                 name='count',
                 sample=args.sample,
                 stat_file=outdir + '/stat.txt',
                 outdir=outdir + '/..')
    t.get_report()