def snpCalling(args): sample = args.sample outdir = args.outdir thread = int(args.thread) match_dir = args.match_dir bam = args.bam genomeDir = args.genomeDir gene_list_file = args.gene_list min_query_length = args.min_query_length # process args barcodes, _nCell = read_barcode_file(match_dir) # check dir if not os.path.exists(outdir): os.system('mkdir -p %s' % (outdir)) # get genome file _refFlat, gtf, fasta = glob_genomeDir(genomeDir, fa=True) # convert gene gene_id_name_dic = convert(gene_list_file, gtf) # split bam index_file, count_file = split_bam(bam, barcodes, outdir, sample, gene_id_name_dic, min_query_length) # snp call_all_snp(index_file, outdir, thread, fasta) # summary summary(index_file, count_file, outdir, sample)
def STAR(args): # check refFlat, gtf = glob_genomeDir(args.genomeDir) # check dir if not os.path.exists(args.outdir): os.system('mkdir -p %s' % (args.outdir)) # run STAR outPrefix = args.outdir + '/' + args.sample + '_' outBam = args.outdir + '/' + args.sample + '_' # cmd = ['STAR', '--runThreadN', str(args.thread), '--genomeDir', args.genomeDir, '--readFilesIn', args.fq, '--readFilesCommand', 'zcat', '--outFilterMultimapNmax', '1', '--outReadsUnmapped', 'Fastx', '--outFileNamePrefix', outPrefix, '--outSAMtype', 'BAM', 'SortedByCoordinate'] cmd = ['STAR', '--runThreadN', str(args.thread), '--genomeDir', args.genomeDir, '--readFilesIn', args.fq, '--readFilesCommand', 'zcat', '--outFilterMultimapNmax', '1', '--outFileNamePrefix', outPrefix, '--outSAMtype', 'BAM', 'SortedByCoordinate'] if args.out_unmapped: cmd += ['--outReadsUnmapped', 'Fastx'] STAR.logger.info('%s' % (' '.join(cmd))) subprocess.check_call(cmd) STAR.logger.info('picard start...') outBam = outPrefix + 'Aligned.sortedByCoord.out.bam' region_txt = args.outdir + '/' + args.sample + '_region.log' cmd = [ 'picard', '-Xmx4G', '-XX:ParallelGCThreads=4', 'CollectRnaSeqMetrics', 'I=%s' % (outBam), 'O=%s' % (region_txt), 'REF_FLAT=%s' % (refFlat), 'STRAND=NONE', 'VALIDATION_STRINGENCY=SILENT'] STAR.logger.info('%s' % (' '.join(cmd))) res = subprocess.run(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) STAR.logger.info(res.stdout) STAR.logger.info('picard done.') plot = format_stat( args.outdir + '/' + args.sample + '_Log.final.out', region_txt, args.sample) t = reporter( name='STAR', assay=args.assay, sample=args.sample, stat_file=args.outdir + '/stat.txt', outdir=args.outdir + '/..', plot=plot) t.get_report()
def count_capture_rna(args): # check _refFlat, gtf = glob_genomeDir(args.genomeDir) id_name = gene_convert(gtf) # 检查和创建输出目录 if not os.path.exists(args.outdir): os.system('mkdir -p %s' % (args.outdir)) # umi纠错,输出Barcode geneID UMI count为表头的表格 count_detail_file = args.outdir + '/' + args.sample + '_count_detail.txt' df_probe = bam2table(args.bam, count_detail_file, id_name) df_probe.to_csv(f'{args.outdir}/{args.sample}_probe_gene_count.tsv', sep='\t', index=False) df = pd.read_table(count_detail_file, header=0) # call cells pdf = args.outdir + '/barcode_filter_magnitude.pdf' marked_counts_file = args.outdir + '/' + args.sample + '_counts.txt' (validated_barcodes, threshold, cell_num, CB_describe) = call_cells(df, args.cells, pdf, marked_counts_file) # match barcode sc_cell_barcodes, sc_cell_number = read_barcode_file(args.match_dir) # 输出matrix (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome, match_cell_str, match_UMI_median) = expression_matrix(df, validated_barcodes, args.outdir, args.sample, id_name, sc_cell_barcodes, sc_cell_number) # downsampling validated_barcodes = set(validated_barcodes) downsample_file = args.outdir + '/' + args.sample + '_downsample.txt' Saturation = downsample(count_detail_file, validated_barcodes, downsample_file) # summary stat_file = args.outdir + '/stat.txt' get_summary(df, args.sample, Saturation, CB_describe, CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome, match_cell_str, match_UMI_median, stat_file, args.outdir + '/../') report_prepare(marked_counts_file, downsample_file, args.outdir + '/..') t = reporter(assay=args.assay, name='count_capture_rna', sample=args.sample, stat_file=args.outdir + '/stat.txt', outdir=args.outdir + '/..') t.get_report()
def setUp(self): os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/0910_panel/') self.sample = 'S20071508_D_TS' count_detail_file = './/S20071508_D_TS/05.count_capture_rna/S20071508_D_TS_count_detail.txt' self.df = pd.read_table(count_detail_file, header=0) self.match_dir = '/SGRNJ02/RandD4/RD20051303_Panel/20200729/S20071508_D_ZL' self.sc_cell_barcodes, self.sc_cell_number = read_barcode_file(self.match_dir) self.outdir = f'{self.sample}/05.count_capture_rna/' self.genomeDir = '/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92' self.validated_barcodes, _ = read_one_col(f'{self.sample}/05.count_capture_rna/{self.sample}_matrix_10X/barcodes.tsv') _refFlat, self.gtf = glob_genomeDir(self.genomeDir) self.assay = 'capture_rna'
def setUp(self): os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/unittest/snp') self.genomeDir = '/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92' self.gene_list_file = './gene_list.tsv' self.index_file = './S20070818_TS/05.snpCalling/S20070818_TS_cell_index.tsv' self.thread = 20 self.outdir = './S20070818_TS/05.snpCalling/' _refFlat, self.gtf, self.fasta = glob_genomeDir(self.genomeDir, fa=True) self.match_dir = '/SGRNJ02/RandD4/RD20051303_Panel/20200717/S20070818_ZL/' self.sample = 'S20070818_TS' self.count_file = './S20070818_TS/05.snpCalling/S20070818_TS_count.tsv'
def picard(self): self.refFlat, self.gtf = glob_genomeDir(self.genomeDir) self.picard_region_log = f'{self.outdir}/{self.sample}_region.log' cmd = [ 'picard', '-Xmx20G', '-XX:ParallelGCThreads=4', 'CollectRnaSeqMetrics', 'I=%s' % (self.STAR_bam), 'O=%s' % (self.picard_region_log), 'REF_FLAT=%s' % (self.refFlat), 'STRAND=NONE', 'VALIDATION_STRINGENCY=SILENT' ] res = subprocess.run(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
def featureCounts(args): # check _refFlat, gtf = glob_genomeDir(args.genomeDir) # check dir if not os.path.exists(args.outdir): os.mkdir(args.outdir) # run featureCounts outPrefix = args.outdir + '/' + args.sample cmd = ['featureCounts', '-a', gtf, '-o', outPrefix, '-R', 'BAM', '-T', str(args.thread), '-t', args.gtf_type, args.input] featureCounts.logger.info('%s' % (' '.join(cmd))) subprocess.check_call(cmd) subprocess.check_call(['which', 'samtools']) # sort by name:BC and umi featureCounts.logger.info('samtools sort ...!') bam_basename = os.path.basename(args.input) cmd = [ 'samtools', 'sort', '-n', '-@', '3', '-o', outPrefix + '_name_sorted.bam', args.outdir + '/' + bam_basename + '.featureCounts.bam'] featureCounts.logger.info('%s' % (' '.join(cmd))) subprocess.check_call(cmd) featureCounts.logger.info('samtools sort done.') format_stat(args.outdir + '/' + args.sample + '.summary', args.sample) t = reporter( name='featureCounts', assay=args.assay, sample=args.sample, stat_file=args.outdir + '/stat.txt', outdir=args.outdir + '/..') t.get_report()
def count(args): # check refFlat, gtf = glob_genomeDir(args.genomeDir) # 检查和创建输出目录 if not os.path.exists(args.outdir): os.system('mkdir -p %s' % (args.outdir)) # umi纠错,输出Barcode geneID UMI count为表头的表格 count_detail_file = args.outdir + '/' + args.sample + '_count_detail.txt' bam2table(args.bam, count_detail_file) df = pd.read_table(count_detail_file, header=0) # call cells pdf = args.outdir + '/barcode_filter_magnitude.pdf' marked_counts_file = args.outdir + '/' + args.sample + '_counts.txt' (validated_barcodes, threshold, cell_num, CB_describe) = call_cells(df, args.cells, pdf, marked_counts_file) # 输出matrix (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome) = expression_matrix( df, validated_barcodes, args.outdir, args.sample, gtf) # downsampling validated_barcodes = set(validated_barcodes) downsample_file = args.outdir + '/' + args.sample + '_downsample.txt' Saturation = downsample(count_detail_file, validated_barcodes, downsample_file) # summary stat_file = args.outdir + '/stat.txt' get_summary(df, args.sample, Saturation, CB_describe, CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome, stat_file, args.outdir + '/../') report_prepare(marked_counts_file, downsample_file, args.outdir + '/..') t = reporter(assay=args.assay, name='count', sample=args.sample, stat_file=args.outdir + '/stat.txt', outdir=args.outdir + '/..') t.get_report()
def setUp(self): os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/unittest/snp') self.genomeDir = '/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92' self.gene_list_file = './gene_list.tsv' self.index_file = './S20070818_TS/05.snpCalling/S20070818_TS_cell_index.tsv' self.thread = 20 self.outdir = './S20070818_TS/05.snpCalling/' self.analysis_outdir = './S20070818_TS/06.analysis_snp' _refFlat, self.gtf, self.fasta = glob_genomeDir(self.genomeDir, fa=True) self.match_dir = '/SGRNJ02/RandD4/RD20051303_Panel/20200717/S20070818_ZL/' self.sample = 'S20070818_TS' self.count_file = './S20070818_TS/05.snpCalling/S20070818_TS_count.tsv' self.vcf_file = './S20070818_TS/05.snpCalling/S20070818_TS_anno.vcf' self.assay = 'snp' self.annovar_config = '/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/soft/annovar/annovar.config' self.step_analysis_variant = analysis_variant( self.analysis_outdir, self.sample, self.match_dir, self.vcf_file, self.index_file, self.assay, self.annovar_config, )
def test_convert(self): _refFlat, gtf = glob_genomeDir(self.genomeDir) gene_list = convert(self.gene_list_file, gtf) print(gene_list)
def count(args): # args outdir = args.outdir sample = args.sample assay = args.assay cells = args.cells rescue = args.rescue # check refFlat, gtf_file = glob_genomeDir(args.genomeDir) # 检查和创建输出目录 if not os.path.exists(outdir): os.system('mkdir -p %s' % (outdir)) # umi纠错,输出Barcode geneID UMI count为表头的表格 count_detail_file = outdir + '/' + sample + '_count_detail.txt.gz' bam2table(args.bam, count_detail_file) df = pd.read_table(count_detail_file, header=0) # export all matrix dir_name = 'all_matrix' matrix_10X(df, outdir, sample, gtf_file, dir_name=dir_name) # call cells pdf = outdir + '/barcode_filter_magnitude.pdf' df_sum, threshold = call_cells(df, cells, pdf) # rescue low UMI cells if rescue: matrix_dir = f"{outdir}/{sample}_{dir_name}/" threshold = rescue_cells(outdir, sample, matrix_dir, threshold) # get cell stats marked_counts_file = outdir + '/' + sample + '_counts.txt' validated_barcodes, CB_describe = get_cell_stats(df_sum, threshold, marked_counts_file) # export cell matrix matrix_10X(df, outdir, sample, gtf_file, dir_name='matrix_10X', validated_barcodes=validated_barcodes) (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome) = expression_matrix( df, validated_barcodes, outdir, sample, gtf_file) # downsampling validated_barcodes = set(validated_barcodes) downsample_file = args.outdir + '/' + args.sample + '_downsample.txt' Saturation = downsample(count_detail_file, validated_barcodes, downsample_file) # summary stat_file = outdir + '/stat.txt' get_summary(df, sample, Saturation, CB_describe, CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome, stat_file, outdir + '/../') report_prepare(marked_counts_file, downsample_file, outdir + '/..') t = reporter(assay=assay, name='count', sample=args.sample, stat_file=outdir + '/stat.txt', outdir=outdir + '/..') t.get_report()