def report(self): t = reporter( name='analysis_snp', assay=self.assay, sample=self.sample, outdir=self.outdir + '/..') t.get_report()
def analysis_capture_virus(args): # check dir outdir = args.outdir sample = args.sample virus_file = args.virus_file match_dir = args.match_dir if not os.path.exists(outdir): os.system('mkdir -p %s' % (outdir)) # report tsne_df_file = glob.glob(f'{match_dir}/*analysis*/*tsne_coord.tsv')[0] marker_df_file = glob.glob(f'{match_dir}/*analysis*/*markers.tsv')[0] tsne_df = pd.read_csv(tsne_df_file, sep="\t") marker_df = pd.read_csv(marker_df_file, sep="\t") virus_df = pd.read_csv(virus_file, sep="\t") report_prepare(outdir, tsne_df, marker_df, virus_df) t = reporter(name='analysis_capture_virus', assay=args.assay, sample=args.sample, outdir=args.outdir + '/..') t.get_report()
def cutadapt(args): # check dir if not os.path.exists(args.outdir): os.system('mkdir -p %s' % (args.outdir)) # run cutadapt adapt = [] for a in args.adapt: adapt.append('-a') adapt.append(a) out_fq2 = args.outdir + '/' + args.sample + '_clean_2.fq.gz' cmd = ['cutadapt'] + adapt + [ '-n', str(len(args.adapt)), '-j', str(args.thread), '-m', str(args.minimum_length), '--nextseq-trim=' + str(args.nextseq_trim), '--overlap', str(args.overlap), '-l', str(args.insert), '-o', out_fq2, args.fq ] cutadapt.logger.info('%s' % (' '.join(cmd))) res = subprocess.run(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) with open(args.outdir + '/cutadapt.log', 'wb') as fh: fh.write(res.stdout) format_stat(args.outdir + '/cutadapt.log', args.sample) t = reporter(name='cutadapt', assay=args.assay, sample=args.sample, stat_file=args.outdir + '/stat.txt', outdir=args.outdir + '/..') t.get_report()
def analysis_smk(args): logger1.info('smk analysis ...!') # check dir outdir = args.outdir sample = args.sample tsne_tag_file = args.tsne_tag_file match_dir = args.match_dir if not os.path.exists(outdir): os.system('mkdir -p %s' % (outdir)) # report tsne_df_file = glob.glob(f'{match_dir}/*analysis*/*tsne_coord.tsv')[0] marker_df_file = glob.glob(f'{match_dir}/*analysis*/*markers.tsv')[0] tsne_df = pd.read_csv(tsne_df_file, sep="\t") marker_df = pd.read_csv(marker_df_file, sep="\t") tsne_tag_df = pd.read_csv(tsne_tag_file, sep="\t", index_col=0) report_prepare(outdir, tsne_df, marker_df, tsne_tag_df) t = reporter( name='analysis_smk', assay=args.assay, sample=sample, outdir=args.outdir + '/..') t.get_report()
def sample_info(args): sample = args.sample ASSAY = ASSAY_DICT[args.assay] version = __VERSION__ outdir = args.outdir chemistry = args.chemistry if not chemistry: chemistry = "Customized" #transcriptome = args.genomeDir.split("/")[-1] if not os.path.exists(outdir): os.system('mkdir -p %s' % outdir) stat = pd.DataFrame( { "item": ["Sample ID", "Assay", "Chemistry", "Software Version"], "count": [sample, ASSAY, chemistry, version], }, columns=["item", "count"]) stat_file = outdir + "/stat.txt" stat.to_csv(stat_file, sep=":", header=None, index=False) t = reporter(name='sample', assay=args.assay, sample=args.sample, stat_file=stat_file, outdir=outdir + '/..') t.get_report()
def analysis_rna_virus(args): # check dir outdir = args.outdir sample = args.sample matrix_file = args.matrix_file virus_file = args.virus_file if not os.path.exists(outdir): os.system('mkdir -p %s' % (outdir)) # run_R seurat(sample, outdir, matrix_file) # report tsne_df_file = glob.glob( "{outdir}/*tsne_coord.tsv".format(outdir=outdir))[0] marker_df_file = glob.glob( "{outdir}/*markers.tsv".format(outdir=outdir))[0] tsne_df = pd.read_csv(tsne_df_file, sep="\t") marker_df = pd.read_csv(marker_df_file, sep="\t") virus_df = pd.read_csv(virus_file, sep="\t") report_prepare(outdir, tsne_df, marker_df, virus_df) t = reporter(name='analysis_rna_virus', assay=args.assay, sample=args.sample, outdir=args.outdir + '/..') t.get_report()
def report(self): t = reporter(name=self.step_name, assay=self.assay, sample=self.sample, stat_file=self.stats_file, outdir=self.outdir + '/..', plot=self.plot) t.get_report()
def run(self): self.read_to_dic() self.tag_count() t = reporter(name='mapping_smk', assay="smk", sample=self.sample, stat_file=self.stat_file, outdir=self.outdir + '/..') t.get_report()
def STAR(args): # check refFlat, gtf = glob_genomeDir(args.genomeDir) # check dir if not os.path.exists(args.outdir): os.system('mkdir -p %s' % (args.outdir)) # run STAR outPrefix = args.outdir + '/' + args.sample + '_' outBam = args.outdir + '/' + args.sample + '_' # cmd = ['STAR', '--runThreadN', str(args.thread), '--genomeDir', args.genomeDir, '--readFilesIn', args.fq, '--readFilesCommand', 'zcat', '--outFilterMultimapNmax', '1', '--outReadsUnmapped', 'Fastx', '--outFileNamePrefix', outPrefix, '--outSAMtype', 'BAM', 'SortedByCoordinate'] cmd = ['STAR', '--runThreadN', str(args.thread), '--genomeDir', args.genomeDir, '--readFilesIn', args.fq, '--readFilesCommand', 'zcat', '--outFilterMultimapNmax', '1', '--outFileNamePrefix', outPrefix, '--outSAMtype', 'BAM', 'SortedByCoordinate'] if args.out_unmapped: cmd += ['--outReadsUnmapped', 'Fastx'] STAR.logger.info('%s' % (' '.join(cmd))) subprocess.check_call(cmd) STAR.logger.info('picard start...') outBam = outPrefix + 'Aligned.sortedByCoord.out.bam' region_txt = args.outdir + '/' + args.sample + '_region.log' cmd = [ 'picard', '-Xmx4G', '-XX:ParallelGCThreads=4', 'CollectRnaSeqMetrics', 'I=%s' % (outBam), 'O=%s' % (region_txt), 'REF_FLAT=%s' % (refFlat), 'STRAND=NONE', 'VALIDATION_STRINGENCY=SILENT'] STAR.logger.info('%s' % (' '.join(cmd))) res = subprocess.run(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) STAR.logger.info(res.stdout) STAR.logger.info('picard done.') plot = format_stat( args.outdir + '/' + args.sample + '_Log.final.out', region_txt, args.sample) t = reporter( name='STAR', assay=args.assay, sample=args.sample, stat_file=args.outdir + '/stat.txt', outdir=args.outdir + '/..', plot=plot) t.get_report()
def report(self): self.stat_file = f'{self.outdir}/stat.txt' self.stats.to_csv(self.stat_file, sep=':', header=False) t = reporter(name='count_cite', assay=self.assay, sample=self.sample, stat_file=self.stat_file, outdir=self.outdir + '/..') t.get_report()
def report(self): t = reporter( name='demultiplex', assay='single-vdj', sample=self.samplename, outdir=self.outdir + '/..', stat_file=f'{self.outdir}/stat.txt', html_flag=False, ) t.get_report()
def count_capture_rna(args): # check _refFlat, gtf = glob_genomeDir(args.genomeDir) id_name = gene_convert(gtf) # 检查和创建输出目录 if not os.path.exists(args.outdir): os.system('mkdir -p %s' % (args.outdir)) # umi纠错,输出Barcode geneID UMI count为表头的表格 count_detail_file = args.outdir + '/' + args.sample + '_count_detail.txt' df_probe = bam2table(args.bam, count_detail_file, id_name) df_probe.to_csv(f'{args.outdir}/{args.sample}_probe_gene_count.tsv', sep='\t', index=False) df = pd.read_table(count_detail_file, header=0) # call cells pdf = args.outdir + '/barcode_filter_magnitude.pdf' marked_counts_file = args.outdir + '/' + args.sample + '_counts.txt' (validated_barcodes, threshold, cell_num, CB_describe) = call_cells(df, args.cells, pdf, marked_counts_file) # match barcode sc_cell_barcodes, sc_cell_number = read_barcode_file(args.match_dir) # 输出matrix (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome, match_cell_str, match_UMI_median) = expression_matrix(df, validated_barcodes, args.outdir, args.sample, id_name, sc_cell_barcodes, sc_cell_number) # downsampling validated_barcodes = set(validated_barcodes) downsample_file = args.outdir + '/' + args.sample + '_downsample.txt' Saturation = downsample(count_detail_file, validated_barcodes, downsample_file) # summary stat_file = args.outdir + '/stat.txt' get_summary(df, args.sample, Saturation, CB_describe, CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome, match_cell_str, match_UMI_median, stat_file, args.outdir + '/../') report_prepare(marked_counts_file, downsample_file, args.outdir + '/..') t = reporter(assay=args.assay, name='count_capture_rna', sample=args.sample, stat_file=args.outdir + '/stat.txt', outdir=args.outdir + '/..') t.get_report()
def report(self, stat=True): if stat: stat_file = self.outdir + "/stat.txt" else: stat_file = '' t = reporter(name=self.step, assay=self.assay, sample=self.sample, outdir=self.outdir + '/..', stat_file=stat_file) t.get_report()
def featureCounts(args): # check _refFlat, gtf = glob_genomeDir(args.genomeDir) # check dir if not os.path.exists(args.outdir): os.mkdir(args.outdir) # run featureCounts outPrefix = args.outdir + '/' + args.sample cmd = ['featureCounts', '-a', gtf, '-o', outPrefix, '-R', 'BAM', '-T', str(args.thread), '-t', args.gtf_type, args.input] featureCounts.logger.info('%s' % (' '.join(cmd))) subprocess.check_call(cmd) subprocess.check_call(['which', 'samtools']) # sort by name:BC and umi featureCounts.logger.info('samtools sort ...!') bam_basename = os.path.basename(args.input) cmd = [ 'samtools', 'sort', '-n', '-@', '3', '-o', outPrefix + '_name_sorted.bam', args.outdir + '/' + bam_basename + '.featureCounts.bam'] featureCounts.logger.info('%s' % (' '.join(cmd))) subprocess.check_call(cmd) featureCounts.logger.info('samtools sort done.') format_stat(args.outdir + '/' + args.sample + '.summary', args.sample) t = reporter( name='featureCounts', assay=args.assay, sample=args.sample, stat_file=args.outdir + '/stat.txt', outdir=args.outdir + '/..') t.get_report()
def count(args): # check refFlat, gtf = glob_genomeDir(args.genomeDir) # 检查和创建输出目录 if not os.path.exists(args.outdir): os.system('mkdir -p %s' % (args.outdir)) # umi纠错,输出Barcode geneID UMI count为表头的表格 count_detail_file = args.outdir + '/' + args.sample + '_count_detail.txt' bam2table(args.bam, count_detail_file) df = pd.read_table(count_detail_file, header=0) # call cells pdf = args.outdir + '/barcode_filter_magnitude.pdf' marked_counts_file = args.outdir + '/' + args.sample + '_counts.txt' (validated_barcodes, threshold, cell_num, CB_describe) = call_cells(df, args.cells, pdf, marked_counts_file) # 输出matrix (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome) = expression_matrix( df, validated_barcodes, args.outdir, args.sample, gtf) # downsampling validated_barcodes = set(validated_barcodes) downsample_file = args.outdir + '/' + args.sample + '_downsample.txt' Saturation = downsample(count_detail_file, validated_barcodes, downsample_file) # summary stat_file = args.outdir + '/stat.txt' get_summary(df, args.sample, Saturation, CB_describe, CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome, stat_file, args.outdir + '/../') report_prepare(marked_counts_file, downsample_file, args.outdir + '/..') t = reporter(assay=args.assay, name='count', sample=args.sample, stat_file=args.outdir + '/stat.txt', outdir=args.outdir + '/..') t.get_report()
def analysis(args): # check dir outdir = args.outdir sample = args.sample matrix_file = args.matrix_file save_rds = args.save_rds type_marker_tsv = args.type_marker_tsv auto_assign_bool = False if type_marker_tsv and type_marker_tsv != 'None': auto_assign_bool = True if auto_assign_bool: save_rds = True if not os.path.exists(outdir): os.system('mkdir -p %s' % (outdir)) # run_R seurat(sample, outdir, matrix_file, save_rds) # auto_assign if auto_assign_bool: auto_assign(sample, outdir, type_marker_tsv) # report tsne_df_file = f'{outdir}/{sample}_tsne_coord.tsv' marker_df_file = f'{outdir}/{sample}_markers.tsv' tsne_df = pd.read_csv(tsne_df_file, sep="\t") marker_df = pd.read_csv(marker_df_file, sep="\t") report_prepare(outdir, tsne_df, marker_df) stat_file = outdir + "/stat.txt" assay = __ASSAY__ t = reporter(name='analysis', assay=assay, sample=args.sample, outdir=args.outdir + '/..', stat_file=stat_file) t.get_report()
def sample_info(args): sample = args.sample ASSAY = ASSAY_DICT[args.assay] version = __VERSION__ outdir = args.outdir chemistry = args.chemistry # get chemistry if chemistry == 'auto': fq1 = args.fq1 ch = Chemistry(fq1) chemistry = ch.check_chemistry() else: chemistry = args.chemistry if not os.path.exists(outdir): os.system('mkdir -p %s' % outdir) stat = pd.DataFrame({ "item": ["Sample ID", "Assay", "Chemistry", "Software Version"], "count": [sample, ASSAY, chemistry, version], }, columns=["item", "count"] ) stat_file = outdir + "/stat.txt" stat.to_csv(stat_file, sep=":", header=None, index=False) t = reporter( name='sample', assay=args.assay, sample=args.sample, stat_file=stat_file, outdir=outdir + '/..') t.get_report() return chemistry
def test_report(self): t = reporter(assay=self.assay, name='count_capture_rna', sample=self.sample, stat_file=self.outdir + '/stat.txt', outdir=self.outdir + '/..') t.get_report()
def summary(index_file, count_file, outdir, sample): # init number = 0 Number_of_Match_Cells_with_SNP = 0 SNP_count_dict = defaultdict(int) coord_gene_dict = defaultdict(dict) # read index df_index, df_valid = read_index(index_file) # out vcf out_vcf = open(f'{outdir}/{sample}.vcf', 'wt') for index in df_valid.index: vcf_coords_dict = {} number += 1 cell_vcf_file = f'{outdir}/cells/cell{index}/cell{index}_norm.vcf' # vcf coords with open(cell_vcf_file, 'rt') as f: for line in f: if line.startswith("#"): # add vcf and bam header if number == 1: new_line = process_vcf_header(line, sample) if new_line: out_vcf.write(new_line) continue if line: items = line.split('\t') items[7] += f';CELL={index}' new_line = '\t'.join(items) out_vcf.write(new_line) chrom = str(items[0]) pos = int(items[1]) if chrom not in vcf_coords_dict: vcf_coords_dict[chrom] = set([pos]) else: vcf_coords_dict[chrom].add(pos) SNP_count_dict[index] += 1 # add bam header if number == 1: cell_bam_file = f'{outdir}/cells/cell{index}/cell{index}_sorted.bam' cell_bam = pysam.AlignmentFile(cell_bam_file, "rb") header = cell_bam.header out_bam = pysam.AlignmentFile(f'{outdir}/{sample}.bam', "wb", header=header) # add bam if len(vcf_coords_dict) > 0: Number_of_Match_Cells_with_SNP += 1 cell_bam_file = f'{outdir}/cells/cell{index}/cell{index}_sorted.bam' cell_bam = pysam.AlignmentFile(cell_bam_file, "rb") for read in cell_bam: bam_ref = str(read.reference_name) gene_name = read.get_tag('GN') aligned_pairs = read.get_aligned_pairs() align_dict = {} for pair in aligned_pairs: ref_pos = pair[1] read_pos = pair[0] if ref_pos: align_dict[ref_pos] = read_pos if bam_ref in vcf_coords_dict.keys(): read_flag = False for pos in vcf_coords_dict[bam_ref]: if pos in align_dict: read_flag = True coord_gene_dict[bam_ref][pos] = gene_name if read_flag: out_bam.write(read) out_vcf.close() out_bam.close() pysam.sort("-o", f'{outdir}/{sample}_sorted.bam', f'{outdir}/{sample}.bam') cmd = f'samtools index {outdir}/{sample}_sorted.bam' os.system(cmd) # annotate vcf anno_vcf = open(f'{outdir}/{sample}_anno.vcf', 'wt') with open(f'{outdir}/{sample}.vcf', 'rt') as vcf: for line in vcf: if line.startswith('#'): anno_vcf.write(line) continue items = line.split('\t') chrom = str(items[0]) pos = int(items[1]) gene_name = coord_gene_dict[chrom][pos] items[7] += f';GENE={gene_name}' new_line = '\t'.join(items) anno_vcf.write(new_line) anno_vcf.close() # rm #os.remove(f'{outdir}/{sample}.vcf') #os.remove(f'{outdir}/{sample}.bam') # stat stats = pd.Series() n_match_cell = len(df_index.index) df_count = pd.read_csv(count_file, sep='\t') df_count_read = df_count.groupby('barcode').agg({'read_count': sum}) read_total = sum(df_count_read['read_count']) Mean_Reads_per_Cell = round((read_total / n_match_cell), 2) stats = stats.append( pd.Series(Mean_Reads_per_Cell, index=['Mean Reads per Cell'])) df_count_UMI = df_count.groupby('barcode').agg({'UMI': 'count'}) UMI_total = sum(df_count_UMI['UMI']) Mean_UMIs_per_Cell = round((UMI_total / n_match_cell), 2) stats = stats.append( pd.Series(Mean_UMIs_per_Cell, index=['Mean UMIs per Cell'])) stats = stats.append( pd.Series(format_stat(Number_of_Match_Cells_with_SNP, n_match_cell), index=['Number of Cells with Variants'])) SNP_counts = list(SNP_count_dict.values()) Mean_SNP_per_Cell = round(np.mean(SNP_counts), 3) stats = stats.append( pd.Series(Mean_SNP_per_Cell, index=['Mean Variants per Cell with Variants'])) stat_file = f'{outdir}/stat.txt' stats.to_csv(stat_file, sep=':', header=False) t = reporter(name='snpCalling', assay='snp', sample=sample, stat_file=stat_file, outdir=outdir + '/..') t.get_report()
def count(args): # args outdir = args.outdir sample = args.sample assay = args.assay cells = args.cells rescue = args.rescue # check refFlat, gtf_file = glob_genomeDir(args.genomeDir) # 检查和创建输出目录 if not os.path.exists(outdir): os.system('mkdir -p %s' % (outdir)) # umi纠错,输出Barcode geneID UMI count为表头的表格 count_detail_file = outdir + '/' + sample + '_count_detail.txt.gz' bam2table(args.bam, count_detail_file) df = pd.read_table(count_detail_file, header=0) # export all matrix dir_name = 'all_matrix' matrix_10X(df, outdir, sample, gtf_file, dir_name=dir_name) # call cells pdf = outdir + '/barcode_filter_magnitude.pdf' df_sum, threshold = call_cells(df, cells, pdf) # rescue low UMI cells if rescue: matrix_dir = f"{outdir}/{sample}_{dir_name}/" threshold = rescue_cells(outdir, sample, matrix_dir, threshold) # get cell stats marked_counts_file = outdir + '/' + sample + '_counts.txt' validated_barcodes, CB_describe = get_cell_stats(df_sum, threshold, marked_counts_file) # export cell matrix matrix_10X(df, outdir, sample, gtf_file, dir_name='matrix_10X', validated_barcodes=validated_barcodes) (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome) = expression_matrix( df, validated_barcodes, outdir, sample, gtf_file) # downsampling validated_barcodes = set(validated_barcodes) downsample_file = args.outdir + '/' + args.sample + '_downsample.txt' Saturation = downsample(count_detail_file, validated_barcodes, downsample_file) # summary stat_file = outdir + '/stat.txt' get_summary(df, sample, Saturation, CB_describe, CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome, stat_file, outdir + '/../') report_prepare(marked_counts_file, downsample_file, outdir + '/..') t = reporter(assay=assay, name='count', sample=args.sample, stat_file=outdir + '/stat.txt', outdir=outdir + '/..') t.get_report()
def barcode(args): # check dir if not os.path.exists(args.outdir): os.system('mkdir -p %s' % args.outdir) bc_pattern = __PATTERN_DICT__[args.chemistry] if (bc_pattern): (linker, whitelist) = get_scope_bc(args.chemistry) else: bc_pattern = args.pattern linker = args.linker whitelist = args.whitelist if (not linker) or (not whitelist) or (not bc_pattern): barcode.logger.error("invalid chemistry or [pattern,linker,whitelist]") sys.exit() # parse pattern to dict, C8L10C8L10C8U8 # defaultdict(<type 'list'>, {'C': [[0, 8], [18, 26], [36, 44]], 'U': # [[44, 52]], 'L': [[8, 18], [26, 36]]}) pattern_dict = parse_pattern(bc_pattern) # check linker check_seq(linker, pattern_dict, "L") bool_T = True if 'T' in pattern_dict else False bool_L = True if 'L' in pattern_dict else False C_len = sum([item[1] - item[0] for item in pattern_dict['C']]) barcode_qual_Counter = Counter() umi_qual_Counter = Counter() C_U_base_Counter = Counter() args.lowQual = ord2chr(args.lowQual) # generate list with mismatch 1, substitute one base in raw sequence with # A,T,C,G barcode_dict = generate_seq_dict(whitelist, n=1) linker_dict = generate_seq_dict(linker, n=2) fq1_list = args.fq1.split(",") fq2_list = args.fq2.split(",") # merge multiple fastq files if len(fq1_list) > 1: barcode.logger.info("merge fastq with same sample name...") fastq_dir = args.outdir + "/../merge_fastq" if not os.path.exists(fastq_dir): os.system('mkdir -p %s' % fastq_dir) fastq1_file = f"{fastq_dir}/{args.sample}_1.fq.gz" fastq2_file = f"{fastq_dir}/{args.sample}_2.fq.gz" fq1_files = " ".join(fq1_list) fq2_files = " ".join(fq2_list) fq1_cmd = f"cat {fq1_files} > {fastq1_file}" fq2_cmd = f"cat {fq2_files} > {fastq2_file}" barcode.logger.info(fq1_cmd) os.system(fq1_cmd) barcode.logger.info(fq2_cmd) os.system(fq2_cmd) barcode.logger.info("merge fastq done.") else: fastq1_file = args.fq1 fastq2_file = args.fq2 fh1 = xopen(fastq1_file) fh2 = xopen(fastq2_file) out_fq2 = args.outdir + '/' + args.sample + '_2.fq.gz' fh3 = xopen(out_fq2, 'w') (total_num, clean_num, no_polyT_num, lowQual_num, no_linker_num, no_barcode_num) = (0, 0, 0, 0, 0, 0) Barcode_dict = defaultdict(int) if args.nopolyT: fh1_without_polyT = xopen(args.outdir + '/noPolyT_1.fq', 'w') fh2_without_polyT = xopen(args.outdir + '/noPolyT_2.fq', 'w') if args.noLinker: fh1_without_linker = xopen(args.outdir + '/noLinker_1.fq', 'w') fh2_without_linker = xopen(args.outdir + '/noLinker_2.fq', 'w') bool_probe = False if args.probe_file and args.probe_file != 'None': bool_probe = True count_dic = genDict(dim=3) valid_count_dic = genDict(dim=2) probe_dic = read_fasta(args.probe_file) reads_without_probe = 0 g1 = read_fastq(fh1) g2 = read_fastq(fh2) while True: try: (header1, seq1, qual1) = next(g1) (header2, seq2, qual2) = next(g2) except BaseException: break if total_num > 0 and total_num % 1000000 == 0: barcode.logger.info( f'processed reads: {format_number(total_num)}.' f'valid reads: {format_number(clean_num)}.' ) total_num += 1 # polyT filter if bool_T: polyT = seq_ranges(seq1, pattern_dict['T']) if no_polyT(polyT): no_polyT_num += 1 if args.nopolyT: fh1_without_polyT.write( '@%s\n%s\n+\n%s\n' % (header1, seq1, qual1)) fh2_without_polyT.write( '@%s\n%s\n+\n%s\n' % (header2, seq2, qual2)) continue # lowQual filter C_U_quals_ascii = seq_ranges( qual1, pattern_dict['C'] + pattern_dict['U']) # C_U_quals_ord = [ord(q) - 33 for q in C_U_quals_ascii] if low_qual(C_U_quals_ascii, args.lowQual, args.lowNum): lowQual_num += 1 continue # linker filter barcode_arr = [seq_ranges(seq1, [i]) for i in pattern_dict['C']] raw_cb = ''.join(barcode_arr) if bool_L: linker = seq_ranges(seq1, pattern_dict['L']) if (no_linker(linker, linker_dict)): no_linker_num += 1 if args.noLinker: fh1_without_linker.write( '@%s\n%s\n+\n%s\n' % (header1, seq1, qual1)) fh2_without_linker.write( '@%s\n%s\n+\n%s\n' % (header2, seq2, qual2)) continue # barcode filter # barcode_arr = [seq_ranges(seq1, [i]) for i in pattern_dict['C']] # raw_cb = ''.join(barcode_arr) res = no_barcode(barcode_arr, barcode_dict) if res is True: no_barcode_num += 1 continue elif res == "correct": cb = raw_cb else: cb = res umi = seq_ranges(seq1, pattern_dict['U']) Barcode_dict[cb] += 1 clean_num += 1 read_name_probe = 'None' if bool_probe: # valid count valid_count_dic[cb][umi] += 1 # output probe UMi and read count find_probe = False for probe_name in probe_dic: probe_seq = probe_dic[probe_name] probe_seq = probe_seq.upper() if seq1.find(probe_seq) != -1: count_dic[probe_name][cb][umi] += 1 read_name_probe = probe_name find_probe = True break if not find_probe: reads_without_probe += 1 barcode_qual_Counter.update(C_U_quals_ascii[:C_len]) umi_qual_Counter.update(C_U_quals_ascii[C_len:]) C_U_base_Counter.update(raw_cb + umi) # new readID: @barcode_umi_old readID fh3.write(f'@{cb}_{umi}_{read_name_probe}_{total_num}\n{seq2}\n+\n{qual2}\n') fh3.close() # logging if total_num % 1000000 != 0: barcode.logger.info( f'processed reads: {format_number(total_num)}. ' f'valid reads: {format_number(clean_num)}. ' ) if clean_num == 0: raise Exception( 'no valid reads found! please check the --chemistry parameter.') if bool_probe: # total probe summary total_umi = 0 total_valid_read = 0 for cb in valid_count_dic: total_umi += len(valid_count_dic[cb]) total_valid_read += sum(valid_count_dic[cb].values()) barcode.logger.info("total umi:"+str(total_umi)) barcode.logger.info("total valid read:"+str(total_valid_read)) barcode.logger.info("reads without probe:"+str(reads_without_probe)) # probe summary count_list = [] for probe_name in probe_dic: UMI_count = 0 read_count = 0 if probe_name in count_dic: for cb in count_dic[probe_name]: UMI_count += len(count_dic[probe_name][cb]) read_count += sum(count_dic[probe_name][cb].values()) count_list.append( {"probe_name": probe_name, "UMI_count": UMI_count, "read_count": read_count}) df_count = pd.DataFrame(count_list, columns=[ "probe_name", "read_count", "UMI_count"]) def format_percent(x): x = str(round(x*100, 2))+"%" return x df_count["read_fraction"] = ( df_count["read_count"]/total_valid_read).apply(format_percent) df_count["UMI_fraction"] = ( df_count["UMI_count"]/total_umi).apply(format_percent) df_count.sort_values(by="UMI_count", inplace=True, ascending=False) df_count_file = args.outdir + '/' + args.sample + '_probe_count.tsv' df_count.to_csv(df_count_file, sep="\t", index=False) # stat BarcodesQ30 = sum([barcode_qual_Counter[k] for k in barcode_qual_Counter if k >= ord2chr( 30)]) / float(sum(barcode_qual_Counter.values())) * 100 UMIsQ30 = sum([umi_qual_Counter[k] for k in umi_qual_Counter if k >= ord2chr( 30)]) / float(sum(umi_qual_Counter.values())) * 100 global stat_info def cal_percent(x): return "{:.2%}".format((x + 0.0) / total_num) with open(args.outdir + '/stat.txt', 'w') as fh: """ Raw Reads: %s Valid Reads: %s(%s) Q30 of Barcodes: %.2f%% Q30 of UMIs: %.2f%% """ stat_info = stat_info % (format_number(total_num), format_number(clean_num), cal_percent(clean_num), BarcodesQ30, UMIsQ30) stat_info = re.sub(r'^\s+', r'', stat_info, flags=re.M) fh.write(stat_info) barcode.logger.info('fastqc ...!') cmd = ['fastqc', '-t', str(args.thread), '-o', args.outdir, out_fq2] barcode.logger.info('%s' % (' '.join(cmd))) subprocess.check_call(cmd) barcode.logger.info('fastqc done!') t = reporter(name='barcode', assay=args.assay, sample=args.sample, stat_file=args.outdir + '/stat.txt', outdir=args.outdir + '/..') t.get_report()
def summary(fq, alignments, type, outdir, sample, assay, debug): chains = CHAINS[type] ''' # out files UMI_unfiltered_file = f'{outdir}/{sample}_UMI_unfiltered.tsv' UMI_filtered1_file = f'{outdir}/{sample}_UMI_filtered1.tsv' UMI_filtered2_file = f'{outdir}/{sample}_UMI_filtered2.tsv' ''' UMI_count_unfiltered_file = f'{outdir}/{sample}_UMI_count_unfiltered.tsv' UMI_count_filtered1_file = f'{outdir}/{sample}_UMI_count_filtered1.tsv' # read fq read2 = gzip.open(fq, "rt") index = 0 read_row_list = [] while True: line1 = read2.readline() line2 = read2.readline() line3 = read2.readline() line4 = read2.readline() if not line4: break attr = str(line1).strip("@").split("_") barcode = str(attr[0]) umi = str(attr[1]) dic = {"readId": index, "barcode": barcode, "UMI": umi} read_row_list.append(dic) index += 1 df_read = pd.DataFrame(read_row_list, columns=["readId", "barcode", "UMI"]) mapping_vdj.logger.info("fq reads to dataframe done.") read2.close() total_read = df_read.shape[0] # init row list mapping_summary_row_list = [] # mapped alignment = pd.read_csv(alignments, sep="\t") alignment.readId = alignment.readId.astype(int) align_read = alignment.shape[0] df_read.readId = df_read.readId.astype(int) df_align = pd.merge(df_read, alignment, on="readId", how="right") mapping_summary_row_list.append({ "item": "Reads Mapped to Any VDJ Gene", "count": align_read, "total_count": total_read, }) # CDR3 df_CDR3 = df_align[~pd.isnull(df_align["aaSeqCDR3"])] align_read_with_CDR3 = df_CDR3.shape[0] mapping_summary_row_list.append({ "item": "Reads with CDR3", "count": align_read_with_CDR3, "total_count": total_read, }) # correct CDR3 df_correct_CDR3 = df_CDR3[~(df_CDR3["aaSeqCDR3"].str.contains(r"\*"))] align_read_with_correct_CDR3 = df_correct_CDR3.shape[0] mapping_summary_row_list.append({ "item": "Reads with Correct CDR3", "count": align_read_with_correct_CDR3, "total_count": total_read, }) # VDJ df_VJ = df_correct_CDR3[ (~pd.isnull(df_correct_CDR3['bestVGene'])) & (~pd.isnull(df_correct_CDR3['bestJGene'])) ] df_VJ = df_VJ[df_VJ.bestVGene.str[:3] == df_VJ.bestJGene.str[:3]] df_VJ["chain"] = df_VJ.bestVGene.str[:3] df_VJ["VJ_pair"] = df_VJ["bestVGene"] + "_" + df_VJ["bestJGene"] Reads_Mapped_Confidently_to_VJ_Gene = df_VJ.shape[0] mapping_summary_row_list.append({ "item": "Reads Mapped Confidently to VJ Gene", "count": Reads_Mapped_Confidently_to_VJ_Gene, "total_count": total_read }) # chain for chain in chains: df_chain = df_VJ[df_VJ.chain == chain] Reads_Mapped_to_chain = df_chain.shape[0] mapping_summary_row_list.append({ "item": f"Reads Mapped to {chain}", "count": Reads_Mapped_to_chain, "total_count": total_read, }) # unique UMI df_UMI = df_VJ.drop_duplicates(subset=["barcode", "UMI"], keep="first") # filter1: keep top 1 in each combinations groupby_elements = [ 'barcode', 'chain', 'bestVGene', 'bestJGene', 'aaSeqCDR3', 'nSeqCDR3', ] df_UMI_count = df_UMI.groupby( groupby_elements, as_index=False).agg({"UMI": "count"}) df_UMI_count = df_UMI_count.sort_values("UMI", ascending=False) # out unfiltered df_UMI_count.to_csv(UMI_count_unfiltered_file, sep="\t", index=False) df_UMI_count_filter1 = df_UMI_count.groupby( ["barcode", "chain"], as_index=False).head(1) # out filtered1 df_UMI_count_filter1.to_csv( UMI_count_filtered1_file, sep="\t", index=False) if debug: unique_UMI = df_UMI.shape[0] mapping_summary_row_list.append({ "item": "UMI unique count", "count": unique_UMI, "total_count": align_read_with_correct_CDR3, }) UMI_after_Contamination_Filtering = df_UMI_count.filter1.UMI.sum() mapping_summary_row_list.append({ "item": "UMI after Contamination Filtering", "count": UMI_after_Contamination_Filtering, "total_count": unique_UMI, }) # stat file df = pd.DataFrame( mapping_summary_row_list, columns=[ "item", "count", "total_count"]) stat_file = f'{outdir}/stat.txt' gen_stat(df, stat_file) # report STEP = 'mapping_vdj' name = f'{type}_{STEP}' t = reporter( name=name, sample=sample, stat_file=stat_file, outdir=outdir + '/..', assay=assay, ) t.get_report()
def count_vdj(args): sample = args.sample match_dir = args.match_dir UMI_min = args.UMI_min outdir = args.outdir UMI_count_filter1_file = args.UMI_count_filter1_file type = args.type debug = args.debug iUMI = int(args.iUMI) chains = CHAINS[type] if not os.path.exists(outdir): os.system('mkdir -p %s' % outdir) # out file cell_confident_file = f"{outdir}/{sample}_cell_confident.tsv" cell_confident_count_file = f"{outdir}/{sample}_cell_confident_count.tsv" clonetypes_file = f"{outdir}/{sample}_clonetypes.tsv" match_clonetypes_file = f"{outdir}/{sample}_match_clonetypes.tsv" top10_clonetypes_file = f"{outdir}/{sample}_top10_clonetypes.tsv" match_top10_clonetypes_file = f"{outdir}/{sample}_match_top10_clonetypes.tsv" # read file df_UMI_count_filter1 = pd.read_csv(UMI_count_filter1_file, sep='\t') if (not match_dir) or (match_dir == "None"): match_bool = False else: match_bool = True if match_bool: match_cell_barcodes, match_cell_number = read_barcode_file(match_dir) cell_summary_row_list = [] # cell calling:cell calling: keep UMIs >= UMI_min df_UMI_sum = df_UMI_count_filter1.groupby(['barcode'], as_index=False).agg( {"UMI": "sum"}) if (UMI_min == "auto"): rank = 20 df_UMI_sum_sorted = df_UMI_sum.sort_values(["UMI"], ascending=False) rank_UMI = df_UMI_sum_sorted.iloc[rank, :]["UMI"] UMI_min = int(rank_UMI / 10) else: UMI_min = int(UMI_min) df_UMI_cell = df_UMI_sum[df_UMI_sum.UMI >= UMI_min] df_UMI_sum["mark"] = df_UMI_sum["UMI"].apply(lambda x: "CB" if (x >= UMI_min) else "UB") report_prepare(df_UMI_sum, outdir + "/../") cell_barcodes = set(df_UMI_cell.barcode) cell_number = len(cell_barcodes) cell_summary_row_list.append({ "item": "Estimated Number of Cells", "count": cell_number, "total_count": cell_number, }) # df_UMI_count_filter1 in cell df_cell = df_UMI_count_filter1[df_UMI_count_filter1.barcode.isin( cell_barcodes)] # filter2: cell wtih UMI >= iUMI of identical receptor type and CDR3 # combinations. df_cell_UMI_count_filter2 = df_cell[df_cell.UMI >= iUMI] # cell confident df_cell_confident = df_cell_UMI_count_filter2[ df_cell_UMI_count_filter2["chain"].isin(chains)] df_cell_confident = df_cell_confident.sort_values("UMI", ascending=False) df_cell_confident = df_cell_confident.groupby(["barcode", "chain"], as_index=False).head(1) # count df_cell_confident_count = df_cell_confident.set_index(["barcode", "chain"]) df_cell_confident_count = df_cell_confident_count.unstack() df_cell_confident_count.columns = [ '_'.join(col) for col in df_cell_confident_count ] df_cell_confident_count = df_cell_confident_count.reset_index() df_cell_confident_count.fillna(inplace=True, value="NA") # clonetypes seqs = ["aaSeqCDR3", "nSeqCDR3"] cols = [] for chain in chains: for seq in seqs: cols.append("_".join([seq, chain])) for col in cols: if not (col in list(df_cell_confident_count.columns)): df_cell_confident_count[col] = "NA" df_clonetypes = df_cell_confident_count.copy() df_clonetypes = df_clonetypes.groupby(cols, as_index=False).agg( {"barcode": "count"}) # put na last df_clonetypes.replace('NA', np.nan, inplace=True) df_clonetypes.sort_values(["barcode"] + cols, ascending=False, na_position='last', inplace=True) df_clonetypes.replace(np.nan, 'NA', inplace=True) total_CDR3_barcode_number = sum(df_clonetypes.barcode) df_clonetypes["percent"] = df_clonetypes.barcode / \ total_CDR3_barcode_number * 100 df_clonetypes["percent"] = df_clonetypes["percent"].apply( lambda x: round(x, 2)) # add clonetype ID df_clonetypes = df_clonetypes.reset_index() df_clonetypes["clonetype_ID"] = pd.Series(df_clonetypes.index) + 1 df_clonetypes.drop(columns=["index"], inplace=True) # order order = ["clonetype_ID"] + cols + ["barcode", "percent"] df_clonetypes = df_clonetypes[order] df_clonetypes.rename(columns={"barcode": "barcode_count"}, inplace=True) # out clonetypes df_clonetypes.to_csv(clonetypes_file, sep="\t", index=False) if type == "TCR": UMI_col_dic = {"TRA": "UMI_TRA", "TRB": "UMI_TRB"} for chain in UMI_col_dic: UMI_col_name = UMI_col_dic[chain] if UMI_col_name in df_cell_confident_count.columns: df_cell_confident_count[UMI_col_name].replace("NA", 0, inplace=True) Median_chain_UMIs_per_Cell = np.median( df_cell_confident_count[UMI_col_name]) else: Median_chain_UMIs_per_Cell = 0 cell_summary_row_list.append({ "item": "Median {chain} UMIs per Cell".format(chain=chain), "count": Median_chain_UMIs_per_Cell, "total_count": np.nan }) df_TRA_TRB = df_cell_confident_count[ (df_cell_confident_count.aaSeqCDR3_TRA != "NA") & (df_cell_confident_count.aaSeqCDR3_TRB != "NA")] cell_with_confident_TRA_and_TRB = df_TRA_TRB.shape[0] cell_summary_row_list.append({ "item": "Cell with TRA and TRB", "count": cell_with_confident_TRA_and_TRB, "total_count": cell_number, }) """ df cell barcode filter intersect cell_barcodes from scRNA-Seq with barcode from TCR seq """ if match_bool: cell_with_match_barcode = match_cell_barcodes.intersection( cell_barcodes) cell_with_match_barcode_number = len(cell_with_match_barcode) df_match = df_cell_confident_count[ df_cell_confident_count.barcode.isin(match_cell_barcodes)] df_match_TRA_TRB = df_match[(df_match.aaSeqCDR3_TRA != "NA") & (df_match.aaSeqCDR3_TRB != "NA")] match_cell_with_TRA_and_TRB = df_match_TRA_TRB.shape[0] cell_summary_row_list.append({ "item": "Cell with Barcode Match", "count": cell_with_match_barcode_number, "total_count": cell_number, }) cell_summary_row_list.append({ "item": "Cell with Barcode Match, TRA and TRB", "count": match_cell_with_TRA_and_TRB, "total_count": cell_number, }) # BCR elif type == "BCR": UMI_col_dic = {"IGH": "UMI_IGH", "IGL": "UMI_IGL", "IGK": "UMI_IGK"} for chain in UMI_col_dic: UMI_col_name = UMI_col_dic[chain] if UMI_col_name in df_cell_confident_count.columns: df_cell_confident_count[UMI_col_name].replace("NA", 0, inplace=True) df_cell_confident_count_over_zero = df_cell_confident_count[ df_cell_confident_count[UMI_col_name] > 0] Median_chain_UMIs_per_Cell = np.median( df_cell_confident_count_over_zero[UMI_col_name]) else: Median_chain_UMIs_per_Cell = 0 cell_summary_row_list.append({ "item": "Median {chain} UMIs per Cell".format(chain=chain), "count": Median_chain_UMIs_per_Cell, "total_count": np.nan }) df_heavy_and_light = df_cell_confident_count[ (df_cell_confident_count.aaSeqCDR3_IGH != "NA") & ((df_cell_confident_count.aaSeqCDR3_IGL != "NA") | (df_cell_confident_count.aaSeqCDR3_IGK != "NA"))] Cell_with_Heavy_and_Light_Chain = df_heavy_and_light.shape[0] cell_summary_row_list.append({ "item": "Cell with Heavy and Light Chain", "count": Cell_with_Heavy_and_Light_Chain, "total_count": cell_number }) """ df cell barcode filter intersect cell_barcodes from normal scRNA-Seq with barcode from BCR seq """ if match_bool: cell_with_match_barcode = match_cell_barcodes.intersection( cell_barcodes) cell_with_match_barcode_number = len(cell_with_match_barcode) df_match = df_cell_confident_count[ df_cell_confident_count.barcode.isin(match_cell_barcodes)] # median match UMI df_match_heavy_light = df_match[ (df_match.aaSeqCDR3_IGH != "NA") & ((df_match.aaSeqCDR3_IGL != "NA") | (df_match.aaSeqCDR3_IGK != "NA"))] match_cell_with_heavy_and_light = df_match_heavy_light.shape[0] cell_summary_row_list.append({ "item": "Cell with Barcode Match ", "count": cell_with_match_barcode_number, "total_count": cell_number }) cell_summary_row_list.append({ "item": "Cell with Barcode Match, Heavy and Light Chain", "count": match_cell_with_heavy_and_light, "total_count": cell_number }) if match_bool: """ df_match_clonetypes """ df_match_clonetypes = df_match.groupby(cols, as_index=False).agg( {"barcode": "count"}) total_match_CDR3_barcode_number = sum(df_match_clonetypes.barcode) df_match_clonetypes["percent"] = df_match_clonetypes.barcode / \ total_match_CDR3_barcode_number * 100 df_match_clonetypes["percent"] = df_match_clonetypes["percent"].apply( lambda x: round(x, 2)) df_match_clonetypes.rename(columns={"barcode": "barcode_count"}, inplace=True) df_match_clonetypes = df_match_clonetypes.merge(df_clonetypes, on=cols, how='left', suffixes=('', '_y')) # order and drop duplicated cols order = ["clonetype_ID"] + cols + ["barcode_count", "percent"] df_match_clonetypes = df_match_clonetypes[order] df_match_clonetypes.sort_values(["barcode_count", "clonetype_ID"], ascending=[False, True], inplace=True) df_match_clonetypes.to_csv(match_clonetypes_file, sep="\t", index=False) df_mergeID = pd.merge(df_cell_confident_count, df_clonetypes, how="left", on=cols) df_mergeID.sort_values(["clonetype_ID", "barcode"], inplace=True) # output df_cell_confident_count df_mergeID.to_csv(cell_confident_count_file, sep="\t", index=False) df_mergeID = df_mergeID[["barcode", "clonetype_ID"]] df_cell_confident_with_ID = pd.merge(df_cell_confident, df_mergeID, how="left", on="barcode") df_cell_confident_with_ID.sort_values(["clonetype_ID", "barcode", "chain"], inplace=True) # output df_cell_confident df_cell_confident_with_ID.to_csv(cell_confident_file, sep="\t", index=False) # summary file cell_summary = pd.DataFrame(cell_summary_row_list, columns=["item", "count", "total_count"]) cell_summary["count"] = cell_summary["count"].apply(int) cell_summary["percent"] = cell_summary["count"] / \ (cell_summary.total_count.astype("float")) * 100 cell_summary["percent"] = cell_summary["percent"].apply( lambda x: round(x, 2)) cell_summary["count"] = cell_summary["count"].apply(format_number) def percent_str_func(row): need_percent = bool( re.search("Cell with", row["item"], flags=re.IGNORECASE)) if need_percent: return "(" + str(row["percent"]) + "%)" else: return "" cell_summary["percent_str"] = cell_summary.apply( lambda row: percent_str_func(row), axis=1) # stat file def gen_stat(summary, stat_file): stat = summary stat["new_count"] = stat["count"].astype(str) + stat["percent_str"] stat = stat.loc[:, ["item", "new_count"]] stat.to_csv(stat_file, sep=":", header=None, index=False) cell_stat_file = "{}/stat.txt".format(outdir) gen_stat(cell_summary, cell_stat_file) name = type + '_count_vdj' t = reporter( name=name, sample=args.sample, stat_file=cell_stat_file, outdir=outdir + '/..', assay=args.assay, parameters={"iUMI": iUMI}, ) t.get_report() # cloneytpes table def format_table(df_clonetypes, top10_clonetypes_file): top10_clonetypes_df = df_clonetypes.head(10) top10_clonetypes_df = top10_clonetypes_df.reset_index(drop=True) top10_clonetypes_df.index = top10_clonetypes_df.index + 1 top10_clonetypes_df["percent"] = top10_clonetypes_df["percent"].apply( lambda x: str(x) + "%") seqs = ["aaSeqCDR3"] cols = [] for chain in chains: for seq in seqs: cols.append("_".join([seq, chain])) top10_cols = ["clonetype_ID"] + cols + ["barcode_count", "percent"] top10_clonetypes_df = top10_clonetypes_df[top10_cols] top10_clonetypes_df.to_csv(top10_clonetypes_file, sep="\t", index=False) table_header = ["Clonetype_ID"] + cols + ["Frequency", "Percent"] return table_header table_header = format_table(df_clonetypes, top10_clonetypes_file) use_top10_clonetypes_file = top10_clonetypes_file section_header = 'Top10 clonetypes' if match_bool: format_table(df_match_clonetypes, match_top10_clonetypes_file) use_top10_clonetypes_file = match_top10_clonetypes_file section_header = 'Match Top10 clonetypes' t = reporter( name="clonetypes", sample=args.sample, table_file=use_top10_clonetypes_file, table_header=table_header, outdir=outdir + '/..', assay=args.assay, parameters={'section_header': section_header}, ) t.get_report() # other_metrics_file """
def count_smk(args): read_file = args.read_file match_dir = args.match_dir tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0] UMI_min = args.UMI_min SNR_min = args.SNR_min dim = int(args.dim) combine_cluster = args.combine_cluster outdir = args.outdir sample = args.sample assay = args.assay if not os.path.exists(outdir): os.system('mkdir -p %s' % (outdir)) # stat_row stats = pd.Series() # process match_barcode, cell_total = read_barcode_file(match_dir) UMI_tag_file = f'{outdir}/{sample}_umi_tag.tsv' tsne_tag_file = f'{outdir}/{sample}_tsne_tag.tsv' cluster_count_file = f'{outdir}/{sample}_cluster_count.tsv' cluster_plot = f'{outdir}/{sample}_cluster_plot.pdf' if combine_cluster: combine_cluster_count_file = f'{outdir}/{sample}_combine_cluster_count.tsv' combine_cluster_plot = f'{outdir}/{sample}_combine_cluster_plot.pdf' df_read_count = pd.read_csv(read_file, sep="\t", index_col=0) mapped_read = df_read_count['read_count'].sum() # in cell df_read_count_in_cell = df_read_count[df_read_count.index.isin( match_barcode)] mapped_read_in_cell = int(df_read_count_in_cell['read_count'].sum()) stats = stats.append( pd.Series(format_stat(mapped_read_in_cell, mapped_read), index=['Mapped Reads in Cells'])) # UMI df_UMI_in_cell = df_read_count_in_cell.reset_index().groupby( ['barcode', 'SMK_barcode_name']).agg({'UMI': 'count'}) df_UMI_in_cell = df_UMI_in_cell.reset_index() df_UMI_in_cell = df_UMI_in_cell.pivot(index='barcode', columns='SMK_barcode_name', values='UMI') df_cell = pd.DataFrame(index=match_barcode) df_UMI_cell = pd.merge(df_cell, df_UMI_in_cell, how="left", left_index=True, right_index=True) # fillna df_UMI_cell.fillna(0, inplace=True) df_UMI_cell = df_UMI_cell.astype(int) # UMI UMIs = df_UMI_cell.apply(sum, axis=1) median = round(np.median(UMIs), 2) mean = round(np.mean(UMIs), 2) stats = stats.append(pd.Series(str(median), index=['Median UMI per Cell'])) stats = stats.append(pd.Series(str(mean), index=['Mean UMI per Cell'])) UMI_min = get_UMI_min(df_UMI_cell, UMI_min) count_smk.logger.info(f'UMI_min: {UMI_min}') SNR_min = get_SNR_min(df_UMI_cell, dim, SNR_min, UMI_min) count_smk.logger.info(f'SNR_min: {SNR_min}') df_UMI_cell["tag"] = df_UMI_cell.apply(tag_type, UMI_min=UMI_min, SNR_min=SNR_min, dim=dim, axis=1) df_UMI_cell.to_csv(UMI_tag_file, sep="\t") df_tsne = pd.read_csv(tsne_file, sep="\t", index_col=0) df_tsne_tag = pd.merge(df_tsne, df_UMI_cell, how="left", left_index=True, right_index=True) if combine_cluster: df_combine_cluster = pd.read_csv(combine_cluster, sep="\t", header=None) df_combine_cluster.columns = ["cluster", "combine_cluster"] df_tsne_combine_cluster_tag = pd.merge(df_tsne_tag, df_combine_cluster, on=["cluster"], how="left", left_index=True).set_index( df_tsne_tag.index) df_tsne_combine_cluster_tag.to_csv(tsne_tag_file, sep="\t") else: df_tsne_tag.to_csv(tsne_tag_file, sep="\t") write_and_plot(df=df_tsne_tag, column_name="cluster", count_file=cluster_count_file, plot_file=cluster_plot) if combine_cluster: write_and_plot(df=df_tsne_combine_cluster_tag, column_name="combine_cluster", count_file=combine_cluster_count_file, plot_file=combine_cluster_plot) df_tag_count = df_UMI_cell["tag"].value_counts().reset_index() df_tag_count.columns = ["item", "count"] for index, row in df_tag_count.iterrows(): stats = stats.append( pd.Series(format_stat(row['count'], cell_total), index=[row['item'] + ' Cells'])) stat_file = f'{outdir}/stat.txt' stats.to_csv(stat_file, sep=':', header=False) t = reporter(name='count_smk', assay=assay, sample=sample, stat_file=stat_file, outdir=outdir + '/..') t.get_report()