def __init__(self, args, display_title=None): Step.__init__(self, args, display_title=display_title) # set self.match_barcode_list, self.n_cell = utils.read_barcode_file(args.match_dir) self.match_barcode = set(self.match_barcode_list) if args.panel: self.gene_list = utils.get_gene_region_from_bed(args.panel)[0] self.n_gene = len(self.gene_list) else: self.gene_list, self.n_gene = utils.read_one_col(args.gene_list) if not self.gene_list: sys.exit("You must provide either --panel or --gene_list!") self.count_dict = utils.genDict(dim=3, valType=int) self.add_metric( name="Number of Target Genes", value=self.n_gene, ) self.add_metric( name="Number of Cells", value=self.n_cell, ) # out file self.out_bam_file = f'{self.out_prefix}_filtered.bam' self.out_bam_file_sorted = f'{self.out_prefix}_filtered_sorted.bam'
def __init__(self, args, display_title=None): Step.__init__(self, args, display_title=display_title) # set self.chains = CHAINS[args.type] self.cols = [] for chain in self.chains: for seq in SEQUENCES_HEADER: self.cols.append("_".join([seq, chain])) self.match_bool = False if args.match_dir and args.match_dir.strip() != 'None': self.match_cell_barcodes, _match_cell_number = utils.read_barcode_file( args.match_dir) self.match_bool = True elif args.matrix_dir and args.matrix_dir.strip() != 'None': self.match_cell_barcodes = utils.get_barcodes_from_matrix_dir( args.matrix_dir) self.match_bool = True if self.match_bool: self.match_cell_barcodes = set(self.match_cell_barcodes) # out files self.cell_confident_file = f"{self.out_prefix}_cell_confident.tsv" self.cell_confident_count_file = f"{self.out_prefix}_cell_confident_count.tsv" self.clonetypes_file = f"{self.out_prefix}_clonetypes.tsv" self.match_clonetypes_file = f"{self.out_prefix}_match_clonetypes.tsv" # add args data self.add_data(iUMI=args.iUMI)
def __init__(self, args, display_title=None): Step.__init__(self, args, display_title=display_title) # read args self.fq = args.fq self.fq_pattern = args.fq_pattern self.linker_fasta = args.linker_fasta self.barcode_fasta = args.barcode_fasta # process self.barcode_dict, self.barcode_length = utils.read_fasta(self.barcode_fasta, equal=True) if self.linker_fasta and self.linker_fasta != 'None': self.linker_dict, self.linker_length = utils.read_fasta(self.linker_fasta, equal=True) else: self.linker_dict, self.linker_length = {}, 0 self.pattern_dict = parse_pattern(self.fq_pattern) # check barcode length barcode1 = self.pattern_dict["C"][0] # end - start pattern_barcode_length = barcode1[1] - barcode1[0] if pattern_barcode_length != self.barcode_length: raise Exception( f'''barcode fasta length {self.barcode_length} != pattern barcode length {pattern_barcode_length}''' ) self.res_dic = utils.genDict() self.res_sum_dic = utils.genDict(dim=2) self.match_barcode = [] # out files self.read_count_file = f'{self.outdir}/{self.sample}_read_count.tsv' self.UMI_count_file = f'{self.outdir}/{self.sample}_UMI_count.tsv' self.stat_file = f'{self.outdir}/stat.txt'
def __init__(self, args): Step.__init__(self, args) # input files self.sample = args.sample self.bam_file = args.bam self.outdir = args.outdir # output files self.outstat = os.path.join(self.outdir, self.sample+'.substitution.txt')
def __init__(self, args, display_title=None): Step.__init__(self, args, display_title=display_title) # set self.gtf = Mkref_rna.parse_genomeDir(self.args.genomeDir)['gtf'] self.featureCounts_param = args.featureCounts_param # out files input_basename = os.path.basename(self.args.input) self.featureCounts_bam = f'{self.outdir}/{input_basename}.featureCounts.bam' self.name_sorted_bam = f'{self.out_prefix}_name_sorted.bam' self.featureCount_log_file = f'{self.out_prefix}.summary'
def __init__(self, args, display_title=None): Step.__init__(self, args, display_title=display_title) if not (args.split_matrix or args.split_fastq or args.split_vdj): return # set df_umi_tag = pd.read_csv(args.umi_tag_file, sep='\t', index_col=0) df_umi_tag = df_umi_tag.rename_axis('barcode').reset_index() self.tag_barcode_dict = { tag: set(row["barcode"].tolist()) for tag, row in df_umi_tag.groupby("tag") } if args.split_matrix: self.matrix_outdir = f'{args.outdir}/matrix/' if args.match_dir: matrix_10X_dir = glob.glob( f'{args.match_dir}/05.count/*_matrix_10X*')[0] elif args.matrix_dir: matrix_10X_dir = args.matrix_dir else: raise ValueError("--match_dir or --matrix_dir is required.") self.raw_mat, self.raw_features_path, self.raw_barcodes = read_raw_matrix( matrix_10X_dir) if args.split_fastq: self.rna_fq_file = glob.glob( f'{args.match_dir}/*barcode/*_2.fq*')[0] fastq_outdir = f'{args.outdir}/fastqs/' os.system(f'mkdir -p {fastq_outdir}') self.r2_fastq_files_handle = {} self.r1_fastq_files_handle = {} for tag in self.tag_barcode_dict: r2_fastq_file_name = f'{fastq_outdir}/{tag}_2.fq' self.r2_fastq_files_handle[tag] = open(r2_fastq_file_name, 'w') r1_fastq_file_name = f'{fastq_outdir}/{tag}_1.fq' self.r1_fastq_files_handle[tag] = open(r1_fastq_file_name, 'w') self.tag_read_index_dict = defaultdict(set) if args.split_vdj: self.cell_confident_vdj = glob.glob( f'{args.vdj_dir}/*count_vdj/*cell_confident.tsv*')[0] self.vdj_outdir = f'{args.outdir}/vdj/' if not os.path.exists(self.vdj_outdir): os.system(f'mkdir -p {self.vdj_outdir}')
def __init__(self, args): Step.__init__(self, args) # input files self.outdir = args.outdir self.sample = args.sample self.bam_file = args.bam self.snp_file = args.bg self.bg_cov = args.bg_cov self.cell_keep = args.cell_keep # output files self.outread = os.path.join( self.outdir, self.sample + '.corrected_gene_cell_UMI_read.txt') self.outrds = os.path.join(self.outdir, self.sample + '.TC_matrix.rds') self.outpre = os.path.join(self.outdir, self.sample)
def __init__(self, args): Step.__init__(self, args) # input files self.sample = args.sample self.tsnefile = args.tsne self.matfile = args.mat self.repfile = args.rep self.mincell = args.mincell self.topgene = args.topgene # output files self.outdot = os.path.join(self.outdir, self.sample + '.rep_in_tsne.txt') self.outtbl = os.path.join(self.outdir, self.sample + '.rep_in_tsne_top10.txt')
def test_stat_to_metric(self): os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/multi_tests/rna') args_dict = { 'sample': 'test1', 'assay': 'rna', 'thread': 1, 'outdir': 'test1/06.analysis', 'debug': True, } Args = namedtuple('Args', list(args_dict.keys())) args = Args(**args_dict) obj = Step(args, 'analysis') obj.stat_to_metric() print(obj.__content_dict['metric'])
def __init__(self, args): Step.__init__(self, args) # input files self.ifile = os.path.join(args.outdir, args.sample + '.bam') self.sample = args.sample self.strandednessfile = args.strand self.inbam = args.bam self.bcfile = args.cell self.outdir = args.outdir self.thread = args.thread # output files self.outfile_bam = os.path.join(args.outdir, args.sample + '.PosTag.bam') self.outfile_csv = os.path.join(args.outdir, args.sample + '.PosTag.csv')
def __init__(self, args, display_title=None): Step.__init__(self, args, display_title=display_title) # set self.read_type = "UMIs" if args.not_consensus: self.read_type = 'Reads' self.chains = CHAINS[args.type] # out files self.UMI_count_unfiltered_file = f'{self.out_prefix}_UMI_count_unfiltered.tsv' self.UMI_count_filtered_file = f'{self.out_prefix}_UMI_count_filtered.tsv' self.mixcr_report = f"{self.out_prefix}_align.txt" self.not_align_fq = f"{self.out_prefix}_not_align.fq" self.read2_vdjca = f"{self.out_prefix}_read2.vdjca" self.alignments = f"{self.out_prefix}_alignments.txt"
def __init__(self, args, display_title=None): Step.__init__(self, args, display_title=display_title) self.fq1_list = args.fq1.split(",") self.fq2_list = args.fq2.split(",") self.fq_number = len(self.fq1_list) if self.fq_number != len(self.fq2_list): raise Exception('fastq1 and fastq2 do not have same file number!') if args.chemistry == 'auto': ch = Chemistry(args.fq1) self.chemistry_list = ch.check_chemistry() else: self.chemistry_list = [args.chemistry] * self.fq_number self.barcode_corrected_num = 0 self.linker_corrected_num = 0 self.total_num = 0 self.clean_num = 0 self.no_polyT_num = 0 self.lowQual_num = 0 self.no_linker_num = 0 self.no_barcode_num = 0 self.barcode_qual_Counter = Counter() self.umi_qual_Counter = Counter() self.pattern = args.pattern self.linker = args.linker self.whitelist = args.whitelist self.lowNum = args.lowNum self.lowQual = args.lowQual self.allowNoPolyT = args.allowNoPolyT self.allowNoLinker = args.allowNoLinker self.nopolyT = args.nopolyT # true == output nopolyT reads self.noLinker = args.noLinker self.output_R1 = args.output_R1 # out file if args.gzip: suffix = ".gz" else: suffix = "" self.out_fq2 = f'{self.out_prefix}_2.fq{suffix}' self.out_fq1 = f'{self.out_prefix}_1.fq{suffix}' if self.nopolyT: self.nopolyT_1 = f'{self.out_prefix}_noPolyT_1.fq' self.nopolyT_2 = f'{self.out_prefix}_noPolyT_2.fq' if self.noLinker: self.noLinker_1 = f'{self.out_prefix}_noLinker_1.fq' self.noLinker_2 = f'{self.out_prefix}_noLinker_2.fq'
def __init__(self, args, step_name): Step.__init__(self, args, step_name) self.outdir = args.outdir self.sample = args.sample self.Seqtype = args.Seqtype self.all_rep = args.all_rep self.fa = args.fa if self.Seqtype == 'TCR': self.string = 't' self.chain = ['TRA', 'TRB'] self.paired_groups = ['TRA_TRB'] elif self.Seqtype == 'BCR': self.string = 'b' self.chain = ['IGH', 'IGL', 'IGK'] self.paired_groups = ['IGH_IGL', 'IGH_IGK']
def __init__(self, args): Step.__init__(self, args) # set self.barcodes, _num = utils.read_barcode_file(args.match_dir) self.fasta = Mkref_rna.parse_genomeDir(args.genomeDir)['fasta'] self.df_vcf = None self.panel = args.panel self.bed = utils.get_bed_file_path(self.panel) # out self.splitN_bam = f'{self.out_prefix}_splitN.bam' self.splitN_bam_name_sorted = f'{self.out_prefix}_splitN_name_sorted.bam' self.raw_bcf_file = f'{self.out_prefix}_raw.bcf' self.raw_vcf_file = f'{self.out_prefix}_raw.vcf' self.fixed_header_vcf = f'{self.out_prefix}_fixed.vcf' self.norm_vcf_file = f'{self.out_prefix}_norm.vcf'
def __init__(self, args, display_title=None): Step.__init__(self, args, display_title=display_title) self.force_cell_num = args.force_cell_num self.cell_calling_method = args.cell_calling_method self.expected_cell_num = int(args.expected_cell_num) self.bam = args.bam # set self.gtf_file = Mkref_rna.parse_genomeDir(args.genomeDir)['gtf'] self.gtf_dict = utils.Gtf_dict(self.gtf_file) self.downsample_dict = {} # output files self.count_detail_file = f'{self.outdir}/{self.sample}_count_detail.txt' self.marked_count_file = f'{self.outdir}/{self.sample}_counts.txt' self.raw_matrix_10X_dir = f'{self.outdir}/{self.sample}_all_matrix' self.cell_matrix_10X_dir = f'{self.outdir}/{self.sample}_matrix_10X' self.downsample_file = f'{self.outdir}/{self.sample}_downsample.txt'
def __init__(self, args, display_title=None): Step.__init__(self, args, display_title=display_title) self.read_count_file = args.read_count_file self.UMI_min = args.UMI_min self.SNR_min = args.SNR_min self.combine_cluster = args.combine_cluster self.dim = int(args.dim) self.coefficient = float(args.coefficient) # read self.df_read_count = pd.read_csv(self.read_count_file, sep="\t", index_col=0) if args.match_dir: match_dict = utils.parse_match_dir(args.match_dir) self.match_barcode = match_dict['match_barcode'] self.n_match_barcode = match_dict['n_match_barcode'] self.tsne_file = match_dict['tsne_coord'] self.matrix_dir = match_dict['matrix_dir'] elif args.matrix_dir: df_barcode = pd.read_csv(f'{args.matrix_dir}/barcodes.tsv', header=None) self.match_barcode = df_barcode[0].tolist() self.n_match_barcode = len(self.match_barcode) self.tsne_file = args.tsne_file self.matrix_dir = args.matrix_dir else: raise ValueError("--match_dir or --matrix_dir is required.") # init self.no_noise = False # out files self.UMI_tag_file = f'{self.outdir}/{self.sample}_umi_tag.tsv' self.tsne_tag_file = f'{self.outdir}/{self.sample}_tsne_tag.tsv' self.cluster_count_file = f'{self.outdir}/{self.sample}_cluster_count.tsv' self.cluster_plot = f'{self.outdir}/{self.sample}_cluster_plot.pdf' if self.combine_cluster: self.combine_cluster_count_file = f'{self.outdir}/{self.sample}_combine_cluster_count.tsv' self.combine_cluster_plot = f'{self.outdir}/{self.sample}_combine_cluster_plot.pdf'
def __init__(self, args): Step.__init__(self, args) self.assay_description = utils.get_assay_text(self.assay) self.version = __VERSION__ self.chemistry = args.chemistry