def __init__( self, sample, outdir, assay, match_dir, step, ): self.sample = sample self.outdir = outdir self.assay = assay self.match_dir = match_dir self.step = step if self.match_dir: match_dict = parse_match_dir(match_dir) tsne_df_file = match_dict['tsne_coord'] marker_df_file = match_dict['markers'] self.tsne_df = pd.read_csv(tsne_df_file, sep="\t") self.marker_df = pd.read_csv(marker_df_file, sep="\t") self.tsne_df.rename(columns={"Unnamed: 0": "barcode"}, inplace=True) self.cluster_tsne = cluster_tsne_list(self.tsne_df) if not os.path.exists(outdir): os.system('mkdir -p %s' % outdir)
def __init__(self, args, display_title='Filtering'): super().__init__(args, display_title) # data with open(args.raw_read_count_file) as f: self.count_dict = json.load(f) self.raw_umi = 0 self.total_corrected_umi = 0 self.del_umi = 0 self.read_threshold_dict = {} self.umi_threshold_dict = {} # if not set explicitly, use 1 as default self.barcode_ref_umi_dict = utils.genDict(dim=2) self.ref_barcode_umi_dict = utils.genDict(dim=2) match_dir_dict = utils.parse_match_dir(args.match_dir) self.df_tsne = match_dir_dict['df_tsne'] self.df_filter_tsne = self.df_tsne.copy() # out self.corrected_read_count_file = f'{self.out_prefix}_corrected_read_count.json' self.filter_read_count_file = f'{self.out_prefix}_filtered_read_count.json' self.filter_tsne_file = f'{self.out_prefix}_filtered_UMI_tsne.csv'
def __init__(self, args, display_title): super().__init__(args, display_title) self.df_read_count = pd.read_csv(args.read_count_file, sep='\t', index_col=0) self.tsne_dict = utils.parse_match_dir(args.match_dir) self.match_barcode = self.tsne_dict['match_barcode'] # out self.mtx = f'{self.out_prefix}_citeseq.mtx.gz'
def read_match_dir(self): """ if match_dir is not self, should read match_dir at init if it is self, read at run_analysis - need to run seurat first """ if self.match_dir: match_dict = utils.parse_match_dir(self.match_dir) tsne_df_file = match_dict['tsne_coord'] self.df_tsne = pd.read_csv(tsne_df_file, sep="\t") self.df_tsne.rename(columns={"Unnamed: 0": "barcode"}, inplace=True) self.df_marker_file = match_dict['markers'] self.read_format_df_marker()
def analysis_cite(args): if not os.path.exists(args.outdir): os.system('mkdir -p %s' % args.outdir) rds = parse_match_dir(args.match_dir)['rds'] app = CITESEQ_DIR + "/analysis_cite.R" cmd = (f'Rscript {app} ' f'--rds {rds} ' f'--citeseq_mtx {args.citeseq_mtx} ' f'--outdir {args.outdir} ' f'--sample {args.sample} ') os.system(cmd)
def __init__(self, args, display_title=None): Step.__init__(self, args, display_title=display_title) self.read_count_file = args.read_count_file self.UMI_min = args.UMI_min self.SNR_min = args.SNR_min self.combine_cluster = args.combine_cluster self.dim = int(args.dim) self.coefficient = float(args.coefficient) # read self.df_read_count = pd.read_csv(self.read_count_file, sep="\t", index_col=0) if args.match_dir: match_dict = utils.parse_match_dir(args.match_dir) self.match_barcode = match_dict['match_barcode'] self.n_match_barcode = match_dict['n_match_barcode'] self.tsne_file = match_dict['tsne_coord'] self.matrix_dir = match_dict['matrix_dir'] elif args.matrix_dir: df_barcode = pd.read_csv(f'{args.matrix_dir}/barcodes.tsv', header=None) self.match_barcode = df_barcode[0].tolist() self.n_match_barcode = len(self.match_barcode) self.tsne_file = args.tsne_file self.matrix_dir = args.matrix_dir else: raise ValueError("--match_dir or --matrix_dir is required.") # init self.no_noise = False # out files self.UMI_tag_file = f'{self.outdir}/{self.sample}_umi_tag.tsv' self.tsne_tag_file = f'{self.outdir}/{self.sample}_tsne_tag.tsv' self.cluster_count_file = f'{self.outdir}/{self.sample}_cluster_count.tsv' self.cluster_plot = f'{self.outdir}/{self.sample}_cluster_plot.pdf' if self.combine_cluster: self.combine_cluster_count_file = f'{self.outdir}/{self.sample}_combine_cluster_count.tsv' self.combine_cluster_plot = f'{self.outdir}/{self.sample}_combine_cluster_plot.pdf'
def __init__(self, args, display_title='Count'): super().__init__(args, display_title) # set self.min_query_length = int(args.min_query_length) self.capture_bam = args.capture_bam # read barcodes match_dir_dict = utils.parse_match_dir(args.match_dir) self.match_barcode = match_dir_dict['match_barcode'] self.n_match_barcode = match_dir_dict['n_match_barcode'] self.add_metric( name=HELP_INFO_DICT['matched_barcode_number']['display'], value=self.n_match_barcode, help_info=HELP_INFO_DICT['matched_barcode_number']['info']) # data self.total_corrected_umi = 0 self.count_dict = utils.genDict(dim=3) # out self.raw_read_count_file = f'{self.out_prefix}_raw_read_count.json'
def count_fusion(args): outdir = args.outdir sample = args.sample bam = args.bam flanking_base = int(args.flanking_base) fusion_pos_file = args.fusion_pos match_dir = args.match_dir UMI_min = int(args.UMI_min) # check dir if not os.path.exists(outdir): os.system('mkdir -p %s' % (outdir)) fusion_pos = read_pos(fusion_pos_file) out_prefix = outdir + "/" + sample # barcode match_barcode, _n_barcode = read_barcode_file(match_dir) # tsne match_tsne_file = parse_match_dir(match_dir)['tsne_coord'] df_tsne = pd.read_csv(match_tsne_file, sep="\t", index_col=0) # out out_read_count_file = out_prefix + "_fusion_read_count.tsv" out_umi_count_file = out_prefix + "_fusion_UMI_count.tsv" out_barcode_count_file = out_prefix + "_fusion_barcode_count.tsv" out_tsne_file = out_prefix + "_fusion_tsne.tsv" # process bam samfile = pysam.AlignmentFile(bam, "rb") header = samfile.header new_bam = pysam.AlignmentFile(out_prefix + "_fusion.bam", "wb", header=header) count_dic = genDict(dim=3) for read in samfile: tag = read.reference_name read_start = int(read.reference_start) read_length = len(read.query_sequence) attr = read.query_name.split('_') barcode = attr[0] umi = attr[1] if tag in fusion_pos.keys(): if barcode in match_barcode: if is_fusion(pos=fusion_pos[tag], read_start=read_start, read_length=read_length, flanking_base=flanking_base): new_bam.write(read) count_dic[barcode][tag][umi] += 1 new_bam.close() # write dic to pandas df rows = [] for barcode in count_dic: for tag in count_dic[barcode]: for umi in count_dic[barcode][tag]: rows.append([barcode, tag, umi, count_dic[barcode][tag][umi]]) df_read = pd.DataFrame(rows) df_read.rename(columns={ 0: "barcode", 1: "tag", 2: "UMI", 3: "read_count" }, inplace=True) df_read.to_csv(out_read_count_file, sep="\t", index=False) if not rows: count_fusion.logger.error('***** NO FUSION FOUND! *****') else: df_umi = df_read.groupby(["barcode", "tag"]).agg({"UMI": "count"}) df_umi = df_umi[df_umi["UMI"] >= UMI_min] df_umi.to_csv(out_umi_count_file, sep="\t") df_umi.reset_index(inplace=True) df_barcode = df_umi.groupby(["tag"]).agg({"barcode": "count"}) n_match_barcode = len(match_barcode) # add zero count tag for tag in fusion_pos.keys(): if not tag in df_barcode.barcode: new_row = pd.Series(data={'barcode': 0}, name=tag) df_barcode = df_barcode.append(new_row, ignore_index=False) df_barcode["percent"] = df_barcode["barcode"] / n_match_barcode df_barcode.to_csv(out_barcode_count_file, sep="\t") df_pivot = df_umi.pivot(index="barcode", columns="tag", values="UMI") df_pivot.fillna(0, inplace=True) df_tsne_fusion = pd.merge(df_tsne, df_pivot, right_index=True, left_index=True, how="left") df_tsne_fusion.fillna(0, inplace=True) df_tsne_fusion.to_csv(out_tsne_file, sep="\t") # plot count_fusion.logger.info("plot fusion...!") app = fusionDir + "/plot_fusion.R" cmd = f"Rscript {app} --tsne_fusion {out_tsne_file} --outdir {outdir}" os.system(cmd) count_fusion.logger.info("plot done.")
def __init__(self, args, display_title): super().__init__(args, display_title) # data self.tsne_dict = utils.parse_match_dir(args.match_dir)
def count_mut(args): outdir = args.outdir sample = args.sample bam = args.bam shift_base = int(args.shift_base) mut_file = args.mut_file match_dir = args.match_dir # check dir if not os.path.exists(outdir): os.system('mkdir -p %s' % (outdir)) mut_dic = read_mut(mut_file) out_prefix = outdir + "/" + sample # tsne match_dict = parse_match_dir(match_dir) df_tsne = pd.read_csv(match_dict['tsne_coord'], sep="\t", index_col=0) # out out_read_file = out_prefix + "_mut_read.tsv" out_read_count_file = out_prefix + "_mut_read_count.tsv" out_umi_count_file = out_prefix + "_mut_UMI_count.tsv" out_insertion_barcode_count_file = out_prefix + "_mut_insertion_barcode_count.tsv" out_tsne_file = out_prefix + "_mut_tsne.tsv" # process bam samfile = pysam.AlignmentFile(bam, "rb") header = samfile.header new_bam = pysam.AlignmentFile(out_prefix+"_mut.bam", "wb", header=header) rows = [] for read in samfile: tag = read.reference_name attr = read.query_name.split('_') barcode = attr[0] umi = attr[1] seq = read.query_sequence cigar = read.cigar ref_pos = read.reference_start read_pos = read.query_alignment_start for (cigarType, cigarLength) in cigar: if cigarType == 0: # match ref_pos += cigarLength read_pos += cigarLength elif cigarType == 1: # insertion insert_seq = seq[read_pos:read_pos+cigarLength] insert_seq_length = len(insert_seq) rows.append({ "barcode": barcode, "UMI": umi, "gene": tag, "type": "insertion", "seq": insert_seq, "ref_pos": ref_pos, "read_pos": read_pos, "seq_length": insert_seq_length }) read_pos += cigarLength elif cigarType == 2: # deletion ref_pos += cigarLength df = pd.DataFrame(rows) df = df[["barcode", "UMI", "gene", "type", "ref_pos", "read_pos", "seq", "seq_length"]] def is_valid(row): gene = str(row["gene"]) ref_pos = int(row["ref_pos"]) mut_type = str(row["type"]) seq_length = int(row["seq_length"]) seq = str(row["seq"]) if gene not in mut_dic.keys(): return False if mut_type not in mut_dic[gene].keys(): return False if abs(ref_pos-mut_dic[gene][mut_type]["pos"]) > shift_base: return False if editdistance.eval(seq, mut_dic[gene][mut_type]["seq"]) > shift_base: return False return True df["is_valid"] = df.apply(func=is_valid, axis=1) df.to_csv(out_read_file, sep="\t") df_valid = df[df["is_valid"]] df_read_count = df_valid.groupby( ["gene", "type", "barcode", "UMI"]).agg({"UMI": "count"}) df_read_count.columns = ["read_count"] df_read_count.reset_index(inplace=True) df_read_count.to_csv(out_read_count_file, sep="\t") df_UMI_count = df_read_count.groupby( ["gene", "type", "barcode"]).agg({"UMI": "count"}) df_UMI_count.columns = ["UMI_count"] df_UMI_count.to_csv(out_umi_count_file, sep="\t") df_temp = df_UMI_count.reset_index() if df_temp.shape[0] == 0: count_mut.logger.warning('NO VALID INSERTION FOUND!') else: df_insertion = df_temp[df_temp["type"] == "insertion"] df_insertion_barcode_count = df_insertion.pivot( index="barcode", columns="gene", values="UMI_count") df_insertion_barcode_count.to_csv( out_insertion_barcode_count_file, sep="\t") df_tsne_mut = pd.merge(df_tsne, df_insertion_barcode_count, right_index=True, left_index=True, how="left") df_tsne_mut.fillna(0, inplace=True) df_tsne_mut.to_csv(out_tsne_file, sep="\t") # plot app = parentDir + "/plot.R" cmd = f"Rscript {app} --tsne_mut {out_tsne_file} --outdir {outdir}" count_mut.logger.info(cmd) os.system(cmd) count_mut.logger.info("plot done.")