def estimate_bias_vom(args): regions = GenomicRegionSet("regions") regions.read(args.regions_file) create_signal(args, regions) hmm_data = HmmData() learn_dependency_model = hmm_data.get_dependency_model() slim_dimont_predictor = hmm_data.get_slim_dimont_predictor() test_fa = hmm_data.get_default_test_fa() shutil.copy(test_fa, args.output_location) os.chdir(args.output_location) print((os.getcwd())) output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb))) output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb))) output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb))) output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb))) infix = "{}_f_obs".format(str(args.k_nb)) create_model(args, output_fname_f_obs, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_f_exp".format(str(args.k_nb)) create_model(args, output_fname_f_exp, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_r_obs".format(str(args.k_nb)) create_model(args, output_fname_r_obs, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_r_exp".format(str(args.k_nb)) create_model(args, output_fname_r_exp, infix, learn_dependency_model, slim_dimont_predictor) os.remove(os.path.join(args.output_location, "test.fa")) compute_bias(args)
def estimate_bias_vom(args): regions = GenomicRegionSet("regions") regions.read(args.regions_file) create_signal(args, regions) hmm_data = HmmData() learn_dependency_model = hmm_data.get_dependency_model() slim_dimont_predictor = hmm_data.get_slim_dimont_predictor() test_fa = hmm_data.get_default_test_fa() shutil.copy(test_fa, args.output_location) os.chdir(args.output_location) print(os.getcwd()) output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb))) output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb))) output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb))) output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb))) infix = "{}_f_obs".format(str(args.k_nb)) create_model(args, output_fname_f_obs, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_f_exp".format(str(args.k_nb)) create_model(args, output_fname_f_exp, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_r_obs".format(str(args.k_nb)) create_model(args, output_fname_r_obs, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_r_exp".format(str(args.k_nb)) create_model(args, output_fname_r_exp, infix, learn_dependency_model, slim_dimont_predictor) os.remove(os.path.join(args.output_location, "test.fa")) compute_bias(args)
def diff_analysis_run(args): # Initializing Error Handler err = ErrorHandler() output_location = os.path.join(args.output_location, "Lineplots") try: if not os.path.isdir(output_location): os.makedirs(output_location) except Exception: err.throw_error("MM_OUT_FOLDER_CREATION") # check if they have same length mpbs_files = args.mpbs_files.strip().split(",") reads_files = args.reads_files.strip().split(",") conditions = args.conditions.strip().split(",") if args.colors is not None: colors = args.colors.strip().split(",") else: colors = [ "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33", "#a65628", "#f781bf", "#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3", "#a6d854", "#ffd92f", "#e5c494", "#b3b3b3", "#8dd3c7", "#ffffb3", "#bebada", "#fb8072", "#80b1d3", "#fdb462", "#b3de69", "#fccde5", "#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e", "#e6ab02", "#a6761d", "#666666", "#7fc97f", "#beaed4", "#fdc086", "#ffff99", "#386cb0", "#f0027f", "#bf5b17", "#666666" ] assert len(mpbs_files) == len(reads_files) == len(conditions), \ "Number of motif, read and condition names are not same: {}, {}, {}".format(len(mpbs_files), len(reads_files), len(conditions)) # Check if the index file exists for reads_file in reads_files: base_name = "{}.bai".format(reads_file) if not os.path.exists(base_name): pysam.index(reads_file) mpbs = GenomicRegionSet("Motif Predicted Binding Sites of All Conditions") for i, mpbs_file in enumerate(mpbs_files): mpbs.read(mpbs_file) mpbs.sort() mpbs.remove_duplicates() mpbs_name_list = list(set(mpbs.get_names())) signals = np.zeros(shape=(len(conditions), len(mpbs_name_list), args.window_size), dtype=np.float32) motif_len = list() motif_num = list() motif_pwm = list() print((" {} cpus are detected and {} of them will be used...\n".format( cpu_count(), args.nc))) genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) print("generating signal for each motif and condition...\n") # differential analysis using bias corrected signal if args.bc: hmm_data = HmmData() table_forward = hmm_data.get_default_bias_table_F_ATAC() table_reverse = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_forward, table_file_name_R=table_reverse) # do not use multi-processing if args.nc == 1: for i, condition in enumerate(conditions): for j, mpbs_name in enumerate(mpbs_name_list): mpbs_regions = mpbs.by_names([mpbs_name]) arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, args.reverse_shift, bias_table) try: signals[i, j, :] = get_bc_signal(arguments) except Exception: logging.exception("get bias corrected signal failed") # get motif length, number and pwm matrix motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) motif_num.append(len(mpbs_regions)) motif_pwm.append( get_pwm(fasta, mpbs_regions, args.window_size)) # use multi-processing else: for i, condition in enumerate(conditions): print(( "generating signal for condition {} \n".format(condition))) with Pool(processes=args.nc) as pool: arguments_list = list() for mpbs_name in mpbs_name_list: mpbs_regions = mpbs.by_names([mpbs_name]) arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, args.reverse_shift, bias_table) arguments_list.append(arguments) # get motif length, number and pwm matrix motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) motif_num.append(len(mpbs_regions)) motif_pwm.append( get_pwm(fasta, mpbs_regions, args.window_size)) res = pool.map(get_bc_signal, arguments_list) signals[i] = np.array(res) # differential analysis using raw signal else: # do not use multi-processing if args.nc == 1: for i, condition in enumerate(conditions): for j, mpbs_name in enumerate(mpbs_name_list): mpbs_regions = mpbs.by_names([mpbs_name]) arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, args.reverse_shift) signals[i, j, :] = get_raw_signal(arguments) # get motif length, number and pwm matrix motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) motif_num.append(len(mpbs_regions)) motif_pwm.append( get_pwm(fasta, mpbs_regions, args.window_size)) # use multi-processing else: for i, condition in enumerate(conditions): print(( "generating signal for condition {} \n".format(condition))) with Pool(processes=args.nc) as pool: arguments_list = list() for mpbs_name in mpbs_name_list: mpbs_regions = mpbs.by_names([mpbs_name]) arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, args.reverse_shift) arguments_list.append(arguments) # get motif length, number and pwm matrix motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) motif_num.append(len(mpbs_regions)) motif_pwm.append( get_pwm(fasta, mpbs_regions, args.window_size)) res = pool.map(get_raw_signal, arguments_list) signals[i] = np.array(res) print("signal generation is done!\n") # compute normalization facotr for each condition factors = compute_factors(signals) output_factor(args, factors, conditions) # normalize signals by factor and number of motifs for i in range(len(conditions)): for j in range(len(mpbs_name_list)): signals[i, j, :] = signals[i, j, :] / (factors[i] * motif_num[j]) if args.output_profiles: output_profiles(mpbs_name_list, signals, conditions, args.output_location) print("generating line plot for each motif...\n") if args.nc == 1: for i, mpbs_name in enumerate(mpbs_name_list): output_line_plot( (mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location, args.window_size, colors)) else: with Pool(processes=args.nc) as pool: arguments_list = list() for i, mpbs_name in enumerate(mpbs_name_list): arguments_list.append( (mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location, args.window_size, colors)) pool.map(output_line_plot, arguments_list) ps_tc_results = list() for i, mpbs_name in enumerate(mpbs_name_list): ps_tc_results.append( get_ps_tc_results(signals[:, i, :], motif_len[i], args.window_size)) # find the significant motifs and generate a scatter plot if two conditions are given if len(conditions) == 2: ps_tc_results = scatter_plot(args, ps_tc_results, mpbs_name_list, conditions) output_stat_results(ps_tc_results, conditions, mpbs_name_list, motif_num, args)
def diff_analysis_run(args): # Initializing Error Handler err = ErrorHandler() output_location = os.path.join(args.output_location, "Lineplots") try: if not os.path.isdir(output_location): os.makedirs(output_location) except Exception: err.throw_error("MM_OUT_FOLDER_CREATION") # Check if the index file exists base_name1 = "{}.bai".format(args.reads_file1) if not os.path.exists(base_name1): pysam.index(args.reads_file1) base_name2 = "{}.bai".format(args.reads_file2) if not os.path.exists(base_name2): pysam.index(args.reads_file2) mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(args.mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(args.mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() mpbs.remove_duplicates() mpbs_name_list = list(set(mpbs.get_names())) signal_dict_by_tf_1 = dict() signal_dict_by_tf_2 = dict() motif_len_dict = dict() motif_num_dict = dict() pwm_dict_by_tf = dict() pool = Pool(processes=args.nc) # differential analysis using bias corrected signal if args.bc: hmm_data = HmmData() table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table1 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) bias_table2 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) mpbs_list = list() for mpbs_name in mpbs_name_list: mpbs_list.append((mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2, args.organism, args.window_size, args.forward_shift, args.reverse_shift, bias_table1, bias_table2)) try: res = pool.map(get_bc_signal, mpbs_list) except Exception: logging.exception("get bias corrected signal failed") # differential analysis using raw signal else: mpbs_list = list() for mpbs_name in mpbs_name_list: mpbs_list.append((mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2, args.organism, args.window_size, args.forward_shift, args.reverse_shift)) try: res = pool.map(get_raw_signal, mpbs_list) except Exception: logging.exception("get raw signal failed") for idx, mpbs_name in enumerate(mpbs_name_list): signal_dict_by_tf_1[mpbs_name] = res[idx][0] signal_dict_by_tf_2[mpbs_name] = res[idx][1] motif_len_dict[mpbs_name] = res[idx][2] pwm_dict_by_tf[mpbs_name] = res[idx][3] motif_num_dict[mpbs_name] = res[idx][4] if args.factor1 is None or args.factor2 is None: args.factor1, args.factor2 = compute_factors(signal_dict_by_tf_1, signal_dict_by_tf_2) output_factor(args, args.factor1, args.factor2) if args.output_profiles: output_profiles(mpbs_name_list, signal_dict_by_tf_1, output_location, args.condition1) output_profiles(mpbs_name_list, signal_dict_by_tf_2, output_location, args.condition2) ps_tc_results_by_tf = dict() plots_list = list() for mpbs_name in mpbs_name_list: plots_list.append((mpbs_name, motif_num_dict[mpbs_name], signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name], args.factor1, args.factor2, args.condition1, args.condition2, pwm_dict_by_tf[mpbs_name], output_location, args.window_size, args.standardize)) pool.map(line_plot, plots_list) for mpbs_name in mpbs_name_list: res = get_ps_tc_results(signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name], args.factor1, args.factor2, motif_num_dict[mpbs_name], motif_len_dict[mpbs_name]) # # # only use the factors whose protection scores are greater than 0 # if res[0] > 0 and res[1] < 0: ps_tc_results_by_tf[mpbs_name] = res # # stat_results_by_tf = get_stat_results(ps_tc_results_by_tf) ps_tc_results_by_tf = scatter_plot(args, ps_tc_results_by_tf) output_stat_results(args, ps_tc_results_by_tf, motif_num_dict)
def get_bc_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() bam = Samfile(args.input_files[0], "rb") genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) hmm_data = HmmData() if args.bias_table: bias_table_list = args.bias_table.split(",") bias_table = BiasTable().load_table(table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) else: table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) if args.strand_specific: fname_forward = os.path.join(args.output_location, "{}_forward.wig".format(args.output_prefix)) fname_reverse = os.path.join(args.output_location, "{}_reverse.wig".format(args.output_prefix)) f_forward = open(fname_forward, "a") f_reverse = open(fname_reverse, "a") for region in regions: signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=True) if args.norm: signal_f = reads_file.boyle_norm(signal_f) perc = scoreatpercentile(signal_f, 98) std = np.std(signal_f) signal_f = reads_file.hon_norm_atac(signal_f, perc, std) signal_r = reads_file.boyle_norm(signal_r) perc = scoreatpercentile(signal_r, 98) std = np.std(signal_r) signal_r = reads_file.hon_norm_atac(signal_r, perc, std) f_forward.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n") f_reverse.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n") f_forward.close() f_reverse.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}_forward.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", fname_forward, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(fname_forward) bw_filename = os.path.join(args.output_location, "{}_reverse.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(fname_reverse) else: output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) with open(output_fname, "a") as output_f: for region in regions: signal = reads_file.get_bc_signal_by_fragment_length(ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=False) if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(output_fname)
def footprint(bam: str, bed: str, assembly: str = "hg38", w: int = 500, dnase: bool = False, bias_type="SH"): # load HMM and bias parameters for ATAC-seq g = GenomeData(organism=assembly) hmm_data = HmmData() if dnase: hmm_file = hmm_data.get_default_hmm_dnase_bc() if bias_type == 'SH': table_F = hmm_data.get_default_bias_table_F_SH() table_R = hmm_data.get_default_bias_table_R_SH() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) elif bias_type == 'DH': table_F = hmm_data.get_default_bias_table_F_DH() table_R = hmm_data.get_default_bias_table_R_DH() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) else: hmm_file = hmm_data.get_default_hmm_atac_paired() table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) # load reads from BAM reads_file = GenomicSignal(bam) reads_file.load_sg_coefs(SG_WINDOW_SIZE) # open data and sequence bam = Samfile(bam, "rb") fasta = Fastafile(g.get_genome()) # load and expand regions with open(bed, 'r') as f: regions = [ expandRegion( *tuple(line.strip().split()[:3]), line.strip().split()[3] if len(line.strip().split()) >= 4 else None, w, line.strip().split()[4] if len(line.strip().split()) >= 5 else '.') for line in f ] # load signal forward = [] reverse = [] failed = 0 get_reads = reads_file.get_signal_atac if not dnase else reads_file.get_signal for i, x in enumerate(regions): try: chromosome, start, end, _, strand = x atac_norm_f, atac_slope_f, atac_norm_r, atac_slope_r = get_reads( chromosome, start, end, 0, 0, FORWARD_SHIFT if not dnase else 0, REVERSE_SHIFT if not dnase else 0, 1000 if dnase else 150, 98, 98, bias_table, g.get_genome()) atac_norm_f = [float(x) for x in atac_norm_f] atac_norm_r = [float(x) for x in atac_norm_r] if strand == '-': atac_norm_f.reverse() atac_norm_r.reverse() forward.append(atac_norm_f if strand != '-' else atac_norm_r) reverse.append(atac_norm_r if strand != '-' else atac_norm_f) if i % 500 == 0: print("INFO: aggregating region %d of %d" % (i, len(regions)), file=sys.stderr) except: if len(forward) <= i: forward.append(None) if len(reverse) <= i: reverse.append(None) failed += 1 if failed > 0: print( "WARNING: failed to generate bias-corrected signal profiles for %d regions" % failed, file=sys.stderr) return [ regionDict(regions[i], forward[i], reverse[i]) for i in range(len(regions)) if forward[i] is not None and reverse[i] is not None ]
def diff_analysis_run(args): # Initializing Error Handler err = ErrorHandler() output_location = os.path.join(args.output_location, "Lineplots") try: if not os.path.isdir(output_location): os.makedirs(output_location) except Exception: err.throw_error("MM_OUT_FOLDER_CREATION") mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(args.mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(args.mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() mpbs_name_list = list(set(mpbs.get_names())) signal_dict_by_tf_1 = dict() signal_dict_by_tf_2 = dict() motif_len_dict = dict() motif_num_dict = dict() pwm_dict_by_tf = dict() pool = Pool(processes=args.nc) # differential analysis using bias corrected signal if args.bc: hmm_data = HmmData() table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table1 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) bias_table2 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) mpbs_list = list() for mpbs_name in mpbs_name_list: mpbs_list.append( (mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2, args.organism, args.window_size, args.forward_shift, args.reverse_shift, bias_table1, bias_table2)) try: res = pool.map(get_bc_signal, mpbs_list) except Exception: logging.exception("get bias corrected signal failed") # differential analysis using raw signal else: mpbs_list = list() for mpbs_name in mpbs_name_list: mpbs_list.append( (mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2, args.organism, args.window_size, args.forward_shift, args.reverse_shift)) try: res = pool.map(get_raw_signal, mpbs_list) except Exception: logging.exception("get raw signal failed") for idx, mpbs_name in enumerate(mpbs_name_list): signal_dict_by_tf_1[mpbs_name] = res[idx][0] signal_dict_by_tf_2[mpbs_name] = res[idx][1] motif_len_dict[mpbs_name] = res[idx][2] pwm_dict_by_tf[mpbs_name] = res[idx][3] motif_num_dict[mpbs_name] = res[idx][4] if args.factor1 is None or args.factor2 is None: args.factor1, args.factor2 = compute_factors(signal_dict_by_tf_1, signal_dict_by_tf_2) output_factor(args, args.factor1, args.factor2) if args.output_profiles: output_profiles(mpbs_name_list, signal_dict_by_tf_1, output_location, args.condition1) output_profiles(mpbs_name_list, signal_dict_by_tf_2, output_location, args.condition2) ps_tc_results_by_tf = dict() plots_list = list() for mpbs_name in mpbs_name_list: plots_list.append( (mpbs_name, motif_num_dict[mpbs_name], signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name], args.factor1, args.factor2, args.condition1, args.condition2, pwm_dict_by_tf[mpbs_name], output_location, args.window_size, args.standardize)) pool.map(line_plot, plots_list) for mpbs_name in mpbs_name_list: res = get_ps_tc_results(signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name], args.factor1, args.factor2, motif_num_dict[mpbs_name], motif_len_dict[mpbs_name]) # # # only use the factors whose protection scores are greater than 0 # if res[0] > 0 and res[1] < 0: ps_tc_results_by_tf[mpbs_name] = res # stat_results_by_tf = get_stat_results(ps_tc_results_by_tf) scatter_plot(args, stat_results_by_tf) output_stat_results(args, stat_results_by_tf, motif_num_dict)
def get_bc_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() bam = Samfile(args.input_files[0], "rb") genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) hmm_data = HmmData() if args.bias_table: bias_table_list = args.bias_table.split(",") bias_table = BiasTable().load_table( table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) else: table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) if args.strand_specific: fname_forward = os.path.join( args.output_location, "{}_forward.wig".format(args.output_prefix)) fname_reverse = os.path.join( args.output_location, "{}_reverse.wig".format(args.output_prefix)) f_forward = open(fname_forward, "a") f_reverse = open(fname_reverse, "a") for region in regions: signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=True) if args.norm: signal_f = reads_file.boyle_norm(signal_f) perc = scoreatpercentile(signal_f, 98) std = np.std(signal_f) signal_f = reads_file.hon_norm_atac(signal_f, perc, std) signal_r = reads_file.boyle_norm(signal_r) perc = scoreatpercentile(signal_r, 98) std = np.std(signal_r) signal_r = reads_file.hon_norm_atac(signal_r, perc, std) f_forward.write( "fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n") f_reverse.write( "fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n") f_forward.close() f_reverse.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join( args.output_location, "{}_forward.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", fname_forward, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(fname_forward) bw_filename = os.path.join( args.output_location, "{}_reverse.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(fname_reverse) else: output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) with open(output_fname, "a") as output_f: for region in regions: signal = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=False) if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write( "fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(output_fname)