def estimate_bias_vom(args): regions = GenomicRegionSet("regions") regions.read(args.regions_file) create_signal(args, regions) hmm_data = HmmData() learn_dependency_model = hmm_data.get_dependency_model() slim_dimont_predictor = hmm_data.get_slim_dimont_predictor() test_fa = hmm_data.get_default_test_fa() shutil.copy(test_fa, args.output_location) os.chdir(args.output_location) print(os.getcwd()) output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb))) output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb))) output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb))) output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb))) infix = "{}_f_obs".format(str(args.k_nb)) create_model(args, output_fname_f_obs, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_f_exp".format(str(args.k_nb)) create_model(args, output_fname_f_exp, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_r_obs".format(str(args.k_nb)) create_model(args, output_fname_r_obs, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_r_exp".format(str(args.k_nb)) create_model(args, output_fname_r_exp, infix, learn_dependency_model, slim_dimont_predictor) os.remove(os.path.join(args.output_location, "test.fa")) compute_bias(args)
def estimate_bias_vom(args): regions = GenomicRegionSet("regions") regions.read(args.regions_file) create_signal(args, regions) hmm_data = HmmData() learn_dependency_model = hmm_data.get_dependency_model() slim_dimont_predictor = hmm_data.get_slim_dimont_predictor() test_fa = hmm_data.get_default_test_fa() shutil.copy(test_fa, args.output_location) os.chdir(args.output_location) print((os.getcwd())) output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb))) output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb))) output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb))) output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb))) infix = "{}_f_obs".format(str(args.k_nb)) create_model(args, output_fname_f_obs, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_f_exp".format(str(args.k_nb)) create_model(args, output_fname_f_exp, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_r_obs".format(str(args.k_nb)) create_model(args, output_fname_r_obs, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_r_exp".format(str(args.k_nb)) create_model(args, output_fname_r_exp, infix, learn_dependency_model, slim_dimont_predictor) os.remove(os.path.join(args.output_location, "test.fa")) compute_bias(args)
def evidence_run(args): # Expanding summits chip_summit_regions = GenomicRegionSet("TFBS Summit Regions") chip_summit_regions.read(args.chip_file) for region in iter(chip_summit_regions): summit = int(region.data.split()[-1]) + region.initial region.initial = max(summit - (args.peak_ext / 2), 0) region.final = summit + (args.peak_ext / 2) # Calculating intersections mpbs_regions = GenomicRegionSet("MPBS Regions") mpbs_regions.read(args.mpbs_file) chip_summit_regions.sort() mpbs_regions.sort() tfbs_regions = GenomicRegionSet("TFBS Regions") for mpbs_region in mpbs_regions: if chip_summit_regions.include(mpbs_region): mpbs_region.name = mpbs_region.name.split(":")[0] + ":Y" else: mpbs_region.name = mpbs_region.name.split(":")[0] + ":N" tfbs_regions.add(mpbs_region) tfbs_regions.sort() tfbs_fname = os.path.join(args.output_location, "{}.bed".format(args.output_prefix)) tfbs_regions.write(tfbs_fname)
def get_bc_signal(arguments): (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism, window_size, forward_shift, reverse_shift, bias_table1, bias_table2) = arguments mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() bam1 = Samfile(reads_file1, "rb") bam2 = Samfile(reads_file2, "rb") genome_data = GenomeData(organism) fasta = Fastafile(genome_data.get_genome()) signal_1 = np.zeros(window_size) signal_2 = np.zeros(window_size) motif_len = None pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), ("G", [0.0] * window_size), ("T", [0.0] * window_size), ("N", [0.0] * window_size)]) mpbs_regions = mpbs.by_names([mpbs_name]) num_motif = len(mpbs_regions) # Fetch bias corrected signal for region in mpbs_regions: if motif_len is None: motif_len = region.final - region.initial mid = (region.final + region.initial) / 2 p1 = mid - window_size / 2 p2 = mid + window_size / 2 if p1 <= 0: continue # Fetch raw signal signal1 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam1, bias_table=bias_table1, genome_file_name=genome_data.get_genome(), forward_shift=forward_shift, reverse_shift=reverse_shift) signal2 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam2, bias_table=bias_table2, genome_file_name=genome_data.get_genome(), forward_shift=forward_shift, reverse_shift=reverse_shift) if len(signal1) != len(signal_1) or len(signal2) != len(signal_2): continue # smooth the signal signal_1 = np.add(signal_1, np.array(signal1)) signal_2 = np.add(signal_2, np.array(signal2)) update_pwm(pwm, fasta, region, p1, p2) return signal_1, signal_2, motif_len, pwm, num_motif
def get_raw_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) bam = Samfile(args.input_files[0], "rb") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() with open(output_fname, "a") as output_f: for region in regions: # Raw counts signal = [0.0] * (region.final - region.initial) for read in bam.fetch(region.chrom, region.initial, region.final): if not read.is_reverse: cut_site = read.pos + args.forward_shift if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 else: cut_site = read.aend + args.reverse_shift - 1 if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(output_fname)
class BiasTable: """ Represent a bias table. Authors: Eduardo G. Gusmao. """ def __init__(self, reads_file=None, regions_file=None, genome_file=None, k_nb=None, forward_shift=None, reverse_shift=None, bias_type=None, output_location=None): """ Initializes BiasTable. """ self.regions = GenomicRegionSet("Bias Regions") self.reads_file = reads_file if regions_file is not None: self.regions.read(regions_file) self.genome_file = genome_file self.k_nb = k_nb self.forward_shift = forward_shift self.reverse_shift = reverse_shift self.bias_type = bias_type self.output_location = output_location def load_table(self, table_file_name_F, table_file_name_R): """ Creates a bias table from a tab separated file with a k-mer and bias estimate in each line. Keyword arguments: table_file_name -- Table file name. Return: bias_table_F, bias_table_R -- Bias tables. """ bias_table_F = dict() table_file_F = open(table_file_name_F, "r") for line in table_file_F: ll = line.strip().split("\t") bias_table_F[ll[0]] = float(ll[1]) table_file_F.close() bias_table_R = dict() table_file_R = open(table_file_name_R, "r") for line in table_file_R: ll = line.strip().split("\t") bias_table_R[ll[0]] = float(ll[1]) table_file_R.close() return [bias_table_F, bias_table_R]
def test_fisher_table(self): regions = GenomicRegionSet("regions") regions.read(os.path.join(os.path.dirname(__file__), "test.bed")) mpbs = GenomicRegionSet("mpbs") mpbs.read(os.path.join(os.path.dirname(__file__), "test_mpbs.bed")) i, ni, gs, ms = fisher_table("GGT1", regions, mpbs) self.assertEqual(i, 0) self.assertEqual(ni, 36) i, ni, gs, ms = fisher_table("HIC2", regions, mpbs) self.assertEqual(i, 8) self.assertEqual(ni, 28) i, ni, gs, ms = fisher_table("RAC2", regions, mpbs, gene_set=True, mpbs_set=True) self.assertEqual(len(gs), 0) self.assertEqual(len(ms), 0)
def get_raw_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) bam = Samfile(args.input_files[0], "rb") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() with open(output_fname, "a") as output_f: for region in regions: # Raw counts signal = [0.0] * (region.final - region.initial) for read in bam.fetch(region.chrom, region.initial, region.final): if not read.is_reverse: cut_site = read.pos + args.forward_shift if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 else: cut_site = read.aend + args.reverse_shift - 1 if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(output_fname)
def test_subtract_exact(self): reference = GenomicRegionSet("reference") reference.read(os.path.join(os.path.dirname(__file__), "test_result.bed")) background = GenomicRegionSet("background") background.read(os.path.join(os.path.dirname(__file__), "test_background.bed")) target = GenomicRegionSet("target") target.read(os.path.join(os.path.dirname(__file__), "test_target.bed")) background_tmp = background.subtract(target, exact=True) reference.sort() self.assertEqual(len(background_tmp.sequences), len(reference.sequences)) for region, region_ref in zip(background_tmp.sequences, reference.sequences): self.assertEqual(region.__cmp__(region_ref), 0)
def test_subtract_exact(self): reference = GenomicRegionSet("reference") reference.read( os.path.join(os.path.dirname(__file__), "test_result.bed")) background = GenomicRegionSet("background") background.read( os.path.join(os.path.dirname(__file__), "test_background.bed")) target = GenomicRegionSet("target") target.read(os.path.join(os.path.dirname(__file__), "test_target.bed")) background_tmp = background.subtract(target, exact=True) reference.sort() self.assertEqual(len(background_tmp.sequences), len(reference.sequences)) for region, region_ref in zip(background_tmp.sequences, reference.sequences): self.assertEqual(region, region_ref)
def test_get_fisher_dict(self): motif_names = ["TFIP11", "ACO2", "HIC2", "HAT5"] regions = GenomicRegionSet("regions") regions.read(os.path.join(os.path.dirname(__file__), "test.bed")) mpbs = GenomicRegionSet("mpbs") mpbs.read(os.path.join(os.path.dirname(__file__), "test_mpbs.bed")) regions2 = GenomicRegionSet("regions2") regions2.read(os.path.join(os.path.dirname(__file__), "test2.bed")) mpbs2 = GenomicRegionSet("mpbs2") mpbs2.read(os.path.join(os.path.dirname(__file__), "test2_mpbs.bed")) regions3 = GenomicRegionSet("regions3") regions3.read(os.path.join(os.path.dirname(__file__), "test3.bed")) mpbs3 = GenomicRegionSet("mpbs3") mpbs3.read(os.path.join(os.path.dirname(__file__), "test3_mpbs.bed")) result = get_fisher_dict(motif_names, regions, mpbs) intersecting = result[0] not_intersecting = result[1] for mn in ["TFIP11", "ACO2", "HAT5"]: self.assertEqual(intersecting[mn], 0) self.assertEqual(intersecting["HIC2"], 8) for mn in motif_names: self.assertEqual(intersecting[mn]+not_intersecting[mn], 36) result = get_fisher_dict(motif_names, regions, mpbs, gene_set=True, mpbs_set=True) gs = result[2] ms = result[3] for mn in ["TFIP11", "ACO2", "HAT5"]: self.assertEqual(len(gs[mn]), 0) self.assertEqual(len(gs["HIC2"]), 8) self.assertEqual(len(ms["HIC2"]), 8)
def estimate_bias_kmer(args): # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta bamFile = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fastaFile = Fastafile(genome_data.get_genome()) regions = GenomicRegionSet("regions") regions.read(args.regions_file) # Initializing dictionaries obsDictF = dict() obsDictR = dict() expDictF = dict() expDictR = dict() ct_reads_r = 0 ct_reads_f = 0 ct_kmers = 0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if not r.is_reverse: cut_site = r.pos + args.forward_shift - 1 p1 = cut_site - int(floor(args.k_nb / 2)) else: cut_site = r.aend + args.reverse_shift + 1 p1 = cut_site - int(floor(args.k_nb / 2)) p2 = p1 + args.k_nb # Verifying PCR artifacts if p1 == prevPos: trueCounter += 1 else: prevPos = p1 trueCounter = 0 if trueCounter > maxDuplicates: continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if not r.is_reverse: ct_reads_f += 1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_r += 1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0, len(currStr) - args.k_nb): ct_kmers += 1 # Counting k-mer in dictionary s = currStr[i:i + args.k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i + args.k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A", "C", "G", "T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] bias_table_F = dict([(e, 0.0) for e in kmerComb]) bias_table_R = dict([(e, 0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount if ct_reads_f == 0: bias_table_F[kmer] = 1 else: bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount if ct_reads_r == 0: bias_table_R[kmer] = 1 else: bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6) write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
type=str, help="Define the organism") parser.add_argument( '-rmcoding', metavar=' ', type=float, help="Define the cutoff to remove the entries with coding potential") parser.add_argument('-mafdir', metavar=' ', type=str, help="Define the directory to MAF files") # python /projects/reg-gen/tools/phylocsf_check.py -i args = parser.parse_args() bed = GenomicRegionSet("input") bed.read(args.i) num = len(bed) organisms = { "hg18": "Human", "panTro2": "Chimp", "rheMac2": "Rhesus", "tarSyr1": "Tarsier", "micMur1": "Mouse_lemur", "otoGar1": "Bushbaby", "tupBel1": "Shrew", "mm9": "Mouse", "rn4": "Rat", "dipOrd1": "Kangaroo_Rat", "cavPor2": "Guinea_Pig", "speTri1": "Squirrel",
def get_bc_signal(arguments): (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism, window_size, forward_shift, reverse_shift, bias_table1, bias_table2) = arguments mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() bam1 = Samfile(reads_file1, "rb") bam2 = Samfile(reads_file2, "rb") genome_data = GenomeData(organism) fasta = Fastafile(genome_data.get_genome()) signal_1 = np.zeros(window_size) signal_2 = np.zeros(window_size) motif_len = None pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), ("G", [0.0] * window_size), ("T", [0.0] * window_size), ("N", [0.0] * window_size)]) mpbs_regions = mpbs.by_names([mpbs_name]) num_motif = len(mpbs_regions) # Fetch bias corrected signal for region in mpbs_regions: if motif_len is None: motif_len = region.final - region.initial mid = (region.final + region.initial) / 2 p1 = mid - window_size / 2 p2 = mid + window_size / 2 if p1 <= 0: continue # Fetch raw signal signal1 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam1, bias_table=bias_table1, genome_file_name=genome_data.get_genome(), forward_shift=forward_shift, reverse_shift=reverse_shift) signal2 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam2, bias_table=bias_table2, genome_file_name=genome_data.get_genome(), forward_shift=forward_shift, reverse_shift=reverse_shift) if len(signal1) != len(signal_1) or len(signal2) != len(signal_2): continue signal_1 = np.add(signal_1, np.array(signal1)) signal_2 = np.add(signal_2, np.array(signal2)) update_pwm(pwm, fasta, region, p1, p2) return signal_1, signal_2, motif_len, pwm, num_motif
def estimate_bias_pwm(args): # Parameters max_duplicates = 100 # Initializing bam and fasta bamFile = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fastaFile = Fastafile(genome_data.get_genome()) regions = GenomicRegionSet("regions") regions.read(args.regions_file) obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) # Iterating on HS regions for region in regions: # Initialization prev_pos = -1 true_counter = 0 # Evaluating observed frequencies # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if not r.is_reverse: cut_site = r.pos + args.forward_shift - 1 p1 = cut_site - int(floor(args.k_nb / 2)) else: cut_site = r.aend + args.reverse_shift + 1 p1 = cut_site - int(floor(args.k_nb / 2)) p2 = p1 + args.k_nb # Verifying PCR artifacts if p1 == prev_pos: true_counter += 1 else: prev_pos = p1 true_counter = 0 if true_counter > max_duplicates: continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if not r.is_reverse: for i in range(0, len(currStr)): obs_f_pwm_dict[currStr[i]][i] += 1 else: for i in range(0, len(currStr)): obs_r_pwm_dict[currStr[i]][i] += 1 # Evaluating expected frequencies # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue # Iterating on each sequence position s = None for i in range(0, len(currStr) - args.k_nb): # Counting k-mer in dictionary s = currStr[i:i + args.k_nb] for i in range(0, len(s)): exp_f_pwm_dict[s[i]][i] += 1 # Counting k-mer in dictionary for reverse complement s = AuxiliaryFunctions.revcomp(s) for i in range(0, len(s)): exp_r_pwm_dict[s[i]][i] += 1 # Closing files bamFile.close() fastaFile.close() # Output pwms os.system("mkdir -p " + os.path.join(args.output_location, "pfm")) pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict] pwm_file_list = [] pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb))) pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb))) pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb))) pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb))) pwm_file_list.append(pwm_obs_f) pwm_file_list.append(pwm_obs_r) pwm_file_list.append(pwm_exp_f) pwm_file_list.append(pwm_exp_r) for i in range(len(pwm_dict_list)): with open(pwm_file_list[i], "w") as pwm_file: for e in ["A", "C", "G", "T"]: pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n") motif_obs_f = motifs.read(open(pwm_obs_f), "pfm") motif_obs_r = motifs.read(open(pwm_obs_r), "pfm") motif_exp_f = motifs.read(open(pwm_exp_f), "pfm") motif_exp_r = motifs.read(open(pwm_exp_r), "pfm") # Output logos os.system("mkdir -p " + os.path.join(args.output_location, "logo")) logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb))) logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb))) logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb))) logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb))) motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) # Creating bias dictionary alphabet = ["A", "C", "G", "T"] k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] bias_table_F = dict([(e, 0.0) for e in k_mer_comb]) bias_table_R = dict([(e, 0.0) for e in k_mer_comb]) for k_mer in k_mer_comb: obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb) exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb) bias_table_F[k_mer] = round(obs_f / exp_f, 6) obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb) exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb) bias_table_R[k_mer] = round(obs_r / exp_r, 6) write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
def chip_evaluate(args): # Evaluate Statistics fpr = dict() tpr = dict() roc_auc_1 = dict() roc_auc_10 = dict() roc_auc_50 = dict() roc_auc_100 = dict() recall = dict() precision = dict() prc_auc_1 = dict() prc_auc_10 = dict() prc_auc_50 = dict() prc_auc_100 = dict() footprint_file = args.footprint_file.split(",") footprint_name = args.footprint_name.split(",") footprint_type = args.footprint_type.split(",") max_score = 0 if "SEG" in footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read(args.tfbs_file) # Verifying the maximum score of the MPBS file for region in iter(mpbs_regions): score = int(region.data.split("\t")[0]) if score > max_score: max_score = score max_score += 1 max_points = [] for i in range(len(footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read(footprint_file[i]) footprints_regions.sort() if footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet("Increased Regions") intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data.split("\t")[0]) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(increased_score_mpbs_regions) max_points.append(len(intersect_regions)) elif footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(footprints_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(footprints_regions) max_points.append(len(footprints_regions)) # Output the statistics results into text stats_fname = os.path.join(args.output_location, "{}_stats.txt".format(args.output_prefix)) stats_header = ["METHOD", "AUC_100", "AUC_50", "AUC_10", "AUC_1", "AUPR_100", "AUPR_50", "AUPR_10", "AUPR_1"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(footprint_name)): stats_file.write(footprint_name[i] + "\t" + str(roc_auc_100[i]) + "\t" + str(roc_auc_50[i]) + "\t" + str(roc_auc_10[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(prc_auc_100[i]) + "\t" + str(prc_auc_50[i]) + "\t" + str(prc_auc_10[i]) + "\t" + str(prc_auc_1[i]) + "\n") # Output the curves if args.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" plot_curve(footprint_name, args.output_location, fpr, tpr, roc_auc_100, label_x, label_y, args.output_prefix, curve_name, max_points=max_points) if args.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" plot_curve(footprint_name, args.output_location, recall, precision, prc_auc_100, label_x, label_y, args.output_prefix, curve_name, max_points=max_points) output_points(footprint_name, args.output_location, args.output_prefix, fpr, tpr, recall, precision)
import sys import pandas as pd from pysam import Samfile from rgt.GenomicRegionSet import GenomicRegionSet from rgt.Util import GenomeData tf_file = sys.argv[1] bam_file = sys.argv[2] output_file = sys.argv[3] gr_tfs = GenomicRegionSet(name="TFs") gr_tfs.read(filename=tf_file) gr_genes = gr_tfs.gene_association(organism="hg38") # Fetching chromosome sizes genome_data = GenomeData("hg38") chrom_sizes_file_name = genome_data.get_chromosome_sizes() chrom_sizes_file = open(chrom_sizes_file_name, "r") chrom_sizes_dict = dict() for chrom_sizes_entry_line in chrom_sizes_file: chrom_sizes_entry_vec = chrom_sizes_entry_line.strip().split("\t") chrom_sizes_dict[chrom_sizes_entry_vec[0]] = int(chrom_sizes_entry_vec[1]) chrom_sizes_file.close() bam = Samfile(bam_file, "rb") tf_list = list() gene_list = list() tc_list = list() for i, r in enumerate(gr_tfs):
def get_bc_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() bam = Samfile(args.input_files[0], "rb") genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) hmm_data = HmmData() if args.bias_table: bias_table_list = args.bias_table.split(",") bias_table = BiasTable().load_table(table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) else: table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) if args.strand_specific: fname_forward = os.path.join(args.output_location, "{}_forward.wig".format(args.output_prefix)) fname_reverse = os.path.join(args.output_location, "{}_reverse.wig".format(args.output_prefix)) f_forward = open(fname_forward, "a") f_reverse = open(fname_reverse, "a") for region in regions: signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=True) if args.norm: signal_f = reads_file.boyle_norm(signal_f) perc = scoreatpercentile(signal_f, 98) std = np.std(signal_f) signal_f = reads_file.hon_norm_atac(signal_f, perc, std) signal_r = reads_file.boyle_norm(signal_r) perc = scoreatpercentile(signal_r, 98) std = np.std(signal_r) signal_r = reads_file.hon_norm_atac(signal_r, perc, std) f_forward.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n") f_reverse.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n") f_forward.close() f_reverse.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}_forward.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", fname_forward, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(fname_forward) bw_filename = os.path.join(args.output_location, "{}_reverse.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(fname_reverse) else: output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) with open(output_fname, "a") as output_f: for region in regions: signal = reads_file.get_bc_signal_by_fragment_length(ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=False) if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(output_fname)
def diff_analysis_run(args): # Initializing Error Handler err = ErrorHandler() output_location = os.path.join(args.output_location, "Lineplots") try: if not os.path.isdir(output_location): os.makedirs(output_location) except Exception: err.throw_error("MM_OUT_FOLDER_CREATION") # Check if the index file exists base_name1 = "{}.bai".format(args.reads_file1) if not os.path.exists(base_name1): pysam.index(args.reads_file1) base_name2 = "{}.bai".format(args.reads_file2) if not os.path.exists(base_name2): pysam.index(args.reads_file2) mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(args.mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(args.mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() mpbs.remove_duplicates() mpbs_name_list = list(set(mpbs.get_names())) signal_dict_by_tf_1 = dict() signal_dict_by_tf_2 = dict() motif_len_dict = dict() motif_num_dict = dict() pwm_dict_by_tf = dict() pool = Pool(processes=args.nc) # differential analysis using bias corrected signal if args.bc: hmm_data = HmmData() table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table1 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) bias_table2 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) mpbs_list = list() for mpbs_name in mpbs_name_list: mpbs_list.append((mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2, args.organism, args.window_size, args.forward_shift, args.reverse_shift, bias_table1, bias_table2)) try: res = pool.map(get_bc_signal, mpbs_list) except Exception: logging.exception("get bias corrected signal failed") # differential analysis using raw signal else: mpbs_list = list() for mpbs_name in mpbs_name_list: mpbs_list.append((mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2, args.organism, args.window_size, args.forward_shift, args.reverse_shift)) try: res = pool.map(get_raw_signal, mpbs_list) except Exception: logging.exception("get raw signal failed") for idx, mpbs_name in enumerate(mpbs_name_list): signal_dict_by_tf_1[mpbs_name] = res[idx][0] signal_dict_by_tf_2[mpbs_name] = res[idx][1] motif_len_dict[mpbs_name] = res[idx][2] pwm_dict_by_tf[mpbs_name] = res[idx][3] motif_num_dict[mpbs_name] = res[idx][4] if args.factor1 is None or args.factor2 is None: args.factor1, args.factor2 = compute_factors(signal_dict_by_tf_1, signal_dict_by_tf_2) output_factor(args, args.factor1, args.factor2) if args.output_profiles: output_profiles(mpbs_name_list, signal_dict_by_tf_1, output_location, args.condition1) output_profiles(mpbs_name_list, signal_dict_by_tf_2, output_location, args.condition2) ps_tc_results_by_tf = dict() plots_list = list() for mpbs_name in mpbs_name_list: plots_list.append((mpbs_name, motif_num_dict[mpbs_name], signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name], args.factor1, args.factor2, args.condition1, args.condition2, pwm_dict_by_tf[mpbs_name], output_location, args.window_size, args.standardize)) pool.map(line_plot, plots_list) for mpbs_name in mpbs_name_list: res = get_ps_tc_results(signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name], args.factor1, args.factor2, motif_num_dict[mpbs_name], motif_len_dict[mpbs_name]) # # # only use the factors whose protection scores are greater than 0 # if res[0] > 0 and res[1] < 0: ps_tc_results_by_tf[mpbs_name] = res # # stat_results_by_tf = get_stat_results(ps_tc_results_by_tf) ps_tc_results_by_tf = scatter_plot(args, ps_tc_results_by_tf) output_stat_results(args, ps_tc_results_by_tf, motif_num_dict)
def get_bc_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() bam = Samfile(args.input_files[0], "rb") genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) hmm_data = HmmData() if args.bias_table: bias_table_list = args.bias_table.split(",") bias_table = BiasTable().load_table( table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) else: table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) if args.strand_specific: fname_forward = os.path.join( args.output_location, "{}_forward.wig".format(args.output_prefix)) fname_reverse = os.path.join( args.output_location, "{}_reverse.wig".format(args.output_prefix)) f_forward = open(fname_forward, "a") f_reverse = open(fname_reverse, "a") for region in regions: signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=True) if args.norm: signal_f = reads_file.boyle_norm(signal_f) perc = scoreatpercentile(signal_f, 98) std = np.std(signal_f) signal_f = reads_file.hon_norm_atac(signal_f, perc, std) signal_r = reads_file.boyle_norm(signal_r) perc = scoreatpercentile(signal_r, 98) std = np.std(signal_r) signal_r = reads_file.hon_norm_atac(signal_r, perc, std) f_forward.write( "fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n") f_reverse.write( "fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n") f_forward.close() f_reverse.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join( args.output_location, "{}_forward.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", fname_forward, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(fname_forward) bw_filename = os.path.join( args.output_location, "{}_reverse.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(fname_reverse) else: output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) with open(output_fname, "a") as output_f: for region in regions: signal = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=False) if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write( "fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(output_fname)
def chip_evaluate(args): # Evaluate Statistics fpr = dict() tpr = dict() roc_auc_1 = dict() roc_auc_10 = dict() roc_auc_50 = dict() roc_auc_100 = dict() recall = dict() precision = dict() prc_auc_1 = dict() prc_auc_10 = dict() prc_auc_50 = dict() prc_auc_100 = dict() footprint_file = args.footprint_file.split(",") footprint_name = args.footprint_name.split(",") footprint_type = args.footprint_type.split(",") max_score = 0 if "SEG" in footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read(args.tfbs_file) # Verifying the maximum score of the MPBS file for region in iter(mpbs_regions): score = int(region.data.split("\t")[0]) if score > max_score: max_score = score max_score += 1 for i in range(len(footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read(footprint_file[i]) footprints_regions.sort() if footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet("Increased Regions") intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data.split("\t")[0]) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(increased_score_mpbs_regions) elif footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(footprints_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(footprints_regions) # Output the statistics results into text stats_fname = os.path.join(args.output_location, "{}_stats.txt".format(args.output_prefix)) stats_header = ["METHOD", "AUC_100", "AUC_50", "AUC_10", "AUC_1", "AUPR_100", "AUPR_50", "AUPR_10", "AUPR_1"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(footprint_name)): stats_file.write(footprint_name[i] + "\t" + str(roc_auc_100[i]) + "\t" + str(roc_auc_50[i]) + "\t" + str(roc_auc_10[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(prc_auc_100[i]) + "\t" + str(prc_auc_50[i]) + "\t" + str(prc_auc_10[i]) + "\t" + str(prc_auc_1[i]) + "\n") # Output the curves if args.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" plot_curve(footprint_name, args.output_location, fpr, tpr, roc_auc_100, label_x, label_y, args.output_prefix, curve_name) if args.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" plot_curve(footprint_name, args.output_location, recall, precision, prc_auc_100, label_x, label_y, args.output_prefix, curve_name) output_points(footprint_name, args.output_location, args.output_prefix, fpr, tpr, recall, precision)
def get_raw_signal(arguments): (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism, window_size, forward_shift, reverse_shift) = arguments mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() bam1 = Samfile(reads_file1, "rb") bam2 = Samfile(reads_file2, "rb") genome_data = GenomeData(organism) fasta = Fastafile(genome_data.get_genome()) signal_1 = np.zeros(window_size) signal_2 = np.zeros(window_size) motif_len = None pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), ("G", [0.0] * window_size), ("T", [0.0] * window_size), ("N", [0.0] * window_size)]) mpbs_regions = mpbs.by_names([mpbs_name]) num_motif = len(mpbs_regions) for region in mpbs_regions: if motif_len is None: motif_len = region.final - region.initial mid = (region.final + region.initial) / 2 p1 = mid - window_size / 2 p2 = mid + window_size / 2 if p1 <= 0: continue # Fetch raw signal for read in bam1.fetch(region.chrom, p1, p2): if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 for read in bam2.fetch(region.chrom, p1, p2): if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 update_pwm(pwm, fasta, region, p1, p2) return signal_1, signal_2, motif_len, pwm, num_motif
def get_raw_signal(arguments): (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism, window_size, forward_shift, reverse_shift) = arguments mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() bam1 = Samfile(reads_file1, "rb") bam2 = Samfile(reads_file2, "rb") genome_data = GenomeData(organism) fasta = Fastafile(genome_data.get_genome()) signal_1 = np.zeros(window_size) signal_2 = np.zeros(window_size) motif_len = None pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), ("G", [0.0] * window_size), ("T", [0.0] * window_size), ("N", [0.0] * window_size)]) mpbs_regions = mpbs.by_names([mpbs_name]) num_motif = len(mpbs_regions) for region in mpbs_regions: if motif_len is None: motif_len = region.final - region.initial mid = (region.final + region.initial) / 2 p1 = mid - window_size / 2 p2 = mid + window_size / 2 if p1 <= 0: continue # Fetch raw signal for read in bam1.fetch(region.chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 for read in bam2.fetch(region.chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 update_pwm(pwm, fasta, region, p1, p2) return signal_1, signal_2, motif_len, pwm, num_motif
def diff_analysis_run(args): # Initializing Error Handler err = ErrorHandler() output_location = os.path.join(args.output_location, "Lineplots") try: if not os.path.isdir(output_location): os.makedirs(output_location) except Exception: err.throw_error("MM_OUT_FOLDER_CREATION") mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(args.mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(args.mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() mpbs_name_list = list(set(mpbs.get_names())) signal_dict_by_tf_1 = dict() signal_dict_by_tf_2 = dict() motif_len_dict = dict() motif_num_dict = dict() pwm_dict_by_tf = dict() pool = Pool(processes=args.nc) # differential analysis using bias corrected signal if args.bc: hmm_data = HmmData() table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table1 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) bias_table2 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) mpbs_list = list() for mpbs_name in mpbs_name_list: mpbs_list.append( (mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2, args.organism, args.window_size, args.forward_shift, args.reverse_shift, bias_table1, bias_table2)) try: res = pool.map(get_bc_signal, mpbs_list) except Exception: logging.exception("get bias corrected signal failed") # differential analysis using raw signal else: mpbs_list = list() for mpbs_name in mpbs_name_list: mpbs_list.append( (mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2, args.organism, args.window_size, args.forward_shift, args.reverse_shift)) try: res = pool.map(get_raw_signal, mpbs_list) except Exception: logging.exception("get raw signal failed") for idx, mpbs_name in enumerate(mpbs_name_list): signal_dict_by_tf_1[mpbs_name] = res[idx][0] signal_dict_by_tf_2[mpbs_name] = res[idx][1] motif_len_dict[mpbs_name] = res[idx][2] pwm_dict_by_tf[mpbs_name] = res[idx][3] motif_num_dict[mpbs_name] = res[idx][4] if args.factor1 is None or args.factor2 is None: args.factor1, args.factor2 = compute_factors(signal_dict_by_tf_1, signal_dict_by_tf_2) output_factor(args, args.factor1, args.factor2) if args.output_profiles: output_profiles(mpbs_name_list, signal_dict_by_tf_1, output_location, args.condition1) output_profiles(mpbs_name_list, signal_dict_by_tf_2, output_location, args.condition2) ps_tc_results_by_tf = dict() plots_list = list() for mpbs_name in mpbs_name_list: plots_list.append( (mpbs_name, motif_num_dict[mpbs_name], signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name], args.factor1, args.factor2, args.condition1, args.condition2, pwm_dict_by_tf[mpbs_name], output_location, args.window_size, args.standardize)) pool.map(line_plot, plots_list) for mpbs_name in mpbs_name_list: res = get_ps_tc_results(signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name], args.factor1, args.factor2, motif_num_dict[mpbs_name], motif_len_dict[mpbs_name]) # # # only use the factors whose protection scores are greater than 0 # if res[0] > 0 and res[1] < 0: ps_tc_results_by_tf[mpbs_name] = res # stat_results_by_tf = get_stat_results(ps_tc_results_by_tf) scatter_plot(args, stat_results_by_tf) output_stat_results(args, stat_results_by_tf, motif_num_dict)
def diff_analysis_run(args): # Initializing Error Handler err = ErrorHandler() output_location = os.path.join(args.output_location, "Lineplots") try: if not os.path.isdir(output_location): os.makedirs(output_location) except Exception: err.throw_error("MM_OUT_FOLDER_CREATION") # check if they have same length mpbs_files = args.mpbs_files.strip().split(",") reads_files = args.reads_files.strip().split(",") conditions = args.conditions.strip().split(",") if args.colors is not None: colors = args.colors.strip().split(",") else: colors = [ "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33", "#a65628", "#f781bf", "#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3", "#a6d854", "#ffd92f", "#e5c494", "#b3b3b3", "#8dd3c7", "#ffffb3", "#bebada", "#fb8072", "#80b1d3", "#fdb462", "#b3de69", "#fccde5", "#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e", "#e6ab02", "#a6761d", "#666666", "#7fc97f", "#beaed4", "#fdc086", "#ffff99", "#386cb0", "#f0027f", "#bf5b17", "#666666" ] assert len(mpbs_files) == len(reads_files) == len(conditions), \ "Number of motif, read and condition names are not same: {}, {}, {}".format(len(mpbs_files), len(reads_files), len(conditions)) # Check if the index file exists for reads_file in reads_files: base_name = "{}.bai".format(reads_file) if not os.path.exists(base_name): pysam.index(reads_file) mpbs = GenomicRegionSet("Motif Predicted Binding Sites of All Conditions") for i, mpbs_file in enumerate(mpbs_files): mpbs.read(mpbs_file) mpbs.sort() mpbs.remove_duplicates() mpbs_name_list = list(set(mpbs.get_names())) signals = np.zeros(shape=(len(conditions), len(mpbs_name_list), args.window_size), dtype=np.float32) motif_len = list() motif_num = list() motif_pwm = list() print((" {} cpus are detected and {} of them will be used...\n".format( cpu_count(), args.nc))) genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) print("generating signal for each motif and condition...\n") # differential analysis using bias corrected signal if args.bc: hmm_data = HmmData() table_forward = hmm_data.get_default_bias_table_F_ATAC() table_reverse = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_forward, table_file_name_R=table_reverse) # do not use multi-processing if args.nc == 1: for i, condition in enumerate(conditions): for j, mpbs_name in enumerate(mpbs_name_list): mpbs_regions = mpbs.by_names([mpbs_name]) arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, args.reverse_shift, bias_table) try: signals[i, j, :] = get_bc_signal(arguments) except Exception: logging.exception("get bias corrected signal failed") # get motif length, number and pwm matrix motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) motif_num.append(len(mpbs_regions)) motif_pwm.append( get_pwm(fasta, mpbs_regions, args.window_size)) # use multi-processing else: for i, condition in enumerate(conditions): print(( "generating signal for condition {} \n".format(condition))) with Pool(processes=args.nc) as pool: arguments_list = list() for mpbs_name in mpbs_name_list: mpbs_regions = mpbs.by_names([mpbs_name]) arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, args.reverse_shift, bias_table) arguments_list.append(arguments) # get motif length, number and pwm matrix motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) motif_num.append(len(mpbs_regions)) motif_pwm.append( get_pwm(fasta, mpbs_regions, args.window_size)) res = pool.map(get_bc_signal, arguments_list) signals[i] = np.array(res) # differential analysis using raw signal else: # do not use multi-processing if args.nc == 1: for i, condition in enumerate(conditions): for j, mpbs_name in enumerate(mpbs_name_list): mpbs_regions = mpbs.by_names([mpbs_name]) arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, args.reverse_shift) signals[i, j, :] = get_raw_signal(arguments) # get motif length, number and pwm matrix motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) motif_num.append(len(mpbs_regions)) motif_pwm.append( get_pwm(fasta, mpbs_regions, args.window_size)) # use multi-processing else: for i, condition in enumerate(conditions): print(( "generating signal for condition {} \n".format(condition))) with Pool(processes=args.nc) as pool: arguments_list = list() for mpbs_name in mpbs_name_list: mpbs_regions = mpbs.by_names([mpbs_name]) arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, args.reverse_shift) arguments_list.append(arguments) # get motif length, number and pwm matrix motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) motif_num.append(len(mpbs_regions)) motif_pwm.append( get_pwm(fasta, mpbs_regions, args.window_size)) res = pool.map(get_raw_signal, arguments_list) signals[i] = np.array(res) print("signal generation is done!\n") # compute normalization facotr for each condition factors = compute_factors(signals) output_factor(args, factors, conditions) # normalize signals by factor and number of motifs for i in range(len(conditions)): for j in range(len(mpbs_name_list)): signals[i, j, :] = signals[i, j, :] / (factors[i] * motif_num[j]) if args.output_profiles: output_profiles(mpbs_name_list, signals, conditions, args.output_location) print("generating line plot for each motif...\n") if args.nc == 1: for i, mpbs_name in enumerate(mpbs_name_list): output_line_plot( (mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location, args.window_size, colors)) else: with Pool(processes=args.nc) as pool: arguments_list = list() for i, mpbs_name in enumerate(mpbs_name_list): arguments_list.append( (mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location, args.window_size, colors)) pool.map(output_line_plot, arguments_list) ps_tc_results = list() for i, mpbs_name in enumerate(mpbs_name_list): ps_tc_results.append( get_ps_tc_results(signals[:, i, :], motif_len[i], args.window_size)) # find the significant motifs and generate a scatter plot if two conditions are given if len(conditions) == 2: ps_tc_results = scatter_plot(args, ps_tc_results, mpbs_name_list, conditions) output_stat_results(ps_tc_results, conditions, mpbs_name_list, motif_num, args)