def get_tag_count(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=1000): """ Gets the tag count associated with self.bam based on start, end and ext. Keyword arguments: ref -- Chromosome name. start -- Initial genomic coordinate of signal. end -- Final genomic coordinate of signal. downstream_ext -- Number of bps to extend towards the downstream region (right for forward strand and left for reverse strand). upstream_ext -- Number of bps to extend towards the upstream region (left for forward strand and right for reverse strand). forward_shift -- Number of bps to shift the reads aligned to the forward strand. Can be a positive number for a shift towards the downstream region (towards the inside of the aligned read) and a negative number for a shift towards the upstream region. reverse_shift -- Number of bps to shift the reads aligned to the reverse strand. Can be a positive number for a shift towards the upstream region and a negative number for a shift towards the downstream region (towards the inside of the aligned read). initial_clip -- Signal will be initially clipped at this level to avoid outliers. Return: tag_count -- Total signal. """ # Fetch raw signal pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift) if ps_version == "0.7.5": self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array([min(e, initial_clip) for e in pileup_region.vector]) # Tag count try: tag_count = sum(raw_signal) except Exception: tag_count = 0 return tag_count
# Iterating on MPBSs intFile = open(intFileName, "r") spr = 0.0 counter = 0.0 for line in intFile: # Fetching signal ll = line.strip().split("\t") mLen = int(ll[2]) - int(ll[1]) mid = (int(ll[1]) + int(ll[2])) / 2 p1 = max(mid - halfWindow, 0) p2 = mid + halfWindow # Fetch raw signal pileup_region = PileupRegion(p1, p2, 1) if (ps_version == "0.7.5"): bam.fetch(reference=ll[0], start=p1, end=p2, callback=pileup_region) else: iter = bam.fetch(reference=ll[0], start=p1, end=p2) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array( [min(e, initial_clip) for e in pileup_region.vector]) # Std-based clipping mean = raw_signal.mean() std = raw_signal.std()
# Iterating on MPBSs intFile = open(intFileName,"r") spr = 0.0 counter = 0.0 for line in intFile: # Fetching signal ll = line.strip().split("\t") mLen = int(ll[2]) - int(ll[1]) mid = (int(ll[1])+int(ll[2]))/2 p1 = max(mid - halfWindow,0) p2 = mid + halfWindow # Fetch raw signal pileup_region = PileupRegion(p1,p2,1) if(ps_version == "0.7.5"): bam.fetch(reference=ll[0], start=p1, end=p2, callback = pileup_region) else: iter = bam.fetch(reference=ll[0], start=p1, end=p2) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array([min(e,initial_clip) for e in pileup_region.vector]) # Std-based clipping mean = raw_signal.mean() std = raw_signal.std() clip_signal = [min(e, mean + (10 * std)) for e in raw_signal] # Bias Correction correctedSignal = bias_correction(bam, clip_signal, biasTableF, biasTableR, genomeFileName, ll[0], p1, p2)
def get_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None, print_raw_signal=False): """ Gets the signal associated with self.bam based on start, end and ext. initial_clip, per_norm and per_slope are used as normalization factors during the normalization and slope evaluation procedures. Keyword arguments: ref -- Chromosome name. start -- Initial genomic coordinate of signal. end -- Final genomic coordinate of signal. initial_clip -- Signal will be initially clipped at this level to avoid outliers. per_norm -- Percentile value for 'hon_norm' function of the normalized signal. per_slope -- Percentile value for 'hon_norm' function of the slope signal. bias_table -- Bias table to perform bias correction. genome_file_name -- Genome to perform bias correction. downstream_ext -- Number of bps to extend towards the downstream region (right for forward strand and left for reverse strand). upstream_ext -- Number of bps to extend towards the upstream region (left for forward strand and right for reverse strand). forward_shift -- Number of bps to shift the reads aligned to the forward strand. Can be a positive number for a shift towards the downstream region (towards the inside of the aligned read) and a negative number for a shift towards the upstream region. reverse_shift -- Number of bps to shift the reads aligned to the reverse strand. Can be a positive number for a shift towards the upstream region and a negative number for a shift towards the downstream region (towards the inside of the aligned read). Return: hon_signal -- Normalized signal. slopehon_signal -- Slope signal. """ # Fetch raw signal pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift) if ps_version == "0.7.5": self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array([min(e, initial_clip) for e in pileup_region.vector]) # Std-based clipping mean = raw_signal.mean() std = raw_signal.std() clip_signal = [min(e, mean + (10 * std)) for e in raw_signal] # Cleavage bias correction bc_signal = self.bias_correction_dnase(clip_signal, bias_table, genome_file_name, ref, start, end, forward_shift, reverse_shift) # Boyle normalization (within-dataset normalization) boyle_signal = array(self.boyle_norm(bc_signal)) # Hon normalization (between-dataset normalization) perc = scoreatpercentile(boyle_signal, per_norm) std = boyle_signal.std() hon_signal = self.hon_norm_dnase(boyle_signal, perc, std) # Slope signal slope_signal = self.slope(hon_signal, self.sg_coefs) # Returning normalized and slope sequences return hon_signal, slope_signal
def print_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None, raw_signal_file=None, bc_signal_file=None, norm_signal_file=None, strand_specific=False): if raw_signal_file: pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift) if ps_version == "0.7.5": self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array([min(e, initial_clip) for e in pileup_region.vector]) f = open(raw_signal_file, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(raw_signal)]) + "\n") f.close() if bc_signal_file or norm_signal_file: # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fasta = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(k_nb / 2.) p2_wk = p2_w + int(k_nb / 2.) currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create the bias signal signal_bias_f = [] signal_bias_r = [] for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1): fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)] rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i] try: signal_bias_f.append(fBiasDict[fseq]) except Exception: signal_bias_f.append(defaultKmerValue) try: signal_bias_r.append(rBiasDict[rseq]) except Exception: signal_bias_r.append(defaultKmerValue) # Raw counts signal_raw_f = [0.0] * (p2_w - p1_w) signal_raw_r = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(ref, p1_w, p2_w): if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: signal_raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: signal_raw_r[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(signal_raw_f[:window]) rSum = sum(signal_raw_r[:window]) fLast = signal_raw_f[0] rLast = signal_raw_r[0] for i in range((window / 2), len(signal_raw_f) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += signal_raw_f[i + (window / 2)] fLast = signal_raw_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_raw_r[i + (window / 2)] rLast = signal_raw_r[i - (window / 2) + 1] # Calculating bias and writing to wig file fSum = sum(signal_bias_f[:window]) rSum = sum(signal_bias_r[:window]) fLast = signal_bias_f[0] rLast = signal_bias_r[0] signal_bc = [] signal_bc_f = [] signal_bc_r = [] for i in range((window / 2), len(signal_bias_f) - (window / 2)): nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum) nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum) signal_bc.append(nhatf + nhatr) signal_bc_f.append(nhatf) signal_bc_r.append(nhatr) fSum -= fLast fSum += signal_bias_f[i + (window / 2)] fLast = signal_bias_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_bias_r[i + (window / 2)] rLast = signal_bias_r[i - (window / 2) + 1] if bc_signal_file: f = open(bc_signal_file, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_bc)]) + "\n") f.close() if strand_specific: prefix = bc_signal_file.split(".")[0] bc_signal_file_f = prefix + "_Forward" + ".bc.wig" bc_signal_file_r = prefix + "_Reverse" + ".bc.wig" f = open(bc_signal_file_f, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_bc_f)]) + "\n") f.close() f = open(bc_signal_file_r, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_bc_r)]) + "\n") f.close() if norm_signal_file: norm_signal_bc = self.boyle_norm(signal_bc) perc = scoreatpercentile(norm_signal_bc, 98) std = np.std(norm_signal_bc) norm_signal_bc = self.hon_norm_atac(norm_signal_bc, perc, std) f = open(norm_signal_file, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(norm_signal_bc)]) + "\n") f.close() if strand_specific: prefix = bc_signal_file.split(".")[0] norm_signal_file_f = prefix + "_Forward" + ".norm.wig" norm_signal_file_r = prefix + "_Reverse" + ".norm.wig" signal_norm_f = self.boyle_norm(signal_bc_f) perc = scoreatpercentile(signal_norm_f, 98) std = np.std(signal_norm_f) signal_norm_f = self.hon_norm_atac(signal_norm_f, perc, std) signal_norm_r = self.boyle_norm(signal_bc_r) perc = scoreatpercentile(signal_norm_r, 98) std = np.std(signal_norm_r) signal_norm_r = self.hon_norm_atac(signal_norm_r, perc, std) f = open(norm_signal_file_f, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_norm_f)]) + "\n") f.close() f = open(norm_signal_file_r, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_norm_r)]) + "\n") f.close()