def bias_correction(bam, signal, fBiasDict, rBiasDict, genome_file_name, chrName, start, end): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) k_nb = len(fBiasDict.keys()[0]) p1 = start; p2 = end p1_w = p1 - (window/2); p2_w = p2 + (window/2) p1_wk = p1_w - (k_nb/2); p2_wk = p2_w + (k_nb/2) # Raw counts nf = [0.0] * (p2_w-p1_w); nr = [0.0] * (p2_w-p1_w) for r in bam.fetch(chrName, p1_w, p2_w): if((not r.is_reverse) and (r.pos > p1_w)): nf[r.pos-p1_w] += 1.0 if((r.is_reverse) and ((r.aend-1) < p2_w)): nr[r.aend-1-p1_w] += 1.0 # Smoothed counts Nf = []; Nr = []; fSum = sum(nf[:window]); rSum = sum(nr[:window]); fLast = nf[0]; rLast = nr[0] for i in range((window/2),len(nf)-(window/2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast; fSum += nf[i+(window/2)]; fLast = nf[i-(window/2)+1] rSum -= rLast; rSum += nr[i+(window/2)]; rLast = nr[i-(window/2)+1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk-1, p2_wk-2)).upper() currRevComp = revcomp(str(fastaFile.fetch(chrName,p1_wk+2, p2_wk+1)).upper()) # Iterating on sequence to create signal af = []; ar = [] for i in range((k_nb/2),len(currStr)-(k_nb/2)+1): fseq = currStr[i-(k_nb/2):i+(k_nb/2)] rseq = currRevComp[len(currStr)-(k_nb/2)-i:len(currStr)+(k_nb/2)-i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]); rSum = sum(ar[:window]); fLast = af[0]; rLast = ar[0] bias_corrected_signal = [] for i in range((window/2),len(af)-(window/2)): nhatf = Nf[i-(window/2)]*(af[i]/fSum) nhatr = Nr[i-(window/2)]*(ar[i]/rSum) zf = log(nf[i]+1)-log(nhatf+1) zr = log(nr[i]+1)-log(nhatr+1) bias_corrected_signal.append(zf+zr) fSum -= fLast; fSum += af[i+(window/2)]; fLast = af[i-(window/2)+1] rSum -= rLast; rSum += ar[i+(window/2)]; rLast = ar[i-(window/2)+1] # Termination fastaFile.close() return bias_corrected_signal
def test_should_return_capitalised_sequence_from_ref_file(self): fasta_file = self.__build_fasta_file({ 'chr20': "tagcattattattattattattatta", }) fasta_file = Fastafile(fasta_file.filename) self.assertEqual( fasta_file.fetch('chr20', 10, 20).upper(), "ATTATTATTA")
def test_should_be_able_to_fetch_section_of_genome(self): fasta_file = self.__build_fasta_file({ 'chr20': "TAGCATTATTATTATTATTATTATTA", }) fasta_file = Fastafile(fasta_file.filename) self.assertEqual( fasta_file.fetch('chr20', 10, 20).upper(), "ATTATTATTA")
class SeqDataset(Dataset): """ Args: intervals_file: bed3 file containing intervals fasta_file: file path; Genome sequence target_file: file path; path to the targets in the csv format """ def __init__(self, intervals_file, fasta_file, use_linecache=True): # intervals if use_linecache: self.bt = BedToolLinecache(intervals_file) else: self.bt = BedTool(intervals_file) self.fasta_file = fasta_file self.fasta = None def __len__(self): return len(self.bt) def __getitem__(self, idx): if self.fasta is None: self.fasta = Fastafile(self.fasta_file) interval = self.bt[idx] # Intervals can't be bigger than 1000bp if (interval.stop - interval.start) > 1000: raise Exception("Input sequences should be at maximum 1000bp.") # Fetch the fasta line seq = self.fasta.fetch(str(interval.chrom), interval.start, interval.stop).upper() # Reverse complement input string is requested if interval.strand == "-": seq = rc_str(seq) """ # generate an id id = str(interval.chrom) + ":" + str(interval.start) + "-" + str(interval.stop) if interval.name not in ["", ".", "*"]: id = interval.name """ return { "inputs": seq, "metadata": { "ranges": GenomicRanges.from_interval(interval) } }
def main(): # setup a reverse_complement translation rev_table=string.maketrans('ACGTacgt', 'TGCAtgca') def revcomp(seq, rev_table): return seq.translate(rev_table) # open your fasta file fasta = Fastafile("bedtools/tests/data/chr21.fa") # open your bed file bed = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") # for each bed, grab the the DNA in that interval for b in bed: # grab the seq, rev. comp if necessary seq = fasta.fetch(b.chrom, b.start, b.end) if b.strand == "-": seq = revcomp(seq, rev_table) # print the interval and the seq print b.chrom, b.start, b.end, b.strand, seq
def removeHomopolymers(self, variants, outFile, distance): startTime = Helper.getTime() Helper.info( " [%s] remove Missmatches from homopolymers " % (startTime.strftime("%c")), self.rnaEdit.logFile, self.rnaEdit.textField) tempBedFile = open(outFile + "_tmp.bed", "w+") tempSeqFile = outFile + "_tmp.tsv" refGenome = "/media/Storage/databases/rnaEditor_annotations/human/human_g1k_v37.fasta" fastaFile = Fastafile(self.rnaEdit.params.refGenome) mmNumberTotal = len(variants.variantDict) # print temporary BedFile numberPassed = 0 for key in variants.variantDict.keys(): chr, position, ref, alt = key startPos = position - distance if position >= distance else 0 endpos = position + distance sequence = fastaFile.fetch(chr, startPos, endpos) pattern = ref * distance """ !!!Test if this gives better results !!!ONLY DELETE IF MM IS AT THE END OF A HOMOPOLYMER NUKLEOTIDES if sequence.startswith(pattern): del mmDict[site] elif sequence.endswith(pattern): del mmDict[site] """ if pattern in sequence: try: del variants.variantDict[key] except KeyError: pass else: numberPassed += 1 # output statistics Helper.info( "\t\t %d out of %d passed the Homopolymer-Filter" % (numberPassed, mmNumberTotal), self.rnaEdit.logFile, self.rnaEdit.textField) Helper.printTimeDiff(startTime, self.rnaEdit.logFile, self.rnaEdit.textField)
def main(): # setup a reverse_complement translation rev_table = string.maketrans('ACGTacgt', 'TGCAtgca') def revcomp(seq, rev_table): return seq.translate(rev_table) # open your fasta file fasta = Fastafile("bedtools/tests/data/chr21.fa") # open your bed file bed = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") # for each bed, grab the the DNA in that interval for b in bed: # grab the seq, rev. comp if necessary seq = fasta.fetch(b.chrom, b.start, b.end) if b.strand == "-": seq = revcomp(seq, rev_table) # print the interval and the seq print b.chrom, b.start, b.end, b.strand, seq
# Evaluating Overall TC regionTagCountVec = ["0", "0", "0"] try: for i in range(0, len(tcHalfWindowVec)): tcHalfWindow = tcHalfWindowVec[i] regionTagCountVec[i] = tag_count(chrName, p1, p2, dnaseBam, tcHalfWindow) except Exception: print "Exception TC raised in " + line writeOutput(ll, regionTagCountVec, resVec, outFile) continue # Fetching sequence try: sequence = str(genomeFile.fetch(chrName, p1, p2)) except Exception: print "Exception SEQUENCE raised in " + line writeOutput(ll, regionTagCountVec, resVec, outFile) continue # Performing motif matching for i in range(0, len(motifList)): m = motifList[i] for res in search(sequence, [m.pssm_list], [m.min], absolute_threshold=True, both_strands=True): for (position, score) in res: if (score > resVec[i][0]): resVec[i][0] = score
def bias_correction(bam, signal, fBiasDict, rBiasDict, genome_file_name, chrName, start, end): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - (k_nb / 2) p2_wk = p2_w + (k_nb / 2) # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for r in bam.fetch(chrName, p1_w, p2_w): if ((not r.is_reverse) and (r.pos > p1_w)): nf[r.pos - p1_w] += 1.0 if ((r.is_reverse) and ((r.aend - 1) < p2_w)): nr[r.aend - 1 - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + (window / 2)] fLast = nf[i - (window / 2) + 1] rSum -= rLast rSum += nr[i + (window / 2)] rLast = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk - 1, p2_wk - 2)).upper() currRevComp = revcomp( str(fastaFile.fetch(chrName, p1_wk + 2, p2_wk + 1)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range((k_nb / 2), len(currStr) - (k_nb / 2) + 1): fseq = currStr[i - (k_nb / 2):i + (k_nb / 2)] rseq = currRevComp[len(currStr) - (k_nb / 2) - i:len(currStr) + (k_nb / 2) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bias_corrected_signal = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / fSum) nhatr = Nr[i - (window / 2)] * (ar[i] / rSum) zf = log(nf[i] + 1) - log(nhatf + 1) zr = log(nr[i] + 1) - log(nhatr + 1) bias_corrected_signal.append(zf + zr) fSum -= fLast fSum += af[i + (window / 2)] fLast = af[i - (window / 2) + 1] rSum -= rLast rSum += ar[i + (window / 2)] rLast = ar[i - (window / 2) + 1] # Termination fastaFile.close() return bias_corrected_signal
def bias_correction(chrom, start, end, bam, bias_table, genome_file_name, forward_shift, reverse_shift): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if p1 <= 0 or p1_w <= 0 or p2_wk <= 0: # Return raw counts bc_signal = [0.0] * (p2 - p1) for read in bam.fetch(chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: bc_signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: bc_signal[cut_site - p1] += 1.0 return bc_signal # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in bam.fetch(chrom, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: nf[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: nr[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] f_sum = sum(nf[:window]) r_sum = sum(nr[:window]) f_last = nf[0] r_last = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(f_sum) Nr.append(r_sum) f_sum -= f_last f_sum += nf[i + (window / 2)] f_last = nf[i - (window / 2) + 1] r_sum -= r_last r_sum += nr[i + (window / 2)] r_last = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrom, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrom, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file f_sum = sum(af[:window]) r_sum = sum(ar[:window]) f_last = af[0] r_last = ar[0] bc_signal = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / f_sum) nhatr = Nr[i - (window / 2)] * (ar[i] / r_sum) bc_signal.append(nhatf + nhatr) f_sum -= f_last f_sum += af[i + (window / 2)] f_last = af[i - (window / 2) + 1] r_sum -= r_last r_sum += ar[i + (window / 2)] r_last = ar[i - (window / 2) + 1] # Termination fastaFile.close() return bc_signal
if strand == '+': pos = max(int(c[1]), int(c[2])) iname = 'c' + chrom + 'p' + str(pos) + 'IN' oname = 'c' + chrom + 'p' + str(pos) + 'OUT' outerstart = pos - 600 outerend = pos + 600 innerstart = pos - 450 innerend = pos + 450 if outerstart < 0: outerstart = 0 if innerstart < 0: innerstart = 0 outerseq = ref.fetch(chrom, outerstart, outerend) innerseq = ref.fetch(chrom, innerstart, innerend) if strand == 'plus': outerseq = rc(outerseq) innerseq = rc(innerseq) outerprimers = makeprimers(oname, outerseq, 1000, 1200) innerprimers = makeprimers(iname, innerseq, 700, 900) leftouterprimer = '' rightouterprimer = '' leftinnerprimer = '' rightinnerprimer = '' primertext = ''
class ReferenceGenome(object): """ Class to read sequence data from a reference genome Attributes: genome_fasta_file (string): Path to reference genome file fasta_file (pysam.Fastafile): Fastafile object for reference genome """ def __init__(self, genome_fasta_file, logger): """ Create new ReferenceGenome Args: genome_fasta_file (string): Path to whole genome FASTA file logger (logging.Logger): Logger for reporting warnings/errors Returns: ReferenceGenome """ self._logger = logger self.genome_fasta_file = genome_fasta_file self._validate_reference_file() try: self._fasta_file = Fastafile(self.genome_fasta_file) except: raise IOError("Could not read genome file: " + self.genome_fasta_file) def _validate_reference_file(self): """ Check whether reference file is valid Args: genome_fasta_file (string): Path to genome FASTA file Returns bool: True if valid """ if not os.path.isfile(self.genome_fasta_file + ".fai"): raise ValueError( "Supplied genome FASTA file does not have FASTA index") def get_contig_lengths(self): """ Get the names and lengths of all contigs in a references Args: None Returns: list(tuple(string,int)): Returns a list representing the name and lengths of all contigs in reference """ return zip(self._fasta_file.references, self._fasta_file.lengths) def get_reference_bases(self, chrom, start, end): """ Get the reference bases from start to end Args: chrom (string): Chromsome to query start (int): Start position to query end (int): End position (not inclusive) Returns: string: The genome sequence Raises: ValueError - Invalid arguments """ if start >= end: raise ValueError("Start/stop coordinates incorrect for: " + str(chrom) + ":" + str(start) + "-" + str(end)) if chrom not in self._fasta_file: raise ValueError( "FASTA reference is missing entry for chromosome " + str(chrom)) return self._fasta_file.fetch(str(chrom), start, end)
def estimate_bias_pwm(args): # Parameters max_duplicates = 100 # Initializing bam and fasta bamFile = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fastaFile = Fastafile(genome_data.get_genome()) regions = GenomicRegionSet("regions") regions.read(args.regions_file) obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) # Iterating on HS regions for region in regions: # Initialization prev_pos = -1 true_counter = 0 # Evaluating observed frequencies # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if not r.is_reverse: cut_site = r.pos + args.forward_shift - 1 p1 = cut_site - int(floor(args.k_nb / 2)) else: cut_site = r.aend + args.reverse_shift + 1 p1 = cut_site - int(floor(args.k_nb / 2)) p2 = p1 + args.k_nb # Verifying PCR artifacts if p1 == prev_pos: true_counter += 1 else: prev_pos = p1 true_counter = 0 if true_counter > max_duplicates: continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if not r.is_reverse: for i in range(0, len(currStr)): obs_f_pwm_dict[currStr[i]][i] += 1 else: for i in range(0, len(currStr)): obs_r_pwm_dict[currStr[i]][i] += 1 # Evaluating expected frequencies # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue # Iterating on each sequence position s = None for i in range(0, len(currStr) - args.k_nb): # Counting k-mer in dictionary s = currStr[i:i + args.k_nb] for i in range(0, len(s)): exp_f_pwm_dict[s[i]][i] += 1 # Counting k-mer in dictionary for reverse complement s = AuxiliaryFunctions.revcomp(s) for i in range(0, len(s)): exp_r_pwm_dict[s[i]][i] += 1 # Closing files bamFile.close() fastaFile.close() # Output pwms os.system("mkdir -p " + os.path.join(args.output_location, "pfm")) pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict] pwm_file_list = [] pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb))) pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb))) pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb))) pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb))) pwm_file_list.append(pwm_obs_f) pwm_file_list.append(pwm_obs_r) pwm_file_list.append(pwm_exp_f) pwm_file_list.append(pwm_exp_r) for i in range(len(pwm_dict_list)): with open(pwm_file_list[i], "w") as pwm_file: for e in ["A", "C", "G", "T"]: pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n") motif_obs_f = motifs.read(open(pwm_obs_f), "pfm") motif_obs_r = motifs.read(open(pwm_obs_r), "pfm") motif_exp_f = motifs.read(open(pwm_exp_f), "pfm") motif_exp_r = motifs.read(open(pwm_exp_r), "pfm") # Output logos os.system("mkdir -p " + os.path.join(args.output_location, "logo")) logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb))) logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb))) logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb))) logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb))) motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) # Creating bias dictionary alphabet = ["A", "C", "G", "T"] k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] bias_table_F = dict([(e, 0.0) for e in k_mer_comb]) bias_table_R = dict([(e, 0.0) for e in k_mer_comb]) for k_mer in k_mer_comb: obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb) exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb) bias_table_F[k_mer] = round(obs_f / exp_f, 6) obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb) exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb) bias_table_R[k_mer] = round(obs_r / exp_r, 6) write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
def main(args): """ Performs motif matching. """ ################################################################################################### # Processing Input Arguments ################################################################################################### # Initializing Error Handler err = ErrorHandler() # Additional Parameters matching_folder_name = "match" random_region_name = "random_regions" filter_values = parse_filter(args.filter) ################################################################################################### # Initializations ################################################################################################### # Output folder if args.output_location: output_location = args.output_location else: output_location = npath(matching_folder_name) print(">> output location:", output_location) # Default genomic data genome_data = GenomeData(args.organism) print(">> genome:", genome_data.organism) print(">> pseudocounts:", args.pseudocounts) print(">> fpr threshold:", args.fpr) ################################################################################################### # Reading Input Regions ################################################################################################### genomic_regions_dict = {} # get experimental matrix, if available if args.input_matrix: try: exp_matrix = ExperimentalMatrix() exp_matrix.read(args.input_matrix) # if the matrix is present, the (empty) dictionary is overwritten genomic_regions_dict = exp_matrix.objectsDict print(">>> experimental matrix loaded") except Exception: err.throw_error("MM_WRONG_EXPMAT") elif args.input_files: # get input files, if available for input_filename in args.input_files: name, _ = os.path.splitext(os.path.basename(input_filename)) regions = GenomicRegionSet(name) regions.read(npath(input_filename)) genomic_regions_dict[name] = regions print(">>> input file", name, "loaded:", len(regions), "regions") # we put this here because we don't want to create the output directory unless we # are sure the initialisation (including loading input files) worked try: if not os.path.isdir(output_location): os.makedirs(output_location) except Exception: err.throw_error("MM_OUT_FOLDER_CREATION") annotation = None target_genes = None # get promoter regions from list of genes (both target and background) # TODO: should be more clever, allow precomputed regions etc if args.target_genes_filename: annotation = AnnotationSet(args.organism, alias_source=args.organism, protein_coding=True, known_only=True) target_genes = GeneSet("target_genes") target_genes.read(args.target_genes_filename) # TODO: what do we do with unmapped genes? maybe just print them out target_regions = annotation.get_promoters(gene_set=target_genes, promoter_length=args.promoter_length) target_regions.name = "target_regions" target_regions.sort() output_file_name = npath(os.path.join(output_location, target_regions.name + ".bed")) target_regions.write(output_file_name) genomic_regions_dict[target_regions.name] = target_regions print(">>> target promoter file created:", len(target_regions), "regions") # we make a background in case it's requested, but also in case a list of target genes has not been # provided if args.promoter_make_background or (args.promoters_only and not args.target_genes_filename): if not annotation: annotation = AnnotationSet(args.organism, alias_source=args.organism, protein_coding=True, known_only=True) # background is made of all known genes minus the target genes (if any) background_genes = GeneSet("background_genes") background_genes.get_all_genes(organism=args.organism) if target_genes: background_genes.subtract(target_genes) background_regions = annotation.get_promoters(gene_set=background_genes, promoter_length=args.promoter_length) background_regions.name = "background_regions" background_regions.sort() output_file_name = npath(os.path.join(output_location, background_regions.name + ".bed")) background_regions.write(output_file_name) genomic_regions_dict[background_regions.name] = background_regions print(">>> background promoter file created:", len(background_regions), "regions") if not genomic_regions_dict: err.throw_error("DEFAULT_ERROR", add_msg="You must either specify an experimental matrix, or at least a " "valid input file, or one of the 'promoter test' options.") max_region_len = 0 max_region = None regions_to_match = [] # Iterating on experimental matrix objects for k in genomic_regions_dict.keys(): curr_genomic_region = genomic_regions_dict[k] # If the object is a GenomicRegionSet if isinstance(curr_genomic_region, GenomicRegionSet): if args.rmdup: # remove duplicates and sort regions curr_genomic_region.remove_duplicates(sort=True) else: # sort regions curr_genomic_region.sort() # Append label and GenomicRegionSet regions_to_match.append(curr_genomic_region) # Verifying max_region_len for random region generation curr_len = len(curr_genomic_region) if curr_len > max_region_len: max_region_len = curr_len max_region = curr_genomic_region print(">> all files loaded") ################################################################################################### # Creating random regions ################################################################################################### # if a random proportion is set, create random regions if args.rand_proportion: # Create random coordinates and name it random_regions rand_region = max_region.random_regions(args.organism, multiply_factor=args.rand_proportion, chrom_X=True) rand_region.sort() rand_region.name = random_region_name # Add random regions to the list of regions to perform matching on regions_to_match.append(rand_region) # Writing random regions output_file_name = npath(os.path.join(output_location, random_region_name)) rand_bed_file_name = output_file_name + ".bed" rand_region.write(rand_bed_file_name) # Verifying condition to write bb if args.bigbed: # Fetching file with chromosome sizes chrom_sizes_file = genome_data.get_chromosome_sizes() try: # Converting to big bed bed_to_bb(rand_bed_file_name, chrom_sizes_file) # removing previously-created BED file os.remove(rand_bed_file_name) except Exception: err.throw_warning("DEFAULT_WARNING") # FIXME: maybe error instead? print(">> random regions file created:", len(rand_region), "regions") ################################################################################################### # Creating PWMs ################################################################################################### if args.motif_dbs: ms = MotifSet(preload_motifs=args.motif_dbs, motif_dbs=True) # filter for dbs only if --motif_dbs is not set if 'database' in filter_values: del filter_values['database'] else: if 'database' in filter_values: ms = MotifSet(preload_motifs=filter_values['database']) else: ms = MotifSet(preload_motifs="default") print(">> used database(s):", ",".join([str(db) for db in ms.motif_data.repositories_list])) # applying filtering pattern, taking a subset of the motif set if args.filter: ms = ms.filter(filter_values, search=args.filter_type) motif_list = ms.get_motif_list(args.pseudocounts, args.fpr) print(">> motifs loaded:", len(motif_list)) # Performing normalized threshold strategy if requested if args.norm_threshold: threshold_list = [motif.threshold / motif.len for motif in motif_list] unique_threshold = sum(threshold_list) / len(threshold_list) else: unique_threshold = None scanner = scan.Scanner(7) pssm_list = [] thresholds = [] for motif in motif_list: if unique_threshold: thresholds.append(0.0) thresholds.append(0.0) else: thresholds.append(motif.threshold) thresholds.append(motif.threshold) pssm_list.append(motif.pssm) pssm_list.append(motif.pssm_rc) # Performing motif matching # TODO: we can expand this to use bg from sequence, for example, # or from organism. bg = tools.flat_bg(4) scanner.set_motifs(pssm_list, bg, thresholds) ################################################################################################### # Motif Matching ################################################################################################### # Creating genome file genome_file = Fastafile(genome_data.get_genome()) print() # Iterating on list of genomic region sets for grs in regions_to_match: start = time.time() print(">> matching [", grs.name, "], ", len(grs), " regions... ", sep="", end='') sys.stdout.flush() # Initializing output bed file output_bed_file = os.path.join(output_location, grs.name + "_mpbs.bed") # must remove it because we append the MPBS if os.path.isfile(output_bed_file): os.remove(output_bed_file) # Iterating on genomic region set for genomic_region in grs: # Reading sequence associated to genomic_region sequence = str(genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final)) grs_tmp = match_multiple(scanner, motif_list, sequence, genomic_region) # post-processing: if required, remove duplicate regions on opposing strands (keep highest score) if len(grs_tmp) > 1 and args.remove_strand_duplicates: grs_tmp.sort() seqs = grs_tmp.sequences seqs_new = [] cur_pos = 0 end_pos = len(seqs) - 1 while cur_pos < end_pos: gr = seqs[cur_pos] new_pos = cur_pos + 1 while new_pos < end_pos: gr2 = seqs[new_pos] # if this sequence is unrelated, we move on if gr.name != gr2.name or gr.chrom != gr2.chrom or gr.initial != gr2.initial or gr.final != gr2.final or gr.orientation == gr2.orientation: break if float(gr.data) < float(gr2.data): gr = gr2 new_pos = new_pos + 1 # adding the currently-selected genomic region seqs_new.append(gr) # at the next loop, we start from the next right-handed sequences cur_pos = new_pos # edge case: the last element was not considered # (when it is, cur_pos == end_pos+1) if cur_pos == end_pos: seqs_new.append(seqs[cur_pos]) grs_tmp.sequences = seqs_new grs_tmp.write(output_bed_file, mode="a") del grs.sequences[:] # Verifying condition to write bb if args.bigbed and args.normalize_bitscore: # Fetching file with chromosome sizes chrom_sizes_file = genome_data.get_chromosome_sizes() # Converting to big bed bed_to_bb(output_bed_file, chrom_sizes_file) # removing BED file os.remove(output_bed_file) secs = time.time() - start print("[", "%02.3f" % secs, " seconds]", sep="")
def estimate_table_pwm(self, regions, dnase_file_name, genome_file_name, k_nb, forward_shift, reverse_shift): """ Estimates bias based on HS regions, DNase-seq signal and genomic sequences. Keyword arguments: regions -- DNase-seq HS regions. atac_file_name -- DNase-seq file name. genome_file_name -- Genome to fetch genomic sequences from. Return: bias_table_F, bias_table_R -- Bias tables. """ # Initializing bam and fasta if (dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR bamFile = Samfile(dnase_file_name, "rb") fastaFile = Fastafile(genome_file_name) obsSeqsF = [] obsSeqsR = [] expSeqsF = [] expSeqsR = [] # Iterating on HS regions for region in regions: # Evaluating observed frequencies # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions # if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift # else: p1 = r.aend - (k_nb/2) + 1 - shift if (not r.is_reverse): cut_site = r.pos + forward_shift - 1 p1 = cut_site - int(floor(k_nb / 2)) else: cut_site = r.aend + reverse_shift + 1 p1 = cut_site - int(floor(k_nb / 2)) p2 = p1 + k_nb # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if (r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if 'N' not in currStr: if (not r.is_reverse): obsSeqsF.append(Seq(currStr)) else: obsSeqsR.append(Seq(currStr)) # Evaluating expected frequencies # Fetching whole sequence try: currStr = str( fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0, len(currStr) - k_nb): s = currStr[i:i + k_nb] if 'N' not in currStr: # Counting k-mer in dictionary expSeqsF.append(Seq(s)) # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i + k_nb] expSeqsR.append(Seq(s)) # Closing files bamFile.close() fastaFile.close() obsMotifsF = motifs.create(obsSeqsF) obsMotifsR = motifs.create(obsSeqsR) expMotifsF = motifs.create(expSeqsF) expMotifsR = motifs.create(expSeqsR) obsPwmF = obsMotifsF.pwm obsPwmR = obsMotifsR.pwm expPwmF = expMotifsF.pwm expPwmR = expMotifsR.pwm # Output logos logo_obs_f = os.path.join( self.output_loc, "Bias", "logo", "obs_{}_{}_f.pdf".format(str(k_nb), str(forward_shift))) logo_obs_r = os.path.join( self.output_loc, "Bias", "logo", "obs_{}_{}_r.pdf".format(str(k_nb), str(forward_shift))) logo_exp_f = os.path.join( self.output_loc, "Bias", "logo", "exp_{}_{}_f.pdf".format(str(k_nb), str(forward_shift))) logo_exp_r = os.path.join( self.output_loc, "Bias", "logo", "exp_{}_{}_r.pdf".format(str(k_nb), str(forward_shift))) obsMotifsF.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) obsMotifsR.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) expMotifsF.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) expMotifsR.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) # Output pwms pwm_data_list = [obsPwmF, obsPwmR, expPwmF, expPwmR] pwm_file_list = [] pwm_obs_f = os.path.join( self.output_loc, "Bias", "pwm", "obs_{}_{}_f.pwm".format(str(k_nb), str(forward_shift))) pwm_obs_r = os.path.join( self.output_loc, "Bias", "pwm", "obs_{}_{}_r.pwm".format(str(k_nb), str(forward_shift))) pwm_exp_f = os.path.join( self.output_loc, "Bias", "pwm", "exp_{}_{}_f.pwm".format(str(k_nb), str(forward_shift))) pwm_exp_r = os.path.join( self.output_loc, "Bias", "pwm", "exp_{}_{}_r.pwm".format(str(k_nb), str(forward_shift))) pwm_file_list.append(pwm_obs_f) pwm_file_list.append(pwm_obs_r) pwm_file_list.append(pwm_exp_f) pwm_file_list.append(pwm_exp_r) for i in range(len(pwm_data_list)): with open(pwm_file_list[i], "w") as f: f.write(str(pwm_data_list[i])) # Creating bias dictionary alphabet = ["A", "C", "G", "T"] k_mer_comb = ["".join(e) for e in product(alphabet, repeat=k_nb)] bias_table_F = dict([(e, 0.0) for e in k_mer_comb]) bias_table_R = dict([(e, 0.0) for e in k_mer_comb]) for k_mer in k_mer_comb: obsF = self.get_pwm_score(k_mer, obsPwmF, k_nb) expF = self.get_pwm_score(k_mer, expPwmF, k_nb) bias_table_F[k_mer] = round(obsF / expF, 6) obsR = self.get_pwm_score(k_mer, obsPwmR, k_nb) expR = self.get_pwm_score(k_mer, expPwmR, k_nb) bias_table_R[k_mer] = round(obsR / expR, 6) # Return return [bias_table_F, bias_table_R]
def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, forward_shift, reverse_shift): """ Estimates bias based on HS regions, DNase-seq signal and genomic sequences. Keyword arguments: regions -- DNase-seq HS regions. dnase_file_name -- DNase-seq file name. genome_file_name -- Genome to fetch genomic sequences from. Return: bias_table_F, bias_table_R -- Bias tables. """ # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta if (dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR bamFile = Samfile(dnase_file_name, "rb") fastaFile = Fastafile(genome_file_name) # Initializing dictionaries obsDictF = dict() obsDictR = dict() expDictF = dict() expDictR = dict() ct_reads_r = 0 ct_reads_f = 0 ct_kmers = 0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if (not r.is_reverse): cut_site = r.pos + forward_shift - 1 p1 = cut_site - int(floor(k_nb / 2)) else: cut_site = r.aend + reverse_shift + 1 p1 = cut_site - int(floor(k_nb / 2)) p2 = p1 + k_nb # Verifying PCR artifacts if (p1 == prevPos): trueCounter += 1 else: prevPos = p1 trueCounter = 0 if (trueCounter > maxDuplicates): continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if (r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if (not r.is_reverse): ct_reads_f += 1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_r += 1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str( fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0, len(currStr) - k_nb): ct_kmers += 1 # Counting k-mer in dictionary s = currStr[i:i + k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i + k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A", "C", "G", "T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)] bias_table_F = dict([(e, 0.0) for e in kmerComb]) bias_table_R = dict([(e, 0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount if ct_reads_f == 0: bias_table_F[kmer] = 1 else: bias_table_F[kmer] = round( float(obsF / ct_reads_f) / float(expF / ct_kmers), 6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount if ct_reads_r == 0: bias_table_R[kmer] = 1 else: bias_table_R[kmer] = round( float(obsR / ct_reads_r) / float(expR / ct_kmers), 6) # Return return [bias_table_F, bias_table_R]
def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, shift): """ Estimates bias based on HS regions, DNase-seq signal and genomic sequences. Keyword arguments: regions -- DNase-seq HS regions. dnase_file_name -- DNase-seq file name. genome_file_name -- Genome to fetch genomic sequences from. Return: bias_table_F, bias_table_R -- Bias tables. """ # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta if(dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR bamFile = Samfile(dnase_file_name, "rb") fastaFile = Fastafile(genome_file_name) # Initializing dictionaries obsDictF = dict(); obsDictR = dict() expDictF = dict(); expDictR = dict() ct_reads_r=0 ct_reads_f=0 ct_kmers=0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift else: p1 = r.aend - (k_nb/2) + 1 - shift p2 = p1 + k_nb # Verifying PCR artifacts if(p1 == prevPos): trueCounter += 1 else: prevPos = p1 trueCounter = 0 if(trueCounter > maxDuplicates): continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if(r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if(not r.is_reverse): ct_reads_r+=1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_f+=1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0,len(currStr)-k_nb): ct_kmers+=1 # Counting k-mer in dictionary s = currStr[i:i+k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i+k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A","C","G","T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)] bias_table_F = dict([(e,0.0) for e in kmerComb]) bias_table_R = dict([(e,0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount bias_table_F[kmer] = round(float(obsF/ct_reads_f)/float(expF/ct_kmers),6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount bias_table_R[kmer] = round(float(obsR/ct_reads_r)/float(expR/ct_kmers),6) # Return return [bias_table_F, bias_table_R]
for i in range((window / 2), len(nf) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + (window / 2)] fLast = nf[i - (window / 2) + 1] rSum -= rLast rSum += nr[i + (window / 2)] rLast = nr[i - (window / 2) + 1] #for i in range(p1, p2): # print i+1, Nf[i-p1], Nr[i-p1] # Fetching sequence # -1 and +2 corrections because He is wrong currStr = str(fastaFile.fetch(chrName, p1_wk - 1, p2_wk - 2)).upper() currRevComp = str( Seq(str(fastaFile.fetch(chrName, p1_wk + 2, p2_wk + 1)).upper()).reverse_complement()) #print currStr #print currRevComp # Iterating on sequence to create signal af = [] ar = [] for i in range((k_nb / 2), len(currStr) - (k_nb / 2) + 1): fseq = currStr[i - (k_nb / 2):i + (k_nb / 2)] rseq = currRevComp[len(currStr) - (k_nb / 2) - i:len(currStr) + (k_nb / 2) - i] #print fseq, rseq
with open(sys.argv[2], 'r') as ref: for line in ref: (bin, name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, score, name2, cdsStartStat, cdsEndStat, exonFrames) = line.strip().split() exonStarts = map(int, exonStarts.split(',')[:-1]) exonEnds = map(int, exonEnds.split(',')[:-1]) assert len(exonEnds) == len(exonStarts) == int(exonCount) seq = '' for start, end in zip(exonStarts, exonEnds): if chrom in fa.references: seq += fa.fetch(chrom, start, end) if strand == '-': seq = rc(seq) if seq: genes[name2].append(seq) for name in genes: for i, tx in enumerate(genes[name]): print ">%s.%d\n%s" % (name, i, tx) else: sys.exit(usage())
def estimate_bias_kmer(args): # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta bamFile = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fastaFile = Fastafile(genome_data.get_genome()) regions = GenomicRegionSet("regions") regions.read(args.regions_file) # Initializing dictionaries obsDictF = dict() obsDictR = dict() expDictF = dict() expDictR = dict() ct_reads_r = 0 ct_reads_f = 0 ct_kmers = 0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if not r.is_reverse: cut_site = r.pos + args.forward_shift - 1 p1 = cut_site - int(floor(args.k_nb / 2)) else: cut_site = r.aend + args.reverse_shift + 1 p1 = cut_site - int(floor(args.k_nb / 2)) p2 = p1 + args.k_nb # Verifying PCR artifacts if p1 == prevPos: trueCounter += 1 else: prevPos = p1 trueCounter = 0 if trueCounter > maxDuplicates: continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if not r.is_reverse: ct_reads_f += 1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_r += 1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0, len(currStr) - args.k_nb): ct_kmers += 1 # Counting k-mer in dictionary s = currStr[i:i + args.k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i + args.k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A", "C", "G", "T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] bias_table_F = dict([(e, 0.0) for e in kmerComb]) bias_table_R = dict([(e, 0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount if ct_reads_f == 0: bias_table_F[kmer] = 1 else: bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount if ct_reads_r == 0: bias_table_R[kmer] = 1 else: bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6) write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
def create_signal(args, regions): def revcomp(s): rev_dict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")]) return "".join([rev_dict[e] for e in s[::-1]]) alphabet = ["A", "C", "G", "T"] kmer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] f_obs_dict = dict([(e, 0.0) for e in kmer_comb]) r_obs_dict = dict([(e, 0.0) for e in kmer_comb]) f_exp_dict = dict([(e, 0.0) for e in kmer_comb]) r_exp_dict = dict([(e, 0.0) for e in kmer_comb]) bam_file = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fasta_file = Fastafile(genome_data.get_genome()) for region in regions: # Fetching observed reads reads = bam_file.fetch(reference=region.chrom, start=region.initial, end=region.final) for read in reads: if not read.is_reverse: p1 = read.pos - int(floor(args.k_nb / 2)) + args.forward_shift - 1 else: p1 = read.aend - int(floor(args.k_nb / 2)) + args.reverse_shift + 1 p2 = p1 + args.k_nb try: dna_sequence_obs = str(fasta_file.fetch(region.chrom, p1, p2)).upper() except Exception: continue if 'N' not in dna_sequence_obs: if read.is_reverse: dna_sequence_obs = revcomp(dna_sequence_obs) r_obs_dict[dna_sequence_obs] += 1 else: f_obs_dict[dna_sequence_obs] += 1 # Fetching whole sequence try: dna_sequence_exp = str(fasta_file.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue dna_sequence_exp_rev = revcomp(dna_sequence_exp) for i in range(0, len(dna_sequence_exp) - args.k_nb): s = dna_sequence_exp[i:i + args.k_nb] if "N" not in s: f_exp_dict[s] += 1 s = dna_sequence_exp_rev[i:i + args.k_nb] if "N" not in s: r_exp_dict[s] += 1 output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb))) output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb))) output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb))) output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb))) output_file_f_obs = open(output_fname_f_obs, "w") output_file_f_exp = open(output_fname_f_exp, "w") output_file_r_obs = open(output_fname_r_obs, "w") output_file_r_exp = open(output_fname_r_exp, "w") for kmer in r_obs_dict.keys(): if f_obs_dict[kmer] > 0: output_file_f_obs.write(kmer + "\t" + str(f_obs_dict[kmer]) + "\n") for kmer in r_obs_dict.keys(): if f_exp_dict[kmer] > 0: output_file_f_exp.write(kmer + "\t" + str(f_exp_dict[kmer]) + "\n") for kmer in r_obs_dict.keys(): if r_obs_dict[kmer] > 0: output_file_r_obs.write(kmer + "\t" + str(r_obs_dict[kmer]) + "\n") for kmer in r_obs_dict.keys(): if r_exp_dict[kmer] > 0: output_file_r_exp.write(kmer + "\t" + str(r_exp_dict[kmer]) + "\n") output_file_f_obs.close() output_file_f_exp.close() output_file_r_obs.close() output_file_r_exp.close()
# Smoothed counts Nf = []; Nr = []; fSum = sum(nf[:window]); rSum = sum(nr[:window]); fLast = nf[0]; rLast = nr[0] for i in range((window/2),len(nf)-(window/2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast; fSum += nf[i+(window/2)]; fLast = nf[i-(window/2)+1] rSum -= rLast; rSum += nr[i+(window/2)]; rLast = nr[i-(window/2)+1] #for i in range(p1, p2): # print i+1, Nf[i-p1], Nr[i-p1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk+4, p2_wk+3)).upper() currRevComp = "".join([revDict[e] for e in currStr]) #print currStr #print currRevComp # Iterating on sequence to create signal af = []; ar = [] for i in range((k_nb/2),len(currStr)-(k_nb/2)+1): fseq = currStr[i-(k_nb/2):i+(k_nb/2)] rseq = currRevComp[i-(k_nb/2):i+(k_nb/2)][::-1] #rseq = currRevComp[len(currStr)-(k_nb/2)-i:len(currStr)+(k_nb/2)-i] #print fseq, rseq try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq])
def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local', logfile=sys.stdout, debugfile=sys.stderr): """Main function of the workflow""" ref_genome = assembly.fasta_by_chrom sample_names = [job.groups[gid]['name'] for gid in sorted(job.files.keys())] logfile.write("\n* Generate vcfs for each chrom/group\n"); logfile.flush() vcfs = dict((chrom,{}) for chrom in ref_genome.keys()) # {chr: {}} bams = {} # Launch the jobs for gid in sorted(job.files.keys()): # Merge all bams belonging to the same group runs = [r['bam'] for r in job.files[gid].itervalues()] bam = Samfile(runs[0]) header = bam.header headerfile = unique_filename_in() for h in header["SQ"]: if h["SN"] in assembly.chrmeta: h["SN"] = assembly.chrmeta[h["SN"]]["ac"] head = Samfile( headerfile, "wh", header=header ) head.close() if len(runs) > 1: _b = merge_bam(ex,runs) index_bam(ex,_b) bams[gid] = _b else: bams[gid] = runs[0] # Samtools mpileup + bcftools + vcfutils.pl for chrom,ref in ref_genome.iteritems(): vcf = unique_filename_in() vcfs[chrom][gid] = (vcf, pileup.nonblocking(ex, bams[gid], ref, header=headerfile, via=via, stdout=vcf)) logfile.write(" ...Group %s running.\n" %job.groups[gid]['name']); logfile.flush() # Wait for vcfs to finish and store them in *vcfs[chrom][gid]* for gid in sorted(job.files.keys()): for chrom,ref in ref_genome.iteritems(): vcfs[chrom][gid][1].wait() vcfs[chrom][gid] = vcfs[chrom][gid][0] logfile.write(" ...Group %s done.\n" %job.groups[gid]['name']); logfile.flush() # Targz the pileup files (vcf) tarname = unique_filename_in() tarfh = tarfile.open(tarname, "w:gz") for chrom,v in vcfs.iteritems(): for gid,vcf in v.iteritems(): tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'],chrom)) tarfh.close() ex.add( tarname, description=set_file_descr("vcfs_files.tar.gz",step="pileup",type="tar",view='admin') ) logfile.write("\n* Merge info from vcf files\n"); logfile.flush() outall = unique_filename_in() outexons = unique_filename_in() with open(outall,"w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \ ['gene','location_type','distance'])+'\n') with open(outexons,"w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \ + ['new_aa_'+s for s in sample_names])+'\n') msa_table = dict((s,'') for s in [assembly.name]+sample_names) for chrom,v in vcfs.iteritems(): logfile.write(" > Chromosome '%s'\n" % chrom); logfile.flush() # Put together info from all vcf files logfile.write(" - All SNPs\n"); logfile.flush() allsnps = all_snps(ex,chrom,vcfs[chrom],bams,outall,assembly, sample_names,mincov,float(minsnp),logfile,debugfile) # Annotate SNPs and check synonymy logfile.write(" - Exonic SNPs\n"); logfile.flush() exon_snps(chrom,outexons,allsnps,assembly,sample_names,ref_genome,logfile,debugfile) for snprow in allsnps: for n,k in enumerate([assembly.name]+sample_names): msa_table[k] += snprow[3+n][0] description = set_file_descr("allSNP.txt",step="SNPs",type="txt") ex.add(outall,description=description) description = set_file_descr("exonsSNP.txt",step="SNPs",type="txt") ex.add(outexons,description=description) msafile = unique_filename_in() with open(msafile,"w") as msa: msa.write(" %i %i\n"%(len(msa_table),len(msa_table.values()[0]))) for name,seq in msa_table.iteritems(): msa.write("%s\t%s\n" %(name,seq)) msa_table = {} description = set_file_descr("SNPalignment.txt",step="SNPs",type="txt") ex.add(msafile,description=description) # Create UCSC bed tracks logfile.write("\n* Create tracks\n"); logfile.flush() create_tracks(ex,outall,sample_names,assembly) # Create quantitative tracks logfile.write("\n* Create heteroz. and quality tracks\n"); logfile.flush() def _process_pileup(pileups, seq, startpos, endpos): atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3} vectors = ([],[],[]) for pileupcolumn in pileups: position = pileupcolumn.pos if position < startpos: continue if position >= endpos: break coverage = pileupcolumn.n ref_symbol = seq[position-startpos] ref = atoi.get(ref_symbol, 4) symbols = [0,0,0,0,0] quality = 0 for pileupread in pileupcolumn.pileups: symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos], 4)] += 1 quality += ord(pileupread.alignment.qual[pileupread.qpos])-33 quality = float(quality)/coverage info = heterozygosity(ref, symbols[0:4]) if coverage > 0: vectors[0].append((position, position+1, coverage)) if info > 0: vectors[1].append((position, position+1, info)) if quality > 0: vectors[2].append((position, position+1, quality)) # yield (position, position+1, coverage, info, quality) return vectors if job.options.get('make_bigwigs',False): _descr = {'groupId':0,'step':"tracks",'type':"bigWig",'ucsc':'1'} for gid,bamfile in bams.iteritems(): _descr['groupId'] = gid bamtr = track(bamfile,format="bam") covname = unique_filename_in()+".bw" out_cov = track(covname, chrmeta=assembly.chrmeta) hetname = unique_filename_in()+".bw" out_het = track(hetname, chrmeta=assembly.chrmeta) qualname = unique_filename_in()+".bw" out_qual = track(qualname, chrmeta=assembly.chrmeta) for chrom, cinfo in assembly.chrmeta.iteritems(): fasta = Fastafile(ref_genome[chrom]) #process fasta and bam by 10Mb chunks for chunk in range(0,cinfo["length"],10**7): fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk+10**7) vecs = _process_pileup(bamtr.pileup(chrom, chunk, chunk+10**7), fastaseq, chunk, chunk+10**7) out_cov.write(vecs[0], fields=['start','end','score'], chrom=chrom) out_het.write(vecs[1], fields=['start','end','score'], chrom=chrom) out_qual.write(vecs[2], fields=['start','end','score'], chrom=chrom) out_cov.close() out_het.close() out_qual.close() description = set_file_descr(job.groups[gid]['name']+"_coverage.bw",**_descr) ex.add(covname,description=description) description = set_file_descr(job.groups[gid]['name']+"_heterozygosity.bw",**_descr) ex.add(hetname,description=description) description = set_file_descr(job.groups[gid]['name']+"_quality.bw",**_descr) ex.add(qualname,description=description) return 0
def bias_correction(chrom, start, end, bam, bias_table, genome_file_name, forward_shift, reverse_shift): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(list(fBiasDict.keys())[0]) p1 = start p2 = end p1_w = p1 - (window // 2) p2_w = p2 + (window // 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if p1 <= 0 or p1_w <= 0 or p1_wk <= 0 or p2_wk <= 0: # Return raw counts bc_signal = [0.0] * (p2 - p1) for read in bam.fetch(chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: bc_signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: bc_signal[cut_site - p1] += 1.0 return bc_signal # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in bam.fetch(chrom, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: nf[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: nr[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] f_sum = sum(nf[:window]) r_sum = sum(nr[:window]) f_last = nf[0] r_last = nr[0] for i in range(int(window / 2), len(nf) - int(window / 2)): Nf.append(f_sum) Nr.append(r_sum) f_sum -= f_last f_sum += nf[i + int(window / 2)] f_last = nf[i - int(window / 2) + 1] r_sum -= r_last r_sum += nr[i + int(window / 2)] r_last = nr[i - int(window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrom, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp( str(fastaFile.fetch(chrom, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file f_sum = sum(af[:window]) r_sum = sum(ar[:window]) f_last = af[0] r_last = ar[0] bc_signal = [] for i in range(int(window / 2), len(af) - int(window / 2)): nhatf = Nf[i - int(window / 2)] * (af[i] / f_sum) nhatr = Nr[i - int(window / 2)] * (ar[i] / r_sum) bc_signal.append(nhatf + nhatr) f_sum -= f_last f_sum += af[i + int(window / 2)] f_last = af[i - int(window / 2) + 1] r_sum -= r_last r_sum += ar[i + int(window / 2)] r_last = ar[i - int(window / 2) + 1] # Termination fastaFile.close() return bc_signal
def print_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None, raw_signal_file=None, bc_signal_file=None, norm_signal_file=None, strand_specific=False): if raw_signal_file: pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift) if ps_version == "0.7.5": self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array([min(e, initial_clip) for e in pileup_region.vector]) f = open(raw_signal_file, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(raw_signal)]) + "\n") f.close() if bc_signal_file or norm_signal_file: # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fasta = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(k_nb / 2.) p2_wk = p2_w + int(k_nb / 2.) currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create the bias signal signal_bias_f = [] signal_bias_r = [] for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1): fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)] rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i] try: signal_bias_f.append(fBiasDict[fseq]) except Exception: signal_bias_f.append(defaultKmerValue) try: signal_bias_r.append(rBiasDict[rseq]) except Exception: signal_bias_r.append(defaultKmerValue) # Raw counts signal_raw_f = [0.0] * (p2_w - p1_w) signal_raw_r = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(ref, p1_w, p2_w): if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: signal_raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: signal_raw_r[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(signal_raw_f[:window]) rSum = sum(signal_raw_r[:window]) fLast = signal_raw_f[0] rLast = signal_raw_r[0] for i in range((window / 2), len(signal_raw_f) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += signal_raw_f[i + (window / 2)] fLast = signal_raw_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_raw_r[i + (window / 2)] rLast = signal_raw_r[i - (window / 2) + 1] # Calculating bias and writing to wig file fSum = sum(signal_bias_f[:window]) rSum = sum(signal_bias_r[:window]) fLast = signal_bias_f[0] rLast = signal_bias_r[0] signal_bc = [] signal_bc_f = [] signal_bc_r = [] for i in range((window / 2), len(signal_bias_f) - (window / 2)): nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum) nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum) signal_bc.append(nhatf + nhatr) signal_bc_f.append(nhatf) signal_bc_r.append(nhatr) fSum -= fLast fSum += signal_bias_f[i + (window / 2)] fLast = signal_bias_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_bias_r[i + (window / 2)] rLast = signal_bias_r[i - (window / 2) + 1] if bc_signal_file: f = open(bc_signal_file, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_bc)]) + "\n") f.close() if strand_specific: prefix = bc_signal_file.split(".")[0] bc_signal_file_f = prefix + "_Forward" + ".bc.wig" bc_signal_file_r = prefix + "_Reverse" + ".bc.wig" f = open(bc_signal_file_f, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_bc_f)]) + "\n") f.close() f = open(bc_signal_file_r, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_bc_r)]) + "\n") f.close() if norm_signal_file: norm_signal_bc = self.boyle_norm(signal_bc) perc = scoreatpercentile(norm_signal_bc, 98) std = np.std(norm_signal_bc) norm_signal_bc = self.hon_norm_atac(norm_signal_bc, perc, std) f = open(norm_signal_file, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(norm_signal_bc)]) + "\n") f.close() if strand_specific: prefix = bc_signal_file.split(".")[0] norm_signal_file_f = prefix + "_Forward" + ".norm.wig" norm_signal_file_r = prefix + "_Reverse" + ".norm.wig" signal_norm_f = self.boyle_norm(signal_bc_f) perc = scoreatpercentile(signal_norm_f, 98) std = np.std(signal_norm_f) signal_norm_f = self.hon_norm_atac(signal_norm_f, perc, std) signal_norm_r = self.boyle_norm(signal_bc_r) perc = scoreatpercentile(signal_norm_r, 98) std = np.std(signal_norm_r) signal_norm_r = self.hon_norm_atac(signal_norm_r, perc, std) f = open(norm_signal_file_f, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_norm_f)]) + "\n") f.close() f = open(norm_signal_file_r, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_norm_r)]) + "\n") f.close()
def bias_correction(self, signal, bias_table, genome_file_name, chrName, start, end, forward_shift, reverse_shift, strands_specific): """ Performs bias correction. Keyword arguments: signal -- Input signal. bias_table -- Bias table. Return: bias_corrected_signal -- Bias-corrected sequence. """ if (not bias_table): return signal # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if (p1 <= 0 or p1_w <= 0 or p1_wk <= 0): return signal # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(chrName, p1_w, p2_w): if (not read.is_reverse): cut_site = read.pos + forward_shift if cut_site >= start and cut_site < end: nf[cut_site - p1_w] += 1.0 # for i in range(max(read.pos + forward_shift, start), min(read.pos + forward_shift + 1, end - 1)): # nf[i - start] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if cut_site >= start and cut_site < end: nr[cut_site - p1_w] += 1.0 # for i in range(max(read.aend + reverse_shift - 1, start), min(read.aend + reverse_shift, end - 1)): # nr[i - start] += 1.0 # if ((not read.is_reverse) and (read.pos > p1_w)): nf[read.pos - p1_w] += 1.0 # if ((read.is_reverse) and ((read.aend - 1) < p2_w)): nr[read.aend - 1 - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + (window / 2)] fLast = nf[i - (window / 2) + 1] rSum -= rLast rSum += nr[i + (window / 2)] rLast = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk - 1, p2_wk - 2)).upper() currRevComp = AuxiliaryFunctions.revcomp( str(fastaFile.fetch(chrName, p1_wk + 2, p2_wk + 1)).upper()) #currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper() #currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, # p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bias_corrected_signal = [] bias_corrected_signal_forward = [] bias_corrected_signal_reverse = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / fSum) nhatr = Nr[i - (window / 2)] * (ar[i] / rSum) zf = log(nf[i] + 1) - log(nhatf + 1) zr = log(nr[i] + 1) - log(nhatr + 1) bias_corrected_signal_forward.append(zf) bias_corrected_signal_reverse.append(zr) bias_corrected_signal.append(zf + zr) fSum -= fLast fSum += af[i + (window / 2)] fLast = af[i - (window / 2) + 1] rSum -= rLast rSum += ar[i + (window / 2)] rLast = ar[i - (window / 2) + 1] # Fixing the negative number in bias corrected signal min_value = abs(min(bias_corrected_signal_forward)) bias_fixed_signal_forward = [ e + min_value for e in bias_corrected_signal_forward ] min_value = abs(min(bias_corrected_signal_reverse)) bias_fixed_signal_reverse = [ e + min_value for e in bias_corrected_signal_reverse ] min_value = abs(min(bias_corrected_signal)) bias_fixed_signal = [e + min_value for e in bias_corrected_signal] # Termination fastaFile.close() if not strands_specific: return bias_corrected_signal else: return bias_fixed_signal_forward, bias_fixed_signal_reverse
def line(self): signal = GenomicSignal(self.bam_file) signal.load_sg_coefs(slope_window_size=9) bias_table = BiasTable() bias_table_list = self.bias_table.split(",") table = bias_table.load_table(table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) genome_data = GenomeData(self.organism) fasta = Fastafile(genome_data.get_genome()) pwm_dict = dict([("A", [0.0] * self.window_size), ("C", [0.0] * self.window_size), ("G", [0.0] * self.window_size), ("T", [0.0] * self.window_size), ("N", [0.0] * self.window_size)]) mean_raw_signal = np.zeros(self.window_size) mean_bc_signal = np.zeros(self.window_size) mean_raw_signal_f = np.zeros(self.window_size) mean_bc_signal_f = np.zeros(self.window_size) mean_raw_signal_r = np.zeros(self.window_size) mean_bc_signal_r = np.zeros(self.window_size) mean_bias_signal_f = np.zeros(self.window_size) mean_bias_signal_r = np.zeros(self.window_size) num_sites = 0 mpbs_regions = GenomicRegionSet("Motif Predicted Binding Sites") mpbs_regions.read_bed(self.motif_file) total_nc_signal = 0 total_nl_signal = 0 total_nr_signal = 0 for region in mpbs_regions: if str(region.name).split(":")[-1] == "Y": num_sites += 1 # Extend by 50 bp mid = (region.initial + region.final) / 2 p1 = mid - (self.window_size / 2) p2 = mid + (self.window_size / 2) if not self.strands_specific: # Fetch raw signal raw_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) mean_raw_signal = np.add(mean_raw_signal, raw_signal) # Fetch bias correction signal bc_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2, bias_table=table, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) mean_bc_signal = np.add(mean_bc_signal, bc_signal) else: raw_signal_f, _, raw_signal_r, _ = signal.get_signal_per_strand(ref=region.chrom, start=p1, end=p2, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) mean_raw_signal_f = np.add(mean_raw_signal_f, raw_signal_f) mean_raw_signal_r = np.add(mean_raw_signal_r, raw_signal_r) bc_signal_f, _, bc_signal_r, _ = signal.get_signal_per_strand(ref=region.chrom, start=p1, end=p2, bias_table=table, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) mean_bc_signal_f = np.add(mean_bc_signal_f, bc_signal_f) mean_bc_signal_r = np.add(mean_bc_signal_r, bc_signal_r) # Update pwm aux_plus = 1 dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper() if (region.final - region.initial) % 2 == 0: aux_plus = 0 dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom, p1 + aux_plus, p2 + aux_plus)).upper()) if region.orientation == "+": for i in range(0, len(dna_seq)): pwm_dict[dna_seq[i]][i] += 1 elif region.orientation == "-": for i in range(0, len(dna_seq_rev)): pwm_dict[dna_seq_rev[i]][i] += 1 # Create bias signal bias_table_f = table[0] bias_table_r = table[1] self.k_nb = len(bias_table_f.keys()[0]) bias_signal_f = [] bias_signal_r = [] p1_wk = p1 - int(self.k_nb / 2) p2_wk = p2 + int(self.k_nb / 2) dna_seq = str(fasta.fetch(region.chrom, p1_wk, p2_wk - 1)).upper() dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom, p1_wk, p2_wk + 1)).upper()) for i in range(int(self.k_nb / 2), len(dna_seq) - int(self.k_nb / 2) + 1): fseq = dna_seq[i - int(self.k_nb / 2):i + int(self.k_nb / 2)] rseq = dna_seq_rev[len(dna_seq) - int(self.k_nb / 2) - i:len(dna_seq) + int(self.k_nb / 2) - i] try: bias_signal_f.append(bias_table_f[fseq]) except Exception: bias_signal_f.append(1) try: bias_signal_r.append(bias_table_r[rseq]) except Exception: bias_signal_r.append(1) mean_bias_signal_f = np.add(mean_bias_signal_f, np.array(bias_signal_f)) mean_bias_signal_r = np.add(mean_bias_signal_r, np.array(bias_signal_r)) if self.protection_score: # signal in the center of the MPBS p1 = region.initial p2 = region.final nc_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2, bias_table=table, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) total_nc_signal += sum(nc_signal) p1 = region.final p2 = 2 * region.final - region.initial nr_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2, bias_table=table, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) total_nr_signal += sum(nr_signal) p1 = 2 * region.initial - region.final p2 = region.final nl_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2, bias_table=table, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) total_nl_signal += sum(nl_signal) mean_raw_signal = mean_raw_signal / num_sites mean_bc_signal = mean_bc_signal / num_sites mean_raw_signal_f = mean_raw_signal_f / num_sites mean_raw_signal_r = mean_raw_signal_r / num_sites mean_bc_signal_f = mean_bc_signal_f / num_sites mean_bc_signal_r = mean_bc_signal_r / num_sites mean_bias_signal_f = mean_bias_signal_f / num_sites mean_bias_signal_r = mean_bias_signal_r / num_sites protection_score = (total_nl_signal + total_nr_signal - 2 * total_nc_signal) / (2 * num_sites) # Output PWM and create logo pwm_fname = os.path.join(self.output_loc, "{}.pwm".format(self.motif_name)) pwm_file = open(pwm_fname,"w") for e in ["A","C","G","T"]: pwm_file.write(" ".join([str(int(f)) for f in pwm_dict[e]])+"\n") pwm_file.close() logo_fname = os.path.join(self.output_loc, "{}.logo.eps".format(self.motif_name)) pwm = motifs.read(open(pwm_fname), "pfm") pwm.weblogo(logo_fname, format="eps", stack_width="large", stacks_per_line="100", color_scheme="color_classic", unit_name="", show_errorbars=False, logo_title="", show_xaxis=False, xaxis_label="", show_yaxis=False, yaxis_label="", show_fineprint=False, show_ends=False) # Output the raw, bias corrected signal and protection score output_fname = os.path.join(self.output_loc, "{}.txt".format(self.motif_name)) output_file = open(output_fname, "w") if not self.strands_specific: output_file.write("raw signal: \n" + np.array_str(mean_raw_signal) + "\n") output_file.write("bias corrected signal: \n" + np.array_str(mean_bc_signal) + "\n") else: output_file.write("raw forward signal: \n" + np.array_str(mean_raw_signal_f) + "\n") output_file.write("bias corrected forward signal: \n" + np.array_str(mean_bc_signal_f) + "\n") output_file.write("raw reverse signal: \n" + np.array_str(mean_raw_signal_r) + "\n") output_file.write("bias reverse corrected signal: \n" + np.array_str(mean_bc_signal_r) + "\n") output_file.write("forward bias signal: \n" + np.array_str(mean_bias_signal_f) + "\n") output_file.write("reverse bias signal: \n" + np.array_str(mean_bias_signal_r) + "\n") if self.protection_score: output_file.write("protection score: \n" + str(protection_score) + "\n") output_file.close() if self.strands_specific: fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(12.0, 10.0)) else: fig, (ax1, ax2) = plt.subplots(2) x = np.linspace(-50, 49, num=self.window_size) ax1.plot(x, mean_bias_signal_f, color='red', label='Forward') ax1.plot(x, mean_bias_signal_r, color='blue', label='Reverse') ax1.xaxis.set_ticks_position('bottom') ax1.yaxis.set_ticks_position('left') ax1.spines['top'].set_visible(False) ax1.spines['right'].set_visible(False) ax1.spines['left'].set_position(('outward', 15)) ax1.spines['bottom'].set_position(('outward', 5)) ax1.tick_params(direction='out') ax1.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49]) ax1.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49']) min_bias_signal = min(min(mean_bias_signal_f), min(mean_bias_signal_r)) max_bias_signal = max(max(mean_bias_signal_f), max(mean_bias_signal_r)) ax1.set_yticks([min_bias_signal, max_bias_signal]) ax1.set_yticklabels([str(round(min_bias_signal,2)), str(round(max_bias_signal,2))], rotation=90) ax1.text(-48, max_bias_signal, '# Sites = {}'.format(str(num_sites)), fontweight='bold') ax1.set_title(self.motif_name, fontweight='bold') ax1.set_xlim(-50, 49) ax1.set_ylim([min_bias_signal, max_bias_signal]) ax1.legend(loc="upper right", frameon=False) ax1.set_ylabel("Average Bias \nSignal", rotation=90, fontweight='bold') if not self.strands_specific: mean_raw_signal = self.standardize(mean_raw_signal) mean_bc_signal = self.standardize(mean_bc_signal) ax2.plot(x, mean_raw_signal, color='red', label='Uncorrected') ax2.plot(x, mean_bc_signal, color='green', label='Corrected') else: mean_raw_signal_f = self.standardize(mean_raw_signal_f) mean_raw_signal_r = self.standardize(mean_raw_signal_r) mean_bc_signal_f = self.standardize(mean_bc_signal_f) mean_bc_signal_r = self.standardize(mean_bc_signal_r) ax2.plot(x, mean_raw_signal_f, color='red', label='Forward') ax2.plot(x, mean_raw_signal_r, color='green', label='Reverse') ax3.plot(x, mean_bc_signal_f, color='red', label='Forward') ax3.plot(x, mean_bc_signal_r, color='green', label='Reverse') ax2.xaxis.set_ticks_position('bottom') ax2.yaxis.set_ticks_position('left') ax2.spines['top'].set_visible(False) ax2.spines['right'].set_visible(False) ax2.spines['left'].set_position(('outward', 15)) ax2.tick_params(direction='out') ax2.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49]) ax2.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49']) ax2.set_yticks([0, 1]) ax2.set_yticklabels([str(0), str(1)], rotation=90) ax2.set_xlim(-50, 49) ax2.set_ylim([0, 1]) if not self.strands_specific: ax2.spines['bottom'].set_position(('outward', 40)) ax2.set_xlabel("Coordinates from Motif Center", fontweight='bold') ax2.set_ylabel("Average ATAC-seq \nSignal", rotation=90, fontweight='bold') ax2.legend(loc="center", frameon=False, bbox_to_anchor=(0.85, 0.06)) else: ax2.spines['bottom'].set_position(('outward', 5)) ax2.set_ylabel("Average ATAC-seq \n Uncorrected Signal", rotation=90, fontweight='bold') ax2.legend(loc="lower right", frameon=False) ax3.xaxis.set_ticks_position('bottom') ax3.yaxis.set_ticks_position('left') ax3.spines['top'].set_visible(False) ax3.spines['right'].set_visible(False) ax3.spines['left'].set_position(('outward', 15)) ax3.tick_params(direction='out') ax3.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49]) ax3.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49']) ax3.set_yticks([0, 1]) ax3.set_yticklabels([str(0), str(1)], rotation=90) ax3.set_xlim(-50, 49) ax3.set_ylim([0, 1]) ax3.legend(loc="lower right", frameon=False) ax3.spines['bottom'].set_position(('outward', 40)) ax3.set_xlabel("Coordinates from Motif Center", fontweight='bold') ax3.set_ylabel("Average ATAC-seq \n Corrected Signal", rotation=90, fontweight='bold') ax3.text(-48, 0.05, '# K-mer = {}\n# Forward Shift = {}'.format(str(self.k_nb), str(self.atac_forward_shift)), fontweight='bold') figure_name = os.path.join(self.output_loc, "{}.line.eps".format(self.motif_name)) fig.subplots_adjust(bottom=.2, hspace=.5) fig.tight_layout() fig.savefig(figure_name, format="eps", dpi=300) # Creating canvas and printing eps / pdf with merged results output_fname = os.path.join(self.output_loc, "{}.eps".format(self.motif_name)) c = pyx.canvas.canvas() c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0)) if self.strands_specific: c.insert(pyx.epsfile.epsfile(2.76, 1.58, logo_fname, width=27.2, height=2.45)) else: c.insert(pyx.epsfile.epsfile(2.5, 1.54, logo_fname, width=16, height=1.75)) c.writeEPSfile(output_fname) os.system("epstopdf " + figure_name) os.system("epstopdf " + logo_fname) os.system("epstopdf " + output_fname)
def main(args): """ Performs motif matching. """ ################################################################################################### # Processing Input Arguments ################################################################################################### # Initializing Error Handler err = ErrorHandler() # Additional Parameters matching_folder_name = "match" random_region_name = "random_regions" filter_values = parse_filter(args.filter) ################################################################################################### # Initializations ################################################################################################### # Output folder if args.output_location: output_location = args.output_location else: output_location = npath(matching_folder_name) print(">> output location:", output_location) # Default genomic data genome_data = GenomeData(args.organism) print(">> genome:", genome_data.organism) print(">> pseudocounts:", args.pseudocounts) print(">> fpr threshold:", args.fpr) ################################################################################################### # Reading Input Regions ################################################################################################### genomic_regions_dict = {} # get experimental matrix, if available if args.input_matrix: try: exp_matrix = ExperimentalMatrix() exp_matrix.read(args.input_matrix) # if the matrix is present, the (empty) dictionary is overwritten genomic_regions_dict = exp_matrix.objectsDict print(">>> experimental matrix loaded") except Exception: err.throw_error("MM_WRONG_EXPMAT") elif args.input_files: # get input files, if available for input_filename in args.input_files: name, _ = os.path.splitext(os.path.basename(input_filename)) regions = GenomicRegionSet(name) regions.read(npath(input_filename)) genomic_regions_dict[name] = regions print(">>> input file", name, "loaded:", len(regions), "regions") # we put this here because we don't want to create the output directory unless we # are sure the initialisation (including loading input files) worked try: if not os.path.isdir(output_location): os.makedirs(output_location) except Exception: err.throw_error("MM_OUT_FOLDER_CREATION") annotation = None target_genes = None # get promoter regions from list of genes (both target and background) # TODO: should be more clever, allow precomputed regions etc if args.target_genes_filename: annotation = AnnotationSet(args.organism, alias_source=args.organism, protein_coding=True, known_only=True) target_genes = GeneSet("target_genes") target_genes.read(args.target_genes_filename) # TODO: what do we do with unmapped genes? maybe just print them out target_regions = annotation.get_promoters( gene_set=target_genes, promoter_length=args.promoter_length) target_regions.name = "target_regions" target_regions.sort() output_file_name = npath( os.path.join(output_location, target_regions.name + ".bed")) target_regions.write(output_file_name) genomic_regions_dict[target_regions.name] = target_regions print(">>> target promoter file created:", len(target_regions), "regions") # we make a background in case it's requested, but also in case a list of target genes has not been # provided if args.promoter_make_background or (args.promoters_only and not args.target_genes_filename): if not annotation: annotation = AnnotationSet(args.organism, alias_source=args.organism, protein_coding=True, known_only=True) # background is made of all known genes minus the target genes (if any) background_genes = GeneSet("background_genes") background_genes.get_all_genes(organism=args.organism) if target_genes: background_genes.subtract(target_genes) background_regions = annotation.get_promoters( gene_set=background_genes, promoter_length=args.promoter_length) background_regions.name = "background_regions" background_regions.sort() output_file_name = npath( os.path.join(output_location, background_regions.name + ".bed")) background_regions.write(output_file_name) genomic_regions_dict[background_regions.name] = background_regions print(">>> background promoter file created:", len(background_regions), "regions") if not genomic_regions_dict: err.throw_error( "DEFAULT_ERROR", add_msg= "You must either specify an experimental matrix, or at least a " "valid input file, or one of the 'promoter test' options.") max_region_len = 0 max_region = None regions_to_match = [] # Iterating on experimental matrix objects for k in genomic_regions_dict.keys(): curr_genomic_region = genomic_regions_dict[k] # If the object is a GenomicRegionSet if isinstance(curr_genomic_region, GenomicRegionSet): if args.rmdup: # remove duplicates and sort regions curr_genomic_region.remove_duplicates(sort=True) else: # sort regions curr_genomic_region.sort() # Append label and GenomicRegionSet regions_to_match.append(curr_genomic_region) # Verifying max_region_len for random region generation curr_len = len(curr_genomic_region) if curr_len > max_region_len: max_region_len = curr_len max_region = curr_genomic_region print(">> all files loaded") ################################################################################################### # Creating random regions ################################################################################################### # if a random proportion is set, create random regions if args.rand_proportion: # Create random coordinates and name it random_regions rand_region = max_region.random_regions( args.organism, multiply_factor=args.rand_proportion, chrom_X=True) rand_region.sort() rand_region.name = random_region_name # Add random regions to the list of regions to perform matching on regions_to_match.append(rand_region) # Writing random regions output_file_name = npath( os.path.join(output_location, random_region_name)) rand_bed_file_name = output_file_name + ".bed" rand_region.write(rand_bed_file_name) # Verifying condition to write bb if args.bigbed: # Fetching file with chromosome sizes chrom_sizes_file = genome_data.get_chromosome_sizes() try: # Converting to big bed bed_to_bb(rand_bed_file_name, chrom_sizes_file) # removing previously-created BED file os.remove(rand_bed_file_name) except Exception: err.throw_warning( "DEFAULT_WARNING") # FIXME: maybe error instead? print(">> random regions file created:", len(rand_region), "regions") ################################################################################################### # Creating PWMs ################################################################################################### if args.motif_dbs: ms = MotifSet(preload_motifs=args.motif_dbs, motif_dbs=True) # filter for dbs only if --motif_dbs is not set if 'database' in filter_values: del filter_values['database'] else: if 'database' in filter_values: ms = MotifSet(preload_motifs=filter_values['database']) else: ms = MotifSet(preload_motifs="default") print(">> used database(s):", ",".join([str(db) for db in ms.motif_data.repositories_list])) # applying filtering pattern, taking a subset of the motif set if args.filter: ms = ms.filter(filter_values, search=args.filter_type) motif_list = ms.get_motif_list(args.pseudocounts, args.fpr) print(">> motifs loaded:", len(motif_list)) # Performing normalized threshold strategy if requested if args.norm_threshold: threshold_list = [motif.threshold / motif.len for motif in motif_list] unique_threshold = sum(threshold_list) / len(threshold_list) else: unique_threshold = None scanner = scan.Scanner(7) pssm_list = [] thresholds = [] for motif in motif_list: if unique_threshold: thresholds.append(0.0) thresholds.append(0.0) else: thresholds.append(motif.threshold) thresholds.append(motif.threshold) pssm_list.append(motif.pssm) pssm_list.append(motif.pssm_rc) # Performing motif matching # TODO: we can expand this to use bg from sequence, for example, # or from organism. bg = tools.flat_bg(4) scanner.set_motifs(pssm_list, bg, thresholds) ################################################################################################### # Motif Matching ################################################################################################### # Creating genome file genome_file = Fastafile(genome_data.get_genome()) print() # Iterating on list of genomic region sets for grs in regions_to_match: start = time.time() print(">> matching [", grs.name, "], ", len(grs), " regions... ", sep="", end='') sys.stdout.flush() # Initializing output bed file output_bed_file = os.path.join(output_location, grs.name + "_mpbs.bed") # must remove it because we append the MPBS if os.path.isfile(output_bed_file): os.remove(output_bed_file) # Iterating on genomic region set for genomic_region in grs: # Reading sequence associated to genomic_region sequence = str( genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final)) grs_tmp = match_multiple(scanner, motif_list, sequence, genomic_region) # post-processing: if required, remove duplicate regions on opposing strands (keep highest score) if len(grs_tmp) > 1 and args.remove_strand_duplicates: grs_tmp.sort() seqs = grs_tmp.sequences seqs_new = [] cur_pos = 0 end_pos = len(seqs) - 1 while cur_pos < end_pos: gr = seqs[cur_pos] new_pos = cur_pos + 1 while new_pos < end_pos: gr2 = seqs[new_pos] # if this sequence is unrelated, we move on if gr.name != gr2.name or gr.chrom != gr2.chrom or gr.initial != gr2.initial or gr.final != gr2.final or gr.orientation == gr2.orientation: break if float(gr.data) < float(gr2.data): gr = gr2 new_pos = new_pos + 1 # adding the currently-selected genomic region seqs_new.append(gr) # at the next loop, we start from the next right-handed sequences cur_pos = new_pos # edge case: the last element was not considered # (when it is, cur_pos == end_pos+1) if cur_pos == end_pos: seqs_new.append(seqs[cur_pos]) grs_tmp.sequences = seqs_new grs_tmp.write(output_bed_file, mode="a") del grs.sequences[:] # Verifying condition to write bb if args.bigbed and args.normalize_bitscore: # Fetching file with chromosome sizes chrom_sizes_file = genome_data.get_chromosome_sizes() # Converting to big bed bed_to_bb(output_bed_file, chrom_sizes_file) # removing BED file os.remove(output_bed_file) secs = time.time() - start print("[", "%02.3f" % secs, " seconds]", sep="")
def bias_correction_atac(self, bias_table, genome_file_name, chrName, start, end, forward_shift, reverse_shift): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0): # Return raw counts nf = [0.0] * (p2 - p1) nr = [0.0] * (p2 - p1) for read in self.bam.fetch(chrName, p1, p2): if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: nf[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: nr[cut_site - p1] += 1.0 return nf, nr # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(chrName, p1_w, p2_w): if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: nf[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: nr[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + (window / 2)] fLast = nf[i - (window / 2) + 1] rSum -= rLast rSum += nr[i + (window / 2)] rLast = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bias_corrected_signal_forward = [] bias_corrected_signal_reverse = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / fSum) nhatr = Nr[i - (window / 2)] * (ar[i] / rSum) bias_corrected_signal_forward.append(nhatf) bias_corrected_signal_reverse.append(nhatr) fSum -= fLast fSum += af[i + (window / 2)] fLast = af[i - (window / 2) + 1] rSum -= rLast rSum += ar[i + (window / 2)] rLast = ar[i - (window / 2) + 1] # Termination fastaFile.close() return bias_corrected_signal_forward, bias_corrected_signal_reverse
def bias_correction(self, signal, bias_table, genome_file_name, chrName, start, end, forward_shift, reverse_shift, strands_specific): """ Performs bias correction. Keyword arguments: signal -- Input signal. bias_table -- Bias table. Return: bias_corrected_signal -- Bias-corrected sequence. """ if (not bias_table): return signal # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if (p1 <= 0 or p1_w <= 0 or p1_wk <= 0): return signal # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(chrName, p1_w, p2_w): if (not read.is_reverse): cut_site = read.pos + forward_shift if cut_site >= start and cut_site < end: nf[cut_site - p1_w] += 1.0 # for i in range(max(read.pos + forward_shift, start), min(read.pos + forward_shift + 1, end - 1)): # nf[i - start] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if cut_site >= start and cut_site < end: nr[cut_site - p1_w] += 1.0 # for i in range(max(read.aend + reverse_shift - 1, start), min(read.aend + reverse_shift, end - 1)): # nr[i - start] += 1.0 # if ((not read.is_reverse) and (read.pos > p1_w)): nf[read.pos - p1_w] += 1.0 # if ((read.is_reverse) and ((read.aend - 1) < p2_w)): nr[read.aend - 1 - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + (window / 2)] fLast = nf[i - (window / 2) + 1] rSum -= rLast rSum += nr[i + (window / 2)] rLast = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk-1, p2_wk-2)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName,p1_wk+2, p2_wk+1)).upper()) #currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper() #currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, # p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bias_corrected_signal = [] bias_corrected_signal_forward = [] bias_corrected_signal_reverse = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / fSum) nhatr = Nr[i - (window / 2)] * (ar[i] / rSum) zf = log(nf[i] + 1) - log(nhatf + 1) zr = log(nr[i] + 1) - log(nhatr + 1) bias_corrected_signal_forward.append(zf) bias_corrected_signal_reverse.append(zr) bias_corrected_signal.append(zf + zr) fSum -= fLast fSum += af[i + (window / 2)] fLast = af[i - (window / 2) + 1] rSum -= rLast rSum += ar[i + (window / 2)] rLast = ar[i - (window / 2) + 1] # Fixing the negative number in bias corrected signal min_value = abs(min(bias_corrected_signal_forward)) bias_fixed_signal_forward = [e + min_value for e in bias_corrected_signal_forward] min_value = abs(min(bias_corrected_signal_reverse)) bias_fixed_signal_reverse = [e + min_value for e in bias_corrected_signal_reverse] min_value = abs(min(bias_corrected_signal)) bias_fixed_signal = [e + min_value for e in bias_corrected_signal] # Termination fastaFile.close() if not strands_specific: return bias_corrected_signal else: return bias_fixed_signal_forward, bias_fixed_signal_reverse
def bias_correction_dnase(signal_class, signal, chrName, start, end, forward_shift, reverse_shift): table_file_name_F = os.path.join(os.path.dirname(__file__), '../data/single_hit_bias_table_F.txt') table_file_name_R = os.path.join(os.path.dirname(__file__), '../data/single_hit_bias_table_R.txt') bias_table = load_table(table_file_name_F, table_file_name_R) if not bias_table: return signal # Parameters window = 50 defaultKmerValue = 1.0 genome_file_name = signal_class.fastaFile # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(list(fBiasDict.keys())[0]) p1 = start p2 = end p1_w = int(p1 - (window / 2)) p2_w = int(p2 + (window / 2)) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if p1 <= 0 or p1_w <= 0 or p1_wk <= 0: return signal # Raw counts nf = [0.0] * int(p2_w - p1_w) nr = [0.0] * int(p2_w - p1_w) for read in signal_class.bam.fetch(chrName, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: nf[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: nr[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range(int(window / 2), int(len(nf) - (window / 2))): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + int(window / 2)] fLast = nf[i - int(window / 2) + 1] rSum -= rLast rSum += nr[i + int(window / 2)] rLast = nr[i - int(window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper() currRevComp = revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[ len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int( floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bias_corrected_signal = [] for i in range(int(window / 2), int(len(af) - (window / 2))): nhatf = Nf[i - int(window / 2)] * (af[i] / fSum) nhatr = Nr[i - int(window / 2)] * (ar[i] / rSum) zf = log(nf[i] + 1) - log(nhatf + 1) zr = log(nr[i] + 1) - log(nhatr + 1) bias_corrected_signal.append(zf + zr) fSum -= fLast fSum += af[i + int(window / 2)] fLast = af[i - int(window / 2) + 1] rSum -= rLast rSum += ar[i + int(window / 2)] rLast = ar[i - int(window / 2) + 1] # Termination fastaFile.close() return bias_corrected_signal
def main_matching(): """ Performs motif matching. Authors: Eduardo G. Gusmao. """ ################################################################################################### # Processing Input Arguments ################################################################################################### # Initializing Error Handler main_error_handler = ErrorHandler() # Parameters usage_message = "%prog --matching [options] <experiment_matrix>" # Initializing Option Parser parser = PassThroughOptionParser(usage = usage_message) # Parameters Options parser.add_option("--organism", dest = "organism", type = "string", metavar="STRING", default = "hg19", help = ("Organism considered on the analysis. Check our full documentation for all available " "options. All default files such as genomes will be based on the chosen organism " "and the data.config file.")) parser.add_option("--fpr", dest = "fpr", type = "float", metavar="FLOAT", default = 0.0001, help = ("False positive rate cutoff for motif matching.")) parser.add_option("--precision", dest = "precision", type = "int", metavar="INT", default = 10000, help = ("Score distribution precision for determining false positive rate cutoff.")) parser.add_option("--pseudocounts", dest = "pseudocounts", type = "float", metavar="FLOAT", default = 0.1, help = ("Pseudocounts to be added to raw counts of each PFM.")) parser.add_option("--rand-proportion", dest = "rand_proportion", type = "float", metavar="FLOAT", default = 10.0, help = ("If random coordinates need to be created (for further motif enrichment)," "then it will be created a number of coordinates that equals this" "parameter x the number of input regions (in case of multiple regions, the" "larger is considered). If zero (0) is passed, then no random coordinates are created.")) parser.add_option("--norm-threshold", dest = "norm_threshold", action = "store_true", default = False, help = ("If this option is used, the thresholds for all PWMs will be normalized by their length." "In this scheme, the threshold cutoff is evaluated in the regular way by the given fpr." "Then, all thresholds are divided by the lenght of the motif. The final threshold consists" "of the average between all normalized motif thresholds. This single threshold will be" "applied to all motifs.")) # Output Options parser.add_option("--output-location", dest = "output_location", type = "string", metavar="PATH", default = os.getcwd(), help = ("Path where the output files will be written.")) parser.add_option("--bigbed", dest = "bigbed", action = "store_true", default = False, help = ("If this option is used, all bed files will be written as bigbed.")) parser.add_option("--normalize-bitscore", dest = "normalize_bitscore", action = "store_false", default = True, help = ("In order to print bigbed files the scores need to be normalized between 0 and 1000." "This option should be used if real bitscores should be printed in the resulting bed file." "In this case, a bigbed file will not be created.")) # Processing Options options, arguments = parser.parse_args() # Additional Parameters matching_folder_name = "Match" random_region_name = "random_regions" ################################################################################################### # Initializations ################################################################################################### # Output folder matching_output_location = os.path.join(options.output_location,matching_folder_name) try: if(not os.path.isdir(matching_output_location)): os.makedirs(matching_output_location) except Exception: main_error_handler.throw_error("MM_OUT_FOLDER_CREATION") # Default genomic data genome_data = GenomeData(options.organism) # Default motif data motif_data = MotifData() ################################################################################################### # Reading Input Matrix ################################################################################################### # Reading arguments try: input_matrix = arguments[0] if(len(arguments) > 1): main_error_handler.throw_warning("MM_MANY_ARG") except Exception: main_error_handler.throw_error("MM_NO_ARGUMENT") # Create experimental matrix try: exp_matrix = ExperimentalMatrix() exp_matrix.read(input_matrix) except Exception: main_error_handler.throw_error("MM_WRONG_EXPMAT") ################################################################################################### # Reading Regions ################################################################################################### # Initialization max_region_len = 0 max_region = None input_regions = [] try: exp_matrix_objects_dict = exp_matrix.objectsDict except Exception: main_error_handler.throw_error("MM_WRONG_EXPMAT") # Iterating on experimental matrix objects for k in exp_matrix_objects_dict.keys(): curr_genomic_region = exp_matrix_objects_dict[k] # If the object is a GenomicRegionSet if(isinstance(curr_genomic_region,GenomicRegionSet)): # Sorting input region curr_genomic_region.sort() # Append label and GenomicRegionSet input_regions.append(curr_genomic_region) # Verifying max_region_len for random region generation curr_len = len(curr_genomic_region) if(curr_len > max_region_len): max_region_len = curr_len max_region = exp_matrix_objects_dict[k] ################################################################################################### # Creating random region ################################################################################################### # Create random coordinates rand_region = None if(options.rand_proportion > 0): # Create random coordinates and name it random_regions rand_region = max_region.random_regions(options.organism, multiply_factor=options.rand_proportion, chrom_X=True) rand_region.sort() rand_region.name = random_region_name # Put random regions in the end of the input regions input_regions.append(rand_region) # Writing random regions output_file_name = os.path.join(matching_output_location, random_region_name) rand_bed_file_name = output_file_name+".bed" rand_region.write_bed(rand_bed_file_name) # Verifying condition to write bb if(options.bigbed): # Fetching file with chromosome sizes chrom_sizes_file = genome_data.get_chromosome_sizes() # Converting to big bed rand_bb_file_name = output_file_name+".bb" try: os.system(" ".join(["bedToBigBed", rand_bed_file_name, chrom_sizes_file, rand_bb_file_name, "-verbose=0"])) os.remove(rand_bed_file_name) except Exception: pass # WARNING else: main_error_handler.throw_error("MM_WRONG_RANDPROP") ################################################################################################### # Creating PWMs ################################################################################################### # Initialization motif_list = [] # Creating thresholds object thresholds = Thresholds(motif_data) # Fetching list with all motif file names motif_file_names = [] for motif_repository in motif_data.get_pwm_list(): for motif_file_name in glob(os.path.join(motif_repository,"*.pwm")): motif_file_names.append(motif_file_name) # Iterating on grouped file name list for motif_file_name in motif_file_names: # Append motif motif_list motif_list.append(Motif(motif_file_name, options.pseudocounts, options.precision, options.fpr, thresholds)) # Performing normalized threshold strategy if requested if(options.norm_threshold): threshold_list = [motif.threshold/motif.len for motif in motif_list] unique_threshold = sum(threshold_list)/len(threshold_list) else: unique_threshold = None ################################################################################################### # Motif Matching ################################################################################################### # Creating genome file genome_file = Fastafile(genome_data.get_genome()) # Iterating on list of genomic regions for genomic_region_set in input_regions: # Initializing output bed file output_file_name = os.path.join(matching_output_location, genomic_region_set.name+"_mpbs") bed_file_name = output_file_name+".bed" output_file = open(bed_file_name,"w") # Iterating on genomic regions for genomic_region in genomic_region_set.sequences: # Reading sequence associated to genomic_region sequence = str(genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final)) # Splitting the sequence in smaller sequences to remove the "N" regions sequence_list = filter(None,sequence.split("N")) # Perform motif matching for each motif in each sequence for seq in sequence_list: for motif in motif_list: match_single(motif, seq, genomic_region, output_file, unique_threshold, options.normalize_bitscore) # Closing file output_file.close() # Verifying condition to write bb if(options.bigbed and options.normalize_bitscore): # Fetching file with chromosome sizes chrom_sizes_file = genome_data.get_chromosome_sizes() # Converting to big bed sort_file_name = output_file_name+"_sort.bed" bb_file_name = output_file_name+".bb" os.system("sort -k1,1 -k2,2n "+bed_file_name+" > "+sort_file_name) os.system(" ".join(["bedToBigBed", sort_file_name, chrom_sizes_file, bb_file_name, "-verbose=0"])) os.remove(bed_file_name); os.remove(sort_file_name)
def line(self): signal = GenomicSignal(self.bam_file) signal.load_sg_coefs(slope_window_size=9) bias_table = BiasTable() bias_table_list = self.bias_table.split(",") table = bias_table.load_table(table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) genome_data = GenomeData(self.organism) fasta = Fastafile(genome_data.get_genome()) pwm_dict = dict([("A", [0.0] * self.window_size), ("C", [0.0] * self.window_size), ("G", [0.0] * self.window_size), ("T", [0.0] * self.window_size), ("N", [0.0] * self.window_size)]) mean_raw_signal = np.zeros(self.window_size) mean_bc_signal = np.zeros(self.window_size) mean_raw_signal_f = np.zeros(self.window_size) mean_bc_signal_f = np.zeros(self.window_size) mean_raw_signal_r = np.zeros(self.window_size) mean_bc_signal_r = np.zeros(self.window_size) mean_bias_signal_f = np.zeros(self.window_size) mean_bias_signal_r = np.zeros(self.window_size) num_sites = 0 mpbs_regions = GenomicRegionSet("Motif Predicted Binding Sites") mpbs_regions.read_bed(self.motif_file) total_nc_signal = 0 total_nl_signal = 0 total_nr_signal = 0 for region in mpbs_regions: if str(region.name).split(":")[-1] == "Y": num_sites += 1 # Extend by 50 bp mid = (region.initial + region.final) / 2 p1 = mid - (self.window_size / 2) p2 = mid + (self.window_size / 2) if not self.strands_specific: # Fetch raw signal raw_signal, _ = signal.get_signal( ref=region.chrom, start=p1, end=p2, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) mean_raw_signal = np.add(mean_raw_signal, raw_signal) # Fetch bias correction signal bc_signal, _ = signal.get_signal( ref=region.chrom, start=p1, end=p2, bias_table=table, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) mean_bc_signal = np.add(mean_bc_signal, bc_signal) else: raw_signal_f, _, raw_signal_r, _ = signal.get_signal_per_strand( ref=region.chrom, start=p1, end=p2, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) mean_raw_signal_f = np.add(mean_raw_signal_f, raw_signal_f) mean_raw_signal_r = np.add(mean_raw_signal_r, raw_signal_r) bc_signal_f, _, bc_signal_r, _ = signal.get_signal_per_strand( ref=region.chrom, start=p1, end=p2, bias_table=table, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) mean_bc_signal_f = np.add(mean_bc_signal_f, bc_signal_f) mean_bc_signal_r = np.add(mean_bc_signal_r, bc_signal_r) # Update pwm aux_plus = 1 dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper() if (region.final - region.initial) % 2 == 0: aux_plus = 0 dna_seq_rev = AuxiliaryFunctions.revcomp( str(fasta.fetch(region.chrom, p1 + aux_plus, p2 + aux_plus)).upper()) if region.orientation == "+": for i in range(0, len(dna_seq)): pwm_dict[dna_seq[i]][i] += 1 elif region.orientation == "-": for i in range(0, len(dna_seq_rev)): pwm_dict[dna_seq_rev[i]][i] += 1 # Create bias signal bias_table_f = table[0] bias_table_r = table[1] self.k_nb = len(bias_table_f.keys()[0]) bias_signal_f = [] bias_signal_r = [] p1_wk = p1 - int(self.k_nb / 2) p2_wk = p2 + int(self.k_nb / 2) dna_seq = str(fasta.fetch(region.chrom, p1_wk, p2_wk - 1)).upper() dna_seq_rev = AuxiliaryFunctions.revcomp( str(fasta.fetch(region.chrom, p1_wk, p2_wk + 1)).upper()) for i in range(int(self.k_nb / 2), len(dna_seq) - int(self.k_nb / 2) + 1): fseq = dna_seq[i - int(self.k_nb / 2):i + int(self.k_nb / 2)] rseq = dna_seq_rev[len(dna_seq) - int(self.k_nb / 2) - i:len(dna_seq) + int(self.k_nb / 2) - i] try: bias_signal_f.append(bias_table_f[fseq]) except Exception: bias_signal_f.append(1) try: bias_signal_r.append(bias_table_r[rseq]) except Exception: bias_signal_r.append(1) mean_bias_signal_f = np.add(mean_bias_signal_f, np.array(bias_signal_f)) mean_bias_signal_r = np.add(mean_bias_signal_r, np.array(bias_signal_r)) if self.protection_score: # signal in the center of the MPBS p1 = region.initial p2 = region.final nc_signal, _ = signal.get_signal( ref=region.chrom, start=p1, end=p2, bias_table=table, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) total_nc_signal += sum(nc_signal) p1 = region.final p2 = 2 * region.final - region.initial nr_signal, _ = signal.get_signal( ref=region.chrom, start=p1, end=p2, bias_table=table, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) total_nr_signal += sum(nr_signal) p1 = 2 * region.initial - region.final p2 = region.final nl_signal, _ = signal.get_signal( ref=region.chrom, start=p1, end=p2, bias_table=table, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) total_nl_signal += sum(nl_signal) mean_raw_signal = mean_raw_signal / num_sites mean_bc_signal = mean_bc_signal / num_sites mean_raw_signal_f = mean_raw_signal_f / num_sites mean_raw_signal_r = mean_raw_signal_r / num_sites mean_bc_signal_f = mean_bc_signal_f / num_sites mean_bc_signal_r = mean_bc_signal_r / num_sites mean_bias_signal_f = mean_bias_signal_f / num_sites mean_bias_signal_r = mean_bias_signal_r / num_sites protection_score = (total_nl_signal + total_nr_signal - 2 * total_nc_signal) / (2 * num_sites) # Output PWM and create logo pwm_fname = os.path.join(self.output_loc, "{}.pwm".format(self.motif_name)) pwm_file = open(pwm_fname, "w") for e in ["A", "C", "G", "T"]: pwm_file.write(" ".join([str(int(f)) for f in pwm_dict[e]]) + "\n") pwm_file.close() logo_fname = os.path.join(self.output_loc, "{}.logo.eps".format(self.motif_name)) pwm = motifs.read(open(pwm_fname), "pfm") pwm.weblogo(logo_fname, format="eps", stack_width="large", stacks_per_line="100", color_scheme="color_classic", unit_name="", show_errorbars=False, logo_title="", show_xaxis=False, xaxis_label="", show_yaxis=False, yaxis_label="", show_fineprint=False, show_ends=False) # Output the raw, bias corrected signal and protection score output_fname = os.path.join(self.output_loc, "{}.txt".format(self.motif_name)) output_file = open(output_fname, "w") if not self.strands_specific: output_file.write("raw signal: \n" + np.array_str(mean_raw_signal) + "\n") output_file.write("bias corrected signal: \n" + np.array_str(mean_bc_signal) + "\n") else: output_file.write("raw forward signal: \n" + np.array_str(mean_raw_signal_f) + "\n") output_file.write("bias corrected forward signal: \n" + np.array_str(mean_bc_signal_f) + "\n") output_file.write("raw reverse signal: \n" + np.array_str(mean_raw_signal_r) + "\n") output_file.write("bias reverse corrected signal: \n" + np.array_str(mean_bc_signal_r) + "\n") output_file.write("forward bias signal: \n" + np.array_str(mean_bias_signal_f) + "\n") output_file.write("reverse bias signal: \n" + np.array_str(mean_bias_signal_r) + "\n") if self.protection_score: output_file.write("protection score: \n" + str(protection_score) + "\n") output_file.close() if self.strands_specific: fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(12.0, 10.0)) else: fig, (ax1, ax2) = plt.subplots(2) x = np.linspace(-50, 49, num=self.window_size) ax1.plot(x, mean_bias_signal_f, color='red', label='Forward') ax1.plot(x, mean_bias_signal_r, color='blue', label='Reverse') ax1.xaxis.set_ticks_position('bottom') ax1.yaxis.set_ticks_position('left') ax1.spines['top'].set_visible(False) ax1.spines['right'].set_visible(False) ax1.spines['left'].set_position(('outward', 15)) ax1.spines['bottom'].set_position(('outward', 5)) ax1.tick_params(direction='out') ax1.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49]) ax1.set_xticklabels([ '-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49' ]) min_bias_signal = min(min(mean_bias_signal_f), min(mean_bias_signal_r)) max_bias_signal = max(max(mean_bias_signal_f), max(mean_bias_signal_r)) ax1.set_yticks([min_bias_signal, max_bias_signal]) ax1.set_yticklabels( [str(round(min_bias_signal, 2)), str(round(max_bias_signal, 2))], rotation=90) ax1.text(-48, max_bias_signal, '# Sites = {}'.format(str(num_sites)), fontweight='bold') ax1.set_title(self.motif_name, fontweight='bold') ax1.set_xlim(-50, 49) ax1.set_ylim([min_bias_signal, max_bias_signal]) ax1.legend(loc="upper right", frameon=False) ax1.set_ylabel("Average Bias \nSignal", rotation=90, fontweight='bold') if not self.strands_specific: mean_raw_signal = self.standardize(mean_raw_signal) mean_bc_signal = self.standardize(mean_bc_signal) ax2.plot(x, mean_raw_signal, color='red', label='Uncorrected') ax2.plot(x, mean_bc_signal, color='green', label='Corrected') else: mean_raw_signal_f = self.standardize(mean_raw_signal_f) mean_raw_signal_r = self.standardize(mean_raw_signal_r) mean_bc_signal_f = self.standardize(mean_bc_signal_f) mean_bc_signal_r = self.standardize(mean_bc_signal_r) ax2.plot(x, mean_raw_signal_f, color='red', label='Forward') ax2.plot(x, mean_raw_signal_r, color='green', label='Reverse') ax3.plot(x, mean_bc_signal_f, color='red', label='Forward') ax3.plot(x, mean_bc_signal_r, color='green', label='Reverse') ax2.xaxis.set_ticks_position('bottom') ax2.yaxis.set_ticks_position('left') ax2.spines['top'].set_visible(False) ax2.spines['right'].set_visible(False) ax2.spines['left'].set_position(('outward', 15)) ax2.tick_params(direction='out') ax2.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49]) ax2.set_xticklabels([ '-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49' ]) ax2.set_yticks([0, 1]) ax2.set_yticklabels([str(0), str(1)], rotation=90) ax2.set_xlim(-50, 49) ax2.set_ylim([0, 1]) if not self.strands_specific: ax2.spines['bottom'].set_position(('outward', 40)) ax2.set_xlabel("Coordinates from Motif Center", fontweight='bold') ax2.set_ylabel("Average ATAC-seq \nSignal", rotation=90, fontweight='bold') ax2.legend(loc="center", frameon=False, bbox_to_anchor=(0.85, 0.06)) else: ax2.spines['bottom'].set_position(('outward', 5)) ax2.set_ylabel("Average ATAC-seq \n Uncorrected Signal", rotation=90, fontweight='bold') ax2.legend(loc="lower right", frameon=False) ax3.xaxis.set_ticks_position('bottom') ax3.yaxis.set_ticks_position('left') ax3.spines['top'].set_visible(False) ax3.spines['right'].set_visible(False) ax3.spines['left'].set_position(('outward', 15)) ax3.tick_params(direction='out') ax3.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49]) ax3.set_xticklabels([ '-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49' ]) ax3.set_yticks([0, 1]) ax3.set_yticklabels([str(0), str(1)], rotation=90) ax3.set_xlim(-50, 49) ax3.set_ylim([0, 1]) ax3.legend(loc="lower right", frameon=False) ax3.spines['bottom'].set_position(('outward', 40)) ax3.set_xlabel("Coordinates from Motif Center", fontweight='bold') ax3.set_ylabel("Average ATAC-seq \n Corrected Signal", rotation=90, fontweight='bold') ax3.text(-48, 0.05, '# K-mer = {}\n# Forward Shift = {}'.format( str(self.k_nb), str(self.atac_forward_shift)), fontweight='bold') figure_name = os.path.join(self.output_loc, "{}.line.eps".format(self.motif_name)) fig.subplots_adjust(bottom=.2, hspace=.5) fig.tight_layout() fig.savefig(figure_name, format="eps", dpi=300) # Creating canvas and printing eps / pdf with merged results output_fname = os.path.join(self.output_loc, "{}.eps".format(self.motif_name)) c = pyx.canvas.canvas() c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0)) if self.strands_specific: c.insert( pyx.epsfile.epsfile(2.76, 1.58, logo_fname, width=27.2, height=2.45)) else: c.insert( pyx.epsfile.epsfile(2.5, 1.54, logo_fname, width=16, height=1.75)) c.writeEPSfile(output_fname) os.system("epstopdf " + figure_name) os.system("epstopdf " + logo_fname) os.system("epstopdf " + output_fname)
def create_signal(args, regions): def revcomp(s): rev_dict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")]) return "".join([rev_dict[e] for e in s[::-1]]) alphabet = ["A", "C", "G", "T"] kmer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] f_obs_dict = dict([(e, 0.0) for e in kmer_comb]) r_obs_dict = dict([(e, 0.0) for e in kmer_comb]) f_exp_dict = dict([(e, 0.0) for e in kmer_comb]) r_exp_dict = dict([(e, 0.0) for e in kmer_comb]) bam_file = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fasta_file = Fastafile(genome_data.get_genome()) for region in regions: # Fetching observed reads reads = bam_file.fetch(reference=region.chrom, start=region.initial, end=region.final) for read in reads: if not read.is_reverse: p1 = read.pos - int(floor(args.k_nb / 2)) + args.forward_shift - 1 else: p1 = read.aend - int(floor(args.k_nb / 2)) + args.reverse_shift + 1 p2 = p1 + args.k_nb try: dna_sequence_obs = str(fasta_file.fetch(region.chrom, p1, p2)).upper() except Exception: continue if 'N' not in dna_sequence_obs: if read.is_reverse: dna_sequence_obs = revcomp(dna_sequence_obs) r_obs_dict[dna_sequence_obs] += 1 else: f_obs_dict[dna_sequence_obs] += 1 # Fetching whole sequence try: dna_sequence_exp = str(fasta_file.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue dna_sequence_exp_rev = revcomp(dna_sequence_exp) for i in range(0, len(dna_sequence_exp) - args.k_nb): s = dna_sequence_exp[i:i + args.k_nb] if "N" not in s: f_exp_dict[s] += 1 s = dna_sequence_exp_rev[i:i + args.k_nb] if "N" not in s: r_exp_dict[s] += 1 output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb))) output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb))) output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb))) output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb))) output_file_f_obs = open(output_fname_f_obs, "w") output_file_f_exp = open(output_fname_f_exp, "w") output_file_r_obs = open(output_fname_r_obs, "w") output_file_r_exp = open(output_fname_r_exp, "w") for kmer in list(r_obs_dict.keys()): if f_obs_dict[kmer] > 0: output_file_f_obs.write(kmer + "\t" + str(f_obs_dict[kmer]) + "\n") for kmer in list(r_obs_dict.keys()): if f_exp_dict[kmer] > 0: output_file_f_exp.write(kmer + "\t" + str(f_exp_dict[kmer]) + "\n") for kmer in list(r_obs_dict.keys()): if r_obs_dict[kmer] > 0: output_file_r_obs.write(kmer + "\t" + str(r_obs_dict[kmer]) + "\n") for kmer in list(r_obs_dict.keys()): if r_exp_dict[kmer] > 0: output_file_r_exp.write(kmer + "\t" + str(r_exp_dict[kmer]) + "\n") output_file_f_obs.close() output_file_f_exp.close() output_file_r_obs.close() output_file_r_exp.close()
score, name2, cdsStartStat, cdsEndStat, exonFrames) = line.strip().split() exonStarts = map(int, exonStarts.split(',')[:-1]) exonEnds = map(int, exonEnds.split(',')[:-1]) assert len(exonEnds) == len(exonStarts) == int(exonCount) seq = '' for start, end in zip(exonStarts, exonEnds): if chrom in fa.references: seq += fa.fetch(chrom, start, end) if strand == '-': seq = rc(seq) #seq = seq + 'A'*100 # polyadenylate if seq: genes[name2].append(seq) for name in genes: for i, tx in enumerate(genes[name]): print ">%s.%d\n%s" % (name, i, tx) else:
chrName = ll[0]; p1 = int(ll[1]); p2 = int(ll[2]) # Starting result structures regionTagCount = 0 resVec = [globalMin,0,0,0,0,0] # BIT-SCORE, MOTIF_P1, MOTIF_P2, FP_OVERLAP, FP_P1, FP_P2 counter = 0 # Evaluating Overall TC try: regionTagCount = tag_count(chrName, p1, p2, dnaseBam, tcHalfWindow) except Exception: print "Exception TC raised in "+line writeOutput(ll,regionTagCount,resVec,outFile) continue # Fetching sequence try: sequence = str(genomeFile.fetch(chrName, p1, p2)) except Exception: print "Exception SEQUENCE raised in "+line writeOutput(ll,regionTagCount,resVec,outFile) continue # Fetching footprints try: footprints = fpBam.fetch(reference=chrName, start=p1, end=p2) except Exception: print "Exception FOOTPRINTS raised in "+line writeOutput(ll,regionTagCount,resVec,outFile) continue # Best mpbs maxPos = -99999 maxValue = globalMin
if (not r.is_reverse): p1 = r.pos - (k_nb / 2) - 1 # The -1 is because He is wrong else: p1 = r.aend - (k_nb / 2) + 1 # The +1 is because He is wrong p2 = p1 + k_nb # Verifying PCR artifacts if (p1 == prevPos): trueCounter += 1 else: prevPos = p1 trueCounter = 0 if (trueCounter > maxDuplicates): continue # Fetching k-mer try: currStr = str(fastaFile.fetch(chrom, p1, p2)).upper() except Exception: continue if (r.is_reverse): currStr = str(Seq(currStr).reverse_complement()) # Counting k-mer in dictionary try: obsDict[currStr] += 1 except Exception: obsDict[currStr] = 1 if (not r.is_reverse): try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else:
vectorTable3.append(chrom) start = int(coordVec[1]) vectorTable1.append(str(start)) vectorTable2.append(str(start)) vectorTable3.append(str(start)) end = int(coordVec[2]) vectorTable1.append(str(end)) vectorTable2.append(str(end)) vectorTable3.append(str(end)) regionLength = end - start vectorTable1.append(str(regionLength)) vectorTable2.append(str(regionLength)) vectorTable3.append(str(regionLength)) # %CG content sequence = str(genomeFile.fetch(chrom, start, end)).upper() cgFreq = 0 for character in sequence: if (character == "C" or character == "G"): cgFreq += 1 vectorTable1.append(str(round(float(cgFreq) / regionLength, 2))) # CTCF status higherScore = -999. higherStrand = "NA" for k in ctcfIndexList: motifFile = motifFileList[k] motifFetch = motifFile.fetch(chrom, start, end) for read in motifFetch: rr = read.qname.split(":") motifScore = float(rr[1]) if (motifScore > higherScore):
class VariantAnnotator(object): def __init__(self, gene_db, reference_fasta): self.reference = Fastafile(reference_fasta) self.con = open_genedb(gene_db) self.gene_cache = OrderedDict() self.band_map = band_map = defaultdict(IntervalTree) for band in get_cytobands(self.con): band_map[band.chrom].insert(band.start,band.end,band) if band.chrom.startswith('chr') and band.chrom[3:] not in band_map: band_map[band.chrom[3:]] = band_map[band.chrom] trans = get_transcripts(self.con) trans = progress_loop(trans, label='Loading transcripts: ', units='transcripts') self.feature_map = feature_map = defaultdict(IntervalTree) for gene in trans: feature_map[gene.chrom].insert(gene.txStart,gene.txEnd,gene) if 0: # DEBUG parts = self.decode_gene(gene) for part in parts: if part.type not in ('intron','UTR5','UTR3','UTR') and '_' not in part.chrom: print '\t'.join(map(str,[part.chrom,part.start,part.end,gene.symbol])) sys.stderr.write('Loading complete.\n') def decode_gene(self,gene): gene_cache = self.gene_cache key = gene.id try: parts = gene_cache.pop(key) except KeyError: partlist = list(decode_gene(gene)) parts = IntervalTree() for part in partlist: parts.insert(part.start,part.end,part) if len(gene_cache)>=300: gene_cache.popitem(0) # Add result to end of LRU gene_cache[key] = parts return parts def annotate(self, chrom, ref_start, ref_end, variant, nsonly=False): variant = variant.replace('-','') ref_nuc = self.reference.fetch(chrom,ref_start,ref_end).upper() var_nuc = variant.upper() evidence = [] for feature in self.feature_map[chrom].find(ref_start, ref_end): evidence.extend( self.classify_feature(feature.value, ref_start, ref_end, ref_nuc, var_nuc) ) #ns = any('NON-SYNONYMOUS' in e[3] for e in evidence) #if nsonly and not ns: # return [] # If not in a gene, check to see if there are any genes nearby if not evidence: five_prime = set() three_prime = set() for feature in self.feature_map[chrom].find(ref_start-2000, ref_end+2000): gene = feature.value if (0<ref_end-gene.txStart<=2000) ^ (gene.strand=='-'): five_prime.add(gene) else: three_prime.add(gene) for gene in five_prime: evidence.append( ['UPSTREAM_GENE',gene,'',False,'','',ref_nuc,var_nuc,'',''] ) for gene in three_prime: evidence.append( ['DOWNSTREAM_GENE',gene,'',False,'','',ref_nuc,var_nuc,'',''] ) if not evidence: evidence.append( ['intergenic','','',False,'','',ref_nuc,var_nuc,'',''] ) evidence = group_evidence(evidence) cytoband = cytoband_name(self.band_map[chrom].find_values(ref_start,ref_end)) context = [ chrom,cytoband,ref_start,ref_end ] if 0: # evidence: print for e in evidence: values = context+e for f,v in zip(GeneEvidence.__slots__,values): print '%15s = %s' % (f,v) print evidence = [ GeneEvidence._make(context+e) for e in evidence ] return evidence def classify_feature(self, gene, ref_start, ref_end, ref_nuc, var_nuc): gene_parts = self.decode_gene(gene) intersect = defaultdict(list) for part in gene_parts.find_values(ref_start, ref_end): intersect[part.type].append(part) evidence = [] parts = set(intersect) mut_type = set() for splice in gene_parts.find_values(ref_start-5,ref_end+5): if splice.type=='CDS' or 'UTR' in splice.type: if (0<splice.start-ref_end<=5) or (0<ref_start-splice.end<=5): mut_type.add('POSSIBLE_INTRONIC_SPLICE_VARIANT') parts = ','.join(sorted(parts)) mut_type = ','.join(sorted(mut_type)) if len(intersect)==1 and len(intersect['CDS'])==1: e = self.classify_exonic_variant(gene, gene_parts, intersect['CDS'][0], ref_start, ref_end, ref_nuc, var_nuc) evidence.append(e) elif len(intersect['CDS']): evidence.append([parts,gene,'',True,'NON-SYNONYMOUS',mut_type,ref_nuc,var_nuc,'','']) elif mut_type: evidence.append([parts,gene,'',True,'PREDICTED-DISRUPT-TRANSCRIPT',mut_type,ref_nuc,var_nuc,'','']) elif len(intersect['UTR5'])+len(intersect['UTR3']): evidence.append([parts,gene,'',False,'UNKNOWN-UTR',mut_type,ref_nuc,var_nuc,'','']) elif len(intersect['intron']): evidence.append([parts,gene,'',False,'UNKNOWN-INTRONIC',mut_type,ref_nuc,var_nuc,'','']) else: evidence.append([parts,gene,'',False,'UNKNOWN-INTERGENIC',mut_type,ref_nuc,var_nuc,'','']) return evidence def classify_exonic_variant(self, gene, gene_parts, cds, ref_start, ref_end, ref_nuc, var_nuc): result = ['CDS',gene,'mRNA=%s:protein=%s:exon=%d:strand=%s' % \ (gene.mRNA,gene.protein,cds.exon_num,gene.strand)] exon_start = ref_start - cds.start exon_end = ref_end - cds.start # FIXME: Report ref and var nuc relative to gene strand var_nuc = var_nuc.upper() #print gene.chrom,ref_start,ref_end,ref_nuc,var_nuc #assert len(ref_nuc)==(ref_end-ref_start) if ref_nuc==var_nuc: result += [False,'SYNONYMOUS','REFERENCE',ref_nuc,var_nuc,'',''] return result ref_frame = len(ref_nuc)%3 var_frame = len(var_nuc)%3 frameshift = (len(ref_nuc)-len(var_nuc))%3 if 0: print ' REF_FRAME: %d' % ref_frame print ' VAR_FRAME: %d' % var_frame mut_type = [] if len(ref_nuc)==len(var_nuc): mut_type.append('SUBSTITUTION') elif len(ref_nuc)>len(var_nuc): mut_type.append('DELETION') else: mut_type.append('INSERTION') if exon_start<5: mut_type.append('POSSIBLE-SPLICE5') if cds.end-exon_end<5: mut_type.append('POSSIBLE-SPLICE3') if ref_frame!=var_frame: mut_type.append('FRAMESHIFT') mut_type = ','.join(sorted(mut_type)) result += [True,'NON-SYNONYMOUS',mut_type,ref_nuc,var_nuc,'',''] return result # FIXME: Request 100 bases beyond end of transcription ref_var_start = 0 ref_cds_seq = [] for part in gene_parts: if part.type=='CDS': seq = Seq(self.reference.fetch(part.chrom,part.start,part.end)) #assert len(seq)==(end-start) ref_cds_seq.append(seq) if part.cds_index<cds.cds_index: ref_var_start += len(seq) elif part.cds_index==cds.cds_index: ref_var_start += exon_start #assert ref_nuc==str(ref_cds_seq[cds.cds_index][exon_start:exon_end]).upper() if 0: print ' CDS : %d-%d' % (cds.start,cds.end) print ' VAR : %d-%d' % (ref_start,ref_end) print ' LOCAL: %d-%d (size=%d)' % (exon_start,exon_end,len(ref_cds_seq[cds.cds_index])) var_cds_seq = ref_cds_seq[:] v = list(var_cds_seq[cds.cds_index]) v[exon_start:exon_end] = list(var_nuc) var_cds_seq[cds.cds_index] = ''.join(v) ref_cds = Seq(''.join(str(s) for s in ref_cds_seq)) var_cds = Seq(''.join(str(s) for s in var_cds_seq)) if gene.strand=='-': ref_var_start = len(ref_cds)-ref_var_start-1 ref_cds = ref_cds.reverse_complement() var_cds = var_cds.reverse_complement() ref_cds_nuc = str(Seq(ref_nuc).reverse_complement()) var_cds_nuc = str(Seq(var_nuc).reverse_complement()) else: ref_cds_nuc = ref_nuc var_cds_nuc = var_nuc try: ref_cds_aa = ref_cds.translate() var_cds_aa = var_cds.translate() except TranslationError: mut_type.append('INVALID_TRANSLATION') mut_type = ','.join(sorted(mut_type)) result += [True,'PRESUMED_NON-SYNONYMOUS',mut_type,ref_cds_nuc,var_cds_nuc,'',''] return result ref_aa,var_aa,aa_position = reduce_match(str(ref_cds_aa),str(var_cds_aa)) if not ref_aa and not var_aa: mut_type = ','.join(sorted(mut_type)) codon_start = ref_var_start-ref_var_start%3 codon_end = ref_var_start+len(ref_nuc) if codon_end%3: codon_end += 3-codon_end%3 aa_position = codon_start//3 ref_frame = ref_cds[codon_start:codon_end] ref_aa = ref_frame.translate() #assert len(ref_aa) result[-1] += ':aa=%d' % (aa_position+1) result += [False,'SYNONYMOUS',mut_type,ref_cds_nuc,var_cds_nuc,str(ref_aa),str(ref_aa)] return result # Classify non-synonymous change by comparing AA sequences # Make sure ref protein doesn't appear to have spurious stops r = ref_cds_aa.rstrip('*') v = var_cds_aa.rstrip('*') ref_stop = r.find('*') var_stop = v.find('*') if ref_stop==-1: if var_stop!=-1 and not v.startswith(r): mut_type.append('PREMATURE_STOP') elif ref_cds_aa[-1]=='*' and var_cds_aa[-1]!='*': mut_type.append('LOSS_OF_STOP') if 0: print ' REF_NUC:',ref_cds_nuc print ' VAR_NUC:',var_cds_nuc print ' REF_AA:',ref_aa print ' VAR_AA:',var_aa #print ' NUC_DIFF:',levenshtein_sequence(str(ref_cds),str(var_cds)) #print ' AA_DIFF: ',levenshtein_sequence(str(ref_aa), str(var_aa) ) ref_size = ref_end-ref_start cds_size = len(ref_cds) print ' CDS_SIZE=%d (%.1f codons)' % (cds_size,cds_size/3.0) print ' CDS SEQ=%s' % ref_cds assert not ref_cds or str(ref_cds[:3])=='ATG' mut_type = ','.join(sorted(mut_type)) result[-1] += ':aa=%d' % (aa_position+1) result += [True,'NON-SYNONYMOUS',mut_type,ref_cds_nuc,var_cds_nuc,str(ref_aa),str(var_aa)] return result
def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local', logfile=sys.stdout, debugfile=sys.stderr): """Main function of the workflow""" ref_genome = assembly.fasta_by_chrom sample_names = [ job.groups[gid]['name'] for gid in sorted(job.files.keys()) ] logfile.write("\n* Generate vcfs for each chrom/group\n") logfile.flush() vcfs = dict((chrom, {}) for chrom in ref_genome.keys()) # {chr: {}} bams = {} # Launch the jobs bam = Samfile(job.files.values()[0].values()[0]['bam']) header = bam.header headerfile = unique_filename_in() for h in header["SQ"]: if h["SN"] in assembly.chrmeta: h["SN"] = assembly.chrmeta[h["SN"]]["ac"] head = Samfile(headerfile, "wh", header=header) head.close() for gid in job.files.keys(): # Merge all bams belonging to the same group runs = [r['bam'] for r in job.files[gid].itervalues()] if len(runs) > 1: _b = merge_bam(ex, runs) index_bam(ex, _b) bams[gid] = _b else: index_bam(ex, runs[0]) bams[gid] = runs[0] # Samtools mpileup + bcftools + vcfutils.pl for chrom, ref in ref_genome.iteritems(): vcf = unique_filename_in() vcfs[chrom][gid] = (vcf, pileup.nonblocking(ex, bams[gid], ref, header=headerfile, via=via, stdout=vcf)) logfile.write(" ...Group %s running.\n" % job.groups[gid]['name']) logfile.flush() # Wait for vcfs to finish and store them in *vcfs[chrom][gid]* for gid in job.files.keys(): for chrom, ref in ref_genome.iteritems(): vcfs[chrom][gid][1].wait() vcfs[chrom][gid] = vcfs[chrom][gid][0] logfile.write(" ...Group %s done.\n" % job.groups[gid]['name']) logfile.flush() # Targz the pileup files (vcf) tarname = unique_filename_in() tarfh = tarfile.open(tarname, "w:gz") for chrom, v in vcfs.iteritems(): for gid, vcf in v.iteritems(): tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'], chrom)) tarfh.close() ex.add(tarname, description=set_file_descr("vcf_files.tar.gz", step="pileup", type="tar", view='admin')) logfile.write("\n* Merge info from vcf files\n") logfile.flush() outall = unique_filename_in() outexons = unique_filename_in() with open(outall, "w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \ ['gene','location_type','distance'])+'\n') with open(outexons, "w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \ + ['new_aa_'+s for s in sample_names])+'\n') msa_table = dict((s, '') for s in [assembly.name] + sample_names) for chrom, v in vcfs.iteritems(): logfile.write(" > Chromosome '%s'\n" % chrom) logfile.flush() # Put together info from all vcf files logfile.write(" - All SNPs\n") logfile.flush() allsnps = all_snps(ex, chrom, vcfs[chrom], bams, outall, assembly, headerfile, sample_names, mincov, float(minsnp), logfile, debugfile, via) # Annotate SNPs and check synonymy logfile.write(" - Exonic SNPs\n") logfile.flush() exon_snps(chrom, outexons, allsnps, assembly, sample_names, ref_genome, logfile, debugfile) for snprow in allsnps: for n, k in enumerate([assembly.name] + sample_names): base = snprow[3 + n][0] if base == "-": base = snprow[3][0] if base not in 'ACGTacgt': base = "N" msa_table[k] += base description = set_file_descr("allSNP.txt", step="SNPs", type="txt") ex.add(outall, description=description) description = set_file_descr("exonsSNP.txt", step="SNPs", type="txt") ex.add(outexons, description=description) msafile = unique_filename_in() with open(msafile, "w") as msa: msa.write(" %i %i\n" % (len(msa_table), len(msa_table.values()[0]))) for name, seq in msa_table.iteritems(): msa.write("%s\t%s\n" % (name, seq)) msa_table = {} description = set_file_descr("SNPalignment.txt", step="SNPs", type="txt") ex.add(msafile, description=description) # Create UCSC bed tracks logfile.write("\n* Create tracks\n") logfile.flush() create_tracks(ex, outall, sample_names, assembly) # Create quantitative tracks logfile.write("\n* Create heteroz. and quality tracks\n") logfile.flush() def _process_pileup(pileups, seq, startpos, endpos): atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3} vectors = ([], [], []) for pileupcolumn in pileups: position = pileupcolumn.pos if position < startpos: continue if position >= endpos: break coverage = pileupcolumn.n ref_symbol = seq[position - startpos] ref = atoi.get(ref_symbol, 4) symbols = [0, 0, 0, 0, 0] quality = 0 for pileupread in pileupcolumn.pileups: if pileupread.qpos >= len(pileupread.alignment.seq): coverage -= 1 else: symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos], 4)] += 1 quality += ord( pileupread.alignment.qual[pileupread.qpos]) - 33 quality = float(quality) / coverage info = heterozygosity(ref, symbols[0:4]) if coverage > 0: vectors[0].append((position, position + 1, coverage)) if info > 0: vectors[1].append((position, position + 1, info)) if quality > 0: vectors[2].append((position, position + 1, quality)) # yield (position, position+1, coverage, info, quality) return vectors if job.options.get('make_bigwigs', False): _descr = { 'groupId': 0, 'step': "tracks", 'type': "bigWig", 'ucsc': '1' } for gid, bamfile in bams.iteritems(): _descr['groupId'] = gid bamtr = track(bamfile, format="bam") covname = unique_filename_in() + ".bw" out_cov = track(covname, chrmeta=assembly.chrmeta) hetname = unique_filename_in() + ".bw" out_het = track(hetname, chrmeta=assembly.chrmeta) qualname = unique_filename_in() + ".bw" out_qual = track(qualname, chrmeta=assembly.chrmeta) for chrom, cinfo in assembly.chrmeta.iteritems(): fasta = Fastafile(ref_genome[chrom]) #process fasta and bam by 10Mb chunks for chunk in range(0, cinfo["length"], 10**7): fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk + 10**7) vecs = _process_pileup( bamtr.pileup(chrom, chunk, chunk + 10**7), fastaseq, chunk, chunk + 10**7) out_cov.write(vecs[0], fields=['start', 'end', 'score'], chrom=chrom) out_het.write(vecs[1], fields=['start', 'end', 'score'], chrom=chrom) out_qual.write(vecs[2], fields=['start', 'end', 'score'], chrom=chrom) out_cov.close() out_het.close() out_qual.close() description = set_file_descr( job.groups[gid]['name'] + "_coverage.bw", **_descr) ex.add(covname, description=description) description = set_file_descr( job.groups[gid]['name'] + "_heterozygosity.bw", **_descr) ex.add(hetname, description=description) description = set_file_descr( job.groups[gid]['name'] + "_quality.bw", **_descr) ex.add(qualname, description=description) return 0