def encodeSeq(self, bucketIndex, seq, pos): #encode the left part K = dna.reverse_complement(str(bucketIndex)) for base in dna.reverse_complement(seq[:pos]): pred = self.Successors_in_graph(K) if len(pred) == 1: if pred[0][-1] != base: self.newNodeNum += 1 self.numFlag.append('0b1') symbol = self.dna2num[base] if base > pred[0][-1]: symbol = self.dna2num[base] - 1 self.encodeSeqPathL.write( self.freq3, symbol) #save the reverse complement sequence else: self.simpleNodeNum += 1 self.numFlag.append('0b0') K = pred[0] else: if len(pred) == 0: self.tipNodeNum += 1 self.encodeSeqPathL.write(self.freq4, self.dna2num[base]) else: self.bifurNodeNum += 1 self.getFreqs(K) self.encodeSeqPathL.write(self.freqs, self.dna2num[base]) K = self.Suffix(K) + base #encode the right part K = str(bucketIndex) for base in seq[pos + self.indexLen:]: succ = self.Successors_in_graph(K) if len(succ) == 1: if succ[0][-1] != base: self.newNodeNum += 1 self.numFlag.append('0b1') symbol = self.dna2num[base] if base > succ[0][-1]: symbol = self.dna2num[base] - 1 self.encodeSeqPathR.write(self.freq3, symbol) else: self.simpleNodeNum += 1 self.numFlag.append('0b0') K = succ[0] else: if len(succ) == 0: self.tipNodeNum += 1 self.encodeSeqPathR.write(self.freq4, self.dna2num[base]) else: self.bifurNodeNum += 1 self.getFreqs(K) self.encodeSeqPathR.write(self.freqs, self.dna2num[base]) K = self.Suffix(K) + base return
def addtoGraph(self, seq, pos): for kmer in self.getKmerR(seq, pos): if str(kmer[:-1]) not in self.bucketDict: self.bucketDict.setdefault(str(kmer[:-1]), [1, 1, 1, 1]) self.bucketDict[str(kmer[:-1])][self.dna2num[str(kmer[-1])]] += 1 for kmer in self.getKmerL(seq, pos): if dna.reverse_complement(kmer[1:]) not in self.bucketDict: self.bucketDict.setdefault(dna.reverse_complement(kmer[1:]), [1, 1, 1, 1]) self.bucketDict[dna.reverse_complement( kmer[1:])][3 - self.dna2num[kmer[0]]] += 1 return
def decompressGraph(self, bucketIndex, indexPos): sequence = bucketIndex #decode the left path K = dna.reverse_complement(bucketIndex) base = "" for i in range(indexPos): pred = self.Successors_in_graph(K) if len(pred) == 1: if self.numFlag.read(bool, 1)[0]: # New node: self.newNodeNum += 1 symbol = self.decodeSeqPathL.read(self.freq3) base = self.num2dna[symbol] if base >= pred[0][-1]: base = self.num2dna[symbol + 1] else: self.simpleNodeNum += 1 base = pred[0][-1] K = pred[0] else: if len(pred) == 0: self.tipNodeNum += 1 symbol = self.decodeSeqPathL.read(self.freq4) base = self.num2dna[symbol] else: self.bifurNodeNum += 1 self.getFreqs(K) symbol = self.decodeSeqPathL.read(self.freqs) base = self.num2dna[symbol] K = self.Suffix(K) + base sequence = self.recdna[base] + sequence #decode the right path K = bucketIndex for i in range(self.seqLen - indexPos - self.indexLen): succ = self.Successors_in_graph(K) if len(succ) == 1: if self.numFlag.read(bool, 1)[0]: # New node self.newNodeNum += 1 symbol = self.decodeSeqPathR.read(self.freq3) base = self.num2dna[symbol] if base >= succ[0][-1]: base = self.num2dna[symbol + 1] else: self.simpleNodeNum += 1 base = succ[0][-1] K = succ[0] else: if len(succ) == 0: self.tipNodeNum += 1 symbol = self.decodeSeqPathR.read(self.freq4) base = self.num2dna[symbol] else: self.bifurNodeNum += 1 self.getFreqs(K) symbol = self.decodeSeqPathR.read(self.freqs) base = self.num2dna[symbol] K = self.Suffix(K) + base sequence += base return sequence
def extract_windows_score(N, H, bg, location, w, params): """ """ R = [] ratio = params['ratio'] # convert position to new ref l = [p - location[0] for p in H[w]] if len(l) < params['occ'][0] or len(l) > params['occ'][1]: return [] #extract windows mu = bg.mu(w, location)[:-len(w) + 1] #print w mu = [mu[i] * N[len(w)][i] for i in range(len(mu))] alpha = [x * ratio for x in mu] for a, b, obsOcc in score(l, alpha, mu): a, b = a + location[0], b + location[0] if b - a + 1 < params['width'][0] or b - a + 1 > params['width'][1]: continue #test occ if obsOcc < params['occ'][0] or obsOcc > params['occ'][1]: continue n = sum(N[len(w)][a - location[0]:b - location[0] + 1]) try: obsFreq = obsOcc / float(n) except: cli.warning('n error') pass expFreq = bg.freq(w, (a, b)) expOcc = expFreq * n pv = Stats.dist.ppois(obsOcc, expOcc) #pv = Stats.dist.pbinom(obsOcc, n, expFreq) ev = 1.0 label = '%s|%s' % (w, reverse_complement(w)) R.append([ w, label, obsFreq, expFreq, obsOcc, expOcc, pv, ev, -log10(ev), a, b, b - a + 1, 0, n, 0, 0 ]) #cli.info(R[-1]) for r in R: r[7] = r[6] * len(H) * len(R) r[8] = -log10(r[7]) r[12] = len(R) return R
def oligo2MM(filename): """Load data from oligo-analysis formated file (can be gzipped) """ if filename.endswith('.gz'): f = gzip.open(filename) else: f = open(filename) rc = 0 priori = {'A': 0, 'C': 0, 'G': 0, 'T': 0} i = 0 for line in f: if line.startswith('#') or line.startswith(';'): if line.find('grouped by pairs of reverse complements') > 0: rc = 1 continue elements = line.strip().split() w, freq, count = elements[0], float(elements[2]), int(elements[3]) w = w.upper() # choose markov order if i == 0: mm = MM(len(w) - 1, pseudo=0.0) #mm.order = len(w) - 1 i += 1 if rc: wrc = reverse_complement(w) prefix = wrc[:-1] if w != wrc: freq = freq / 2.0 mm.S[prefix] = mm.S.get(prefix, 0) + freq mm.T[prefix] = mm.T.get(prefix, {}) mm.T[prefix][wrc[-1]] = freq for letter in wrc: priori[letter] += freq #priori[prefix] += freq prefix = w[:-1] mm.S[prefix] = mm.S.get(prefix, 0) + freq mm.T[prefix] = mm.T.get(prefix, {}) mm.T[prefix][w[-1]] = freq #priori for letter in w: priori[letter] += freq #priori[prefix] += freq S = float(sum(priori.values())) mm.priori = [priori[b] / S for b in ALPHABET] mm.freq() #print mm.priori #print mm.order #print mm.S #print mm.T return mm
def count_words_hash(sequences, l, searchLocation, strand='+-', overlap=False): """Count each word of length l in sequences l -- oligonucleotide length searchLocation -- location tuple example (-200,-1) strand -- + or +- overlap -- allow auto-overlapping return N, H """ location = find_location(sequences) H = {} #hash table key=oligonucleotide value=list of occurrence position N = {} #scanned bases per position for each word size N[l] = [0] * (searchLocation[1] - searchLocation[0] + 1) scannedPositions = 0 scannedWords = 0 info = cli.Info(len(sequences), 1, 1) for s in sequences: info('Counting words in [%+05d:%+05d]' % (searchLocation[0], searchLocation[1])) dna = s.sequence HS = {} # for current sequence a, b = max(searchLocation[0], s.location[0]), min(searchLocation[1], s.location[1] - l + 1) for I in range(a, b + 1): scannedPositions += 1 i = I - s.location[0] if dna[i:i + l].find('N') >= 0: continue if strand == '+': w = dna[i:i + l] elif strand == '+-': wf = dna[i:i + l] wr = reverse_complement(dna[i:i + l]) w = min(wf, wr) N[l][I - searchLocation[0]] += 1 if not overlap and HS.get(w, [a - l])[-1] + l > I: continue H.setdefault(w, []).append(I) HS.setdefault(w, []).append(I) scannedWords += 1 return dict(N=N, H=H, scannedPositions=scannedPositions, scannedWords=scannedWords)
def reSortSeqence(self): ##for reasin single read """Run over the given file and sort the sequences.""" index, reverseFlag, indexPos = self.getMostCommenIndex() self.bucketTable[index] = self.bucketTable.get(index, 0) + 1 if reverseFlag: self.read['sequence'] = dna.reverse_complement( str(self.record['sequence'])) else: self.read['sequence'] = str(self.record['sequence']) self.read['reverse'] = reverseFlag self.read['indexPos'] = indexPos self.sequenceTable.setdefault(index, []).append(copy.deepcopy(self.read)) return
def sanitize_codon_list(codon_list, forbidden_seqs=()): """ Make silent mutations to the given codon lists to remove any undesirable sequences that are present within it. Undesirable sequences include restriction sites, which may be optionally specified as a second argument, and homopolymers above a pre-defined length. The return value is the number of corrections made to the codon list. """ # Unit test missing for: # Homopolymer fixing for codon in codon_list: if len(codon) != 3: raise ValueError("Codons must have exactly 3 bases: '{}'".format(codon)) # Compile a collection of all the sequences we don't want to appear in the # gene. This includes the given restriction sites and their reverse # complements, plus any homopolymers above a pre-defined length. bad_seqs = set() bad_seqs.union( restriction_sites.get(seq, seq) for seq in forbidden_seqs) bad_seqs.union( dna.reverse_complement(seq) for seq in bad_seqs) bad_seqs.union( base * (gen9.homopolymer_max_lengths[base] + 1) for base in dna.dna_bases) bad_seqs = [ dna.dna_to_re(bs) for bs in bad_seqs] # Remove every bad sequence from the gene by making silent mutations to the # codon list. num_corrections = 0 for bad_seq in bad_seqs: while remove_bad_sequence(codon_list, bad_seq, bad_seqs): num_corrections += 1 return num_corrections
def group_rc(c): """ TODO : grouprc and overlapping on both strands """ groupedc = {} for w in c: wrc = dna.reverse_complement(w) x = min(w, wrc) if x in groupedc: continue groupedc[x] = c[w] if w != wrc and wrc in c: groupedc[x] += c[wrc] else: groupedc[x] *= 2 return groupedc
def getMostCommenIndex(self): read = self.record['sequence'] rcread = dna.reverse_complement(str(read)) freq_kmer = 0 mostIndex = 0 indexPos = 0 reverseFlag = False kmerIter = 0 for kmer in self.getBucketKmers(read): m = self.getCommenOverlapKmer(kmer, read) if m > freq_kmer: freq_kmer = m mostIndex = kmer indexPos = kmerIter kmerIter += 1 kmerIter = 0 for kmer in self.getBucketKmers(rcread): m = self.getCommenOverlapKmer(kmer, rcread) if m > freq_kmer: freq_kmer = m mostIndex = kmer reverseFlag = True indexPos = kmerIter kmerIter += 1 #add new kmer to mutiDict if freq_kmer > 0: if (reverseFlag): self.addTomutiDict(mostIndex, rcread) else: self.addTomutiDict(mostIndex, read) else: minKmer, reverseFlag, indexPos = self.getReadMinKmers() if (reverseFlag): self.addTomutiDict(minKmer, rcread) else: self.addTomutiDict(minKmer, read) #find a commen index if freq_kmer: return mostIndex, reverseFlag, indexPos else: return minKmer, reverseFlag, indexPos
def __getitem__(self, key): if self.strand == '+-': wrc = reverse_complement(key) if key != wrc: if self.dyad: return self.P(key[:self.monad]) * self.P( key[-self.monad:]) + self.P(wrc[:self.monad]) * self.P( wrc[-self.monad:]) else: return self.P(key) + self.P(wrc) else: if self.dyad: return self.P(key[:self.monad]) * self.P(key[-self.monad:]) else: return self.P(key) else: if self.dyad: return self.P(key[:self.monad]) * self.P(key[-self.monad:]) else: return self.P(key)
def reassigned(self): for countIndex in self.bucketTable.keys(): if self.bucketTable[countIndex] == 1: if self.sequenceTable[countIndex][0][ 'reverse']: ##recover the raw sequence self.record['sequence'] = dna.reverse_complement( str(self.sequenceTable[countIndex][0]['sequence'])) else: self.record['sequence'] = self.sequenceTable[countIndex][ 0]['sequence'] self.read['N'] = self.sequenceTable[countIndex][0][ 'N'] ## take N infor for reassign self.read['order'] = self.sequenceTable[countIndex][0]['order'] self.read['len'] = self.sequenceTable[countIndex][0]['len'] self.seqLen = len(self.record['sequence']) del self.mutiDict[countIndex] del self.bucketTable[countIndex] del self.sequenceTable[countIndex] self.reSortSeqence() return
def getReadMinKmers(self): read = self.record['sequence'] rcread = dna.reverse_complement(str(read)) minKmer = self.maxKmer reverseFlag = False indexPos = 0 kmerIter = 0 for kmer in self.getBucketKmers(read): if kmer < minKmer: minKmer = kmer indexPos = kmerIter kmerIter += 1 kmerIter = 0 for kmer in self.getBucketKmers(rcread): if kmer < minKmer: minKmer = kmer reverseFlag = True indexPos = kmerIter kmerIter += 1 return minKmer, reverseFlag, indexPos
def get_positions_two_strands(c, overlap=False): positions = {} for wf in c: wrc = dna.reverse_complement(wf) w = min(wf, wrc) if w in positions: continue l = [] f = c[wf] for i in f: l += [j[0] for j in f[i]] if wf != wrc: r = c.get(wrc, {}) for i in r: l += [j[0] for j in r[i]] l.sort() positions[w] = l return positions
def __next(self, a, b, obsOcc): width = b - a + 1 if width < self.MIN_WIDTH or width > self.MAX_WIDTH: return if obsOcc < self.MIN_OCC or obsOcc > self.MAX_WIDTH: return n = sum(self.N[len(self.w)][a - self.location[0]:b - self.location[0] + 1]) try: obsFreq = obsOcc / float(n) except: cli.warning('n error') expFreq = self.bg.freq(self.w, (a, b)) expOcc = expFreq * n #pv = ppois(obsOcc, expOcc) #pv = ppois_cached(obsOcc, expOcc) #pv = pbinom_right_left_cached(obsOcc, n, expFreq) if self.params['under']: pv = pbinom_left(obsOcc, n, expFreq) else: pv = pbinom(obsOcc, n, expFreq) ev = 1.0 label = '%s|%s' % (self.w, reverse_complement(self.w)) w = self.w spaces = self.w.count('N') if spaces >= 1: label = label.replace('N' * spaces, 'n{%d}' % spaces) w = self.w.replace('N' * spaces, 'n{%d}' % spaces) self.R.append([ w, label, obsFreq, expFreq, obsOcc, expOcc, pv, ev, -log10(ev), a, b, b - a + 1, 0, n, 0, 0 ])
def count_dyads_hash(sequences, l, spacing, searchLocation, strand='+-', overlap=False): """Count each dyad of length l in sequences l -- oligonucleotide length spacing -- spacing searchLocation -- location tuple example (-200,-1) strand -- + or +- overlap -- allow auto-overlapping """ lmonad = l l = 2 * lmonad + spacing location = find_location(sequences) H = {} N = {} N[l] = [0] * (searchLocation[1] - searchLocation[0] + 1) scannedPositions = 0 scannedWords = 0 info = cli.Info(len(sequences), 1, 1) for s in sequences: info('Counting words in [%+05d:%+05d]' % (searchLocation[0], searchLocation[1])) dna = s.sequence HS = {} a, b = max(searchLocation[0], s.location[0]), min(searchLocation[1], s.location[1] - l + 1) for I in range(a, b + 1): scannedPositions += 1 i = I - s.location[0] if dna[i:i + l].find('N') >= 0: continue if strand == '+': w = dna[i:i + lmonad] + 'N' * spacing + dna[i + lmonad + spacing:i + 2 * lmonad + spacing] elif strand == '+-': wf = dna[i:i + lmonad] + 'N' * spacing + dna[i + lmonad + spacing:i + 2 * lmonad + spacing] wr = reverse_complement(wf) w = min(wf, wr) N[l][I - searchLocation[0]] += 1 if not overlap and HS.get(w, [a - l])[-1] + l > I: continue H.setdefault(w, []).append(I) HS.setdefault(w, []).append(I) scannedWords += 1 return dict(N=N, H=H, scannedPositions=scannedPositions, scannedWords=scannedWords)
def mergePairRead(self, record1, record2): self.record['sequence'] = str( record1['sequence']) + dna.reverse_complement( str(record2['sequence'])) return
def print_count(c): l = [(k, v) for k, v in list(c.items())] l.sort() for w, wcount in l: wrc = dna.reverse_complement(w) print('%s|%s %4d' % (w, wrc, wcount))
def outPutSeqence(self): if self.readrc.read(bool, 1)[0]: self.sequence = dna.reverse_complement(self.sequence) self.replaceN() self.outFile.write(self.sequence + "\n") return
def ajustSeqDir(self): if self.readrc.read(bool, 1)[0]: self.sequence = dna.reverse_complement(self.sequence) return