def de_brujin_adjacency_list(dnas): adj_list = set() for dna in dnas: revc = reverse_complement(dna) adj_list.add((dna[:-1], dna[1:])) adj_list.add((revc[:-1], revc[1:])) return adj_list
def find_orfs(seq): """Find all potential open reading frames in a sequence.""" frames = [] for i in range(3): frames.append(transcribe(seq[i:])) frames.append(transcribe(reverse_complement(seq)[i:])) orfs = set() codons = load_codons() for frame in frames: i = 0 recording = False current_orf = '' while True: codon = frame[i:i + 3] aa = codons.get(codon, 'Stop') if len(codon) != 3: break # Because reached end of frame if codon == 'AUG': recording = True if aa != 'Stop' and recording: current_orf += aa elif aa == 'Stop': recording = False orfs.add(current_orf) current_orf = '' i += 3 orfs.remove('') orfs = extract_subseqs(orfs) return orfs
def reverse_palindrome(dna): size = len(dna) for start in range(size): for length in range(4, 13): piece = dna[start:start + length] if len(piece) >= 4 and piece == reverse_complement(piece): yield (start + 1, length)
def reverse_palindromes(dna): sites = [] for length in range(4, 13): for start in range(0, len(dna) - length + 1): segment = dna[start:start + length] if segment == revc.reverse_complement(segment): sites.append((start + 1, length)) return sites
def is_reverse_palindrome(seq): if len(seq) == 0: return True elif len(seq) == 1: return False elif seq[0] == revc.reverse_complement(seq[-1]): return is_reverse_palindrome(seq[1:-1]) else: return False
def setup_assemble_genome(S): reads = set(S) | {reverse_complement(i) for i in S} while True: genome = assemble_genome(reads) if genome: return genome else: reads = {read[:-1] for read in reads} | {read[1:] for read in reads}
def correct_errors(seqs): corrections = [] c = Counter(seqs) correct = [item for item in c if c[item] >= 2] reverse = {reverse_complement(s) for s in correct} seqset = set(correct) for s in [item for item in c if c[item] < 2]: if s not in reverse: for fix in (f for f in seqset | reverse if hamming(s, f) == 1): corrections.append((s, fix)) return corrections
def find_reverse_palindromes(dna: str, minlen: int = 4, maxlen: int = 12) -> Dict[int, int]: palindromes = {} for i in range(len(dna) - minlen + 1): for j in range(minlen, maxlen + 1): if i + j > len(dna): continue substring = dna[i:i + j] if is_reverse_palindrome(substring): print(i, j, substring, reverse_complement(substring)) palindromes[i + 1] = j return palindromes
def find_candidate_proteins(dna: str) -> List[str]: rna = _convert_to_rna(dna) revcomp = _convert_to_rna(reverse_complement(dna)) candidate_proteins = set() for i in range(len(rna) - 2): codon = rna[i:i+3] rev_codon = revcomp[i:i+3] if codon == 'AUG': candidate_proteins.add(_create_protein(rna, startpos=i)) elif rev_codon == 'AUG': candidate_proteins.add(_create_protein(revcomp, startpos=i)) elif codon == 'AUG' and rev_codon == 'AUG': candidate_proteins.add(_create_protein(rna, startpos=i)) candidate_proteins.add(_create_protein(revcomp, startpos=i)) return sorted([p for p in candidate_proteins if p is not None])
def open_reading_frames(dna_): dnac = revc.reverse_complement(dna_) rna_ = rna.convert_to_rna(dna_) rnac = rna.convert_to_rna(dnac) proteins = [] for i in range(3): proteins.append(prot.translate(rna_[i:], stop='*')) proteins.append(prot.translate(rnac[i:], stop='*')) candidates = [] for protein in proteins: for i, aa in enumerate(protein): if aa == 'M': for j, aa_stop in enumerate(protein[i + 1:]): if aa_stop == '*': candidates.append(protein[i:i + j + 1]) break return set(candidates)
def error_corrections(reads): # seen contains only the read, seen_twice contains both the read and its reverse complement seen, seen_twice = set(), set() corrections = {} for read in reads: read_revc = reverse_complement(read) if read in seen_twice: continue elif read in seen or read_revc in seen: seen.discard(read) seen.discard(read_revc) seen_twice.add(read) seen_twice.add(read_revc) else: seen.add(read) for i in seen: for j in seen_twice: if hamming_distance(i, j) == 1: corrections[i] = j break return corrections
#!/usr/bin/env python from __future__ import print_function import os from revc import reverse_complement if __name__ == "__main__": with open(os.path.join('data', 'rosalind_dbru.txt')) as dataset: dna_set = set(s[:-1] for s in dataset) reverse_complements = {reverse_complement(s) for s in dna_set} dna_strings = dna_set | reverse_complements item = dna_strings.pop() k = len(item) - 1 dna_strings.add(item) adj_list = {(r[0:k], r[1:k + 1]) for r in dna_strings} for adj in sorted(adj_list): print('(' + adj[0] + ', ' + adj[1] + ')')
from prot import prepare_codon_table from revp import read_fasta from revc import reverse_complement from rna import rna_transcription from subs import substring_find if __name__ == "__main__": with open(os.path.join('data', 'rosalind_orf.txt')) as dataset: seqs = read_fasta(dataset) codon_table = prepare_codon_table(os.path.join('data', 'codon_table')) output = [] dna = seqs.popitem()[1] rna = (rna_transcription(dna), rna_transcription(reverse_complement(dna))) for seq in rna: for offset in (0, 1, 2): for start_pos in substring_find(seq[offset:], 'AUG'): current = [] for codon in (seq[i:i + 3] for i in range(start_pos, len(seq), 3)): if len(codon) == 3: if codon_table[codon] == 'Stop' and current: output.append(''.join(current)) current = [] elif codon == 'AUG' or current: current.append(codon_table[codon]) print("\n".join(set(output)))
#!/usr/bin/env python from __future__ import print_function import os from revc import reverse_complement if __name__ == "__main__": with open(os.path.join('data', 'rosalind_pcov.txt')) as dataset: reads = {s.rstrip() for s in dataset} reverse_complements = {reverse_complement(s) for s in reads} dna_strings = reads | reverse_complements item = dna_strings.pop() k = len(item) - 1 dna_strings.add(item) adj_list = {(head, tail) for (head, tail) in {(r[0:k], r[1:k + 1]) for r in dna_strings} if any(head in dna or tail in dna for dna in dna_strings)} superstring = [] c = adj_list.pop() while adj_list: superstring.append(c[1][k - 1:]) next_edge = {n for n in adj_list if n[0] == c[1]} if next_edge: c = (adj_list & next_edge).pop()
from prot import prepare_codon_table from revp import read_fasta from revc import reverse_complement from rna import rna_transcription from subs import substring_find if __name__ == "__main__": with open(os.path.join("data", "rosalind_orf.txt")) as dataset: seqs = read_fasta(dataset) codon_table = prepare_codon_table(os.path.join("data", "codon_table")) output = [] dna = seqs.popitem()[1] rna = (rna_transcription(dna), rna_transcription(reverse_complement(dna))) for seq in rna: for offset in (0, 1, 2): for start_pos in substring_find(seq[offset:], "AUG"): current = [] for codon in (seq[i : i + 3] for i in range(start_pos, len(seq), 3)): if len(codon) == 3: if codon_table[codon] == "Stop" and current: output.append("".join(current)) current = [] elif codon == "AUG" or current: current.append(codon_table[codon]) print("\n".join(set(output)))
def is_reverse_palindrome(dna: str) -> bool: if dna == reverse_complement(dna): return True return False