def cons(stream): countA: List[int] = [] countC: List[int] = [] countG: List[int] = [] countT: List[int] = [] for chunk in fasta(stream): if len(countA) == 0: dnaSize = len(chunk.content) countA = [0 for _ in range(dnaSize)] countC = [0 for _ in range(dnaSize)] countG = [0 for _ in range(dnaSize)] countT = [0 for _ in range(dnaSize)] for i, c in enumerate(chunk.content): if c == "A": countA[i] += 1 elif c == "C": countC[i] += 1 elif c == "G": countG[i] += 1 elif c == "T": countT[i] += 1 consensus = "" for a, c, g, t in zip(countA, countC, countG, countT): consensus += max([ { "l": "A", "n": a }, { "l": "C", "n": c }, { "l": "G", "n": g }, { "l": "T", "n": t }, ], key=lambda x: x["n"])["l"] return { "A": countA, "C": countC, "G": countG, "T": countT, "Consensus": consensus }
#!/bin/env python3 import sys import util """ Given: Two DNA strings of equal length. Return: The transition/transversion ratio. transitions: A<>G, C<>T transversions: A<>C, A<>T, G<>C, G<>T """ s1, s2 = util.fasta(sys.stdin.readlines()).values() assert len(s1) == len(s2) n = len(s1) transitions = 0 transversions = 0 for i in range(n): c1 = s1[i] c2 = s2[i] if c1 == c2: continue # ord('A') + ord('G') = 136 # ord('C') + ord('T') = 151 if ord(c1) + ord(c2) in [136, 151]: transitions += 1 else: transversions += 1
#!/bin/env python3 import sys import util """ Given: At most 10 DNA strings in FASTA format. Return: The ID of the string having the highest GC-content, followed by the GC-content of that string. """ data = util.fasta(sys.stdin.readlines()) max_gc = -1 for k, v in data.items(): gc = util.gc(v) if gc > max_gc: max_gc = gc max_id = k print(max_id) print("{:.6f}".format(max_gc * 100))
def gcPerFasta(stream): for fastaChunk in fasta(stream): nc = dna([fastaChunk.content]) yield ((nc.G + nc.C) / nc.total(), fastaChunk)
#!/bin/env python3 import sys from util import fasta """ Given: A collection of 'k' DNA strings each in FASTA format. Return: A longest common substring of the collection. If multiple solutions exist, you may return any single solution. Notes: Start with the longest candidates, and work towards shorter ones. Once a candidate is a solution, the search is done. Need a way of knowing if a given substring exists in a given string. """ strings = list(fasta(sys.stdin.readlines()).values()) print(strings)
#!/bin/env python3 import sys import util """ Given: A DNA string in FASTA format. Return: The position and length of every reverse palindrome in the string having length between 4 and 12. """ MIN_LEN = 4 MAX_LEN = 12 dna_string = list(util.fasta(sys.stdin.readlines()).values())[0] dna_len = len(dna_string) for i in range(0, dna_len - MIN_LEN + 1): search_space = dna_len - i current_max_len = min(MAX_LEN, search_space) for l in range(MIN_LEN, current_max_len + 1): if util.reverse_palindrome(dna_string, i, i + l - 1): print("{} {}".format(i + 1, l))
#!/bin/env python3 import sys import util """ Given: A DNA string 's' and a collection of substrings of 's' acting as introns. All strings are given in FASTA format. Return: A protein string resulting from transcribing and translating the exons of 's'. """ data = list(util.fasta(sys.stdin.readlines()).values()) dna_string = data[0] introns = data[1:] for intron in introns: dna_string = dna_string.replace(intron, '') rna_string = util.rna(dna_string) codons = util.codons(rna_string) protein = util.protein(codons) print(''.join(protein))