Пример #1
0
def de_brujin_adjacency_list(dnas):
    adj_list = set()
    for dna in dnas:
        revc = reverse_complement(dna)
        adj_list.add((dna[:-1], dna[1:]))
        adj_list.add((revc[:-1], revc[1:]))
    return adj_list
Пример #2
0
def find_orfs(seq):
    """Find all potential open reading frames in a sequence."""
    frames = []
    for i in range(3):
        frames.append(transcribe(seq[i:]))
        frames.append(transcribe(reverse_complement(seq)[i:]))

    orfs = set()
    codons = load_codons()
    for frame in frames:
        i = 0
        recording = False
        current_orf = ''

        while True:
            codon = frame[i:i + 3]
            aa = codons.get(codon, 'Stop')

            if len(codon) != 3:
                break  # Because reached end of frame

            if codon == 'AUG':
                recording = True
            if aa != 'Stop' and recording:
                current_orf += aa
            elif aa == 'Stop':
                recording = False
                orfs.add(current_orf)
                current_orf = ''
            i += 3

    orfs.remove('')
    orfs = extract_subseqs(orfs)

    return orfs
Пример #3
0
def reverse_palindrome(dna):
    size = len(dna)
    for start in range(size):
        for length in range(4, 13):
            piece = dna[start:start + length]
            if len(piece) >= 4 and piece == reverse_complement(piece):
                yield (start + 1, length)
Пример #4
0
def reverse_palindromes(dna):
    sites = []
    for length in range(4, 13):
        for start in range(0, len(dna) - length + 1):
            segment = dna[start:start + length]
            if segment == revc.reverse_complement(segment):
                sites.append((start + 1, length))
    return sites
Пример #5
0
def is_reverse_palindrome(seq):
    if len(seq) == 0:
        return True
    elif len(seq) == 1:
        return False
    elif seq[0] == revc.reverse_complement(seq[-1]):
        return is_reverse_palindrome(seq[1:-1])
    else:
        return False
Пример #6
0
def setup_assemble_genome(S):
    reads = set(S) | {reverse_complement(i) for i in S}
    while True:
        genome = assemble_genome(reads)
        if genome:
            return genome
        else:
            reads = {read[:-1]
                     for read in reads} | {read[1:]
                                           for read in reads}
Пример #7
0
def correct_errors(seqs):
    corrections = []
    c = Counter(seqs)

    correct = [item for item in c if c[item] >= 2]
    reverse = {reverse_complement(s) for s in correct}
    seqset = set(correct)

    for s in [item for item in c if c[item] < 2]:
        if s not in reverse:
            for fix in (f for f in seqset | reverse if hamming(s, f) == 1):
                corrections.append((s, fix))

    return corrections
Пример #8
0
def correct_errors(seqs):
    corrections = []
    c = Counter(seqs)

    correct = [item for item in c if c[item] >= 2]
    reverse = {reverse_complement(s) for s in correct}
    seqset = set(correct)

    for s in [item for item in c if c[item] < 2]:
        if s not in reverse:
            for fix in (f for f in seqset | reverse if hamming(s, f) == 1):
                corrections.append((s, fix))

    return corrections
Пример #9
0
def find_reverse_palindromes(dna: str,
                             minlen: int = 4,
                             maxlen: int = 12) -> Dict[int, int]:
    palindromes = {}

    for i in range(len(dna) - minlen + 1):
        for j in range(minlen, maxlen + 1):
            if i + j > len(dna):
                continue

            substring = dna[i:i + j]
            if is_reverse_palindrome(substring):
                print(i, j, substring, reverse_complement(substring))
                palindromes[i + 1] = j
    return palindromes
Пример #10
0
def find_candidate_proteins(dna: str) -> List[str]:
    rna = _convert_to_rna(dna)
    revcomp = _convert_to_rna(reverse_complement(dna))

    candidate_proteins = set()
    for i in range(len(rna) - 2):
        codon = rna[i:i+3]
        rev_codon = revcomp[i:i+3]

        if codon == 'AUG':
            candidate_proteins.add(_create_protein(rna, startpos=i))
        elif rev_codon == 'AUG':
            candidate_proteins.add(_create_protein(revcomp, startpos=i))
        elif codon == 'AUG' and rev_codon == 'AUG':
            candidate_proteins.add(_create_protein(rna, startpos=i))
            candidate_proteins.add(_create_protein(revcomp, startpos=i))
    return sorted([p for p in candidate_proteins if p is not None])
Пример #11
0
def open_reading_frames(dna_):
    dnac = revc.reverse_complement(dna_)
    rna_ = rna.convert_to_rna(dna_)
    rnac = rna.convert_to_rna(dnac)
    proteins = []
    for i in range(3):
        proteins.append(prot.translate(rna_[i:], stop='*'))
        proteins.append(prot.translate(rnac[i:], stop='*'))
    candidates = []
    for protein in proteins:
        for i, aa in enumerate(protein):
            if aa == 'M':
                for j, aa_stop in enumerate(protein[i + 1:]):
                    if aa_stop == '*':
                        candidates.append(protein[i:i + j + 1])
                        break
    return set(candidates)
Пример #12
0
def error_corrections(reads):
    # seen contains only the read, seen_twice contains both the read and its reverse complement
    seen, seen_twice = set(), set()
    corrections = {}

    for read in reads:
        read_revc = reverse_complement(read)
        if read in seen_twice:
            continue
        elif read in seen or read_revc in seen:
            seen.discard(read)
            seen.discard(read_revc)
            seen_twice.add(read)
            seen_twice.add(read_revc)
        else:
            seen.add(read)

    for i in seen:
        for j in seen_twice:
            if hamming_distance(i, j) == 1:
                corrections[i] = j
                break

    return corrections
Пример #13
0
#!/usr/bin/env python

from __future__ import print_function
import os

from revc import reverse_complement

if __name__ == "__main__":
    with open(os.path.join('data', 'rosalind_dbru.txt')) as dataset:
        dna_set = set(s[:-1] for s in dataset)

    reverse_complements = {reverse_complement(s) for s in dna_set}
    dna_strings = dna_set | reverse_complements

    item = dna_strings.pop()
    k = len(item) - 1
    dna_strings.add(item)

    adj_list = {(r[0:k], r[1:k + 1]) for r in dna_strings}

    for adj in sorted(adj_list):
        print('(' + adj[0] + ', ' + adj[1] + ')')
Пример #14
0
from prot import prepare_codon_table
from revp import read_fasta
from revc import reverse_complement
from rna import rna_transcription
from subs import substring_find

if __name__ == "__main__":
    with open(os.path.join('data', 'rosalind_orf.txt')) as dataset:
        seqs = read_fasta(dataset)

    codon_table = prepare_codon_table(os.path.join('data', 'codon_table'))

    output = []
    dna = seqs.popitem()[1]
    rna = (rna_transcription(dna), rna_transcription(reverse_complement(dna)))

    for seq in rna:
        for offset in (0, 1, 2):
            for start_pos in substring_find(seq[offset:], 'AUG'):
                current = []
                for codon in (seq[i:i + 3]
                              for i in range(start_pos, len(seq), 3)):
                    if len(codon) == 3:
                        if codon_table[codon] == 'Stop' and current:
                            output.append(''.join(current))
                            current = []
                        elif codon == 'AUG' or current:
                            current.append(codon_table[codon])

    print("\n".join(set(output)))
Пример #15
0
#!/usr/bin/env python

from __future__ import print_function
import os

from revc import reverse_complement


if __name__ == "__main__":
    with open(os.path.join('data', 'rosalind_pcov.txt')) as dataset:
        reads = {s.rstrip() for s in dataset}

    reverse_complements = {reverse_complement(s) for s in reads}
    dna_strings = reads | reverse_complements

    item = dna_strings.pop()
    k = len(item) - 1
    dna_strings.add(item)

    adj_list = {(head, tail)
                for (head, tail) in {(r[0:k], r[1:k + 1]) for r in dna_strings}
                if any(head in dna or tail in dna for dna in dna_strings)}

    superstring = []
    c = adj_list.pop()
    while adj_list:
        superstring.append(c[1][k - 1:])

        next_edge = {n for n in adj_list if n[0] == c[1]}
        if next_edge:
            c = (adj_list & next_edge).pop()
Пример #16
0
from prot import prepare_codon_table
from revp import read_fasta
from revc import reverse_complement
from rna import rna_transcription
from subs import substring_find


if __name__ == "__main__":
    with open(os.path.join("data", "rosalind_orf.txt")) as dataset:
        seqs = read_fasta(dataset)

    codon_table = prepare_codon_table(os.path.join("data", "codon_table"))

    output = []
    dna = seqs.popitem()[1]
    rna = (rna_transcription(dna), rna_transcription(reverse_complement(dna)))

    for seq in rna:
        for offset in (0, 1, 2):
            for start_pos in substring_find(seq[offset:], "AUG"):
                current = []
                for codon in (seq[i : i + 3] for i in range(start_pos, len(seq), 3)):
                    if len(codon) == 3:
                        if codon_table[codon] == "Stop" and current:
                            output.append("".join(current))
                            current = []
                        elif codon == "AUG" or current:
                            current.append(codon_table[codon])

    print("\n".join(set(output)))
Пример #17
0
def is_reverse_palindrome(dna: str) -> bool:
    if dna == reverse_complement(dna):
        return True
    return False