示例#1
0
def main():
    chr1 = utils.readGenome('chr1.GRCh38.excerpt.fasta')
    #Question 1
    start = time.clock()
    print edit_distance_alt('GCTGATCGATCGTACG', chr1)
    end = time.clock()
    print ">> %.2gs" % (end - start)
    #Question 2
    start = time.clock()
    print edit_distance_alt('GATTTACCAGATTGAG', chr1)
    end = time.clock()
    print ">> %.2gs" % (end - start)
    #Questions 3 and 4
    start = time.clock()
    seqs, _ = utils.readFastq('ERR266411_1.for_asm.fastq')
    edges, suffixes = overlap_graph(seqs, 30)
    print edges
    print suffixes
    end = time.clock()
    print ">> %.2gs" % (end - start)
示例#2
0
def main():
    chr1 = utils.readGenome("chr1.GRCh38.excerpt.fasta")
    # Question 1
    start = time.clock()
    print edit_distance_alt("GCTGATCGATCGTACG", chr1)
    end = time.clock()
    print ">> %.2gs" % (end - start)
    # Question 2
    start = time.clock()
    print edit_distance_alt("GATTTACCAGATTGAG", chr1)
    end = time.clock()
    print ">> %.2gs" % (end - start)
    # Questions 3 and 4
    start = time.clock()
    seqs, _ = utils.readFastq("ERR266411_1.for_asm.fastq")
    edges, suffixes = overlap_graph(seqs, 30)
    print edges
    print suffixes
    end = time.clock()
    print ">> %.2gs" % (end - start)
示例#3
0
from kmer_index import SubseqIndex
from utils import readGenome

f = "chr1.GRCh38.excerpt.fasta"
t = readGenome(f)


def approx_match(p, t, k, tol, ival):

    index = SubseqIndex(t, k, ival)
    n = int(len(p) / k)

    all_matches = set()
    total_hits = 0
    span = index.span

    for i in range(len(p) - span + 1):
        print(f'i = {i}')
        hits = index.query(p[i:])
        total_hits += len(hits)

        for h in hits:
            if h < i or h - i - len(p) > len(t):
                continue

            mismatches = 0

            for j in range(i):
                if p[j] != t[h - i + j]:
                    mismatches += 1
                    if mismatches > tol:
def main():
    sequences, total_records = utils.readGenome('dna4.fasta')

    # question 1: total records in the file
    print total_records # answer: 22

    # question: longest sequence in the file?
    lengths = [len(i) for i in sequences]
    print(max(lengths)) # answer: 4815
    print(min(lengths)) # answer 40


    # question: what is the length of the longest ORF appearing in reading frame 1 of any of the sequences?
    max_orf = 0
    for i in sequences:
        cur_seq = utils.find_orf(i, 0)
        for j in cur_seq:
            if len(j) > max_orf:
                max_orf = len(j)
    print max_orf # answer 1767

    # question: what is the length of the longest ORF appearing in any sequence and in any forward reading frame?
    max_orf = 0
    for i in sequences:
        cur_seq = utils.find_orf(i, 1)
        for j in cur_seq:
            if len(j) > max_orf:
                max_orf = len(j)
    print max_orf # answer 1770

    # question: what is the length of the longest forward ORF that appears in the sequence with the identifier gi|142022655|gb|EQ086233.1|349?
    test_orf = "GATCGCCGCCTGGGTTGTCGAGACACCTGCGCGTGCGCGTCGAACGAAACACCTTGACCCACCGTATGCC CGGCACCGCGCGCGTCCCGGCCGACCTCGCGACACCGAGCGGCACCGCTTCGAAGCATTCTAGCCGGCTC GCGCTTCGCGAACCACCTTTTCGGACGAAAATCCGCACGTTGAATCACTTTCCTGCTTCGTATTTCACGC AAACTGCGTACAATCCTGAGACAACAGTACGTCAACTTCAGGAGAGCAACGATGCCCCCTCGCAAGGATC GCGATACGCCCCATCGCTATCGCAGCGGCGAGGCCGCGCGCCTGGCGCGCATGCCGGCAGCCACGCTGAG AATCTGGGAACGGCGCTATGGCGTGGTTGCGCCGCCCAAAACGCCGTCCGGACAACGGCTGTACTCGGAC GACGACGTGCAGCGCATTCGATTGCTGAAAACGCTCGTCAATCAGGGCCACGCGATCGGGTCGATCGCCA GCCTGAGCCGCGAGGAACTCGAGGCGTTGTCGTTGACGAATACGCGTGACCCGGCGTTTCACGAGGCAAG TGTGAGCCTCGCGGTCGTCGGCGCGCTTTCGATTCCGGAAGCCGCGATCGAGCGAATGGGAATCCGGATC GCCGCGCGAATCGACTCGCTCGACGACACGAGCGCGCATGCGGGTACGTCGGTCGATGCCCTCATCGCGA CGACCACGTCGCTCCATGAGGATGTCGTTTCGCAGCTCGCTGCCCAGGCGCAACAGCTCAACGCGCACGC CGTGGCCGTCGTATACGGGTTCGGCACGGCAGAAGCGGTCGAGCTGGCGCGTCTGTCGGGGTTCGAGCTG TTCCGGTCGACGGAAGGCCAGACCAACCCGATATCGATCATTTCGAAACTGGCGCAAGCCGTCGTCAAGT CGCGCCAATCGAATGACGCGGATCGCGGGCTCTGGCTGCGCACGCGGCGACGCTTCGACGAGGCGACGCT CGCGTCGCTCAGCGGCCTGTCCACCACCGTCAAATGCGAGTGTCCGCGTCACCTCTCCGAATTGATCATG CAGCTCAGTGCGTTCGAGCGATACAGCGACGAATGCGTGTCGCGATCGCCGGCCGATGCGCTGCTGCACC GCCACCTTGGAGACGCAGCGAACCGGGCAGCCGAATTGCTCGAGACGGCGCTTGCCGTCATTCTCCGCGA AGAGGGATTGGGCGGGACGACGCCGGAACTGAAGGCGCTGTAGCGCGGCACGCGCCGCCGGCTGTTCCGA CCTGCCGACGACGGCAGGTGGCGATGCTCTTTCGCGTGCAATGCAGGGCTTGCGTCGATCACTGAGCCGA AACGGAAGAACGAGCCGCTGCGGCAGGCGATGCCGGCGGCCTGCCCGTGGTTCCGGCATTCGACGCATGC GCGACTCGATCCACGAACGCGGAGAGATCGTCGAGCACTGACGCCATCCCTCTCAACGGCGCGCCCAGAA CACCGACGATCGACGCATGGCCCACGTTCCGGTAACGCATGACCACGACGGTGTCGCCCTTCTCTTGCAA TGCTCGCGCGAAGCGGGTCGTGTTGCCGGGCTCGACCACGGTGTCGTTCTCCGCGGTGGCCAGCCACATC GGCGGCTCCGTACCCTGAATGAACCGGATGGGCTGGCTCGCGGCCCGCACTTCCTGCGGGAATATCCTTT CAAGCGTGGTATCGCGCAGCGGCAGGAAATCATAAGCCCCGGCCAGGCCAATCACGCCGGCGATATCGCT CTTCCGCATCGCCTGTGCCGCCAGATAGCGGCCGTCGGTCGCAAGCAATGCGGCAATCTGCGCGCCCGCG GAATGCCCCATCAGAAACAGGCGATGTGGATCGCCGCCGAACGCAACCGCGTGCTCGCGTGCCCACGCGA CCGCCTGCGCCGCATCGTCGACGAAACCGGGAAAGGTGGTCGCCGGATACGTCCTGTAGTCGGGTAAGAC GGCAACAAAGCCCCTCGACGCGAGCGCCTCTCCCACGAACAGATAGTCCTTGCGCTCGCCGGACTGCCAG CTTCCGCCGTAAAGGAACACGACCACAGGGGCGCCCGCACTCGCATCGGCCGGCCAGTGATGCAAGACGC GCGTGGGCAAATAGACGTCGAGCACCTGGCGTTCGCCGGATCCGTACGGGATACCTGCGAACAGACTGAA CGTGTAGCTCGGCGTCAGCGCATTCAGGAGCCGCACCGGGCTGCACGCGGAGAGGAGACCGGCCGCGAGC AGCACCGACAGGACGACAAGCCCGGCTTTCATGTTCATGGAGATCCCCATTCCTGACGATTCCGGCCGCA TCCGCCGCCTGGTACGAGGTTTACGGCGCTTGCGCGCAAGCGGATGCACGCATCGCATGGCAACCGCGCC CCTTGACGGCATCCAGATCTTTCCTGCGCAAGTGCATCCGTCCGCAACGGAGAGTCGTATGTGAATGGAT AGGTGAATCAACGCGGAATGCCGACCATCGCTCGCTGCAAAGCAATCGTCCGGTGGCGAGTCCGCTCGTC GACGATAGTGAGAGCCGTCTGCCATGAGCGTTCTACCTGCCACTTACCCCGAGATGCAACGTCGACGTGG CGGCACGGCGACCATGCCGTTACCGATGATCCCGCGCGAACGATCATGAGGAGCGCGCCGAATCAACTGA CGTCGAGCACGCAAAAGTCCGGCGCCGCTCGCGTGTACGTCTATCTCGCGACGACCCAGACAGGATGGCT GGTATGCGTGATGACTGCCGCAGCGCATCACGCCGCGTGGGGCGTCACCTATGCGCTGATCGCGACAGCG GGCCATCTTCTCTTCGCGCGTCGGCCCGCATCCGAGGCGCGGATCGTCATCACGGTCACGGTGTCCGGAT GGTTATGGGACAGCGCCGTTGCACATTCCGGCCTGCTCGTGTACCCGAACGGCGTTTTTCTCAAAGGTAC AGCGCCGTACTGGCTCGCGGGGCTGTGGGCGCTGTTCGCGATTCAACTCAACACCTTGCTGCTCTGGCTT CGGGCGCGACCGCTCGTCTCGGCGCTCGTCGGCGCATTCGCAGGCCCCGCATCCTTTCGCGCAGGTGCGG CGCTGGGGGCCGTTCATTTCAAAGACTCGGCTGCAGCGCTCGTCGTTCTCGCAACCGGCTGGGCGTTCAT CTTGCCGGCCGCGCTTGCGATTGCAAGCCATTGGGATGGCGTAACGCCCCCTTCTCCTCCGCCAATCGGC GCAGGCGACATGAATGACGCCCGCGCCGGATAGAGCCGGACGCGTCGTAAGCCAGCGTTATCTCCGATCC CGTTCAAATTGCCAACGTACCTTCTCAGGCACCACACATGACACGCACCGAATTGCCGTATGAATCCCGC CCCGTTATCGTATGGTTTCGGGATGACCAACGACTCAGCGACAATCCCGCACTCTCTCATGCGGTCAGTA CCGGCCATCCTGTTGTTTGCGTCTACGTCTACGACCCTGCCCCGAAGCTCGGGCGCGCCATGGGGGGCGC GCAGAAGTGGTGGCTGCACGAGTCGTTGAAAAAACTCGACGACTCGCTTTCCGCTCTCGGCGGCTCGCTG CTCGTGCTTCGCGGTAACGAACACGAAGCCATCAGGAGCCTCGCCGTCGAGACCCGGGCGGCAATGGTTT TCTGGAATCGCCGCTACTCGAAAGCGCAAACGGAAATGGATGCATCGATCAAGAAAGACCTGATCGGGCG CGGCATCGACGTGTCGACATTCAATGGCCATCTTTTGCGCGAACCCTGGACAGTGGCCACGCGCGAAGGC TTGCCGTTCCAGGTATTCAGCGCGTACTGGAGAGCCGCTCGCCGCGATAATTTTTTCCCGCCGTGCCCAC TGTCGGCGCCCGCCCGGGTCACGTTCTTTCCCGTCTCCAGAAACGTCAGCGCACACGTCTGTACGCTTCC CGCGCTTGCACTGCAGCCCTCGACGCCGGACTGGGCGGAGGGCCTGCGTGCAACCTGGCGATGCGGCGAG GAAGCGGCCGGGCATCAACTCGAGGCCTTCATTGAACACTCGTTTTCCGACTATGCCGGCGCTCGAGATT TTCCGGCCACTCGAGCGACGAGCCGGCTCTCTCCGTATCTTCGCTTCGGAAATATCTCGGCCCGGCAGGT GTGGTACGCGACGTTATCAGCGGTAGACGCGATGCGAAGCAGGCGAGTTGTTCGCATTGACGATGCCAAA AATGAGTCGTTGAACAAGTTCTTCAGTGAACTCGGATGGAGAGAATTCTCGTATTACCTTCTTTACCACT GCGAACCCCTTCATCAGGTCAATTTCCGGCGTCAGTTTGACGCCATGCCGTGGCGTACCGACGCCAAGGC GCTTCGCGCGTGGCAAAGGGGGAAAACAGGATACCCGCTGGTCGACGCCGGCATGCGCGAGCTTTGGCAC ACGGGCTGGATGCACAACCGCGTGCGCATGGTGACAGCGTCATTTCTCACCAAGCACTTGCTGATCGACT GGCGCGAGGGCGAAGCATGGTTCTGGGATACGCTGGTTGACGCG"
    max_orf = 0
    cur_seq = utils.find_orf(test_orf, 0)
    for j in cur_seq:
        if len(j) > max_orf:
            max_orf = len(j)
    cur_seq = utils.find_orf(test_orf, 1)
    for j in cur_seq:
        if len(j) > max_orf:
            max_orf = len(j)
    cur_seq = utils.find_orf(test_orf, 2)
    for j in cur_seq:
        if len(j) > max_orf:
            max_orf = len(j)
    print max_orf # 972 or 975

    # q: find the most frequently occurring repeat of length 6 in all sequences. How many times does it occur in all?
    all_repeats = []
    for i in sequences:
        repeats_list = utils.get_all_repeats(i,6)
        for j in repeats_list:
            all_repeats.append(j)
    print(all_repeats.count(utils.most_common(all_repeats))) # answer 208

    # q: find all repeats of length 11 in the input file. Let's use Max to specify the number of copies of the most frequent repeat of length 11. How many different 11-base sequences occur Max times?

    all_repeats = []
    for i in sequences:
        repeats_list = utils.get_all_repeats(i,11)
        for j in repeats_list:
            all_repeats.append(j)
    print Counter(all_repeats).most_common(10) # answer: 5
    seq1=0
    seq2=0
    seq3=0
    seq4=0
    for i in sequences:
        repeats_list = utils.get_all_repeats(i,7)
        for j in repeats_list:
            if j == 'CGGCGCG':
                seq1 +=1
            if j == 'CGGCACG':
                seq2 +=1
            if j == 'GCGGCAC':
                seq3 +=1
            if j == 'TCGGCGG':
                seq4 +=1
    print str(seq1) + " | " + str(seq2) + " | " + str(seq3) + " | " + str(seq4) + " | "
示例#5
0
from naive_with_rc import *
from naive_2mm import *
from utils import readGenome, readFastq
from process_quality import *

filename = r"D:\Dropbox\NTU Modules\Algorithms for DNA Sequencing\Programming Assignments\Week 1\lambda_virus.fa"
t = readGenome(filename)
# print(t)
p = str(input('Enter string to be searched: '))
print(f'p: {p}')

# p_occ, q_occ = naive_with_rc(p, t)
# print(f'Total {len(p_occ)} + {len(q_occ)} = {len(p_occ)+len(q_occ)} occurences.')
# print(f'{min(p_occ)}, {min(q_occ)} offset.')

occ = naive_2mm(p, t)
print(f'# of occ {len(occ)}, offset {min(occ)}')

# fastqfile = r"ERR037900_1.first1000.fastq"

# s, q = readFastq(fastqfile)
# fault = process_quality(q)
# print(fault)