示例#1
0
    def test_minoverlap(self):
        ''' test different values for ``minoverlap`` config parameter '''
        seqs = (
            "TCGATGCGATCTGTCAAGTCGGTGGCGGTA...", # end of sequence + junk
            "TCGATGCGATCTG.CAAGTCGGTGGCGGTA...", # end of sequence + junk + 1 error
            "...NTGAACGTATCGCCTCGAGGGACTT", # junk + beginning of sequence
            "...NTGAACGTATCG.CTCGAGGGACTT", # junk + beginning of sequence + 1 error
        )

        engine.config(
                maxerrors=0,
                minreadlength=25,
                minoverlap=30,
                Amin='!'
            )
        ret = engine.findseqs(self.fname, seqs)
        hits = ret['hits']
        assert len(hits)==1 and hits[0].seq_nr==0 and hits[0].seq_pos<0

        engine.config(maxerrors=0, minoverlap=25)
        hits = engine.findseqs(self.fname, seqs)['hits']
        assert len(hits)==2
        for hit in hits:
            assert hit[0]!=3 or hit[2]>0

        engine.config(maxerrors=1, minoverlap=25)
        hits = engine.findseqs(self.fname, seqs)['hits']
        assert len(hits)==4
示例#2
0
    def test_Amin(self):
        seqs = (
                "GGAG",
                "CCGAC",
            )
        engine.config(Amin='H', minreadlength=4, maxerrors=0)
        ret = engine.findseqs(self.fname, seqs)

        assert len(ret['hits']) == 1
        assert ret['stats']['readlengths'][5] == 3
        assert ret['stats']['readlengths'][4] == 5

        engine.config(Amin='G')
        ret = engine.findseqs(self.fname, seqs)
        assert len(ret['hits']) == 2
示例#3
0
    def test_hits(self):
        fq = FastqGenerator(self.tfn.name, force=True)
        seq = fq.randseq(51)

        minoverlap = 25
        readlength = 100
        pmax = .05
        n = 100
        for i in range(n):
            fq.cover_seq(seq,
                    minoverlap=minoverlap,
                    readlength=readlength,
                    pmax=pmax)
        fq.flush()
        #print "\033[94mfilesize=%.2f MB\033[m" % (fq.size() / 1024. / 1024.)

        fq = Fastq(self.tfn.name, quiet=True)

        engine.config(
                nthreads=3,
                Amin=fq.Q2A(fq.p2Q(pmax)),
                maxerrors=0,
                minreadlength=random.randint(minoverlap, readlength),
                minoverlap=minoverlap
            )
        ret = engine.findseqs(fq.fname, [seq])

        assert ret['stats']['readlengths'][readlength] == n
        assert len(ret['hits']) == n

        if 0:
            print('hits=%d'%len(ret['hits']))
            print('readlenghts='+', '.join(['%dx %dbp'%(n, idx)
                    for idx,n in enumerate(ret['stats']['readlengths']) if n]))

        seqx = ''.join([i%minoverlap!=0 and b or {'A':'C','C':'G','G':'T','T':'A'}[b]
                    for i,b in enumerate(seq)])
        ret = engine.findseqs(fq.fname, [seqx])

        if 0:
            print('0123456789'*6)
            print(('*'+' '*(minoverlap-1))*6)
            print(seq)
            print(seqx)
            print(str(ret['hits']))

        assert ret['stats']['readlengths'][readlength] == n
        assert len(ret['hits']) == 0
示例#4
0
 def test_forward_fastq(self):
     engine.config(Amin='#', nthreads=2, minoverlap=80)
     for n in [3, 5, 7, 133]:
         for plus in ['+', '+IDENTIFIER']:
             for cr in ['\n', '\r\n']:
                 record = '@IDENTIFIER' + cr + 'A' * 80 + cr + \
                         plus + cr + '#' * 80 + cr
                 file(self.tfn.name, 'wb').write(record * n)
                 Fastq(self.tfn.name, quiet=True)
                 ret = engine.findseqs(self.tfn.name, ['A'*80])
                 assert len(ret['hits']) == n
示例#5
0
    def test_paired(self, gz=False):
        engine.config(maxerrors=0, minoverlap=1000, minreadlength=3, Amin='!')
        seqs = (
            "CCC", # "CCCC" should be counted 2x ...
            "TTTT",
            "TATATATA",
            "TGTAG", # at beginning
            "ATATT", # at end
            "GAGCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT",
            "...NACTTCCTCTCTACTGGTGTCGGCGGTGAAAGAGCTTACGTACTCTTCGAT...",
        )

        fname = self.fname
        fnames = (self.fname_1, self.fname_2)
        if gz:
            fname += '.gz'
            fnames = (self.fname_1 + '.gz', self.fname_2 + '.gz')

        ret = engine.findseqs(fname, seqs)
        ret_12 = engine.findseqs(fnames, seqs)

        assert ret == ret_12
示例#6
0
    def test_fastq(self):

        file(self.tfn.name, 'w').write('''_IDENTIFIER
ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+
#############################################
''')
        try:
            engine.findseqs(self.tfn.name, [])
            assert False, "malformed @IDENTIFIER must raise FastqFileFormatException"
        except FastqFileFormatException:
            pass

        file(self.tfn.name, 'w').write('''@IDENTIFIER
ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
-
#############################################
''')
        try:
            engine.findseqs(self.tfn.name, [])
            assert False, "malformed 3rd line must raise FastqFileFormatException"
        except FastqFileFormatException:
            pass
示例#7
0
    def test_maxerror(self):
        ''' test different values for ``maxerror`` config parameter '''
        engine.config(minreadlength=25, minoverlap=25, Amin='!')
        seqs = (
            #GAGCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT
            "CAGCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT",
            #^ : 1 error
            "CTGCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT",
            #^^: 2 errors
            "CTCCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT",
            #^^^: 3 errors
        )

        for maxerrors in range(4):
            engine.config( maxerrors=maxerrors )
            hits = engine.findseqs(self.fname, seqs)['hits']
            assert len(hits) == maxerrors
示例#8
0
    def test_findseqs(self, gz=False):
        ''' find specified sequences in handwritten .fastq file '''
        engine.config(maxerrors=0, minoverlap=1000, minreadlength=3, Amin='!')
        seqs = (
            "CCC", # "CCCC" should be counted 2x ...
            "TTTT",
            "TATATATA",
            "TGTAG", # at beginning
            "ATATT", # at end
            "GAGCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT",
            "...NACTTCCTCTCTACTGGTGTCGGCGGTGAAAGAGCTTACGTACTCTTCGAT...",
        )
        fname = self.fname
        if gz:
            fname += '.gz'
        hits = engine.findseqs(fname, seqs)['hits']

        if gz:
            f = gzip.GzipFile(fname, 'rb')
        else:
            f = file(fname, 'rb')

        x = [0] * len(seqs)

        for hit in hits:
            x[hit.seq_nr] += 1

            seq = seqs[hit.seq_nr]
            if hit.seq_pos<0:
                f.seek(hit.file_pos-hit.seq_pos)
                bps = f.read(hit.length)
            else:
                f.seek(hit.file_pos)
                bps = f.read(hit.length)
                seq = seq[hit.seq_pos:hit.seq_pos+hit[3]]

            assert bps == seq

        assert x == [19,1,0,1,1,1,1]