def test_vectordb(self):
        'It removes the vector from a vector database'
        seq1 = create_random_seqwithquality(5, qual_range=35)
        vector = 'CACTATCTCCGACGACGGCGATTTCACCGTTGACCTGATTTCCAGTTGCTACGTCAAGTTC'
        vector = SeqWithQuality(Seq(vector), name='vect', qual=[30]*len(vector))
        seq2 = create_random_seqwithquality(250, qual_range=35)
        seqs = [seq1 + vector + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
        outseq_fhand = NamedTemporaryFile()
        vector_db = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+')
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', '454', '-f', 'fastq', '-d', vector_db]
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq'))
        assert (len(seq2.seq) - len(out_seqs[0].seq)) < 5

        seq1 = create_random_seqwithquality(5, qual_range=35)
        vector = 'GGTGCCTCCGGCGGGCCACTCAATGCTTGAGTATACTCACTAGACTTTGCTTCGCAAAG'
        vector = SeqWithQuality(Seq(vector), name='vect', qual=[30]*len(vector))
        seq2 = create_random_seqwithquality(250, qual_range=35)
        seqs = [seq1 + vector + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
        outseq_fhand = NamedTemporaryFile()
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', '454', '-f', 'fastq', '-d']
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq'))
        assert (len(seq2.seq) - len(out_seqs[0].seq)) < 5
    def test_illumina(self):
        'It tests the Illumina cleaning'
        seq1 = create_random_seqwithquality(50, qual_range=35)
        seq2 = create_random_seqwithquality(10, qual_range=15)
        seqs = [seq1 + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
        outseq_fhand = NamedTemporaryFile()
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'illumina', '-f', 'fastq']
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq'))
        assert out_seqs[0].qual[-2] == 35

        #disable quality trimming
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'illumina', '-f', 'fastq', '-x']
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq'))
        assert seqs[0].seq == out_seqs[0].seq


        #illumina format
        inseq_fhand = create_temp_seq_file(seqs, format='fastq-illumina')[0]
        outseq_fhand = NamedTemporaryFile()
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'illumina', '-f', 'fastq-illumina']
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq-illumina'))
        assert out_seqs[0].qual[-2] == 35
Exemplo n.º 3
0
    def test_adaptors(self):
        'It removes adaptors'
        seq1 = create_random_seqwithquality(5, qual_range=35)
        adaptor = create_random_seqwithquality(15, qual_range=35)
        seq2 = create_random_seqwithquality(50, qual_range=35)
        seqs = [seq1 + adaptor + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
        outseq_fhand = NamedTemporaryFile()
        adaptor_fhand = create_temp_seq_file([adaptor], format='fasta')[0]
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'illumina', '-f', 'fastq', '-a', adaptor_fhand.name]
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq'))
        assert seq2.seq == out_seqs[0].seq


        seq1 = create_random_seqwithquality(5, qual_range=35)
        adaptor = create_random_seqwithquality(15, qual_range=35)
        seq2 = create_random_seqwithquality(50, qual_range=35)
        seqs = [seq1 + adaptor + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
        outseq_fhand = NamedTemporaryFile()
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', '454', '-f', 'fastq', '-a']
        stdout, stderr, retcode = _call_python(cmd)
        assert retcode == 0
        assert  "--adaptors_file: {'454': '" in stdout

        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'illumina', '-f', 'fastq', '-a']
        stdout, stderr, retcode = _call_python(cmd)
        assert 'clean_reads does not have default adaptors file' in stderr
        assert  retcode == 14
    def test_sanger(self):
        'It tests the basic sanger cleaning'
        seq1 = create_random_seqwithquality(500, qual_range=55)
        seq2 = create_random_seqwithquality(50, qual_range=15)
        seqs = [seq1 + seq2]
        inseq_fhand, inqual_fhand = create_temp_seq_file(seqs, format='qual')
        outseq_fhand = NamedTemporaryFile()
        outqual_fhand = NamedTemporaryFile()

        #platform is required
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name]
        stderr = _call_python(cmd)[1]
        assert 'required' in stderr

        #a correct platform is required
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'hola']
        stderr = _call_python(cmd)[1]
        assert 'choice' in stderr

        #disable quality trimming and lucy_splice are incompatible
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'sanger', '-x', '--lucy_splice', 'splice.fasta']
        stderr = _call_python(cmd)[1]
        assert 'incompatible' in stderr

        #we can clean a sanger sequence with quality
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-q', inqual_fhand.name,
               '-o', outseq_fhand.name, '-u', outqual_fhand.name,
               '-p', 'sanger']
        retcode = _call_python(cmd)[2]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     qual_fhand=open(outqual_fhand.name)))
        assert out_seqs[0].qual[-1] == 55

        #disable quality trimming
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-q', inqual_fhand.name,
               '-o', outseq_fhand.name, '-u', outqual_fhand.name,
               '-p', 'sanger', '-x']
        retcode = _call_python(cmd)[2]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     qual_fhand=open(outqual_fhand.name)))
        assert seqs[0].seq == out_seqs[0].seq

        #we can clean a sanger sequence without quality
        seq1 = create_random_seqwithquality(500, qual_range=55)
        seqs = [SeqWithQuality(seq1.seq + Seq('NNNNNNNNNNNNNN'), name='Ns')]
        inseq_fhand = create_temp_seq_file(seqs, format='fasta')[0]
        outseq_fhand = NamedTemporaryFile()
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'sanger']
        retcode = _call_python(cmd)[2]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name)))
        assert not str(out_seqs[0].seq).lower().endswith('nnnnn')
    def test_seq_pipeline_parallel_run_with_fasta_qual(self):
        'The pipeline runs in parallel with fasta and qual'
        pipeline = 'sanger_with_qual'

        fhand_adaptors = NamedTemporaryFile()
        fhand_adaptors.write(ADAPTORS)
        fhand_adaptors.flush()
        arabidopsis_genes = 'arabidopsis_genes+'
        univec = os.path.join(TEST_DATA_DIR, 'blast', arabidopsis_genes)
        configuration = {'remove_vectors': {'vectors': univec},
                         'remove_adaptors': {'adaptors': fhand_adaptors.name}}

        seq1 = create_random_seqwithquality(500, qual_range=50)
        seq2 = create_random_seqwithquality(500, qual_range=51)
        seq3 = create_random_seqwithquality(500, qual_range=52)
        seqs = [seq1, seq2, seq3]
        inseq_fhand, inqual_fhand = create_temp_seq_file(seqs, format='qual')

        in_fhands = {}
        in_fhands['in_seq'] = open(inseq_fhand.name)
        in_fhands['in_qual'] = open(inqual_fhand.name)

        outseq_fhand = NamedTemporaryFile()
        outqual_fhand = NamedTemporaryFile()
        writer = SequenceWriter(outseq_fhand, qual_fhand=outqual_fhand,
                                file_format='fasta')
        writers = {'seq': writer}

        seq_pipeline_runner(pipeline, configuration, in_fhands,
                            processes=4, writers=writers)
        out_fhand = open(outseq_fhand.name, 'r')

        result_seq = out_fhand.read()
        assert result_seq.count('>') == 3
    def test_tempdir(self):
        'it test that the tmpdir work fine'
        seq1 = create_random_seqwithquality(500, qual_range=55)
        seq2 = create_random_seqwithquality(50, qual_range=15)
        seqs = [seq1 + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='qual')[0]
        outseq_fhand = NamedTemporaryFile()

        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'sanger', '--tmpdir', '.']

        retcode = _call_python(cmd)[-1]
        assert retcode == 0

        dir_without_perm = '/usr'
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'sanger', '--tmpdir', dir_without_perm]
        stderr, retcode = _call_python(cmd)[1:]
        assert retcode == 1
        assert "Permission denied: '%s" % dir_without_perm  in stderr

        dir_without_perm = '/usr/remove_this_dir'
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'sanger', '--tmpdir', dir_without_perm]
        stderr, retcode = _call_python(cmd)[1:]
        assert retcode == 14
        assert "Permission denied: '%s" % dir_without_perm  in stderr
        errolog_path = 'clean_reads.error'
        if os.path.exists(errolog_path):
            os.remove(errolog_path)
 def test_vector(self):
     'It removes the vector'
     seq1 = create_random_seqwithquality(5, qual_range=35)
     vector = create_random_seqwithquality(3000, qual_range=35)
     seq2 = create_random_seqwithquality(250, qual_range=35)
     seqs = [seq1 + vector[30:60] + seq2]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     vector_fhand = create_temp_seq_file([vector], format='fasta')[0]
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq', '-v', vector_fhand.name]
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert (len(seq2.seq) - len(out_seqs[0].seq)) < 5
 def test_edge_trim(self):
     'It trims the sequence edges'
     seq2 = create_random_seqwithquality(250, qual_range=35)
     seqs = [seq2]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq', '-e', '10,10']
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert len(seq2.seq) - len(out_seqs[0].seq) == 20
 def test_words(self):
     'It trims re words'
     vector = 'ACTG'
     vector = SeqWithQuality(Seq(vector), name='vect', qual=[30]*len(vector))
     seq2 = create_random_seqwithquality(250, qual_range=35)
     seqs = [vector + seq2]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq', '-r', '"^ACTG","TTTTTTTTTTTTTT"']
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert seq2.seq == out_seqs[0].seq
 def test_min_length(self):
     'Filtering by length'
     seq1 = create_random_seqwithquality(250, qual_range=35)
     seq2 = create_random_seqwithquality(50, qual_range=35)
     seq3 = create_random_seqwithquality(250, qual_range=35)
     seqs = [seq1, seq2, seq3]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq', '-m', '51']
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert len(out_seqs) == 2
     assert len(out_seqs[0]) == 250
     assert len(out_seqs[1]) == 250
 def test_trim_as_mask(self):
     'It masks the regions to trim'
     seq2 = create_random_seqwithquality(250, qual_range=35)
     seqs = [seq2]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq', '-e', '10,10', '--mask_no_trim']
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert len(seq2.seq) == len(out_seqs[0].seq)
     seq = str(out_seqs[0].seq)
     assert seq[0:9].islower()
     assert seq[10:len(seq) - 10].isupper()
     assert seq[-10:].islower()
 def test_fastq(self):
     'Cleaning fastq seqs in parallel'
     seq1 = create_random_seqwithquality(500, qual_range=55)
     seq2 = create_random_seqwithquality(50, qual_range=15)
     seq3 = create_random_seqwithquality(500, qual_range=55)
     seq4 = create_random_seqwithquality(50, qual_range=15)
     seq5 = create_random_seqwithquality(500, qual_range=55)
     seq6 = create_random_seqwithquality(50, qual_range=15)
     seqs = [seq1 + seq2, seq3 + seq4, seq5 + seq6]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     #we can clean a sanger sequence with quality
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', 'sanger', '-t', '4', '-f', 'fastq']
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert out_seqs[0].qual[-1] == 55
 def test_filter(self):
     'Filtering by blast similarity'
     seq1 = create_random_seqwithquality(150, qual_range=35)
     seq2 = 'CACTATCTCCGACGACGGCGATTTCACCGTTGACCTGATTTCCAGTTGCTACGTCAAGTTCTC'
     seq2 += 'TACGGCAAGAATATCGCCGGAAAACTCAGTTACGGATCTGTTAAAGACGTCCGTGGAATCCA'
     seq2 += 'AGCTAAAGAAGCTTTCCTTTGGCTACCAATCACCGCCATGGAATCGGATCCAAGCTCTGCCA'
     seq2 = SeqWithQuality(Seq(seq2), name='ara', qual=[30]*len(seq2))
     seq3 = create_random_seqwithquality(150, qual_range=35)
     seqs = [seq1, seq2, seq3]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     ara_db = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+')
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq',
            '--filter_dbs', ','.join((ara_db, ara_db))]
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert len(out_seqs) == 2