Пример #1
0
    def test_calculate_stats():
        'It tests the calculate stat function'
        in_fhands = []
        for val in range(1, 6):
            fhand = open(join(TEST_DATA_DIR, 'pairend{0}.sfastq'.format(val)))
            in_fhands.append(fhand)
        seqs = read_seqrecords(in_fhands, file_format='fastq')
        (lengths_srt, qual_str, freq_str,
                          qual_boxplot, kmers) = calculate_sequence_stats(seqs)
        assert 'maximum: 4' in lengths_srt
        assert 'Q30: 100.0' in qual_str
        assert '1:30.0,30.0,30.0,30.0,30.0 <[|]>' in qual_boxplot
        assert '[30 , 31[ (96): **********' in qual_str
        assert '0 (A: 1.00, C: 0.00, G: 0.00, T: 0.00, N: 0.00) |' in  freq_str
        assert kmers == ''

        infhands = [open(join(TEST_DATA_DIR, 'arabidopsis_genes'))]
        seqs = read_seqrecords(infhands, file_format='fasta')
        kmers = calculate_sequence_stats(seqs)[-1]
        assert not 'Kmer distribution' in kmers

        infhands = [open(join(TEST_DATA_DIR, 'arabidopsis_genes'))]
        seqs = read_seqrecords(infhands, file_format='fasta')
        kmers = calculate_sequence_stats(seqs, kmer_size=3)[-1]
        assert 'Kmer distribution' in kmers
        assert 'TCT: 167' in kmers
    def test_deinterleave(self):
        'It de-interleaves an iterator of alternating fwd and rev reads'

        fhand1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        fhand2 = os.path.join(TEST_DATA_DIR, 'pairend1b.sfastq')
        fwd_seqs = read_seqrecords([open(fhand1)], 'fastq')
        rev_seqs = read_seqrecords([open(fhand2)], 'fastq')

        seqs = interleave_pairs(fwd_seqs, rev_seqs)
        out_fhand1 = StringIO()
        out_fhand2 = StringIO()
        out_format = 'fastq'
        deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format)
        result1 = out_fhand1.getvalue()
        result2 = out_fhand2.getvalue()
        assert result1.strip() == open(fhand1).read().strip()
        assert result2.strip() == open(fhand2).read().strip()
    def test_mate_pair_checker():
        'It test the mate pair function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqrecords([open(file1)], 'fastq')
        rev_seqs = read_seqrecords([open(file2)], 'fastq')

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fwd_seqs = read_seqrecords([open(file1)], 'fastq')
        rev_seqs = read_seqrecords([open(file2)], 'fastq')
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqrecords([open(file1)], 'fastq')
        rev_seqs = read_seqrecords([open(file2)], 'fastq')
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
    def test_interleave(self):
        'It interleaves two iterators with paired reads'
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = list(read_seqrecords([open(file1)], 'fastq'))
        rev_seqs = list(read_seqrecords([open(file2)], 'fastq'))

        try:
            list(interleave_pairs(fwd_seqs, rev_seqs))
            self.fail('InterleaveError expected')
        except InterleaveError:
            pass

        # we skip the tests
        seqs = list(interleave_pairs(fwd_seqs, rev_seqs, skip_checks=True))
        assert len(seqs) == 8

        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend1b.sfastq')
        fwd_seqs = read_seqrecords([open(file1)], 'fastq')
        rev_seqs = read_seqrecords([open(file2)], 'fastq')

        seqs = list(interleave_pairs(fwd_seqs, rev_seqs))
        assert len(seqs) == 8
Пример #5
0
 def test_orf_annotator(self):
     'It tests orf annotator'
     fpath = os.path.join(TEST_DATA_DIR, 'orf_test.fasta')
     estscan_matrix = os.path.join(TEST_DATA_DIR,
                                   'Arabidopsis_thaliana.smat')
     seq_records = list(read_seqrecords([open(fpath)]))
     orf_annotator = EstscanOrfAnnotator(estscan_matrix)
     seq_records = orf_annotator(seq_records)
     orf1 = seq_records[0].features[0]
     orf2 = seq_records[1].features[0]
     assert orf1.strand == 1
     assert orf1.location.start.position == 0
     assert orf1.location.end.position == 541
     assert orf2.strand == -1
     assert orf2.location.start.position == 0
     assert orf2.location.end.position == 541
     assert not seq_records[2].features
Пример #6
0
def _read_estcan_result(fhand, result, file_type):
    'It reads a dna or pep ESTscan result file'
    for seq in read_seqrecords([fhand], file_format='fasta'):
        items = [i.strip() for i in seq.description.split(';')]
        strand = -1 if 'minus strand' in items else 1
        start, end = items[0].split(' ', 3)[1:3]
        seqid = seq.id
        try:
            seq_orfs = result[seqid]
        except KeyError:
            seq_orfs = {}
            result[seqid] = seq_orfs
        orf_key = (int(start), int(end), strand)
        if orf_key in seq_orfs:
            orf = seq_orfs[orf_key]
        else:
            orf = {}
            seq_orfs[orf_key] = orf
        orf[file_type] = seq.seq
 def test_read_fasta(self):
     'It tests the reading of a fasta file'
     fhand = StringIO('>seq1\nACTG\n')
     assert not list(read_seqrecords([fhand]))[0].description