def test_calculate_stats(): 'It tests the calculate stat function' in_fhands = [] for val in range(1, 6): fhand = open(join(TEST_DATA_DIR, 'pairend{0}.sfastq'.format(val))) in_fhands.append(fhand) seqs = read_seqrecords(in_fhands, file_format='fastq') (lengths_srt, qual_str, freq_str, qual_boxplot, kmers) = calculate_sequence_stats(seqs) assert 'maximum: 4' in lengths_srt assert 'Q30: 100.0' in qual_str assert '1:30.0,30.0,30.0,30.0,30.0 <[|]>' in qual_boxplot assert '[30 , 31[ (96): **********' in qual_str assert '0 (A: 1.00, C: 0.00, G: 0.00, T: 0.00, N: 0.00) |' in freq_str assert kmers == '' infhands = [open(join(TEST_DATA_DIR, 'arabidopsis_genes'))] seqs = read_seqrecords(infhands, file_format='fasta') kmers = calculate_sequence_stats(seqs)[-1] assert not 'Kmer distribution' in kmers infhands = [open(join(TEST_DATA_DIR, 'arabidopsis_genes'))] seqs = read_seqrecords(infhands, file_format='fasta') kmers = calculate_sequence_stats(seqs, kmer_size=3)[-1] assert 'Kmer distribution' in kmers assert 'TCT: 167' in kmers
def test_deinterleave(self): 'It de-interleaves an iterator of alternating fwd and rev reads' fhand1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') fhand2 = os.path.join(TEST_DATA_DIR, 'pairend1b.sfastq') fwd_seqs = read_seqrecords([open(fhand1)], 'fastq') rev_seqs = read_seqrecords([open(fhand2)], 'fastq') seqs = interleave_pairs(fwd_seqs, rev_seqs) out_fhand1 = StringIO() out_fhand2 = StringIO() out_format = 'fastq' deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format) result1 = out_fhand1.getvalue() result2 = out_fhand2.getvalue() assert result1.strip() == open(fhand1).read().strip() assert result2.strip() == open(fhand2).read().strip()
def test_mate_pair_checker(): 'It test the mate pair function' # with equal seqs but the last ones file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = read_seqrecords([open(file1)], 'fastq') rev_seqs = read_seqrecords([open(file2)], 'fastq') out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp # with the firsts seqs different file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq') fwd_seqs = read_seqrecords([open(file1)], 'fastq') rev_seqs = read_seqrecords([open(file2)], 'fastq') out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = read_seqrecords([open(file1)], 'fastq') rev_seqs = read_seqrecords([open(file2)], 'fastq') out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
def test_interleave(self): 'It interleaves two iterators with paired reads' file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = list(read_seqrecords([open(file1)], 'fastq')) rev_seqs = list(read_seqrecords([open(file2)], 'fastq')) try: list(interleave_pairs(fwd_seqs, rev_seqs)) self.fail('InterleaveError expected') except InterleaveError: pass # we skip the tests seqs = list(interleave_pairs(fwd_seqs, rev_seqs, skip_checks=True)) assert len(seqs) == 8 file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend1b.sfastq') fwd_seqs = read_seqrecords([open(file1)], 'fastq') rev_seqs = read_seqrecords([open(file2)], 'fastq') seqs = list(interleave_pairs(fwd_seqs, rev_seqs)) assert len(seqs) == 8
def test_orf_annotator(self): 'It tests orf annotator' fpath = os.path.join(TEST_DATA_DIR, 'orf_test.fasta') estscan_matrix = os.path.join(TEST_DATA_DIR, 'Arabidopsis_thaliana.smat') seq_records = list(read_seqrecords([open(fpath)])) orf_annotator = EstscanOrfAnnotator(estscan_matrix) seq_records = orf_annotator(seq_records) orf1 = seq_records[0].features[0] orf2 = seq_records[1].features[0] assert orf1.strand == 1 assert orf1.location.start.position == 0 assert orf1.location.end.position == 541 assert orf2.strand == -1 assert orf2.location.start.position == 0 assert orf2.location.end.position == 541 assert not seq_records[2].features
def _read_estcan_result(fhand, result, file_type): 'It reads a dna or pep ESTscan result file' for seq in read_seqrecords([fhand], file_format='fasta'): items = [i.strip() for i in seq.description.split(';')] strand = -1 if 'minus strand' in items else 1 start, end = items[0].split(' ', 3)[1:3] seqid = seq.id try: seq_orfs = result[seqid] except KeyError: seq_orfs = {} result[seqid] = seq_orfs orf_key = (int(start), int(end), strand) if orf_key in seq_orfs: orf = seq_orfs[orf_key] else: orf = {} seq_orfs[orf_key] = orf orf[file_type] = seq.seq
def test_read_fasta(self): 'It tests the reading of a fasta file' fhand = StringIO('>seq1\nACTG\n') assert not list(read_seqrecords([fhand]))[0].description