예제 #1
0
    def test_calculate_stats():
        'It tests the calculate stat function'
        in_fhands = []
        for val in range(1, 6):
            fhand = open(join(TEST_DATA_DIR, 'pairend{0}.sfastq'.format(val)))
            in_fhands.append(fhand)
        seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQRECORD])
        results = calculate_sequence_stats(seqs, nxs=[50])
        assert 'maximum: 4' in results['length']
        assert 'N50' in results['length']
        assert '1:30.0,30.0,30.0,30.0,30.0 <[|]>' in results['qual_boxplot']
        assert '[30 , 31[ (96): **********' in results['quality']
        assert 'Q30: 100.0' in results['quality']
        assert '0 (A: 1.00, C: 0.00, G: 0.00, T: 0.00' in results['nucl_freq']
        assert results['kmer'] == ''

        infhands = [open(join(TEST_DATA_DIR, 'arabidopsis_genes'))]
        seqs = list(read_seqs(infhands, prefered_seq_classes=[SEQRECORD]))
        kmers = calculate_sequence_stats(seqs)['kmer']
        assert not 'Kmer distribution' in kmers

        kmers = calculate_sequence_stats(seqs, kmer_size=3)['kmer']
        assert 'Kmer distribution' in kmers
        assert 'TCT: 167' in kmers

        # dust
        dust = calculate_sequence_stats(seqs)['dustscore']
        assert not dust
        dust = calculate_sequence_stats(seqs, do_dust_stats=True)['dustscore']
        assert 'average: 1.83\nvariance: 0.14\nnum. seqs.: 6\n' in dust
        assert '% above 7 (low complexity): 0.00' in dust
예제 #2
0
    def test_sample_seq(self):
        sample_seq = os.path.join(BIN_DIR, 'sample_seqs')
        assert 'usage' in check_output([sample_seq, '-h'])

        fasta_fhand = NamedTemporaryFile()
        fasta_fhand.write('>seq\nACTA\n>seq2\nACTA\n>seq3\nACTA\n')
        fasta_fhand.flush()

        # random sample
        result = check_output([sample_seq, '-n', '1', fasta_fhand.name])
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 1

        # random sample
        result = check_output([sample_seq, '-n', '2', fasta_fhand.name])
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2

        # random sample
        try:
            stderr = NamedTemporaryFile()
            check_output([sample_seq, '-n', '10', fasta_fhand.name],
                         stderr=stderr)
        except CalledProcessError:
            assert 'larger' in open(stderr.name).read()

        # random sample with stdin
        result = check_output([sample_seq, '-n', '2'],
                              stdin=open(fasta_fhand.name))
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2
예제 #3
0
    def test_seqitems_io(self):
        'It checks the different seq class streams IO'
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQITEM]))
        assert seqs[0].kind == SEQITEM
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
        assert seqs[0].object.name == 's1'

        # SeqRecord
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQRECORD]))
        assert seqs[0].kind == SEQRECORD
        fhand = StringIO()
        write_seqs(seqs, fhand, 'fasta')
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'

        # seqitem not possible with different input and output formats
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        try:
            seqs = list(read_seqs([fhand], out_format='fastq',
                        prefered_seq_classes=[SEQITEM]))
            self.fail('ValueError expected')
        except ValueError:
            pass

        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], out_format='fasta',
                        prefered_seq_classes=[SEQITEM]))
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
예제 #4
0
    def test_sample_seq(self):
        sample_seq = os.path.join(BIN_DIR, 'sample_seqs')
        assert 'usage' in check_output([sample_seq, '-h'])

        fasta_fhand = NamedTemporaryFile()
        fasta_fhand.write('>seq\nACTA\n>seq2\nACTA\n>seq3\nACTA\n')
        fasta_fhand.flush()

        # random sample
        result = check_output([sample_seq, '-n', '1', fasta_fhand.name])
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 1

        # random sample
        result = check_output([sample_seq, '-n', '2', fasta_fhand.name])
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2

        # random sample
        try:
            stderr = NamedTemporaryFile()
            check_output([sample_seq, '-n', '10', fasta_fhand.name],
                         stderr=stderr)
        except CalledProcessError:
            assert 'larger' in open(stderr.name).read()

        # random sample with stdin
        result = check_output([sample_seq, '-n', '2'],
                              stdin=open(fasta_fhand.name))
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2
예제 #5
0
    def test_calculate_stats():
        'It tests the calculate stat function'
        in_fhands = []
        for val in range(1, 6):
            fhand = open(join(TEST_DATA_DIR, 'pairend{0}.sfastq'.format(val)))
            in_fhands.append(fhand)
        seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQRECORD])
        results = calculate_sequence_stats(seqs, nxs=[50])
        assert 'maximum: 4' in results['length']
        assert 'N50' in results['length']
        assert '1:30.0,30.0,30.0,30.0,30.0 <[|]>' in results['qual_boxplot']
        assert '[30 , 31[ (96): **********' in results['quality']
        assert 'Q30: 100.0' in results['quality']
        assert '0 (A: 1.00, C: 0.00, G: 0.00, T: 0.00' in  results['nucl_freq']
        assert results['kmer'] == ''

        infhands = [open(join(TEST_DATA_DIR, 'arabidopsis_genes'))]
        seqs = list(read_seqs(infhands, prefered_seq_classes=[SEQRECORD]))
        kmers = calculate_sequence_stats(seqs)['kmer']
        assert not 'Kmer distribution' in kmers

        kmers = calculate_sequence_stats(seqs, kmer_size=3)['kmer']
        assert 'Kmer distribution' in kmers
        assert 'TCT: 167' in kmers

        # dust
        dust = calculate_sequence_stats(seqs)['dustscore']
        assert not dust
        dust = calculate_sequence_stats(seqs, do_dust_stats=True)['dustscore']
        assert 'average: 1.83\nvariance: 0.14\nnum. seqs.: 6\n' in dust
        assert '% above 7 (low complexity): 0.00' in dust
예제 #6
0
    def test_giuseppe_reads():
        'It splits some real reads'
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 20

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 20

        # reads 2
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20
예제 #7
0
    def test_giuseppe_reads():
        'It splits some real reads'
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 20

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 20

        # reads 2
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20
예제 #8
0
def _read_pairs(in_fhands, paired_reads):
    seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQITEM])
    if paired_reads:
        pairs = group_pairs_by_name(seqs)
    else:
        pairs = group_pairs(seqs, n_seqs_in_pair=1)
    return pairs
예제 #9
0
def sort_fastx_files(in_fhands, key, index_fpath=None, directory=None,
                     max_items_in_memory=None, tempdir=None):
    if key == 'seq':
        reads = read_seqs(in_fhands)
        return sorted_items(reads, key=get_str_seq, tempdir=tempdir,
                            max_items_in_memory=max_items_in_memory)
    elif key == 'coordinate':
        return sort_by_position_in_ref(in_fhands, index_fpath=index_fpath,
                                       directory=directory,
                                       tempdir=tempdir)
    elif key == 'name':
        reads = read_seqs(in_fhands)
        return sorted_items(reads, key=get_name, tempdir=tempdir,
                            max_items_in_memory=max_items_in_memory)
    else:
        raise ValueError('Non-supported sorting key')
예제 #10
0
def _read_pairs(in_fhands, paired_reads):
    seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQITEM])
    if paired_reads:
        pairs = group_seqs_in_pairs(seqs)
    else:
        pairs = ([seq] for seq in seqs)
    return pairs
예제 #11
0
def _read_pairs(in_fhands, paired_reads):
    seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQITEM])
    if paired_reads:
        pairs = group_pairs_by_name(seqs)
    else:
        pairs = group_pairs(seqs, n_seqs_in_pair=1)
    return pairs
예제 #12
0
 def test_count_seqs():
     in_fhands = []
     for val in range(1, 6):
         fhand = open(join(TEST_DATA_DIR, 'pairend{0}.sfastq'.format(val)))
         in_fhands.append(fhand)
     seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQRECORD])
     counts = count_seqs(seqs)
     assert counts == {'total_length': 96, 'num_seqs': 24}
예제 #13
0
    def test_deinterleave(self):
        'It de-interleaves an iterator of alternating fwd and rev reads'

        fhand1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        fhand2 = os.path.join(TEST_DATA_DIR, 'pairend1b.sfastq')
        fwd_seqs = read_seqs([open(fhand1)], 'fastq')
        rev_seqs = read_seqs([open(fhand2)], 'fastq')

        seqs = interleave_pairs(fwd_seqs, rev_seqs)
        out_fhand1 = StringIO()
        out_fhand2 = StringIO()
        out_format = 'fastq'
        deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format)
        result1 = out_fhand1.getvalue()
        result2 = out_fhand2.getvalue()
        assert result1.strip() == open(fhand1).read().strip()
        assert result2.strip() == open(fhand2).read().strip()
예제 #14
0
    def test_filter_chimeras(self):
        reference_seq = GENOME

        #Typic non chimeric
        query1 = '>seq1 1:Y:18:ATCACG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTA'
        query1 += 'CATTGAACTT\n'
        query2 = '>seq1 2:Y:18:ATCACG\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAG'
        query2 += 'GGTTGTAACG\n'

        #typic chimeric
        query3 = '>seq2 1:Y:18:ATCACG\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGT'
        query3 += 'CTGCGATCCCTG'
        query3 += 'GGTAGACTGAGGCCTTCTCGAACTACAAATCATCACCAGACCATGTCCGA\n'
        query4 = '>seq2 2:Y:18:ATCACG\nTTAAGGCACGTACGGTACCTAAATCGGCCTGATGGTATT'
        query4 += 'GATGCTGAACTT'
        query4 += 'ATTGCGGCTCACACACCCCTACGTTACACGCAAATGCTGCCCGAAACGTTAT\n'

        #Unknown, 3' end does not map, impossible to know if it is chimeric
        query13 = '>seq7 1:Y:18:ATCACG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCT'
        query13 += 'ACATTGAACTT'
        query13 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query14 = '>seq7 2:Y:18:ATCACG\nATCATTGCATAAGTAACACTCAACCAACAGTGCTACAG'
        query14 += 'GGTTGTAACGCC'
        query14 += 'CCTCGAAGGTACCTTTGCCAGACTGGGCTACAGGACACCCAGTCTCCCGGGAGTCT\n'

        query = query1 + query2 + query3 + query4 + query13 + query14
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()
        ref_fhand = NamedTemporaryFile()
        ref_fhand.write(reference_seq)
        ref_fhand.flush()
        out_fhand = NamedTemporaryFile()
        chimeras_fhand = NamedTemporaryFile()
        unknown_fhand = NamedTemporaryFile()
        filter_chimeras(ref_fhand.name, out_fhand, chimeras_fhand, [in_fhand],
                        unknown_fhand)
        result = read_seqs([out_fhand])
        chimeric = read_seqs([chimeras_fhand])
        unknown = read_seqs([unknown_fhand])
        for seq in result:
            assert get_name(seq) in ['seq1.f', 'seq1.r']
        for seq in chimeric:
            assert get_name(seq) in ['seq2.f', 'seq2.r']
        for seq in unknown:
            assert get_name(seq) in ['seq7.f', 'seq7.r']
예제 #15
0
 def test_count_seqs():
     in_fhands = []
     for val in range(1, 6):
         fhand = open(join(TEST_DATA_DIR, 'pairend{0}.sfastq'.format(val)))
         in_fhands.append(fhand)
     seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQRECORD])
     counts = count_seqs(seqs)
     assert counts == {'total_length': 96, 'num_seqs': 24}
예제 #16
0
    def test_deinterleave(self):
        'It de-interleaves an iterator of alternating fwd and rev reads'

        fhand1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        fhand2 = os.path.join(TEST_DATA_DIR, 'pairend1b.sfastq')
        fwd_seqs = read_seqs([open(fhand1)], 'fastq')
        rev_seqs = read_seqs([open(fhand2)], 'fastq')

        seqs = interleave_pairs(fwd_seqs, rev_seqs)
        out_fhand1 = StringIO()
        out_fhand2 = StringIO()
        out_format = 'fastq'
        deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format)
        result1 = out_fhand1.getvalue()
        result2 = out_fhand2.getvalue()
        assert result1.strip() == open(fhand1).read().strip()
        assert result2.strip() == open(fhand2).read().strip()
예제 #17
0
    def test_trim_chimeras_bin(self):
        trim_chimeras_bin = os.path.join(BIN_DIR, 'trim_mp_chimeras')
        assert 'usage' in check_output([trim_chimeras_bin, '-h'])
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')
        query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT'
        query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
        query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n'
        query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query = query1 + query2
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        out_fhand = NamedTemporaryFile()
        expected_seqs = [
            'GGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT',
            'CATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG'
        ]
        cmd = [
            trim_chimeras_bin, in_fhand.name, '-r', index_fpath, '-o',
            out_fhand.name
        ]
        #raw_input(" ".join(cmd))
        check_output(cmd, stdin=in_fhand)
        counts = 0
        for seq in read_seqs([open(out_fhand.name)]):
            assert get_str_seq(seq) in expected_seqs
            counts += 1
        assert counts != 0

        #With several threads
        cmd = [
            trim_chimeras_bin, in_fhand.name, '-r', index_fpath, '-o',
            out_fhand.name, '-p', '2'
        ]
        check_output(cmd, stdin=in_fhand)
        counts = 0
        for seq in read_seqs([open(out_fhand.name)]):
            assert get_str_seq(seq) in expected_seqs
            counts += 1
        assert counts != 0
예제 #18
0
    def test_seqitems_io(self):
        'It checks the different seq class streams IO'
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], 'fasta',
                              prefered_seq_classes=[SEQITEM]))
        assert seqs[0].kind == SEQITEM
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
        assert seqs[0].object.name == 's1'

        # SeqRecord
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(
            read_seqs([fhand], 'fasta', prefered_seq_classes=[SEQRECORD]))
        assert seqs[0].kind == SEQRECORD
        fhand = StringIO()
        write_seqs(seqs, fhand, 'fasta')
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'

        # seqitem not possible with different input and output formats
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        try:
            seqs = list(
                read_seqs([fhand],
                          'fasta',
                          out_format='fastq',
                          prefered_seq_classes=[SEQITEM]))
            self.fail('ValueError expected')
        except ValueError:
            pass

        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(
            read_seqs([fhand],
                      'fasta',
                      out_format='fasta',
                      prefered_seq_classes=[SEQITEM]))
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
예제 #19
0
    def test_sample_seq(self):
        'It tests the seq head'
        sample_seq = os.path.join(BIN_DIR, 'sample_seqs')
        assert 'usage' in check_output([sample_seq, '-h'])

        fasta_fhand = NamedTemporaryFile()
        fasta_fhand.write('>seq\nACTA\n>seq2\nACTA\n>seq3\nACTA\n')
        fasta_fhand.flush()

        # random sample
        result = check_output([sample_seq, '-n', '1', fasta_fhand.name])
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 1

        # random sample
        result = check_output([sample_seq, '-n', '2', fasta_fhand.name])
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2

        # random sample with stdin
        result = check_output([sample_seq, '-n', '2'],
                              stdin=open(fasta_fhand.name))
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2
예제 #20
0
    def test_sample_seq(self):
        'It tests the seq head'
        sample_seq = os.path.join(BIN_DIR, 'sample_seqs')
        assert 'usage' in check_output([sample_seq, '-h'])

        fasta_fhand = NamedTemporaryFile()
        fasta_fhand.write('>seq\nACTA\n>seq2\nACTA\n>seq3\nACTA\n')
        fasta_fhand.flush()

        # random sample
        result = check_output([sample_seq, '-n', '1', fasta_fhand.name])
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 1

        # random sample
        result = check_output([sample_seq, '-n', '2', fasta_fhand.name])
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2

        # random sample with stdin
        result = check_output([sample_seq, '-n', '2'],
                              stdin=open(fasta_fhand.name))
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2
예제 #21
0
    def test_trim_chimeras_bin(self):
        trim_chimeras_bin = os.path.join(BIN_DIR, 'trim_mp_chimeras')
        assert 'usage' in check_output([trim_chimeras_bin, '-h'])
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')
        query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT'
        query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
        query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n'
        query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query = query1 + query2
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        out_fhand = NamedTemporaryFile()
        expected_seqs = ['GGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT',
                         'CATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG']
        cmd = [trim_chimeras_bin, in_fhand.name, '-r', index_fpath,
               '-o', out_fhand.name]
        #raw_input(" ".join(cmd))
        check_output(cmd, stdin=in_fhand)
        counts = 0
        for seq in read_seqs([open(out_fhand.name)]):
            assert get_str_seq(seq) in expected_seqs
            counts += 1
        assert counts != 0

        #With several threads
        cmd = [trim_chimeras_bin, in_fhand.name, '-r', index_fpath,
               '-o', out_fhand.name, '-p', '2']
        check_output(cmd, stdin=in_fhand)
        counts = 0
        for seq in read_seqs([open(out_fhand.name)]):
            assert get_str_seq(seq) in expected_seqs
            counts += 1
        assert counts != 0
예제 #22
0
    def test_interleave(self):
        'It interleaves two iterators with paired reads'
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = list(read_seqs([open(file1)], 'fastq'))
        rev_seqs = list(read_seqs([open(file2)], 'fastq'))

        try:
            list(interleave_pairs(fwd_seqs, rev_seqs))
            self.fail('InterleaveError expected')
        except InterleaveError:
            pass

        # we skip the tests
        seqs = list(interleave_pairs(fwd_seqs, rev_seqs, skip_checks=True))
        assert len(seqs) == 8

        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend1b.sfastq')
        fwd_seqs = read_seqs([open(file1)], 'fastq')
        rev_seqs = read_seqs([open(file2)], 'fastq')

        seqs = list(interleave_pairs(fwd_seqs, rev_seqs))
        assert len(seqs) == 8
예제 #23
0
 def test_calculate_stats_seqitems():
     'It tests the calculate stat function with seqitems'
     in_fhands = []
     for val in range(1, 6):
         fhand = open(join(TEST_DATA_DIR, 'pairend{0}.sfastq'.format(val)))
         in_fhands.append(fhand)
     seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQITEM])
     results = calculate_sequence_stats(seqs, nxs=[50])
     assert 'maximum: 4' in results['length']
     assert 'N50' in results['length']
     assert '1:30.0,30.0,30.0,30.0,30.0 <[|]>' in results['qual_boxplot']
     assert '[30 , 31[ (96): **********' in results['quality']
     assert 'Q30: 100.0' in results['quality']
     assert '0 (A: 1.00, C: 0.00, G: 0.00, T: 0.00' in  results['nucl_freq']
     assert results['kmer'] == ''
예제 #24
0
    def test_interleave(self):
        'It interleaves two iterators with paired reads'
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = list(read_seqs([open(file1)], 'fastq'))
        rev_seqs = list(read_seqs([open(file2)], 'fastq'))

        try:
            list(interleave_pairs(fwd_seqs, rev_seqs))
            self.fail('InterleaveError expected')
        except InterleaveError:
            pass

        # we skip the tests
        seqs = list(interleave_pairs(fwd_seqs, rev_seqs, skip_checks=True))
        assert len(seqs) == 8

        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend1b.sfastq')
        fwd_seqs = read_seqs([open(file1)], 'fastq')
        rev_seqs = read_seqs([open(file2)], 'fastq')

        seqs = list(interleave_pairs(fwd_seqs, rev_seqs))
        assert len(seqs) == 8
예제 #25
0
 def test_calculate_stats_seqitems():
     'It tests the calculate stat function with seqitems'
     in_fhands = []
     for val in range(1, 6):
         fhand = open(join(TEST_DATA_DIR, 'pairend{0}.sfastq'.format(val)))
         in_fhands.append(fhand)
     seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQITEM])
     results = calculate_sequence_stats(seqs, nxs=[50])
     assert 'maximum: 4' in results['length']
     assert 'N50' in results['length']
     assert '1:30.0,30.0,30.0,30.0,30.0 <[|]>' in results['qual_boxplot']
     assert '[30 , 31[ (96): **********' in results['quality']
     assert 'Q30: 100.0' in results['quality']
     assert '0 (A: 1.00, C: 0.00, G: 0.00, T: 0.00' in results['nucl_freq']
     assert results['kmer'] == ''
예제 #26
0
 def test_orf_annotator(self):
     'It tests orf annotator'
     fpath = os.path.join(TEST_DATA_DIR, 'orf_test.fasta')
     estscan_matrix = os.path.join(TEST_DATA_DIR,
                                   'Arabidopsis_thaliana.smat')
     seq_records = list(
         read_seqs([open(fpath)], prefered_seq_classes=[SEQRECORD]))
     orf_annotator = EstscanOrfAnnotator(estscan_matrix)
     seq_records = orf_annotator(seq_records)
     orf1 = seq_records[0].object.features[0]
     orf2 = seq_records[1].object.features[0]
     assert orf1.strand == 1
     assert orf1.location.start.position == 0
     assert orf1.location.end.position == 541
     assert orf2.strand == -1
     assert orf2.location.start.position == 0
     assert orf2.location.end.position == 541
     assert not seq_records[2].object.features
예제 #27
0
 def test_orf_annotator(self):
     'It tests orf annotator'
     fpath = os.path.join(TEST_DATA_DIR, 'orf_test.fasta')
     estscan_matrix = os.path.join(TEST_DATA_DIR,
                                   'Arabidopsis_thaliana.smat')
     seq_records = list(read_seqs([open(fpath)],
                                  prefered_seq_classes=[SEQRECORD]))
     orf_annotator = EstscanOrfAnnotator(estscan_matrix)
     seq_records = orf_annotator(seq_records)
     orf1 = seq_records[0].object.features[0]
     orf2 = seq_records[1].object.features[0]
     assert orf1.strand == 1
     assert orf1.location.start.position == 0
     assert orf1.location.end.position == 541
     assert orf2.strand == -1
     assert orf2.location.start.position == 0
     assert orf2.location.end.position == 541
     assert not seq_records[2].object.features
예제 #28
0
def _read_estcan_result(fhand, result, file_type):
    'It reads a dna or pep ESTscan result file'
    for seq in read_seqs([fhand], file_format='fasta'):
        items = [i.strip() for i in get_description(seq).split(';')]
        strand = -1 if 'minus strand' in items else 1
        start, end = items[0].split(' ', 3)[1:3]
        # estscan changes the name, we have to fix it
        seqid = get_name(seq).strip(';')
        try:
            seq_orfs = result[seqid]
        except KeyError:
            seq_orfs = {}
            result[seqid] = seq_orfs
        orf_key = (int(start), int(end), strand)
        if orf_key in seq_orfs:
            orf = seq_orfs[orf_key]
        else:
            orf = {}
            seq_orfs[orf_key] = orf
        orf[file_type] = get_str_seq(seq)
예제 #29
0
def _read_estcan_result(fhand, result, file_type):
    'It reads a dna or pep ESTscan result file'
    for seq in read_seqs([fhand], file_format='fasta'):
        items = [i.strip() for i in get_description(seq).split(';')]
        strand = -1 if 'minus strand' in items else 1
        start, end = items[0].split(' ', 3)[1:3]
        # estscan changes the name, we have to fix it
        seqid = get_name(seq).strip(';')
        try:
            seq_orfs = result[seqid]
        except KeyError:
            seq_orfs = {}
            result[seqid] = seq_orfs
        orf_key = (int(start), int(end), strand)
        if orf_key in seq_orfs:
            orf = seq_orfs[orf_key]
        else:
            orf = {}
            seq_orfs[orf_key] = orf
        orf[file_type] = get_str_seq(seq)
예제 #30
0
    def _get_chrom_lengths(self):
        chrom_lens = OrderedDict()
        if self._ref_fhand is None:
            vcf_fhand = gzip.open(self._reader.fhand.name)
            for line in vcf_fhand:
                line = line.strip()
                if line.startswith('#'):
                    continue
                items = line.split()
                chrom = items[0]
                loc = int(items[1])
                if chrom not in chrom_lens:
                    chrom_lens[chrom] = loc
                else:
                    if loc > chrom_lens[chrom]:
                        chrom_lens[chrom] = loc

        else:
            for read in read_seqs([self._ref_fhand]):
                chrom_lens[get_name(read)] = get_length(read)
        return chrom_lens
예제 #31
0
def filter_chimeras(ref_fpath, out_fhand, chimeras_fhand, in_fhands,
                    unknown_fhand, unpaired=False, paired_result=True,
                    settings=get_setting('CHIMERAS_SETTINGS'),
                    min_seed_len=None, directory=None):
    file_format = get_format(in_fhands[0])
    if unpaired:
        unpaired_fpaths = [fhand.name for fhand in in_fhands]
        paired_fpaths = None
    else:
        f_fhand = NamedTemporaryFile()
        r_fhand = NamedTemporaryFile()
        seqs = read_seqs(in_fhands)
        deinterleave_pairs(seqs, f_fhand, r_fhand, file_format)
        paired_fpaths = [f_fhand.name, r_fhand.name]
        unpaired_fpaths = None
    bamfile = _sorted_mapped_reads(ref_fpath, paired_fpaths, unpaired_fpaths,
                                   directory, file_format, min_seed_len)

    total = 0
    chimeric = 0
    unknown = 0
    for pair, kind in classify_mapped_reads(bamfile, settings=settings,
                                           paired_result=paired_result,
                                           file_format=file_format):
        if kind is NON_CHIMERIC:
            write_seqs(pair, out_fhand)
        elif kind is CHIMERA and chimeras_fhand is not None:
            write_seqs(pair, chimeras_fhand)
            chimeric += 1
        elif kind is UNKNOWN and unknown_fhand is not None:
            write_seqs(pair, unknown_fhand)
            unknown += 1
        total += 1
    mapped = total - chimeric - unknown
    print 'Total pairs analyzed: ', total
    print 'Chimeric pairs filtered: ', chimeric, '\t', chimeric / float(total)
    print 'Unknown pairs found: ', unknown, '\t', unknown / float(total)
    print 'Non-chimeric pairs: ', mapped, '\t', mapped / float(total)
예제 #32
0
    def test_pair_matcher(self):
        'It test the pair matcher function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)], file_format='fastq')
        rev_seqs = read_seqs([open(file2)], file_format='fastq')

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fwd_seqs = read_seqs([open(file1)], 'fastq')
        rev_seqs = read_seqs([open(file2)], 'fastq')
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)], 'fastq')
        rev_seqs = read_seqs([open(file2)], 'fastq')
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)], 'fastq')
        rev_seqs = read_seqs([open(file2)], 'fastq')
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)
        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output

        orp = orphan_out_fhand.getvalue()
        assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp
        assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp

        # File is not sorted
        file1 = '''@s1.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s2.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s1.r
AACCAGTCAAC
+
CCCFFFFFGHH
'''
        file1 = StringIO(file1)
        seqs = read_seqs([file1], 'fastq')
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        try:
            match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)
            output = out_fhand.getvalue()
            self.fail('MalformedFile error expected')
        except MalformedFile:
            pass
예제 #33
0
    def test_mate_pair_checker():
        'It test the pair matcher function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)], file_format='fastq')
        rev_seqs = read_seqs([open(file2)], file_format='fastq')

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fwd_seqs = read_seqs([open(file1)], 'fastq')
        rev_seqs = read_seqs([open(file2)], 'fastq')
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)], 'fastq')
        rev_seqs = read_seqs([open(file2)], 'fastq')
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)], 'fastq')
        rev_seqs = read_seqs([open(file2)], 'fastq')
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)
        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output

        orp = orphan_out_fhand.getvalue()
        assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp
        assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp
예제 #34
0
    def test_mate_pair_unorderer_checker():
        'It test the mate pair function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # unordered file
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2_unordered.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    out_format,
                    ordered=False)
        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    out_format,
                    ordered=False)
        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output

        orp = orphan_out_fhand.getvalue()
        assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp
        assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp
예제 #35
0
def _get_seq_lengths(fhand):
    return {get_name(seq): get_length(seq) for seq in read_seqs([fhand])}
예제 #36
0
    def test_pair_matcher(self):
        "It test the pair matcher function"
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, "pairend1.sfastq")
        file2 = os.path.join(TEST_DATA_DIR, "pairend2.sfastq")
        fwd_seqs = read_seqs([open(file1)], file_format="fastq")
        rev_seqs = read_seqs([open(file2)], file_format="fastq")

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = "fastq"
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert "@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG" in output
        assert "@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in output
        orp = orphan_out_fhand.getvalue()
        assert "@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, "pairend1.sfastq")
        file2 = os.path.join(TEST_DATA_DIR, "pairend3.sfastq")
        fwd_seqs = read_seqs([open(file1)], "fastq")
        rev_seqs = read_seqs([open(file2)], "fastq")
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = "fastq"
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert "@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG" in output
        assert "@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in output
        orp = orphan_out_fhand.getvalue()
        assert "@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG" in orp
        assert "@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in orp
        assert "@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in orp

        file1 = os.path.join(TEST_DATA_DIR, "pairend4.sfastq")
        file2 = os.path.join(TEST_DATA_DIR, "pairend2.sfastq")
        fwd_seqs = read_seqs([open(file1)], "fastq")
        rev_seqs = read_seqs([open(file2)], "fastq")
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = "fastq"

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert "@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG" in output
        assert "@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in output
        orp = orphan_out_fhand.getvalue()
        assert "@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in orp
        assert "@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, "pairend7.sfastq")
        file2 = os.path.join(TEST_DATA_DIR, "pairend2.sfastq")
        fwd_seqs = read_seqs([open(file1)], "fastq")
        rev_seqs = read_seqs([open(file2)], "fastq")
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = "fastq"

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)
        output = out_fhand.getvalue()
        assert "@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG" in output
        assert "@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in output
        assert "@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in output
        assert "@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in output

        orp = orphan_out_fhand.getvalue()
        assert "@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1" in orp
        assert "@seq7:136:FC706VJ:2:2104:15343:197393.hhhh" in orp
        assert "@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC" in orp

        # File is not sorted
        file1 = """@s1.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s2.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s1.r
AACCAGTCAAC
+
CCCFFFFFGHH
"""
        file1 = StringIO(file1)
        seqs = read_seqs([file1], "fastq")
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = "fastq"

        try:
            match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)
            output = out_fhand.getvalue()
            self.fail("MalformedFile error expected")
        except MalformedFile:
            pass
예제 #37
0
    def test_pair_matcher(self):
        'It test the pair matcher function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)
        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output

        orp = orphan_out_fhand.getvalue()
        assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp
        assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp

        # File is not sorted
        file1 = '''@s1.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s2.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s1.r
AACCAGTCAAC
+
CCCFFFFFGHH
'''
        file1 = StringIO(file1)
        set_format(file1, 'fastq')
        seqs = read_seqs([file1])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        try:
            match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                        check_order_buffer_size=10)
            output = out_fhand.getvalue()
            self.fail('ItemsNotSortedError error expected')
        except ItemsNotSortedError:
            pass
    def test_bin_transcrip_orientator(self):
        'it tests the transcript orientator binary'
        orientate_bin = os.path.join(BIN_DIR, 'orientate_transcripts')
        assert 'usage' in check_output([orientate_bin, '-h'])

        in_fpath = os.path.join(TEST_DATA_DIR, 'seqs_to_orientate.fasta')
        estscan_matrix = os.path.join(TEST_DATA_DIR,
                                      'Arabidopsis_thaliana.smat')
        blastdb1 = os.path.join(TEST_DATA_DIR, 'blastdbs', 'arabidopsis_genes')
        blastdb2 = os.path.join(TEST_DATA_DIR, 'blastdbs', 'calabaza')

        out_fhand = NamedTemporaryFile()
        cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d',
               blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001',
               '-v', '0.0001', in_fpath, '-o', out_fhand.name,
               '--polya_min_len', '4']
        check_output(cmd)

        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert get_str_seq(init_seqs[0]) == get_str_seq(out_seqs[0])
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert 'polyA' in  out_seqs[1].object.description
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        out_seq4 = str(out_seqs[4].object.seq.reverse_complement())
        assert str(init_seqs[4].object.seq) == out_seq4
        assert 'estscan_orf' in  out_seqs[4].object.description
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        out_seq6 = str(out_seqs[6].object.seq.reverse_complement())
        assert str(init_seqs[6].object.seq) == out_seq6
        assert 'blast arabidopsis_genes' in  out_seqs[6].object.description
        cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d',
               blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001',
               in_fpath]
        stderr = NamedTemporaryFile()
        try:
            check_output(cmd, stderr=stderr)
            self.fail()
        except CalledProcessError:
            stde = open(stderr.name).read()
            assert 'Blast parameters are not well defined' in stde

        # witouth parameters
        out_fhand = NamedTemporaryFile()
        check_output([orientate_bin, in_fpath, '-o', out_fhand.name,
                      '--polya_min_len', '4'])

        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq)
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        assert str(init_seqs[4].object.seq) == str(out_seqs[4].object.seq)
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        assert str(init_seqs[6].object.seq) == str(out_seqs[6].object.seq)

        # only with orf annotator
        check_output([orientate_bin, in_fpath, '-o', out_fhand.name, '-u',
                      estscan_matrix, '--polya_min_len', '4'])

        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq)
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        out_seq4 = str(out_seqs[4].object.seq.reverse_complement())
        assert str(init_seqs[4].object.seq) == out_seq4
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        assert str(init_seqs[6].object.seq) == str(out_seqs[6].object.seq)

        # multiprocessor
        out_fhand = NamedTemporaryFile()
        cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d',
               blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001',
               '-v', '0.0001', in_fpath, '-o', out_fhand.name, '-p', '2',
               '--polya_min_len', '4']
        check_output(cmd)
        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq)
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert 'polyA' in  out_seqs[1].object.description
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        out_seq4 = str(out_seqs[4].object.seq.reverse_complement())
        assert str(init_seqs[4].object.seq) == out_seq4
        assert 'estscan_orf' in  out_seqs[4].object.description
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        out_seq6 = str(out_seqs[6].object.seq.reverse_complement())
        assert str(init_seqs[6].object.seq) == out_seq6
        assert 'blast arabidopsis_genes' in  out_seqs[6].object.description
예제 #39
0
    def test_mate_pair_unorderer_checker():
        'It test the mate pair function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # unordered file
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2_unordered.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                    ordered=False)
        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                    ordered=False)
        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output

        orp = orphan_out_fhand.getvalue()
        assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp
        assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp