def test_giuseppe_reads(): 'It splits some real reads' seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) new_seqs = [] for packet in read_seq_packets([open(seq_fpath)], 2): new_seqs.extend(splitter(packet)) seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01BJHT8\\1' in seq_names assert 'G109AZL01BJHT8\\2' in seq_names assert len(new_seqs) == 20 # test with process_seq_packet seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) seq_packets = read_seq_packets([open(seq_fpath)], 2) seq_packets = process_seq_packets(seq_packets, [splitter])[0] new_seqs = [seq for l in list(seq_packets) for seq in l] seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01BJHT8\\1' in seq_names assert 'G109AZL01BJHT8\\2' in seq_names assert len(new_seqs) == 20 # reads 2 seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) new_seqs = [] for packet in read_seq_packets([open(seq_fpath)], 2): new_seqs.extend(splitter(packet)) seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01D8U3X\\1' in seq_names assert 'G109AZL01D8U3X\\2' in seq_names assert len(new_seqs) == 20 # test with process_seq_packet seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) seq_packets = read_seq_packets([open(seq_fpath)], 2) seq_packets = process_seq_packets(seq_packets, [splitter])[0] new_seqs = [seq for l in list(seq_packets) for seq in l] seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01D8U3X\\1' in seq_names assert 'G109AZL01D8U3X\\2' in seq_names assert len(new_seqs) == 20
def test_many_reads(self): 'It splits lots of reads to check that blast finds everything' linker = TITANIUM_LINKER def create_seq(index): 'It creates a random seq with a linker' seq1 = ''.join(choice('ACTG') for i in range(100)) seq2 = ''.join(choice('ACTG') for i in range(100)) seq = seq1 + linker + seq2 seq = SeqRecord(id='seq_' + str(index), seq=Seq(seq)) seq = SeqWrapper(SEQRECORD, seq, None) return seq # We want to test that blast reports all reads packet_size = get_setting('PACKET_SIZE') default_blast_max_target_size = 500 assert packet_size > default_blast_max_target_size seqs = [create_seq(i) for i in range(1000)] splitter = MatePairSplitter() for index, seq in enumerate(splitter(seqs)): seq_index = index // 2 pair_index = (index % 2) + 1 expected_id = 'seq_' + str(seq_index) + '\\' + str(pair_index) assert get_name(seq) == expected_id
def test_split_mate(self): 'It tests the function that splits seqs using segments' # pylint: disable=W0212 seq = 'aaatttccctt' seq = SeqWrapper(SEQRECORD, SeqRecord(Seq(seq), id='seq'), None) # fake class to test splitter = MatePairSplitter([seq]) # segment beginning seqs = splitter._split_by_mate_linker(seq, ([(0, 3)], False)) assert get_str_seq(seqs[0]) == 'ttccctt' assert get_name(seqs[0]) == 'seq' # segment at end seqs = splitter._split_by_mate_linker(seq, ([(7, 11)], False)) assert get_str_seq(seqs[0]) == 'aaatttc' assert get_name(seqs[0]) == 'seq' # segmnent in the middle seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], True)) assert get_str_seq(seqs[0]) == 'aaat' assert get_str_seq(seqs[1]) == 'ctt' assert get_name(seqs[0]) == 'seq_pl.part1' assert get_name(seqs[1]) == 'seq_pl.part2' seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], False)) assert get_name(seqs[0]) == r'seq\1' assert get_name(seqs[1]) == r'seq\2' seqs = splitter._split_by_mate_linker(seq, ([(4, 6), (8, 9)], False)) assert get_str_seq(seqs[0]) == 'aaat' assert get_str_seq(seqs[1]) == 'c' assert get_str_seq(seqs[2]) == 't' assert get_name(seqs[0]) == 'seq_mlc.part1' # all sequence is linker seqs = splitter._split_by_mate_linker(seq, ([(0, 10)], False)) assert not get_str_seq(seqs[0]) # there's no segments seqs = splitter._split_by_mate_linker(seq, ([], False)) assert get_name(seq) == get_name(seqs[0]) assert get_str_seq(seq) == get_str_seq(seqs[0])
def test_split_mate(self): 'It tests the function that splits seqs using segments' # pylint: disable=W0212 seq = 'aaatttccctt' seqrecord = SeqRecord(Seq(seq), id='seq') #fake class to test splitter = MatePairSplitter([seqrecord]) # segment beginning seqs = splitter._split_by_mate_linker(seqrecord, ([(0, 3)], False)) assert str(seqs[0].seq) == 'ttccctt' assert seqs[0].id == 'seq' # segment at end seqs = splitter._split_by_mate_linker(seqrecord, ([(7, 10)], False)) assert str(seqs[0].seq) == 'aaatttc' assert seqs[0].id == 'seq' # segmnet in the middle seqs = splitter._split_by_mate_linker(seqrecord, ([(4, 7)], True)) assert str(seqs[0].seq) == 'aaat' assert str(seqs[1].seq) == 'ctt' assert seqs[0].id == 'seq_pl.part1' assert seqs[1].id == 'seq_pl.part2' seqs = splitter._split_by_mate_linker(seqrecord, ([(4, 7)], False)) assert seqs[0].id == 'seq\1' assert seqs[1].id == 'seq\2' seqs = splitter._split_by_mate_linker(seqrecord, ([(4, 6), (8, 9)], False)) assert str(seqs[0].seq) == 'aaat' assert str(seqs[1].seq) == 'c' assert str(seqs[2].seq) == 't' assert seqs[0].id == 'seq_mlc.part1' # all sequence is linker seqs = splitter._split_by_mate_linker(seqrecord, ([(0, 10)], False)) assert not str(seqs[0].seq) # there's no segments seqs = splitter._split_by_mate_linker(seqrecord, ([], False)) assert seqrecord.id == seqs[0].id assert str(seqrecord.seq) == str(seqs[0].seq)
def test_split_mates(self): 'It tests the detection of oligos in sequence files' mate_fhand = NamedTemporaryFile(suffix='.fasta') linker = TITANIUM_LINKER # a complete linker seq5 = 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC' seq3 = 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT' mate_fhand.write('>seq1\n' + seq5 + linker + seq3 + '\n') # no linker mate_fhand.write('>seq2\n' + seq5 + '\n') # a partial linker mate_fhand.write('>seq3\n' + seq5 + linker[2:25] + seq3 + '\n') # the linker is 5 prima mate_fhand.write('>seq4\n' + linker[10:] + seq3 + '\n') # two linkers mate_fhand.write('>seq5\n' + linker + seq3 + FLX_LINKER + seq5 + '\n') # reverse linker rev_linker = get_setting('TITANIUM_LINKER_REV') mate_fhand.write('>seq6\n' + seq5 + rev_linker + seq3 + '\n') mate_fhand.flush() splitter = MatePairSplitter() new_seqs = [] for packet in read_seq_packets([mate_fhand], 2): new_seqs.append(splitter(packet)) out_fhand = StringIO() write_seq_packets(out_fhand, new_seqs, file_format='fasta') result = out_fhand.getvalue() xpect = r'>seq1\1' xpect += '\n' xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n' xpect += r'>seq1\2' xpect += '\n' xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq2\n' xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n' xpect += '>seq3_pl.part1\n' xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTG\n' xpect += '>seq3_pl.part2\n' xpect += 'GTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq4\n' xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq5_mlc.part1\n' xpect += 'TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACGATCGATCATGTTGTAT' xpect += 'TG' xpect += 'TGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq5_mlc.part2\n' xpect += 'ACCTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC' xpect += '\n' xpect += r'>seq6\1' xpect += '\n' xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n' xpect += r'>seq6\2' xpect += '\n' xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' assert xpect == result # with short linker in 3 prima mate_fhand = NamedTemporaryFile(suffix='.fasta') seq = ">seq1\nCATCAATGACATCACAAATGACATCAACAAACTCAAA" seq += "CTCACATACACTGCTGTACCGTAC" mate_fhand.write(seq) mate_fhand.flush() splitter = MatePairSplitter() new_seqs = [] for packet in read_seq_packets([mate_fhand], 1): new_seqs.append(splitter(packet)) out_fhand = StringIO() write_seq_packets(out_fhand, new_seqs, file_format='fasta') result = ">seq1\nCATCAATGACATCACAAATGACATCAACAAACTCAAACTCACATACA\n" assert result == out_fhand.getvalue()