def default_seqs(): """Default sequence input used in all tests requiring sequence data. Exceptions include tests in the TestSeqIO class.""" seqs = read_seqs(data_dir, filename_end='trimmed.fq', cutoff=5.8) return seqs
def test_data_random_sampling(self): seqs = read_seqs(data_dir, filename_end='trimmed.fq', p_discard=0.5) for condition in seqs: # With 1000 seqs, chance of having less than 400 or more than 600 # seqs by chance is low (at least six standard devs from mean). assert len(seqs[condition]) > 400 assert len(seqs[condition]) < 600
def test_exact_length(self): seqs_len10 = read_seqs(data_dir, filename_end='trimmed.fq', seq_len_req=10) for condition in seqs_len10: for seq in seqs_len10[condition]: assert len(seq) == 10
def test_get_seqs_for_one_specific_condition(self): seqs = read_seqs(data_dir, filename_end='trimmed.fq', cond_text='C0_rep1') cond_name = 'test_C0_rep1_reads' assert len(seqs) == 1 assert cond_name in seqs
def test_get_seqs_for_multiple_specific_conditions(self): seqs = read_seqs(data_dir, filename_end='trimmed.fq', cond_text=['C0_rep1', 'C1_rep1']) cond1_name = 'test_C0_rep1_reads' cond2_name = 'test_C1_rep1_reads' assert len(seqs) == 2 assert cond1_name in seqs assert cond2_name in seqs
def test_degen(self): seqs_degen = read_seqs(data_dir, filename_end='trimmed.fq', degen=5) first_seqs = [ 'CAC', 'AGAATGAG', 'AAAATAGCTGGAGGATC', 'ACGGGGGATGCAGAGGGGTTGTCC', 'AGTGAAAGGATAGGAAGGTCA', 'GAGTCGAGACGAGAAGGATA' ] for i, condition in enumerate(seqs_degen): first_seq = list(first_seqs[i]) assert seqs_degen[condition][0] == first_seq
def test_cutoff_int(self): seqs_cutoff_int = read_seqs(data_dir, filename_end='trimmed.fq', cutoff=5.0) first_seqs = [ 'CTA', 'GTAGGAGA', 'GAGGTAAAATAGCTGGA', 'AAGTGACGGGGGATGCAGAGGGGT', 'GATCGAGTGAAAGGATAGGAA', 'CGGGTGAGTCGAGACGAGAA' ] for i, condition in enumerate(seqs_cutoff_int): first_seq = list(first_seqs[i]) assert seqs_cutoff_int[condition][0] == first_seq
def test_degen_cutoff(self): seqs_degen_cutoff = read_seqs(data_dir, filename_end='trimmed.fq', degen=5, cutoff=5.8) first_seqs = [ ('G', 'GGAAACGGGAAAGCTAAATCAAGAGA', 'GGAAACGGGAAAGCTAAATCAAGAG'), # first seq too short ('AGA', 'AG'), ('AAAATAGCTGGA', 'AAAATAGCTGG'), ('ACGGGGGATGCAGAGGGGT', 'ACGGGGGATGCAGAGGGG'), ('AGTGAAAGGATAGGAA', 'AGTGAAAGGATAGGA'), ('GAGTCGAGACGAGAA', 'GAGTCGAGACGAGA') ] for i, condition in enumerate(seqs_degen_cutoff): first_seq_possibilities = (list(seq) for seq in first_seqs[i]) assert seqs_degen_cutoff[condition][0] in first_seq_possibilities
def test_p_discard_greater_than_1_raises_ValueError(self): with pytest.raises(ValueError): read_seqs(data_dir, filename_end='trimmed.fq', p_discard=1.5)
def test_neg_p_discard_raises_ValueError(self): with pytest.raises(ValueError): read_seqs(data_dir, filename_end='trimmed.fq', p_discard=-0.5)
def test_neg_cutoff_raises_ValueError(self): with pytest.raises(ValueError): read_seqs(data_dir, filename_end='trimmed.fq', cutoff=-1)
def seqs(self): return read_seqs(data_dir, filename_end='trimmed.fq')