def test_ne_true(self): s1 = Sequence(self.label, self.sequence) s2 = Sequence(self.label, "---aggatgcgagatgcgtgtt-----") self.assertTrue(s1 != s2) self.assertTrue(s1 != 2) self.assertTrue(s1 != "---aggatgcgagatgcgtgtt-----")
def test_eq_true(self): s1 = Sequence(self.label, self.sequence) s2 = Sequence(self.label, self.sequence) self.assertTrue(s1 == s2) s2 = Sequence("152_4447;size=1812;", self.sequence) self.assertTrue(s1 == s2)
def test_eq_false(self): s1 = Sequence(self.label, self.sequence) s2 = Sequence(self.label, "---aggatgcgagatgcgtgtt-----") self.assertFalse(s1 == s2) self.assertFalse(s1 == 2) self.assertFalse(s1 == "---aggatgcgagatgcgtgtt-----")
def test_ne_false(self): s1 = Sequence(self.label, self.sequence) s2 = Sequence(self.label, self.sequence) self.assertFalse(s1 != s2) s2 = Sequence("152_4447;size=1812;", self.sequence) self.assertFalse(s1 != s2)
def test_deblur_with_non_default_error_profile(self): error_dist = [ 1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005, 0.0000005, 0.0000005 ] seqs_f = StringIO(TEST_SEQS_2) obs = deblur(sequence_generator(seqs_f), error_dist=error_dist) exp = [ Sequence( "E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag") ] self.assertEqual(obs, exp) error_dist = np.array([ 1, 0.06, 0.02, 0.02, 0.01, 0.005, 0.005, 0.005, 0.001, 0.001, 0.001, 0.0005 ]) seqs_f = StringIO(TEST_SEQS_2) obs = deblur(sequence_generator(seqs_f), error_dist=error_dist) exp = [ Sequence( "E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag") ] self.assertEqual(obs, exp)
def test_get_sequences(self): exp_seqs = [ Sequence("151_4447;size=1812;", "---aggatgcgagatgcgtggt-----"), Sequence("151_3288;size=1337;", "---ggatgcgagatgcgtggtg-----"), Sequence("151_6640;size=1068;", "---cggaggcgagatgcgtggt-----"), Sequence("151_5155;size=998;", "---gaggatgcgagatgcgtgg-----"), Sequence("151_527;size=964;", "---acggaggatgatgcgcggt-----"), Sequence("151_14716;size=390;", "---gagtgcgagatgcgtggtg-----"), Sequence("151_5777;size=305;", "---ggagtgcaagattccaggt-----"), Sequence("151_64278;size=200;", "---tactagcaagattcctggt-----"), Sequence("151_9240;size=170;", "---tagggcaagactccatggt-----"), Sequence("151_41690;size=157;", "---agg-gcgagattcctagtgg----")] obs_seqs = get_sequences(self.seqs) self.assertEqual(obs_seqs, exp_seqs)
def test_deblur_indel(self): """Test if also removes indel sequences """ seqs_f = StringIO(TEST_SEQS_2) # add the MSA for the indel seqs = sequence_generator(seqs_f) newseqs = [] for chead, cseq in seqs: tseq = cseq[:10] + '-' + cseq[10:] newseqs.append((chead, tseq)) # now add a sequence with an A insertion tseq = cseq[:10] + 'A' + cseq[10:-1] + '-' newseqs.append((chead, tseq)) obs = deblur(newseqs) # remove the '-' (same as in launch_workflow) for s in obs: s.sequence = s.sequence.replace('-', '') # the expected output exp = [ Sequence( "E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag") ] # make sure we get 1 sequence as output self.assertEqual(len(obs), 1) # and that it is the correct sequence self.assertEqual(obs[0].sequence, exp[0].sequence)
def test_init(self): obs = Sequence(self.label, self.sequence) self.assertEqual(obs.label, self.label) self.assertEqual(obs.sequence, self.exp_seq) self.assertEqual(obs.length, 27) self.assertEqual(obs.unaligned_length, 19) self.assertEqual(obs.frequency, 1812) npt.assert_equal(obs.np_sequence, self.exp_np_seq)
def test_init_mixed_case(self): sequence = "---AggATgcGAgatGCgtgGT-----" obs = Sequence(self.label, sequence) self.assertEqual(obs.label, self.label) self.assertEqual(obs.sequence, self.exp_seq) self.assertEqual(obs.length, 27) self.assertEqual(obs.unaligned_length, 19) self.assertEqual(obs.frequency, 1812) npt.assert_equal(obs.np_sequence, self.exp_np_seq)
def test_init_uppercase(self): sequence = "---AGGATGCGAGATGCGTGGT-----" obs = Sequence(self.label, sequence) self.assertEqual(obs.label, self.label) self.assertEqual(obs.sequence, self.exp_seq) self.assertEqual(obs.length, 27) self.assertEqual(obs.unaligned_length, 19) self.assertEqual(obs.frequency, 1812) npt.assert_equal(obs.np_sequence, self.exp_np_seq)
def test_deblur(self): seqs_f = StringIO(TEST_SEQS_2) obs = deblur(parse_fasta(seqs_f)) exp = [ Sequence("E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag")] self.assertEqual(obs, exp)
def test_deblur_toy_example(self): seqs_f = StringIO(TEST_SEQS_1) obs = deblur(sequence_generator(seqs_f)) exp = [ Sequence( "E.Coli;size=1000;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag") ] self.assertEqual(obs, exp)
def test_to_fasta(self): seq = Sequence(self.label, self.sequence) obs = seq.to_fasta() exp = ">151_4447;size=1812;\n---AGGATGCGAGATGCGTGGT-----\n" self.assertEqual(obs, exp) seq.frequency = 1811.1 obs = seq.to_fasta() exp = ">151_4447;size=1811;\n---AGGATGCGAGATGCGTGGT-----\n" self.assertEqual(obs, exp) seq.frequency = 1811.5 obs = seq.to_fasta() exp = ">151_4447;size=1812;\n---AGGATGCGAGATGCGTGGT-----\n" self.assertEqual(obs, exp)
def test_init_non_actg_chars(self): sequence = "---FoOatgcgagatgcgtfOo-----" exp_seq = "---FOOATGCGAGATGCGTFOO-----" exp_np_seq = np.array([ 4, 4, 4, 5, 5, 5, 0, 3, 2, 1, 2, 0, 2, 0, 3, 2, 1, 2, 3, 5, 5, 5, 4, 4, 4, 4, 4 ]) obs = Sequence(self.label, sequence) self.assertEqual(obs.label, self.label) self.assertEqual(obs.sequence, exp_seq) self.assertEqual(obs.length, 27) self.assertEqual(obs.unaligned_length, 19) self.assertEqual(obs.frequency, 1812) npt.assert_equal(obs.np_sequence, exp_np_seq)
def test_deblur_indel(self): """Test if also removes indel sequences """ seqs_f = StringIO(TEST_SEQS_2) # add the MSA for the indel seqs = sequence_generator(seqs_f) newseqs = [] for chead, cseq in seqs: tseq = cseq[:10] + '-' + cseq[10:] newseqs.append((chead, tseq)) # now add a sequence with an A insertion at the expected freq. (30 < 0.02 * (720 / 0.47) where 0.47 is the mod_factor) so should be removed cseq = newseqs[0][1] tseq = cseq[:10] + 'A' + cseq[11:-1] + '-' chead = '>indel1-read;size=30;' newseqs.append((chead, tseq)) # and add a sequence with an A insertion but at higher freq. (not expected by indel upper bound - (31 > 0.02 * (720 / 0.47) so should not be removed) cseq = newseqs[0][1] tseq = cseq[:10] + 'A' + cseq[11:-1] + '-' chead = '>indel2-read;size=31;' newseqs.append((chead, tseq)) obs = deblur(newseqs) # remove the '-' (same as in launch_workflow) for s in obs: s.sequence = s.sequence.replace('-', '') # the expected output exp = [ Sequence( "E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag") ] # make sure we get 2 sequences as output - the original and the indel2 (too many reads for the expected indel probabilty) self.assertEqual(len(obs), 2) # and that it is the correct sequence self.assertEqual(obs[0].sequence, exp[0].sequence) self.assertEqual(obs[1].label, '>indel2-read;size=31;')
def get_sequences(input_seqs): """Returns a list of Sequences Parameters ---------- input_seqs : iterable of (str, str) The list of input sequences in (label, sequence) format Returns ------- list of Sequence Raises ------ ValueError If no sequences where found in `input_seqs` If all the sequences do not have the same length either aligned or unaligned. """ try: seqs = [Sequence(id, seq) for id, seq in input_seqs] except Exception: seqs = [] if len(seqs) == 0: logger = logging.getLogger(__name__) logger.warn('No sequences found in fasta file!') return None # Check that all the sequence lengths (aligned and unaligned are the same) aligned_lengths = set(s.length for s in seqs) unaligned_lengths = set(s.unaligned_length for s in seqs) if len(aligned_lengths) != 1 or len(unaligned_lengths) != 1: raise ValueError( "Not all sequence have the same length. Aligned lengths: %s, " "sequence lengths: %s" % (", ".join(map(str, aligned_lengths)), ", ".join(map(str, unaligned_lengths)))) seqs = sorted(seqs, key=attrgetter('frequency'), reverse=True) return seqs