class SequenceCollectionTests(TestCase): def setUp(self): self.d1 = DNA('GATTACA', metadata={'id': "d1"}) self.d2 = DNA('TTG', metadata={'id': "d2"}) self.d3 = DNA('GTATACA', metadata={'id': "d3"}) self.r1 = RNA('GAUUACA', metadata={'id': "r1"}) self.r2 = RNA('UUG', metadata={'id': "r2"}) self.r3 = RNA('U-----UGCC--', metadata={'id': "r3"}) self.seqs1 = [self.d1, self.d2] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs4 = [self.d1, self.d3] self.s1 = SequenceCollection(self.seqs1) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.s4 = SequenceCollection(self.seqs4) self.empty = SequenceCollection([]) def test_init(self): SequenceCollection(self.seqs1) SequenceCollection(self.seqs2) SequenceCollection(self.seqs3) SequenceCollection([]) def test_init_fail(self): # sequences with overlapping ids s1 = [self.d1, self.d1] self.assertRaises(SequenceCollectionError, SequenceCollection, s1) def test_init_fail_no_id(self): seq = Sequence('ACGTACGT') with six.assertRaisesRegex(self, SequenceCollectionError, "'id' must be included in the sequence " "metadata"): SequenceCollection([seq]) def test_contains(self): self.assertTrue('d1' in self.s1) self.assertTrue('r2' in self.s2) self.assertFalse('r2' in self.s1) def test_eq(self): self.assertTrue(self.s1 == self.s1) self.assertFalse(self.s1 == self.s2) # different objects can be equal self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2])) self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1) # SequenceCollections with different number of sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertFalse(self.s4 == FakeSequenceCollection([self.d1, self.d3])) self.assertFalse(self.s4 == Alignment([self.d1, self.d3])) # SequenceCollections with different sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1])) def test_getitem(self): self.assertEqual(self.s1[0], self.d1) self.assertEqual(self.s1[1], self.d2) self.assertEqual(self.s2[0], self.r1) self.assertEqual(self.s2[1], self.r2) self.assertRaises(IndexError, self.empty.__getitem__, 0) self.assertRaises(KeyError, self.empty.__getitem__, '0') def test_iter(self): s1_iter = iter(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_len(self): self.assertEqual(len(self.s1), 2) self.assertEqual(len(self.s2), 3) self.assertEqual(len(self.s3), 5) self.assertEqual(len(self.empty), 0) def test_ne(self): self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s4 != FakeSequenceCollection([self.d1, self.d3])) self.assertTrue(self.s4 != Alignment([self.d1, self.d3])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1])) def test_repr(self): self.assertEqual(repr(self.s1), "<SequenceCollection: n=2; " "mean +/- std length=5.00 +/- 2.00>") self.assertEqual(repr(self.s2), "<SequenceCollection: n=3; " "mean +/- std length=7.33 +/- 3.68>") self.assertEqual(repr(self.s3), "<SequenceCollection: n=5; " "mean +/- std length=6.40 +/- 3.32>") self.assertEqual(repr(self.empty), "<SequenceCollection: n=0; " "mean +/- std length=0.00 +/- 0.00>") def test_reversed(self): s1_iter = reversed(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1[::-1]): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_kmer_frequencies(self): expected1 = Counter({'GAT': 1, 'TAC': 1}) expected2 = Counter({'TTG': 1}) self.assertEqual( self.s1.kmer_frequencies(k=3, overlap=False, relative=False), [expected1, expected2]) expected1 = defaultdict(float) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(float) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.kmer_frequencies(k=1, relative=True), [expected1, expected2]) expected1 = defaultdict(float) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(float) expected2['TTG'] = 1 / 1. self.assertEqual( self.s1.kmer_frequencies(k=3, overlap=False, relative=True), [expected1, expected2]) self.assertEqual(self.empty.kmer_frequencies(k=1, relative=True), []) # Test to ensure floating point precision bug isn't present. See the # tests for Sequence.kmer_frequencies for more details. sc = SequenceCollection([RNA('C' * 10, metadata={'id': 's1'}), RNA('G' * 10, metadata={'id': 's2'})]) self.assertEqual(sc.kmer_frequencies(1, relative=True), [defaultdict(float, {'C': 1.0}), defaultdict(float, {'G': 1.0})]) def test_str(self): exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(str(self.s1), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(str(self.s2), exp2) exp4 = "" self.assertEqual(str(self.empty), exp4) def test_distances(self): s1 = SequenceCollection([DNA("ACGT", metadata={'id': "d1"}), DNA("ACGG", metadata={'id': "d2"})]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) def h(s1, s2): return hamming(s1.values, s2.values) actual = s1.distances(h) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected) def test_distribution_stats(self): actual1 = self.s1.distribution_stats() self.assertEqual(actual1[0], 2) self.assertAlmostEqual(actual1[1], 5.0, 3) self.assertAlmostEqual(actual1[2], 2.0, 3) actual2 = self.s2.distribution_stats() self.assertEqual(actual2[0], 3) self.assertAlmostEqual(actual2[1], 7.333, 3) self.assertAlmostEqual(actual2[2], 3.682, 3) actual3 = self.s3.distribution_stats() self.assertEqual(actual3[0], 5) self.assertAlmostEqual(actual3[1], 6.400, 3) self.assertAlmostEqual(actual3[2], 3.323, 3) actual4 = self.empty.distribution_stats() self.assertEqual(actual4[0], 0) self.assertEqual(actual4[1], 0.0) self.assertEqual(actual4[2], 0.0) def test_degap(self): expected = SequenceCollection([ RNA('GAUUACA', metadata={'id': "r1"}), RNA('UUG', metadata={'id': "r2"}), RNA('UUGCC', metadata={'id': "r3"})]) actual = self.s2.degap() self.assertEqual(actual, expected) def test_get_seq(self): self.assertEqual(self.s1.get_seq('d1'), self.d1) self.assertEqual(self.s1.get_seq('d2'), self.d2) def test_ids(self): self.assertEqual(self.s1.ids(), ['d1', 'd2']) self.assertEqual(self.s2.ids(), ['r1', 'r2', 'r3']) self.assertEqual(self.s3.ids(), ['d1', 'd2', 'r1', 'r2', 'r3']) self.assertEqual(self.empty.ids(), []) def test_update_ids_default_behavior(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', metadata={'id': "1"}), RNA('UUG', metadata={'id': "2"}), RNA('U-----UGCC--', metadata={'id': "3"}) ]) exp_id_map = {'1': 'r1', '2': 'r2', '3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids() self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids() self.assertEqual(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_prefix(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', metadata={'id': "abc1"}), RNA('UUG', metadata={'id': "abc2"}), RNA('U-----UGCC--', metadata={'id': "abc3"}) ]) exp_id_map = {'abc1': 'r1', 'abc2': 'r2', 'abc3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(prefix='abc') self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(prefix='abc') self.assertEqual(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_func_parameter(self): def append_42(ids): return [id_ + '-42' for id_ in ids] # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', metadata={'id': "r1-42"}), RNA('UUG', metadata={'id': "r2-42"}), RNA('U-----UGCC--', metadata={'id': "r3-42"}) ]) exp_id_map = {'r1-42': 'r1', 'r2-42': 'r2', 'r3-42': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(func=append_42) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(func=append_42) self.assertEqual(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_ids_parameter(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', metadata={'id': "abc"}), RNA('UUG', metadata={'id': "def"}), RNA('U-----UGCC--', metadata={'id': "ghi"}) ]) exp_id_map = {'abc': 'r1', 'def': 'r2', 'ghi': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(ids=('abc', 'def', 'ghi')) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(ids=[]) self.assertEqual(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_sequence_attributes_propagated(self): # 1 seq exp_sc = Alignment([ DNA('ACGT', metadata={'id': "abc", 'description': 'desc'}, positional_metadata={'quality': range(4)}) ]) exp_id_map = {'abc': 'seq1'} obj = Alignment([ DNA('ACGT', metadata={'id': "seq1", 'description': 'desc'}, positional_metadata={'quality': range(4)}) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc',)) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # 2 seqs exp_sc = Alignment([ DNA('ACGT', metadata={'id': "abc", 'description': 'desc1'}, positional_metadata={'quality': range(4)}), DNA('TGCA', metadata={'id': "def", 'description': 'desc2'}, positional_metadata={'quality': range(4)[::-1]}) ]) exp_id_map = {'abc': 'seq1', 'def': 'seq2'} obj = Alignment([ DNA('ACGT', metadata={'id': "seq1", 'description': 'desc1'}, positional_metadata={'quality': (0, 1, 2, 3)}), DNA('TGCA', metadata={'id': "seq2", 'description': 'desc2'}, positional_metadata={'quality': (3, 2, 1, 0)}) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def')) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) def test_update_ids_invalid_parameter_combos(self): with six.assertRaisesRegex(self, SequenceCollectionError, 'ids and func'): self.s1.update_ids(func=lambda e: e, ids=['foo', 'bar']) with six.assertRaisesRegex(self, SequenceCollectionError, 'prefix'): self.s1.update_ids(ids=['foo', 'bar'], prefix='abc') with six.assertRaisesRegex(self, SequenceCollectionError, 'prefix'): self.s1.update_ids(func=lambda e: e, prefix='abc') def test_update_ids_invalid_ids(self): # incorrect number of new ids with six.assertRaisesRegex(self, SequenceCollectionError, '3 != 2'): self.s1.update_ids(ids=['foo', 'bar', 'baz']) with six.assertRaisesRegex(self, SequenceCollectionError, '4 != 2'): self.s1.update_ids(func=lambda e: ['foo', 'bar', 'baz', 'abc']) # duplicates with six.assertRaisesRegex(self, SequenceCollectionError, 'foo'): self.s2.update_ids(ids=['foo', 'bar', 'foo']) with six.assertRaisesRegex(self, SequenceCollectionError, 'bar'): self.s2.update_ids(func=lambda e: ['foo', 'bar', 'bar']) def test_is_empty(self): self.assertFalse(self.s1.is_empty()) self.assertFalse(self.s2.is_empty()) self.assertFalse(self.s3.is_empty()) self.assertTrue(self.empty.is_empty()) def test_iteritems(self): self.assertEqual(list(self.s1.iteritems()), [(s.metadata['id'], s) for s in self.s1]) def test_sequence_count(self): self.assertEqual(self.s1.sequence_count(), 2) self.assertEqual(self.s2.sequence_count(), 3) self.assertEqual(self.s3.sequence_count(), 5) self.assertEqual(self.empty.sequence_count(), 0) def test_sequence_lengths(self): self.assertEqual(self.s1.sequence_lengths(), [7, 3]) self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12]) self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12]) self.assertEqual(self.empty.sequence_lengths(), [])
class SequenceCollectionTests(TestCase): def setUp(self): self.d1 = DNASequence('GATTACA', id="d1") self.d2 = DNASequence('TTG', id="d2") self.d3 = DNASequence('GTATACA', id="d3") self.d1_lower = DNASequence('gattaca', id="d1") self.d2_lower = DNASequence('ttg', id="d2") self.d3_lower = DNASequence('gtataca', id="d3") self.r1 = RNASequence('GAUUACA', id="r1") self.r2 = RNASequence('UUG', id="r2") self.r3 = RNASequence('U-----UGCC--', id="r3") self.i1 = DNASequence('GATXACA', id="i1") self.seqs1 = [self.d1, self.d2] self.seqs1_lower = [self.d1_lower, self.d2_lower] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs4 = [self.d1, self.d3] self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')] self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'), ('r3', 'U-----UGCC--')] self.seqs3_t = self.seqs1_t + self.seqs2_t self.s1 = SequenceCollection(self.seqs1) self.s1_lower = SequenceCollection(self.seqs1_lower) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.s4 = SequenceCollection(self.seqs4) self.empty = SequenceCollection([]) self.invalid_s1 = SequenceCollection([self.i1]) def test_init(self): SequenceCollection(self.seqs1) SequenceCollection(self.seqs2) SequenceCollection(self.seqs3) SequenceCollection([]) def test_init_fail(self): # sequences with overlapping ids s1 = [self.d1, self.d1] self.assertRaises(SequenceCollectionError, SequenceCollection, s1) def test_init_validate(self): SequenceCollection(self.seqs1, validate=True) SequenceCollection(self.seqs1, validate=True) # can't validate self.seqs2 as a DNASequence self.assertRaises(SequenceCollectionError, SequenceCollection, self.invalid_s1, validate=True) def test_contains(self): self.assertTrue('d1' in self.s1) self.assertTrue('r2' in self.s2) self.assertFalse('r2' in self.s1) def test_eq(self): self.assertTrue(self.s1 == self.s1) self.assertFalse(self.s1 == self.s2) # different objects can be equal self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2])) self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1) # SequenceCollections with different number of sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertFalse(self.s4 == FakeSequenceCollection([self.d1, self.d3])) self.assertFalse(self.s4 == Alignment([self.d1, self.d3])) # SequenceCollections with different sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1])) def test_getitem(self): self.assertEqual(self.s1[0], self.d1) self.assertEqual(self.s1[1], self.d2) self.assertEqual(self.s2[0], self.r1) self.assertEqual(self.s2[1], self.r2) self.assertRaises(IndexError, self.empty.__getitem__, 0) self.assertRaises(KeyError, self.empty.__getitem__, '0') def test_iter(self): s1_iter = iter(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_len(self): self.assertEqual(len(self.s1), 2) self.assertEqual(len(self.s2), 3) self.assertEqual(len(self.s3), 5) self.assertEqual(len(self.empty), 0) def test_ne(self): self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s4 != FakeSequenceCollection([self.d1, self.d3])) self.assertTrue(self.s4 != Alignment([self.d1, self.d3])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1])) def test_repr(self): self.assertEqual(repr(self.s1), "<SequenceCollection: n=2; " "mean +/- std length=5.00 +/- 2.00>") self.assertEqual(repr(self.s2), "<SequenceCollection: n=3; " "mean +/- std length=7.33 +/- 3.68>") self.assertEqual(repr(self.s3), "<SequenceCollection: n=5; " "mean +/- std length=6.40 +/- 3.32>") self.assertEqual(repr(self.empty), "<SequenceCollection: n=0; " "mean +/- std length=0.00 +/- 0.00>") def test_reversed(self): s1_iter = reversed(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1[::-1]): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_k_word_frequencies(self): expected1 = defaultdict(float) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(float) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.k_word_frequencies(k=1), [expected1, expected2]) expected1 = defaultdict(float) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(float) expected2['TTG'] = 1 / 1. self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False), [expected1, expected2]) self.assertEqual(self.empty.k_word_frequencies(k=1), []) # Test to ensure floating point precision bug isn't present. See the # tests for BiologicalSequence.k_word_frequencies for more details. sc = SequenceCollection([RNA('C' * 10, id='s1'), RNA('G' * 10, id='s2')]) self.assertEqual(sc.k_word_frequencies(1), [defaultdict(float, {'C': 1.0}), defaultdict(float, {'G': 1.0})]) def test_str(self): exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(str(self.s1), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(str(self.s2), exp2) exp4 = "" self.assertEqual(str(self.empty), exp4) def test_distances(self): s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(hamming) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected) def test_distribution_stats(self): actual1 = self.s1.distribution_stats() self.assertEqual(actual1[0], 2) self.assertAlmostEqual(actual1[1], 5.0, 3) self.assertAlmostEqual(actual1[2], 2.0, 3) actual2 = self.s2.distribution_stats() self.assertEqual(actual2[0], 3) self.assertAlmostEqual(actual2[1], 7.333, 3) self.assertAlmostEqual(actual2[2], 3.682, 3) actual3 = self.s3.distribution_stats() self.assertEqual(actual3[0], 5) self.assertAlmostEqual(actual3[1], 6.400, 3) self.assertAlmostEqual(actual3[2], 3.323, 3) actual4 = self.empty.distribution_stats() self.assertEqual(actual4[0], 0) self.assertEqual(actual4[1], 0.0) self.assertEqual(actual4[2], 0.0) def test_degap(self): expected = SequenceCollection([ RNASequence('GAUUACA', id="r1"), RNASequence('UUG', id="r2"), RNASequence('UUGCC', id="r3")]) actual = self.s2.degap() self.assertEqual(actual, expected) def test_get_seq(self): self.assertEqual(self.s1.get_seq('d1'), self.d1) self.assertEqual(self.s1.get_seq('d2'), self.d2) def test_ids(self): self.assertEqual(self.s1.ids(), ['d1', 'd2']) self.assertEqual(self.s2.ids(), ['r1', 'r2', 'r3']) self.assertEqual(self.s3.ids(), ['d1', 'd2', 'r1', 'r2', 'r3']) self.assertEqual(self.empty.ids(), []) def _assert_sequence_collections_equal(self, observed, expected): """Compare SequenceCollections strictly.""" # TODO remove this custom equality testing code when SequenceCollection # has an equals method (part of #656). We need this method to include # IDs in the comparison (not part of SequenceCollection.__eq__). self.assertEqual(observed, expected) for obs_seq, exp_seq in zip(observed, expected): self.assertTrue(obs_seq.equals(exp_seq)) def test_update_ids_default_behavior(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="1"), RNA('UUG', id="2"), RNA('U-----UGCC--', id="3") ]) exp_id_map = {'1': 'r1', '2': 'r2', '3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids() self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids() self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_prefix(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="abc1"), RNA('UUG', id="abc2"), RNA('U-----UGCC--', id="abc3") ]) exp_id_map = {'abc1': 'r1', 'abc2': 'r2', 'abc3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(prefix='abc') self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(prefix='abc') self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_fn_parameter(self): def append_42(ids): return [id_ + '-42' for id_ in ids] # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="r1-42"), RNA('UUG', id="r2-42"), RNA('U-----UGCC--', id="r3-42") ]) exp_id_map = {'r1-42': 'r1', 'r2-42': 'r2', 'r3-42': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(fn=append_42) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(fn=append_42) self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_ids_parameter(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="abc"), RNA('UUG', id="def"), RNA('U-----UGCC--', id="ghi") ]) exp_id_map = {'abc': 'r1', 'def': 'r2', 'ghi': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(ids=('abc', 'def', 'ghi')) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(ids=[]) self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_sequence_attributes_propagated(self): # 1 seq exp_sc = Alignment([ DNA('ACGT', id="abc", description='desc', quality=range(4)) ]) exp_id_map = {'abc': 'seq1'} obj = Alignment([ DNA('ACGT', id="seq1", description='desc', quality=range(4)) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc',)) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # 2 seqs exp_sc = Alignment([ DNA('ACGT', id="abc", description='desc1', quality=range(4)), DNA('TGCA', id="def", description='desc2', quality=range(4)[::-1]) ]) exp_id_map = {'abc': 'seq1', 'def': 'seq2'} obj = Alignment([ DNA('ACGT', id="seq1", description='desc1', quality=(0, 1, 2, 3)), DNA('TGCA', id="seq2", description='desc2', quality=(3, 2, 1, 0)) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def')) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) def test_update_ids_invalid_parameter_combos(self): with self.assertRaisesRegexp(SequenceCollectionError, 'ids and fn'): self.s1.update_ids(fn=lambda e: e, ids=['foo', 'bar']) with self.assertRaisesRegexp(SequenceCollectionError, 'prefix'): self.s1.update_ids(ids=['foo', 'bar'], prefix='abc') with self.assertRaisesRegexp(SequenceCollectionError, 'prefix'): self.s1.update_ids(fn=lambda e: e, prefix='abc') def test_update_ids_invalid_ids(self): # incorrect number of new ids with self.assertRaisesRegexp(SequenceCollectionError, '3 != 2'): self.s1.update_ids(ids=['foo', 'bar', 'baz']) with self.assertRaisesRegexp(SequenceCollectionError, '4 != 2'): self.s1.update_ids(fn=lambda e: ['foo', 'bar', 'baz', 'abc']) # duplicates with self.assertRaisesRegexp(SequenceCollectionError, 'foo'): self.s2.update_ids(ids=['foo', 'bar', 'foo']) with self.assertRaisesRegexp(SequenceCollectionError, 'bar'): self.s2.update_ids(fn=lambda e: ['foo', 'bar', 'bar']) def test_is_empty(self): self.assertFalse(self.s1.is_empty()) self.assertFalse(self.s2.is_empty()) self.assertFalse(self.s3.is_empty()) self.assertTrue(self.empty.is_empty()) def test_is_valid(self): self.assertTrue(self.s1.is_valid()) self.assertTrue(self.s2.is_valid()) self.assertTrue(self.s3.is_valid()) self.assertTrue(self.empty.is_valid()) self.assertFalse(self.invalid_s1.is_valid()) def test_iteritems(self): self.assertEqual(list(self.s1.iteritems()), [(s.id, s) for s in self.s1]) def test_lower(self): self.assertEqual(self.s1.lower(), self.s1_lower) def test_sequence_count(self): self.assertEqual(self.s1.sequence_count(), 2) self.assertEqual(self.s2.sequence_count(), 3) self.assertEqual(self.s3.sequence_count(), 5) self.assertEqual(self.empty.sequence_count(), 0) def test_sequence_lengths(self): self.assertEqual(self.s1.sequence_lengths(), [7, 3]) self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12]) self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12]) self.assertEqual(self.empty.sequence_lengths(), []) def test_upper(self): self.assertEqual(self.s1_lower.upper(), self.s1)
class SequenceCollectionTests(TestCase): """Tests of the SequenceCollection class """ def setUp(self): """Initialize values to be used in tests """ self.d1 = DNASequence('GATTACA', id="d1") self.d2 = DNASequence('TTG', id="d2") self.d1_lower = DNASequence('gattaca', id="d1") self.d2_lower = DNASequence('ttg', id="d2") self.r1 = RNASequence('GAUUACA', id="r1") self.r2 = RNASequence('UUG', id="r2") self.r3 = RNASequence('U-----UGCC--', id="r3") self.i1 = DNASequence('GATXACA', id="i1") self.seqs1 = [self.d1, self.d2] self.seqs1_lower = [self.d1_lower, self.d2_lower] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')] self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'), ('r3', 'U-----UGCC--')] self.seqs3_t = self.seqs1_t + self.seqs2_t self.s1 = SequenceCollection(self.seqs1) self.s1_lower = SequenceCollection(self.seqs1_lower) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.empty = SequenceCollection([]) self.invalid_s1 = SequenceCollection([self.i1]) def test_init(self): """Initialization functions as expected with varied input types """ SequenceCollection(self.seqs1) SequenceCollection(self.seqs2) SequenceCollection(self.seqs3) SequenceCollection([]) def test_init_fail(self): """initialization with sequences with overlapping ids fails """ s1 = [self.d1, self.d1] self.assertRaises(SequenceCollectionError, SequenceCollection, s1) def test_init_validate(self): """initialization with validation functions as expected """ SequenceCollection(self.seqs1, validate=True) SequenceCollection(self.seqs1, validate=True) # can't validate self.seqs2 as a DNASequence self.assertRaises(SequenceCollectionError, SequenceCollection, self.invalid_s1, validate=True) def test_from_fasta_records(self): """Initialization from list of tuples functions as expected """ SequenceCollection.from_fasta_records(self.seqs1_t, DNASequence) SequenceCollection.from_fasta_records(self.seqs2_t, RNASequence) SequenceCollection.from_fasta_records(self.seqs3_t, NucleotideSequence) def test_contains(self): """in operator functions as expected """ self.assertTrue('d1' in self.s1) self.assertTrue('r2' in self.s2) self.assertFalse('r2' in self.s1) def test_eq(self): """equality operator functions as expected """ self.assertTrue(self.s1 == self.s1) self.assertFalse(self.s1 == self.s2) # different objects can be equal self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2])) self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1) # SequenceCollections with different number of sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertFalse(self.s1 == FakeSequenceCollection([self.d1, self.d2])) self.assertFalse(self.s1 == Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1])) def test_getitem(self): """getitem functions as expected """ self.assertEqual(self.s1[0], self.d1) self.assertEqual(self.s1[1], self.d2) self.assertEqual(self.s2[0], self.r1) self.assertEqual(self.s2[1], self.r2) self.assertRaises(IndexError, self.empty.__getitem__, 0) self.assertRaises(KeyError, self.empty.__getitem__, '0') def test_iter(self): """iter functions as expected """ s1_iter = iter(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_len(self): """len functions as expected """ self.assertEqual(len(self.s1), 2) self.assertEqual(len(self.s2), 3) self.assertEqual(len(self.s3), 5) self.assertEqual(len(self.empty), 0) def test_ne(self): """inequality operator functions as expected """ self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2])) self.assertTrue(self.s1 != Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1])) def test_repr(self): """repr functions as expected """ self.assertEqual(repr(self.s1), "<SequenceCollection: n=2; " "mean +/- std length=5.00 +/- 2.00>") self.assertEqual(repr(self.s2), "<SequenceCollection: n=3; " "mean +/- std length=7.33 +/- 3.68>") self.assertEqual(repr(self.s3), "<SequenceCollection: n=5; " "mean +/- std length=6.40 +/- 3.32>") self.assertEqual(repr(self.empty), "<SequenceCollection: n=0; " "mean +/- std length=0.00 +/- 0.00>") def test_reversed(self): """reversed functions as expected """ s1_iter = reversed(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1[::-1]): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_k_word_frequencies(self): """k_word_frequencies functions as expected """ expected1 = defaultdict(int) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(int) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.k_word_frequencies(k=1), [expected1, expected2]) expected1 = defaultdict(int) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(int) expected2['TTG'] = 1 / 1. self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False), [expected1, expected2]) self.assertEqual(self.empty.k_word_frequencies(k=1), []) def test_str(self): """str functions as expected """ exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(str(self.s1), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(str(self.s2), exp2) exp4 = "" self.assertEqual(str(self.empty), exp4) def test_distances(self): """distances functions as expected """ s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(hamming) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected) def test_distribution_stats(self): """distribution_stats functions as expected """ actual1 = self.s1.distribution_stats() self.assertEqual(actual1[0], 2) self.assertAlmostEqual(actual1[1], 5.0, 3) self.assertAlmostEqual(actual1[2], 2.0, 3) actual2 = self.s2.distribution_stats() self.assertEqual(actual2[0], 3) self.assertAlmostEqual(actual2[1], 7.333, 3) self.assertAlmostEqual(actual2[2], 3.682, 3) actual3 = self.s3.distribution_stats() self.assertEqual(actual3[0], 5) self.assertAlmostEqual(actual3[1], 6.400, 3) self.assertAlmostEqual(actual3[2], 3.323, 3) actual4 = self.empty.distribution_stats() self.assertEqual(actual4[0], 0) self.assertEqual(actual4[1], 0.0) self.assertEqual(actual4[2], 0.0) def test_degap(self): """degap functions as expected """ expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs2_t] expected = SequenceCollection.from_fasta_records(expected, RNASequence) actual = self.s2.degap() self.assertEqual(actual, expected) def test_get_seq(self): """getseq functions asexpected """ self.assertEqual(self.s1.get_seq('d1'), self.d1) self.assertEqual(self.s1.get_seq('d2'), self.d2) def test_ids(self): """ids functions as expected """ self.assertEqual(self.s1.ids(), ['d1', 'd2']) self.assertEqual(self.s2.ids(), ['r1', 'r2', 'r3']) self.assertEqual(self.s3.ids(), ['d1', 'd2', 'r1', 'r2', 'r3']) self.assertEqual(self.empty.ids(), []) def test_int_map(self): """int_map functions as expected """ expected1 = {"1": self.d1, "2": self.d2} expected2 = {"1": "d1", "2": "d2"} self.assertEqual(self.s1.int_map(), (expected1, expected2)) expected1 = {"h-1": self.d1, "h-2": self.d2} expected2 = {"h-1": "d1", "h-2": "d2"} self.assertEqual(self.s1.int_map(prefix='h-'), (expected1, expected2)) def test_is_empty(self): """is_empty functions as expected """ self.assertFalse(self.s1.is_empty()) self.assertFalse(self.s2.is_empty()) self.assertFalse(self.s3.is_empty()) self.assertTrue(self.empty.is_empty()) def test_is_valid(self): """is_valid functions as expected """ self.assertTrue(self.s1.is_valid()) self.assertTrue(self.s2.is_valid()) self.assertTrue(self.s3.is_valid()) self.assertTrue(self.empty.is_valid()) self.assertFalse(self.invalid_s1.is_valid()) def test_iteritems(self): """iteritems functions as expected """ self.assertEqual(list(self.s1.iteritems()), [(s.id, s) for s in self.s1]) def test_lower(self): """lower functions as expected """ self.assertEqual(self.s1.lower(), self.s1_lower) def test_sequence_count(self): """num_seqs functions as expected """ self.assertEqual(self.s1.sequence_count(), 2) self.assertEqual(self.s2.sequence_count(), 3) self.assertEqual(self.s3.sequence_count(), 5) self.assertEqual(self.empty.sequence_count(), 0) def test_sequence_lengths(self): """sequence_lengths functions as expected """ self.assertEqual(self.s1.sequence_lengths(), [7, 3]) self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12]) self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12]) self.assertEqual(self.empty.sequence_lengths(), []) def test_to_fasta(self): """to_fasta functions as expected """ exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(self.s1.to_fasta(), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(self.s2.to_fasta(), exp2) def test_toFasta(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") exp = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(self.s1.toFasta(), exp) def test_upper(self): """upper functions as expected """ self.assertEqual(self.s1_lower.upper(), self.s1)
class SequenceCollectionTests(TestCase): """Tests of the SequenceCollection class """ def setUp(self): """Initialize values to be used in tests """ self.d1 = DNASequence('GATTACA', id="d1") self.d2 = DNASequence('TTG', id="d2") self.d1_lower = DNASequence('gattaca', id="d1") self.d2_lower = DNASequence('ttg', id="d2") self.r1 = RNASequence('GAUUACA', id="r1") self.r2 = RNASequence('UUG', id="r2") self.r3 = RNASequence('U-----UGCC--', id="r3") self.i1 = DNASequence('GATXACA', id="i1") self.seqs1 = [self.d1, self.d2] self.seqs1_lower = [self.d1_lower, self.d2_lower] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')] self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'), ('r3', 'U-----UGCC--')] self.seqs3_t = self.seqs1_t + self.seqs2_t self.s1 = SequenceCollection(self.seqs1) self.s1_lower = SequenceCollection(self.seqs1_lower) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.empty = SequenceCollection([]) self.invalid_s1 = SequenceCollection([self.i1]) def test_init(self): """Initialization functions as expected with varied input types """ SequenceCollection(self.seqs1) SequenceCollection(self.seqs2) SequenceCollection(self.seqs3) SequenceCollection([]) def test_init_fail(self): """initialization with sequences with overlapping ids fails """ s1 = [self.d1, self.d1] self.assertRaises(SequenceCollectionError, SequenceCollection, s1) def test_init_validate(self): """initialization with validation functions as expected """ SequenceCollection(self.seqs1, validate=True) SequenceCollection(self.seqs1, validate=True) # can't validate self.seqs2 as a DNASequence self.assertRaises(SequenceCollectionError, SequenceCollection, self.invalid_s1, validate=True) def test_from_fasta_records(self): """Initialization from list of tuples functions as expected """ SequenceCollection.from_fasta_records(self.seqs1_t, DNASequence) SequenceCollection.from_fasta_records(self.seqs2_t, RNASequence) SequenceCollection.from_fasta_records(self.seqs3_t, NucleotideSequence) def test_contains(self): """in operator functions as expected """ self.assertTrue('d1' in self.s1) self.assertTrue('r2' in self.s2) self.assertFalse('r2' in self.s1) def test_eq(self): """equality operator functions as expected """ self.assertTrue(self.s1 == self.s1) self.assertFalse(self.s1 == self.s2) # different objects can be equal self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2])) self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1) # SequenceCollections with different number of sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertFalse(self.s1 == FakeSequenceCollection([self.d1, self.d2])) self.assertFalse(self.s1 == Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1])) def test_getitem(self): """getitem functions as expected """ self.assertEqual(self.s1[0], self.d1) self.assertEqual(self.s1[1], self.d2) self.assertEqual(self.s2[0], self.r1) self.assertEqual(self.s2[1], self.r2) self.assertRaises(IndexError, self.empty.__getitem__, 0) self.assertRaises(KeyError, self.empty.__getitem__, '0') def test_iter(self): """iter functions as expected """ s1_iter = iter(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_len(self): """len functions as expected """ self.assertEqual(len(self.s1), 2) self.assertEqual(len(self.s2), 3) self.assertEqual(len(self.s3), 5) self.assertEqual(len(self.empty), 0) def test_ne(self): """inequality operator functions as expected """ self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2])) self.assertTrue(self.s1 != Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1])) def test_repr(self): """repr functions as expected """ self.assertEqual(repr(self.s1), "<SequenceCollection: n=2; " "mean +/- std length=5.00 +/- 2.00>") self.assertEqual(repr(self.s2), "<SequenceCollection: n=3; " "mean +/- std length=7.33 +/- 3.68>") self.assertEqual(repr(self.s3), "<SequenceCollection: n=5; " "mean +/- std length=6.40 +/- 3.32>") self.assertEqual(repr(self.empty), "<SequenceCollection: n=0; " "mean +/- std length=0.00 +/- 0.00>") def test_reversed(self): """reversed functions as expected """ s1_iter = reversed(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1[::-1]): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_k_word_frequencies(self): """k_word_frequencies functions as expected """ expected1 = defaultdict(int) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(int) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.k_word_frequencies(k=1), [expected1, expected2]) expected1 = defaultdict(int) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(int) expected2['TTG'] = 1 / 1. self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False), [expected1, expected2]) self.assertEqual(self.empty.k_word_frequencies(k=1), []) def test_str(self): """str functions as expected """ exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(str(self.s1), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(str(self.s2), exp2) exp4 = "" self.assertEqual(str(self.empty), exp4) def test_distances(self): """distances functions as expected """ s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(hamming) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected) def test_distribution_stats(self): """distribution_stats functions as expected """ actual1 = self.s1.distribution_stats() self.assertEqual(actual1[0], 2) self.assertAlmostEqual(actual1[1], 5.0, 3) self.assertAlmostEqual(actual1[2], 2.0, 3) actual2 = self.s2.distribution_stats() self.assertEqual(actual2[0], 3) self.assertAlmostEqual(actual2[1], 7.333, 3) self.assertAlmostEqual(actual2[2], 3.682, 3) actual3 = self.s3.distribution_stats() self.assertEqual(actual3[0], 5) self.assertAlmostEqual(actual3[1], 6.400, 3) self.assertAlmostEqual(actual3[2], 3.323, 3) actual4 = self.empty.distribution_stats() self.assertEqual(actual4[0], 0) self.assertEqual(actual4[1], 0.0) self.assertEqual(actual4[2], 0.0) def test_degap(self): """degap functions as expected """ expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs2_t] expected = SequenceCollection.from_fasta_records(expected, RNASequence) actual = self.s2.degap() self.assertEqual(actual, expected) def test_get_seq(self): """getseq functions asexpected """ self.assertEqual(self.s1.get_seq('d1'), self.d1) self.assertEqual(self.s1.get_seq('d2'), self.d2) def test_ids(self): """ids functions as expected """ self.assertEqual(self.s1.ids(), ['d1', 'd2']) self.assertEqual(self.s2.ids(), ['r1', 'r2', 'r3']) self.assertEqual(self.s3.ids(), ['d1', 'd2', 'r1', 'r2', 'r3']) self.assertEqual(self.empty.ids(), []) def _assert_sequence_collections_equal(self, observed, expected): """Compare SequenceCollections strictly.""" # TODO remove this custom equality testing code when SequenceCollection # has an equals method (part of #656). We need this method to include # IDs in the comparison (not part of SequenceCollection.__eq__). self.assertEqual(observed, expected) for obs_seq, exp_seq in zip(observed, expected): self.assertTrue(obs_seq.equals(exp_seq)) def test_update_ids_default_behavior(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="1"), RNA('UUG', id="2"), RNA('U-----UGCC--', id="3") ]) exp_id_map = {'1': 'r1', '2': 'r2', '3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids() self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids() self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_prefix(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="abc1"), RNA('UUG', id="abc2"), RNA('U-----UGCC--', id="abc3") ]) exp_id_map = {'abc1': 'r1', 'abc2': 'r2', 'abc3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(prefix='abc') self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(prefix='abc') self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_fn_parameter(self): def append_42(ids): return [id_ + '-42' for id_ in ids] # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="r1-42"), RNA('UUG', id="r2-42"), RNA('U-----UGCC--', id="r3-42") ]) exp_id_map = {'r1-42': 'r1', 'r2-42': 'r2', 'r3-42': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(fn=append_42) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(fn=append_42) self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_ids_parameter(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="abc"), RNA('UUG', id="def"), RNA('U-----UGCC--', id="ghi") ]) exp_id_map = {'abc': 'r1', 'def': 'r2', 'ghi': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(ids=('abc', 'def', 'ghi')) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(ids=[]) self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_sequence_attributes_propagated(self): # 1 seq exp_sc = Alignment([ DNA('ACGT', id="abc", description='desc', quality=range(4)) ]) exp_id_map = {'abc': 'seq1'} obj = Alignment([ DNA('ACGT', id="seq1", description='desc', quality=range(4)) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc',)) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # 2 seqs exp_sc = Alignment([ DNA('ACGT', id="abc", description='desc1', quality=range(4)), DNA('TGCA', id="def", description='desc2', quality=range(4)[::-1]) ]) exp_id_map = {'abc': 'seq1', 'def': 'seq2'} obj = Alignment([ DNA('ACGT', id="seq1", description='desc1', quality=(0, 1, 2, 3)), DNA('TGCA', id="seq2", description='desc2', quality=(3, 2, 1, 0)) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def')) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) def test_update_ids_invalid_parameter_combos(self): with self.assertRaisesRegexp(SequenceCollectionError, 'ids and fn'): self.s1.update_ids(fn=lambda e: e, ids=['foo', 'bar']) with self.assertRaisesRegexp(SequenceCollectionError, 'prefix'): self.s1.update_ids(ids=['foo', 'bar'], prefix='abc') with self.assertRaisesRegexp(SequenceCollectionError, 'prefix'): self.s1.update_ids(fn=lambda e: e, prefix='abc') def test_update_ids_invalid_ids(self): # incorrect number of new ids with self.assertRaisesRegexp(SequenceCollectionError, '3 != 2'): self.s1.update_ids(ids=['foo', 'bar', 'baz']) with self.assertRaisesRegexp(SequenceCollectionError, '4 != 2'): self.s1.update_ids(fn=lambda e: ['foo', 'bar', 'baz', 'abc']) # duplicates with self.assertRaisesRegexp(SequenceCollectionError, 'foo'): self.s2.update_ids(ids=['foo', 'bar', 'foo']) with self.assertRaisesRegexp(SequenceCollectionError, 'bar'): self.s2.update_ids(fn=lambda e: ['foo', 'bar', 'bar']) def test_int_map(self): expected1 = {"1": self.d1, "2": self.d2} expected2 = {"1": "d1", "2": "d2"} obs = npt.assert_warns(UserWarning, self.s1.int_map) self.assertEqual(obs, (expected1, expected2)) expected1 = {"h-1": self.d1, "h-2": self.d2} expected2 = {"h-1": "d1", "h-2": "d2"} obs = npt.assert_warns(UserWarning, self.s1.int_map, prefix='h-') self.assertEqual(obs, (expected1, expected2)) def test_is_empty(self): """is_empty functions as expected """ self.assertFalse(self.s1.is_empty()) self.assertFalse(self.s2.is_empty()) self.assertFalse(self.s3.is_empty()) self.assertTrue(self.empty.is_empty()) def test_is_valid(self): """is_valid functions as expected """ self.assertTrue(self.s1.is_valid()) self.assertTrue(self.s2.is_valid()) self.assertTrue(self.s3.is_valid()) self.assertTrue(self.empty.is_valid()) self.assertFalse(self.invalid_s1.is_valid()) def test_iteritems(self): """iteritems functions as expected """ self.assertEqual(list(self.s1.iteritems()), [(s.id, s) for s in self.s1]) def test_lower(self): """lower functions as expected """ self.assertEqual(self.s1.lower(), self.s1_lower) def test_sequence_count(self): """num_seqs functions as expected """ self.assertEqual(self.s1.sequence_count(), 2) self.assertEqual(self.s2.sequence_count(), 3) self.assertEqual(self.s3.sequence_count(), 5) self.assertEqual(self.empty.sequence_count(), 0) def test_sequence_lengths(self): """sequence_lengths functions as expected """ self.assertEqual(self.s1.sequence_lengths(), [7, 3]) self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12]) self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12]) self.assertEqual(self.empty.sequence_lengths(), []) def test_to_fasta(self): """to_fasta functions as expected """ exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(self.s1.to_fasta(), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(self.s2.to_fasta(), exp2) def test_toFasta(self): exp = ">d1\nGATTACA\n>d2\nTTG\n" obs = npt.assert_warns(UserWarning, self.s1.toFasta) self.assertEqual(obs, exp) def test_upper(self): """upper functions as expected """ self.assertEqual(self.s1_lower.upper(), self.s1)
class SequenceCollectionTests(TestCase): def setUp(self): self.d1 = DNA('GATTACA', metadata={'id': "d1"}) self.d2 = DNA('TTG', metadata={'id': "d2"}) self.d3 = DNA('GTATACA', metadata={'id': "d3"}) self.r1 = RNA('GAUUACA', metadata={'id': "r1"}) self.r2 = RNA('UUG', metadata={'id': "r2"}) self.r3 = RNA('U-----UGCC--', metadata={'id': "r3"}) self.seqs1 = [self.d1, self.d2] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs4 = [self.d1, self.d3] self.s1 = SequenceCollection(self.seqs1) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.s4 = SequenceCollection(self.seqs4) self.empty = SequenceCollection([]) def test_init(self): SequenceCollection(self.seqs1) SequenceCollection(self.seqs2) SequenceCollection(self.seqs3) SequenceCollection([]) def test_init_fail(self): # sequences with overlapping ids s1 = [self.d1, self.d1] self.assertRaises(SequenceCollectionError, SequenceCollection, s1) def test_init_fail_no_id(self): seq = Sequence('ACGTACGT') with six.assertRaisesRegex( self, SequenceCollectionError, "'id' must be included in the sequence " "metadata"): SequenceCollection([seq]) def test_contains(self): self.assertTrue('d1' in self.s1) self.assertTrue('r2' in self.s2) self.assertFalse('r2' in self.s1) def test_eq(self): self.assertTrue(self.s1 == self.s1) self.assertFalse(self.s1 == self.s2) # different objects can be equal self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2])) self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1) # SequenceCollections with different number of sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertFalse(self.s4 == FakeSequenceCollection([self.d1, self.d3])) self.assertFalse(self.s4 == Alignment([self.d1, self.d3])) # SequenceCollections with different sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1])) def test_getitem(self): self.assertEqual(self.s1[0], self.d1) self.assertEqual(self.s1[1], self.d2) self.assertEqual(self.s2[0], self.r1) self.assertEqual(self.s2[1], self.r2) self.assertRaises(IndexError, self.empty.__getitem__, 0) self.assertRaises(KeyError, self.empty.__getitem__, '0') def test_iter(self): s1_iter = iter(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_len(self): self.assertEqual(len(self.s1), 2) self.assertEqual(len(self.s2), 3) self.assertEqual(len(self.s3), 5) self.assertEqual(len(self.empty), 0) def test_ne(self): self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s4 != FakeSequenceCollection([self.d1, self.d3])) self.assertTrue(self.s4 != Alignment([self.d1, self.d3])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1])) def test_repr(self): self.assertEqual( repr(self.s1), "<SequenceCollection: n=2; " "mean +/- std length=5.00 +/- 2.00>") self.assertEqual( repr(self.s2), "<SequenceCollection: n=3; " "mean +/- std length=7.33 +/- 3.68>") self.assertEqual( repr(self.s3), "<SequenceCollection: n=5; " "mean +/- std length=6.40 +/- 3.32>") self.assertEqual( repr(self.empty), "<SequenceCollection: n=0; " "mean +/- std length=0.00 +/- 0.00>") def test_reversed(self): s1_iter = reversed(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1[::-1]): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_kmer_frequencies(self): expected1 = Counter({'GAT': 1, 'TAC': 1}) expected2 = Counter({'TTG': 1}) self.assertEqual( self.s1.kmer_frequencies(k=3, overlap=False, relative=False), [expected1, expected2]) expected1 = defaultdict(float) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(float) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.kmer_frequencies(k=1, relative=True), [expected1, expected2]) expected1 = defaultdict(float) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(float) expected2['TTG'] = 1 / 1. self.assertEqual( self.s1.kmer_frequencies(k=3, overlap=False, relative=True), [expected1, expected2]) self.assertEqual(self.empty.kmer_frequencies(k=1, relative=True), []) # Test to ensure floating point precision bug isn't present. See the # tests for Sequence.kmer_frequencies for more details. sc = SequenceCollection([ RNA('C' * 10, metadata={'id': 's1'}), RNA('G' * 10, metadata={'id': 's2'}) ]) self.assertEqual( sc.kmer_frequencies(1, relative=True), [defaultdict(float, {'C': 1.0}), defaultdict(float, {'G': 1.0})]) def test_str(self): exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(str(self.s1), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(str(self.s2), exp2) exp4 = "" self.assertEqual(str(self.empty), exp4) def test_distances(self): s1 = SequenceCollection([ DNA("ACGT", metadata={'id': "d1"}), DNA("ACGG", metadata={'id': "d2"}) ]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) def h(s1, s2): return hamming(s1.values, s2.values) actual = s1.distances(h) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected) def test_distribution_stats(self): actual1 = self.s1.distribution_stats() self.assertEqual(actual1[0], 2) self.assertAlmostEqual(actual1[1], 5.0, 3) self.assertAlmostEqual(actual1[2], 2.0, 3) actual2 = self.s2.distribution_stats() self.assertEqual(actual2[0], 3) self.assertAlmostEqual(actual2[1], 7.333, 3) self.assertAlmostEqual(actual2[2], 3.682, 3) actual3 = self.s3.distribution_stats() self.assertEqual(actual3[0], 5) self.assertAlmostEqual(actual3[1], 6.400, 3) self.assertAlmostEqual(actual3[2], 3.323, 3) actual4 = self.empty.distribution_stats() self.assertEqual(actual4[0], 0) self.assertEqual(actual4[1], 0.0) self.assertEqual(actual4[2], 0.0) def test_degap(self): expected = SequenceCollection([ RNA('GAUUACA', metadata={'id': "r1"}), RNA('UUG', metadata={'id': "r2"}), RNA('UUGCC', metadata={'id': "r3"}) ]) actual = self.s2.degap() self.assertEqual(actual, expected) def test_get_seq(self): self.assertEqual(self.s1.get_seq('d1'), self.d1) self.assertEqual(self.s1.get_seq('d2'), self.d2) def test_ids(self): self.assertEqual(self.s1.ids(), ['d1', 'd2']) self.assertEqual(self.s2.ids(), ['r1', 'r2', 'r3']) self.assertEqual(self.s3.ids(), ['d1', 'd2', 'r1', 'r2', 'r3']) self.assertEqual(self.empty.ids(), []) def test_update_ids_default_behavior(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', metadata={'id': "1"}), RNA('UUG', metadata={'id': "2"}), RNA('U-----UGCC--', metadata={'id': "3"}) ]) exp_id_map = {'1': 'r1', '2': 'r2', '3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids() self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids() self.assertEqual(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_prefix(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', metadata={'id': "abc1"}), RNA('UUG', metadata={'id': "abc2"}), RNA('U-----UGCC--', metadata={'id': "abc3"}) ]) exp_id_map = {'abc1': 'r1', 'abc2': 'r2', 'abc3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(prefix='abc') self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(prefix='abc') self.assertEqual(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_func_parameter(self): def append_42(ids): return [id_ + '-42' for id_ in ids] # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', metadata={'id': "r1-42"}), RNA('UUG', metadata={'id': "r2-42"}), RNA('U-----UGCC--', metadata={'id': "r3-42"}) ]) exp_id_map = {'r1-42': 'r1', 'r2-42': 'r2', 'r3-42': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(func=append_42) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(func=append_42) self.assertEqual(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_ids_parameter(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', metadata={'id': "abc"}), RNA('UUG', metadata={'id': "def"}), RNA('U-----UGCC--', metadata={'id': "ghi"}) ]) exp_id_map = {'abc': 'r1', 'def': 'r2', 'ghi': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(ids=('abc', 'def', 'ghi')) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(ids=[]) self.assertEqual(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_sequence_attributes_propagated(self): # 1 seq exp_sc = Alignment([ DNA('ACGT', metadata={ 'id': "abc", 'description': 'desc' }, positional_metadata={'quality': range(4)}) ]) exp_id_map = {'abc': 'seq1'} obj = Alignment([ DNA('ACGT', metadata={ 'id': "seq1", 'description': 'desc' }, positional_metadata={'quality': range(4)}) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc', )) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # 2 seqs exp_sc = Alignment([ DNA('ACGT', metadata={ 'id': "abc", 'description': 'desc1' }, positional_metadata={'quality': range(4)}), DNA('TGCA', metadata={ 'id': "def", 'description': 'desc2' }, positional_metadata={'quality': range(4)[::-1]}) ]) exp_id_map = {'abc': 'seq1', 'def': 'seq2'} obj = Alignment([ DNA('ACGT', metadata={ 'id': "seq1", 'description': 'desc1' }, positional_metadata={'quality': (0, 1, 2, 3)}), DNA('TGCA', metadata={ 'id': "seq2", 'description': 'desc2' }, positional_metadata={'quality': (3, 2, 1, 0)}) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def')) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) def test_update_ids_invalid_parameter_combos(self): with six.assertRaisesRegex(self, SequenceCollectionError, 'ids and func'): self.s1.update_ids(func=lambda e: e, ids=['foo', 'bar']) with six.assertRaisesRegex(self, SequenceCollectionError, 'prefix'): self.s1.update_ids(ids=['foo', 'bar'], prefix='abc') with six.assertRaisesRegex(self, SequenceCollectionError, 'prefix'): self.s1.update_ids(func=lambda e: e, prefix='abc') def test_update_ids_invalid_ids(self): # incorrect number of new ids with six.assertRaisesRegex(self, SequenceCollectionError, '3 != 2'): self.s1.update_ids(ids=['foo', 'bar', 'baz']) with six.assertRaisesRegex(self, SequenceCollectionError, '4 != 2'): self.s1.update_ids(func=lambda e: ['foo', 'bar', 'baz', 'abc']) # duplicates with six.assertRaisesRegex(self, SequenceCollectionError, 'foo'): self.s2.update_ids(ids=['foo', 'bar', 'foo']) with six.assertRaisesRegex(self, SequenceCollectionError, 'bar'): self.s2.update_ids(func=lambda e: ['foo', 'bar', 'bar']) def test_is_empty(self): self.assertFalse(self.s1.is_empty()) self.assertFalse(self.s2.is_empty()) self.assertFalse(self.s3.is_empty()) self.assertTrue(self.empty.is_empty()) def test_iteritems(self): self.assertEqual(list(self.s1.iteritems()), [(s.metadata['id'], s) for s in self.s1]) def test_sequence_count(self): self.assertEqual(self.s1.sequence_count(), 2) self.assertEqual(self.s2.sequence_count(), 3) self.assertEqual(self.s3.sequence_count(), 5) self.assertEqual(self.empty.sequence_count(), 0) def test_sequence_lengths(self): self.assertEqual(self.s1.sequence_lengths(), [7, 3]) self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12]) self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12]) self.assertEqual(self.empty.sequence_lengths(), [])
class SequenceCollectionTests(TestCase): """Tests of the SequenceCollection class """ def setUp(self): """Initialize values to be used in tests """ self.d1 = DNASequence('GATTACA', id="d1") self.d2 = DNASequence('TTG', id="d2") self.d1_lower = DNASequence('gattaca', id="d1") self.d2_lower = DNASequence('ttg', id="d2") self.r1 = RNASequence('GAUUACA', id="r1") self.r2 = RNASequence('UUG', id="r2") self.r3 = RNASequence('U-----UGCC--', id="r3") self.i1 = DNASequence('GATXACA', id="i1") self.seqs1 = [self.d1, self.d2] self.seqs1_lower = [self.d1_lower, self.d2_lower] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')] self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'), ('r3', 'U-----UGCC--')] self.seqs3_t = self.seqs1_t + self.seqs2_t self.s1 = SequenceCollection(self.seqs1) self.s1_lower = SequenceCollection(self.seqs1_lower) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.empty = SequenceCollection([]) self.invalid_s1 = SequenceCollection([self.i1]) def test_init(self): """Initialization functions as expected with varied input types """ SequenceCollection(self.seqs1) SequenceCollection(self.seqs2) SequenceCollection(self.seqs3) SequenceCollection([]) def test_init_fail(self): """initialization with sequences with overlapping ids fails """ s1 = [self.d1, self.d1] self.assertRaises(SequenceCollectionError, SequenceCollection, s1) def test_init_validate(self): """initialization with validation functions as expected """ SequenceCollection(self.seqs1, validate=True) SequenceCollection(self.seqs1, validate=True) # can't validate self.seqs2 as a DNASequence self.assertRaises(SequenceCollectionError, SequenceCollection, self.invalid_s1, validate=True) def test_from_fasta_records(self): """Initialization from list of tuples functions as expected """ SequenceCollection.from_fasta_records(self.seqs1_t, DNASequence) SequenceCollection.from_fasta_records(self.seqs2_t, RNASequence) SequenceCollection.from_fasta_records(self.seqs3_t, NucleotideSequence) def test_contains(self): """in operator functions as expected """ self.assertTrue('d1' in self.s1) self.assertTrue('r2' in self.s2) self.assertFalse('r2' in self.s1) def test_eq(self): """equality operator functions as expected """ self.assertTrue(self.s1 == self.s1) self.assertFalse(self.s1 == self.s2) # different objects can be equal self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2])) self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1) # SequenceCollections with different number of sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertFalse(self.s1 == FakeSequenceCollection([self.d1, self.d2])) self.assertFalse(self.s1 == Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1])) def test_getitem(self): """getitem functions as expected """ self.assertEqual(self.s1[0], self.d1) self.assertEqual(self.s1[1], self.d2) self.assertEqual(self.s2[0], self.r1) self.assertEqual(self.s2[1], self.r2) self.assertRaises(IndexError, self.empty.__getitem__, 0) self.assertRaises(KeyError, self.empty.__getitem__, '0') def test_iter(self): """iter functions as expected """ s1_iter = iter(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_len(self): """len functions as expected """ self.assertEqual(len(self.s1), 2) self.assertEqual(len(self.s2), 3) self.assertEqual(len(self.s3), 5) self.assertEqual(len(self.empty), 0) def test_ne(self): """inequality operator functions as expected """ self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2])) self.assertTrue(self.s1 != Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1])) def test_repr(self): """repr functions as expected """ self.assertEqual( repr(self.s1), "<SequenceCollection: n=2; " "mean +/- std length=5.00 +/- 2.00>") self.assertEqual( repr(self.s2), "<SequenceCollection: n=3; " "mean +/- std length=7.33 +/- 3.68>") self.assertEqual( repr(self.s3), "<SequenceCollection: n=5; " "mean +/- std length=6.40 +/- 3.32>") self.assertEqual( repr(self.empty), "<SequenceCollection: n=0; " "mean +/- std length=0.00 +/- 0.00>") def test_reversed(self): """reversed functions as expected """ s1_iter = reversed(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1[::-1]): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_k_word_frequencies(self): """k_word_frequencies functions as expected """ expected1 = defaultdict(int) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(int) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.k_word_frequencies(k=1), [expected1, expected2]) expected1 = defaultdict(int) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(int) expected2['TTG'] = 1 / 1. self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False), [expected1, expected2]) self.assertEqual(self.empty.k_word_frequencies(k=1), []) def test_str(self): """str functions as expected """ exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(str(self.s1), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(str(self.s2), exp2) exp4 = "" self.assertEqual(str(self.empty), exp4) def test_distances(self): """distances functions as expected """ s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(hamming) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected) def test_distribution_stats(self): """distribution_stats functions as expected """ actual1 = self.s1.distribution_stats() self.assertEqual(actual1[0], 2) self.assertAlmostEqual(actual1[1], 5.0, 3) self.assertAlmostEqual(actual1[2], 2.0, 3) actual2 = self.s2.distribution_stats() self.assertEqual(actual2[0], 3) self.assertAlmostEqual(actual2[1], 7.333, 3) self.assertAlmostEqual(actual2[2], 3.682, 3) actual3 = self.s3.distribution_stats() self.assertEqual(actual3[0], 5) self.assertAlmostEqual(actual3[1], 6.400, 3) self.assertAlmostEqual(actual3[2], 3.323, 3) actual4 = self.empty.distribution_stats() self.assertEqual(actual4[0], 0) self.assertEqual(actual4[1], 0.0) self.assertEqual(actual4[2], 0.0) def test_degap(self): """degap functions as expected """ expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs2_t] expected = SequenceCollection.from_fasta_records(expected, RNASequence) actual = self.s2.degap() self.assertEqual(actual, expected) def test_get_seq(self): """getseq functions asexpected """ self.assertEqual(self.s1.get_seq('d1'), self.d1) self.assertEqual(self.s1.get_seq('d2'), self.d2) def test_ids(self): """ids functions as expected """ self.assertEqual(self.s1.ids(), ['d1', 'd2']) self.assertEqual(self.s2.ids(), ['r1', 'r2', 'r3']) self.assertEqual(self.s3.ids(), ['d1', 'd2', 'r1', 'r2', 'r3']) self.assertEqual(self.empty.ids(), []) def test_int_map(self): """int_map functions as expected """ expected1 = {"1": self.d1, "2": self.d2} expected2 = {"1": "d1", "2": "d2"} self.assertEqual(self.s1.int_map(), (expected1, expected2)) expected1 = {"h-1": self.d1, "h-2": self.d2} expected2 = {"h-1": "d1", "h-2": "d2"} self.assertEqual(self.s1.int_map(prefix='h-'), (expected1, expected2)) def test_is_empty(self): """is_empty functions as expected """ self.assertFalse(self.s1.is_empty()) self.assertFalse(self.s2.is_empty()) self.assertFalse(self.s3.is_empty()) self.assertTrue(self.empty.is_empty()) def test_is_valid(self): """is_valid functions as expected """ self.assertTrue(self.s1.is_valid()) self.assertTrue(self.s2.is_valid()) self.assertTrue(self.s3.is_valid()) self.assertTrue(self.empty.is_valid()) self.assertFalse(self.invalid_s1.is_valid()) def test_iteritems(self): """iteritems functions as expected """ self.assertEqual(list(self.s1.iteritems()), [(s.id, s) for s in self.s1]) def test_lower(self): """lower functions as expected """ self.assertEqual(self.s1.lower(), self.s1_lower) def test_sequence_count(self): """num_seqs functions as expected """ self.assertEqual(self.s1.sequence_count(), 2) self.assertEqual(self.s2.sequence_count(), 3) self.assertEqual(self.s3.sequence_count(), 5) self.assertEqual(self.empty.sequence_count(), 0) def test_sequence_lengths(self): """sequence_lengths functions as expected """ self.assertEqual(self.s1.sequence_lengths(), [7, 3]) self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12]) self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12]) self.assertEqual(self.empty.sequence_lengths(), []) def test_to_fasta(self): """to_fasta functions as expected """ exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(self.s1.to_fasta(), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(self.s2.to_fasta(), exp2) def test_toFasta(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") exp = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(self.s1.toFasta(), exp) def test_upper(self): """upper functions as expected """ self.assertEqual(self.s1_lower.upper(), self.s1)