class AlignmentTests(TestCase): def setUp(self): self.d1 = DNA('..ACC-GTTGG..', metadata={'id': "d1"}) self.d2 = DNA('TTACCGGT-GGCC', metadata={'id': "d2"}) self.d3 = DNA('.-ACC-GTTGC--', metadata={'id': "d3"}) self.r1 = RNA('UUAU-', metadata={'id': "r1"}) self.r2 = RNA('ACGUU', metadata={'id': "r2"}) self.seqs1 = [self.d1, self.d2, self.d3] self.seqs2 = [self.r1, self.r2] self.a1 = Alignment(self.seqs1) self.a2 = Alignment(self.seqs2) self.a3 = Alignment(self.seqs2, score=42.0, start_end_positions=[(0, 3), (5, 9)]) self.a4 = Alignment(self.seqs2, score=-42.0, start_end_positions=[(1, 4), (6, 10)]) # no sequences self.empty = Alignment([]) # sequences, but no positions self.no_positions = Alignment([RNA('', metadata={'id': 'a'}), RNA('', metadata={'id': 'b'})]) def test_degap(self): expected = SequenceCollection([ DNA('ACCGTTGG', metadata={'id': "d1"}), DNA('TTACCGGTGGCC', metadata={'id': "d2"}), DNA('ACCGTTGC', metadata={'id': "d3"})]) actual = self.a1.degap() self.assertEqual(actual, expected) expected = SequenceCollection([ RNA('UUAU', metadata={'id': "r1"}), RNA('ACGUU', metadata={'id': "r2"})]) actual = self.a2.degap() self.assertEqual(actual, expected) def test_distances(self): expected = [[0, 6. / 13, 4. / 13], [6. / 13, 0, 7. / 13], [4. / 13, 7. / 13, 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances() self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42., 42.], [42., 0, 42.], [42., 42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances(dumb_distance) self.assertEqual(actual, expected) def test_score(self): self.assertEqual(self.a3.score(), 42.0) self.assertEqual(self.a4.score(), -42.0) def test_start_end_positions(self): self.assertEqual(self.a3.start_end_positions(), [(0, 3), (5, 9)]) self.assertEqual(self.a4.start_end_positions(), [(1, 4), (6, 10)]) def test_subalignment(self): # keep seqs by ids actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3']) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by indices actual = self.a1.subalignment(seqs_to_keep=[0, 2]) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by ids (invert) actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep seqs by indices (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep positions actual = self.a1.subalignment(positions_to_keep=[0, 2, 3]) d1 = DNA('.AC', metadata={'id': "d1"}) d2 = DNA('TAC', metadata={'id': "d2"}) d3 = DNA('.AC', metadata={'id': "d3"}) expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep positions (invert) actual = self.a1.subalignment(positions_to_keep=[0, 2, 3], invert_positions_to_keep=True) d1 = DNA('.C-GTTGG..', metadata={'id': "d1"}) d2 = DNA('TCGGT-GGCC', metadata={'id': "d2"}) d3 = DNA('-C-GTTGC--', metadata={'id': "d3"}) expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep seqs and positions actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3]) d1 = DNA('.AC', metadata={'id': "d1"}) d3 = DNA('.AC', metadata={'id': "d3"}) expected = Alignment([d1, d3]) self.assertEqual(actual, expected) # keep seqs and positions (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3], invert_seqs_to_keep=True, invert_positions_to_keep=True) d2 = DNA('TCGGT-GGCC', metadata={'id': "d2"}) expected = Alignment([d2]) self.assertEqual(actual, expected) def test_subalignment_filter_out_everything(self): exp = Alignment([]) # no sequences obs = self.a1.subalignment(seqs_to_keep=None, invert_seqs_to_keep=True) self.assertEqual(obs, exp) # no positions obs = self.a1.subalignment(positions_to_keep=None, invert_positions_to_keep=True) self.assertEqual(obs, exp) def test_init_not_equal_lengths(self): invalid_seqs = [self.d1, self.d2, self.d3, DNA('.-ACC-GTGC--', metadata={'id': "i2"})] self.assertRaises(AlignmentError, Alignment, invalid_seqs) def test_init_equal_lengths(self): seqs = [self.d1, self.d2, self.d3] Alignment(seqs) def test_iter_positions(self): actual = list(self.a2.iter_positions()) expected = [ [RNA('U', metadata={'id': 'r1'}), RNA('A', metadata={'id': 'r2'})], [RNA('U', metadata={'id': 'r1'}), RNA('C', metadata={'id': 'r2'})], [RNA('A', metadata={'id': 'r1'}), RNA('G', metadata={'id': 'r2'})], [RNA('U', metadata={'id': 'r1'}), RNA('U', metadata={'id': 'r2'})], [RNA('-', metadata={'id': 'r1'}), RNA('U', metadata={'id': 'r2'})] ] self.assertEqual(actual, expected) actual = list(self.a2.iter_positions(constructor=str)) expected = [list('UA'), list('UC'), list('AG'), list('UU'), list('-U')] self.assertEqual(actual, expected) def test_majority_consensus(self): # empty cases self.assertEqual( self.empty.majority_consensus(), Sequence('')) self.assertEqual( self.no_positions.majority_consensus(), RNA('')) # alignment where all sequences are the same aln = Alignment([DNA('AG', metadata={'id': 'a'}), DNA('AG', metadata={'id': 'b'})]) self.assertEqual(aln.majority_consensus(), DNA('AG')) # no ties d1 = DNA('TTT', metadata={'id': "d1"}) d2 = DNA('TT-', metadata={'id': "d2"}) d3 = DNA('TC-', metadata={'id': "d3"}) a1 = Alignment([d1, d2, d3]) self.assertEqual(a1.majority_consensus(), DNA('TT-')) # ties d1 = DNA('T', metadata={'id': "d1"}) d2 = DNA('A', metadata={'id': "d2"}) a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNA('T'), DNA('A')]) def test_omit_gap_positions(self): expected = self.a2 self.assertEqual(self.a2.omit_gap_positions(1.0), expected) self.assertEqual(self.a2.omit_gap_positions(0.51), expected) r1 = RNA('UUAU', metadata={'id': "r1"}) r2 = RNA('ACGU', metadata={'id': "r2"}) expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.49), expected) r1 = RNA('UUAU', metadata={'id': "r1"}) r2 = RNA('ACGU', metadata={'id': "r2"}) expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.0), expected) self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty) self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty) self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty) # Test to ensure floating point precision bug isn't present. See the # tests for Alignment.position_frequencies for more details. seqs = [] for i in range(33): seqs.append(DNA('-.', metadata={'id': str(i)})) aln = Alignment(seqs) self.assertEqual(aln.omit_gap_positions(1 - np.finfo(float).eps), Alignment([DNA('', metadata={'id': str(i)}) for i in range(33)])) def test_omit_gap_sequences(self): expected = self.a2 self.assertEqual(self.a2.omit_gap_sequences(1.0), expected) self.assertEqual(self.a2.omit_gap_sequences(0.20), expected) expected = Alignment([self.r2]) self.assertEqual(self.a2.omit_gap_sequences(0.19), expected) self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty) self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty) self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty) # Test to ensure floating point precision bug isn't present. See the # tests for Alignment.position_frequencies for more details. aln = Alignment([DNA('.' * 33, metadata={'id': 'abc'}), DNA('-' * 33, metadata={'id': 'def'})]) self.assertEqual(aln.omit_gap_sequences(1 - np.finfo(float).eps), Alignment([])) def test_position_counters(self): self.assertEqual(self.empty.position_counters(), []) self.assertEqual(self.no_positions.position_counters(), []) expected = [Counter({'U': 1, 'A': 1}), Counter({'U': 1, 'C': 1}), Counter({'A': 1, 'G': 1}), Counter({'U': 2}), Counter({'-': 1, 'U': 1})] self.assertEqual(self.a2.position_counters(), expected) def test_position_frequencies(self): self.assertEqual(self.empty.position_frequencies(), []) self.assertEqual(self.no_positions.position_frequencies(), []) expected = [defaultdict(float, {'U': 0.5, 'A': 0.5}), defaultdict(float, {'U': 0.5, 'C': 0.5}), defaultdict(float, {'A': 0.5, 'G': 0.5}), defaultdict(float, {'U': 1.0}), defaultdict(float, {'-': 0.5, 'U': 0.5})] self.assertEqual(self.a2.position_frequencies(), expected) def test_position_frequencies_floating_point_precision(self): # Test that a position with no variation yields a frequency of exactly # 1.0. Note that it is important to use self.assertEqual here instead # of self.assertAlmostEqual because we want to test for exactly 1.0. A # previous implementation of Alignment.position_frequencies added # (1 / sequence_count) for each occurrence of a character in a position # to compute the frequencies (see # https://github.com/biocore/scikit-bio/issues/801). In certain cases, # this yielded a frequency slightly less than 1.0 due to roundoff # error. The test case here uses an alignment of 10 sequences with no # variation at a position. This test case exposes the roundoff error # present in the previous implementation because 1/10 added 10 times # yields a number slightly less than 1.0. This occurs because 1/10 # cannot be represented exactly as a floating point number. seqs = [] for i in range(10): seqs.append(DNA('A', metadata={'id': str(i)})) aln = Alignment(seqs) self.assertEqual(aln.position_frequencies(), [defaultdict(float, {'A': 1.0})]) def test_position_entropies(self): # tested by calculating values as described in this post: # http://stackoverflow.com/a/15476958/3424666 expected = [0.69314, 0.69314, 0.69314, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(), expected, 5) expected = [1.0, 1.0, 1.0, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(base=2), expected, 5) np.testing.assert_almost_equal(self.empty.position_entropies(base=2), []) def test_kmer_frequencies(self): expected = [defaultdict(float, {'U': 3 / 5, 'A': 1 / 5, '-': 1 / 5}), defaultdict(float, {'A': 1 / 5, 'C': 1 / 5, 'G': 1 / 5, 'U': 2 / 5})] actual = self.a2.kmer_frequencies(k=1, relative=True) for a, e in zip(actual, expected): self.assertEqual(sorted(a), sorted(e), 5) np.testing.assert_almost_equal(sorted(a.values()), sorted(e.values()), 5) def test_sequence_length(self): self.assertEqual(self.a1.sequence_length(), 13) self.assertEqual(self.a2.sequence_length(), 5) self.assertEqual(self.empty.sequence_length(), 0) def test_validate_lengths(self): self.assertTrue(self.a1._validate_lengths()) self.assertTrue(self.a2._validate_lengths()) self.assertTrue(self.empty._validate_lengths()) self.assertTrue(Alignment([ DNA('TTT', metadata={'id': "d1"})])._validate_lengths())
class AlignmentTests(TestCase): def setUp(self): self.d1 = DNASequence('..ACC-GTTGG..', id="d1") self.d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.d3 = DNASequence('.-ACC-GTTGC--', id="d3") self.r1 = RNASequence('UUAU-', id="r1") self.r2 = RNASequence('ACGUU', id="r2") self.seqs1 = [self.d1, self.d2, self.d3] self.seqs2 = [self.r1, self.r2] self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'), ('d3', '.-ACC-GTTGC--')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.a1 = Alignment(self.seqs1) self.a2 = Alignment(self.seqs2) self.a3 = Alignment(self.seqs2, score=42.0, start_end_positions=[(0, 3), (5, 9)]) self.a4 = Alignment(self.seqs2, score=-42.0, start_end_positions=[(1, 4), (6, 10)]) self.empty = Alignment([]) def test_degap(self): """degap functions as expected """ expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs1_t] expected = SequenceCollection.from_fasta_records(expected, DNASequence) actual = self.a1.degap() self.assertEqual(actual, expected) expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs2_t] expected = SequenceCollection.from_fasta_records(expected, RNASequence) actual = self.a2.degap() self.assertEqual(actual, expected) def test_distances(self): """distances functions as expected """ expected = [[0, 6. / 13, 4. / 13], [6. / 13, 0, 7. / 13], [4. / 13, 7. / 13, 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances() self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42., 42.], [42., 0, 42.], [42., 42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances(dumb_distance) self.assertEqual(actual, expected) def test_score(self): self.assertEqual(self.a3.score(), 42.0) self.assertEqual(self.a4.score(), -42.0) def test_start_end_positions(self): self.assertEqual(self.a3.start_end_positions(), [(0, 3), (5, 9)]) self.assertEqual(self.a4.start_end_positions(), [(1, 4), (6, 10)]) def test_subalignment(self): """subalignment functions as expected """ # keep seqs by ids actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3']) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by indices actual = self.a1.subalignment(seqs_to_keep=[0, 2]) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by ids (invert) actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep seqs by indices (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep positions actual = self.a1.subalignment(positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d2 = DNASequence('TAC', id="d2") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep positions (invert) actual = self.a1.subalignment(positions_to_keep=[0, 2, 3], invert_positions_to_keep=True) d1 = DNASequence('.C-GTTGG..', id="d1") d2 = DNASequence('TCGGT-GGCC', id="d2") d3 = DNASequence('-C-GTTGC--', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep seqs and positions actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d3]) self.assertEqual(actual, expected) # keep seqs and positions (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3], invert_seqs_to_keep=True, invert_positions_to_keep=True) d2 = DNASequence('TCGGT-GGCC', id="d2") expected = Alignment([d2]) self.assertEqual(actual, expected) def test_subalignment_filter_out_everything(self): exp = Alignment([]) # no sequences obs = self.a1.subalignment(seqs_to_keep=None, invert_seqs_to_keep=True) self.assertEqual(obs, exp) # no positions obs = self.a1.subalignment(positions_to_keep=None, invert_positions_to_keep=True) self.assertEqual(obs, exp) def test_init_validate(self): """initialization with validation functions as expected """ Alignment(self.seqs1, validate=True) # invalid DNA character invalid_seqs1 = [self.d1, self.d2, self.d3, DNASequence('.-ACC-GTXGC--', id="i1")] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs1, validate=True) # invalid lengths (they're not all equal) invalid_seqs2 = [self.d1, self.d2, self.d3, DNASequence('.-ACC-GTGC--', id="i2")] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs2, validate=True) def test_is_valid(self): """is_valid functions as expected """ self.assertTrue(self.a1.is_valid()) self.assertTrue(self.a2.is_valid()) self.assertTrue(self.empty.is_valid()) # invalid because of length mismatch d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGC', id="d2") self.assertFalse(Alignment([d1, d2]).is_valid()) # invalid because of invalid charaters d1 = DNASequence('..ACC-GTXGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.assertFalse(Alignment([d1, d2]).is_valid()) def test_iter_positions(self): """iter_positions functions as expected """ actual = list(self.a2.iter_positions()) expected = [[RNASequence(j) for j in i] for i in ['UA', 'UC', 'AG', 'UU', '-U']] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.assertEqual(actual, expected) actual = list(self.a2.iter_positions(constructor=str)) expected = [list('UA'), list('UC'), list('AG'), list('UU'), list('-U')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.assertEqual(actual, expected) def test_majority_consensus(self): d1 = DNASequence('TTT', id="d1") d2 = DNASequence('TT-', id="d2") d3 = DNASequence('TC-', id="d3") a1 = Alignment([d1, d2, d3]) self.assertTrue(a1.majority_consensus().equals(DNASequence('TT-'))) d1 = DNASequence('T', id="d1") d2 = DNASequence('A', id="d2") a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNASequence('T'), DNASequence('A')]) self.assertEqual(self.empty.majority_consensus(), '') def test_majority_consensus_constructor(self): d1 = DNASequence('TTT', id="d1") d2 = DNASequence('TT-', id="d2") d3 = DNASequence('TC-', id="d3") a1 = Alignment([d1, d2, d3]) obs = npt.assert_warns(UserWarning, a1.majority_consensus, constructor=str) self.assertEqual(obs, 'TT-') def test_omit_gap_positions(self): """omitting gap positions functions as expected """ expected = self.a2 self.assertEqual(self.a2.omit_gap_positions(1.0), expected) self.assertEqual(self.a2.omit_gap_positions(0.51), expected) r1 = RNASequence('UUAU', id="r1") r2 = RNASequence('ACGU', id="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.49), expected) r1 = RNASequence('UUAU', id="r1") r2 = RNASequence('ACGU', id="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.0), expected) self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty) self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty) self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty) def test_omit_gap_sequences(self): """omitting gap sequences functions as expected """ expected = self.a2 self.assertEqual(self.a2.omit_gap_sequences(1.0), expected) self.assertEqual(self.a2.omit_gap_sequences(0.20), expected) expected = Alignment([self.r2]) self.assertEqual(self.a2.omit_gap_sequences(0.19), expected) self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty) self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty) self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty) def test_position_counters(self): """position_counters functions as expected """ expected = [Counter({'U': 1, 'A': 1}), Counter({'U': 1, 'C': 1}), Counter({'A': 1, 'G': 1}), Counter({'U': 2}), Counter({'-': 1, 'U': 1})] self.assertEqual(self.a2.position_counters(), expected) self.assertEqual(self.empty.position_counters(), []) def test_position_frequencies(self): """computing position frequencies functions as expected """ expected = [defaultdict(int, {'U': 0.5, 'A': 0.5}), defaultdict(int, {'U': 0.5, 'C': 0.5}), defaultdict(int, {'A': 0.5, 'G': 0.5}), defaultdict(int, {'U': 1.0}), defaultdict(int, {'-': 0.5, 'U': 0.5})] self.assertEqual(self.a2.position_frequencies(), expected) self.assertEqual(self.empty.position_frequencies(), []) def test_position_entropies(self): """computing positional uncertainties functions as expected tested by calculating values as described in this post: http://stackoverflow.com/a/15476958/3424666 """ expected = [0.69314, 0.69314, 0.69314, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(), expected, 5) expected = [1.0, 1.0, 1.0, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(base=2), expected, 5) np.testing.assert_almost_equal(self.empty.position_entropies(base=2), []) def test_k_word_frequencies(self): """k_word_frequencies functions as expected """ expected = [defaultdict(int, {'U': 3 / 5, 'A': 1 / 5, '-': 1 / 5}), defaultdict(int, {'A': 1 / 5, 'C': 1 / 5, 'G': 1 / 5, 'U': 2 / 5})] actual = self.a2.k_word_frequencies(k=1) for a, e in zip(actual, expected): self.assertEqual(sorted(a), sorted(e), 5) np.testing.assert_almost_equal(sorted(a.values()), sorted(e.values()), 5) def test_sequence_length(self): """sequence_length functions as expected """ self.assertEqual(self.a1.sequence_length(), 13) self.assertEqual(self.a2.sequence_length(), 5) self.assertEqual(self.empty.sequence_length(), 0) def test_to_phylip(self): d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = npt.assert_warns(UserWarning, a.to_phylip, map_labels=False) self.assertEqual(id_map, {'d1': 'd1', 'd3': 'd3', 'd2': 'd2'}) expected = "\n".join(["3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--"]) self.assertEqual(phylip_str, expected) def test_to_phylip_map_labels(self): d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = npt.assert_warns(UserWarning, a.to_phylip, map_labels=True, label_prefix="s") self.assertEqual(id_map, {'s1': 'd1', 's3': 'd3', 's2': 'd2'}) expected = "\n".join(["3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--"]) self.assertEqual(phylip_str, expected) def test_to_phylip_unequal_sequence_lengths(self): d1 = DNASequence('A-CT', id="d1") d2 = DNASequence('TTA', id="d2") d3 = DNASequence('.-AC', id="d3") a = Alignment([d1, d2, d3]) with self.assertRaises(SequenceCollectionError): npt.assert_warns(UserWarning, a.to_phylip) def test_to_phylip_no_sequences(self): with self.assertRaises(SequenceCollectionError): npt.assert_warns(UserWarning, Alignment([]).to_phylip) def test_to_phylip_no_positions(self): d1 = DNASequence('', id="d1") d2 = DNASequence('', id="d2") a = Alignment([d1, d2]) with self.assertRaises(SequenceCollectionError): npt.assert_warns(UserWarning, a.to_phylip) def test_validate_lengths(self): self.assertTrue(self.a1._validate_lengths()) self.assertTrue(self.a2._validate_lengths()) self.assertTrue(self.empty._validate_lengths()) self.assertTrue(Alignment([ DNASequence('TTT', id="d1")])._validate_lengths()) self.assertFalse(Alignment([ DNASequence('TTT', id="d1"), DNASequence('TT', id="d2")])._validate_lengths())
class AlignmentTests(TestCase): def setUp(self): self.d1 = DNASequence('..ACC-GTTGG..', id="d1") self.d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.d3 = DNASequence('.-ACC-GTTGC--', id="d3") self.r1 = RNASequence('UUAU-', id="r1") self.r2 = RNASequence('ACGUU', id="r2") self.seqs1 = [self.d1, self.d2, self.d3] self.seqs2 = [self.r1, self.r2] self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'), ('d3', '.-ACC-GTTGC--')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.a1 = Alignment(self.seqs1) self.a2 = Alignment(self.seqs2) self.a3 = Alignment(self.seqs2, score=42.0, start_end_positions=[(0, 3), (5, 9)]) self.a4 = Alignment(self.seqs2, score=-42.0, start_end_positions=[(1, 4), (6, 10)]) self.empty = Alignment([]) def test_degap(self): """degap functions as expected """ expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs1_t] expected = SequenceCollection.from_fasta_records(expected, DNASequence) actual = self.a1.degap() self.assertEqual(actual, expected) expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs2_t] expected = SequenceCollection.from_fasta_records(expected, RNASequence) actual = self.a2.degap() self.assertEqual(actual, expected) def test_distances(self): """distances functions as expected """ expected = [[0, 6. / 13, 4. / 13], [6. / 13, 0, 7. / 13], [4. / 13, 7. / 13, 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances() self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42., 42.], [42., 0, 42.], [42., 42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances(dumb_distance) self.assertEqual(actual, expected) def test_score(self): self.assertEqual(self.a3.score(), 42.0) self.assertEqual(self.a4.score(), -42.0) def test_start_end_positions(self): self.assertEqual(self.a3.start_end_positions(), [(0, 3), (5, 9)]) self.assertEqual(self.a4.start_end_positions(), [(1, 4), (6, 10)]) def test_subalignment(self): """subalignment functions as expected """ # keep seqs by ids actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3']) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by indices actual = self.a1.subalignment(seqs_to_keep=[0, 2]) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by ids (invert) actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep seqs by indices (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep positions actual = self.a1.subalignment(positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d2 = DNASequence('TAC', id="d2") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep positions (invert) actual = self.a1.subalignment(positions_to_keep=[0, 2, 3], invert_positions_to_keep=True) d1 = DNASequence('.C-GTTGG..', id="d1") d2 = DNASequence('TCGGT-GGCC', id="d2") d3 = DNASequence('-C-GTTGC--', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep seqs and positions actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d3]) self.assertEqual(actual, expected) # keep seqs and positions (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3], invert_seqs_to_keep=True, invert_positions_to_keep=True) d2 = DNASequence('TCGGT-GGCC', id="d2") expected = Alignment([d2]) self.assertEqual(actual, expected) def test_subalignment_filter_out_everything(self): exp = Alignment([]) # no sequences obs = self.a1.subalignment(seqs_to_keep=None, invert_seqs_to_keep=True) self.assertEqual(obs, exp) # no positions obs = self.a1.subalignment(positions_to_keep=None, invert_positions_to_keep=True) self.assertEqual(obs, exp) def test_init_validate(self): """initialization with validation functions as expected """ Alignment(self.seqs1, validate=True) # invalid DNA character invalid_seqs1 = [self.d1, self.d2, self.d3, DNASequence('.-ACC-GTXGC--', id="i1")] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs1, validate=True) # invalid lengths (they're not all equal) invalid_seqs2 = [self.d1, self.d2, self.d3, DNASequence('.-ACC-GTGC--', id="i2")] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs2, validate=True) def test_is_valid(self): """is_valid functions as expected """ self.assertTrue(self.a1.is_valid()) self.assertTrue(self.a2.is_valid()) self.assertTrue(self.empty.is_valid()) # invalid because of length mismatch d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGC', id="d2") self.assertFalse(Alignment([d1, d2]).is_valid()) # invalid because of invalid charaters d1 = DNASequence('..ACC-GTXGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.assertFalse(Alignment([d1, d2]).is_valid()) def test_iter_positions(self): """iter_positions functions as expected """ actual = list(self.a2.iter_positions()) expected = [[RNASequence(j) for j in i] for i in ['UA', 'UC', 'AG', 'UU', '-U']] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.assertEqual(actual, expected) actual = list(self.a2.iter_positions(constructor=str)) expected = [list('UA'), list('UC'), list('AG'), list('UU'), list('-U')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.assertEqual(actual, expected) def test_majority_consensus(self): """majority_consensus functions as expected """ d1 = DNASequence('TTT', id="d1") d2 = DNASequence('TT-', id="d2") d3 = DNASequence('TC-', id="d3") a1 = Alignment([d1, d2, d3]) self.assertEqual(a1.majority_consensus(), DNASequence('TT-')) d1 = DNASequence('T', id="d1") d2 = DNASequence('A', id="d2") a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNASequence('T'), DNASequence('A')]) self.assertEqual(self.empty.majority_consensus(), '') def test_omit_gap_positions(self): """omitting gap positions functions as expected """ expected = self.a2 self.assertEqual(self.a2.omit_gap_positions(1.0), expected) self.assertEqual(self.a2.omit_gap_positions(0.51), expected) r1 = RNASequence('UUAU', id="r1") r2 = RNASequence('ACGU', id="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.49), expected) r1 = RNASequence('UUAU', id="r1") r2 = RNASequence('ACGU', id="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.0), expected) self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty) self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty) self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty) def test_omit_gap_sequences(self): """omitting gap sequences functions as expected """ expected = self.a2 self.assertEqual(self.a2.omit_gap_sequences(1.0), expected) self.assertEqual(self.a2.omit_gap_sequences(0.20), expected) expected = Alignment([self.r2]) self.assertEqual(self.a2.omit_gap_sequences(0.19), expected) self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty) self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty) self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty) def test_position_counters(self): """position_counters functions as expected """ expected = [Counter({'U': 1, 'A': 1}), Counter({'U': 1, 'C': 1}), Counter({'A': 1, 'G': 1}), Counter({'U': 2}), Counter({'-': 1, 'U': 1})] self.assertEqual(self.a2.position_counters(), expected) self.assertEqual(self.empty.position_counters(), []) def test_position_frequencies(self): """computing position frequencies functions as expected """ expected = [defaultdict(int, {'U': 0.5, 'A': 0.5}), defaultdict(int, {'U': 0.5, 'C': 0.5}), defaultdict(int, {'A': 0.5, 'G': 0.5}), defaultdict(int, {'U': 1.0}), defaultdict(int, {'-': 0.5, 'U': 0.5})] self.assertEqual(self.a2.position_frequencies(), expected) self.assertEqual(self.empty.position_frequencies(), []) def test_position_entropies(self): """computing positional uncertainties functions as expected tested by calculating values as described in this post: http://stackoverflow.com/a/15476958/3424666 """ expected = [0.69314, 0.69314, 0.69314, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(), expected, 5) expected = [1.0, 1.0, 1.0, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(base=2), expected, 5) np.testing.assert_almost_equal(self.empty.position_entropies(base=2), []) def test_k_word_frequencies(self): """k_word_frequencies functions as expected """ expected = [defaultdict(int, {'U': 3 / 5, 'A': 1 / 5, '-': 1 / 5}), defaultdict(int, {'A': 1 / 5, 'C': 1 / 5, 'G': 1 / 5, 'U': 2 / 5})] actual = self.a2.k_word_frequencies(k=1) for a, e in zip(actual, expected): self.assertEqual(sorted(a), sorted(e), 5) np.testing.assert_almost_equal(sorted(a.values()), sorted(e.values()), 5) def test_sequence_length(self): """sequence_length functions as expected """ self.assertEqual(self.a1.sequence_length(), 13) self.assertEqual(self.a2.sequence_length(), 5) self.assertEqual(self.empty.sequence_length(), 0) def test_to_phylip(self): """to_phylip functions as expected """ d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=False) self.assertEqual(id_map, {'d1': 'd1', 'd3': 'd3', 'd2': 'd2'}) expected = "\n".join(["3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--"]) self.assertEqual(phylip_str, expected) def test_to_phylip_map_labels(self): """to_phylip functions as expected with label mapping """ d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s") self.assertEqual(id_map, {'s1': 'd1', 's3': 'd3', 's2': 'd2'}) expected = "\n".join(["3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--"]) self.assertEqual(phylip_str, expected) def test_to_phylip_unequal_sequence_lengths(self): d1 = DNASequence('A-CT', id="d1") d2 = DNASequence('TTA', id="d2") d3 = DNASequence('.-AC', id="d3") a = Alignment([d1, d2, d3]) with self.assertRaises(SequenceCollectionError): a.to_phylip() def test_to_phylip_no_sequences(self): with self.assertRaises(SequenceCollectionError): Alignment([]).to_phylip() def test_to_phylip_no_positions(self): d1 = DNASequence('', id="d1") d2 = DNASequence('', id="d2") a = Alignment([d1, d2]) with self.assertRaises(SequenceCollectionError): a.to_phylip() def test_validate_lengths(self): """ """ self.assertTrue(self.a1._validate_lengths()) self.assertTrue(self.a2._validate_lengths()) self.assertTrue(self.empty._validate_lengths()) self.assertTrue(Alignment([ DNASequence('TTT', id="d1")])._validate_lengths()) self.assertFalse(Alignment([ DNASequence('TTT', id="d1"), DNASequence('TT', id="d2")])._validate_lengths())
class AlignmentTests(TestCase): def setUp(self): self.d1 = DNASequence('..ACC-GTTGG..', id="d1") self.d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.d3 = DNASequence('.-ACC-GTTGC--', id="d3") self.r1 = RNASequence('UUAU-', id="r1") self.r2 = RNASequence('ACGUU', id="r2") self.seqs1 = [self.d1, self.d2, self.d3] self.seqs2 = [self.r1, self.r2] self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'), ('d3', '.-ACC-GTTGC--')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.a1 = Alignment(self.seqs1) self.a2 = Alignment(self.seqs2) self.a3 = Alignment(self.seqs2, score=42.0, start_end_positions=[(0, 3), (5, 9)]) self.a4 = Alignment(self.seqs2, score=-42.0, start_end_positions=[(1, 4), (6, 10)]) # no sequences self.empty = Alignment([]) # sequences, but no positions self.no_positions = Alignment([RNA('', id='a'), RNA('', id='b')]) def test_degap(self): expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs1_t] expected = SequenceCollection.from_fasta_records(expected, DNASequence) actual = self.a1.degap() self.assertEqual(actual, expected) expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs2_t] expected = SequenceCollection.from_fasta_records(expected, RNASequence) actual = self.a2.degap() self.assertEqual(actual, expected) def test_distances(self): expected = [[0, 6. / 13, 4. / 13], [6. / 13, 0, 7. / 13], [4. / 13, 7. / 13, 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances() self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42., 42.], [42., 0, 42.], [42., 42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances(dumb_distance) self.assertEqual(actual, expected) def test_score(self): self.assertEqual(self.a3.score(), 42.0) self.assertEqual(self.a4.score(), -42.0) def test_start_end_positions(self): self.assertEqual(self.a3.start_end_positions(), [(0, 3), (5, 9)]) self.assertEqual(self.a4.start_end_positions(), [(1, 4), (6, 10)]) def test_subalignment(self): # keep seqs by ids actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3']) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by indices actual = self.a1.subalignment(seqs_to_keep=[0, 2]) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by ids (invert) actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep seqs by indices (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep positions actual = self.a1.subalignment(positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d2 = DNASequence('TAC', id="d2") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep positions (invert) actual = self.a1.subalignment(positions_to_keep=[0, 2, 3], invert_positions_to_keep=True) d1 = DNASequence('.C-GTTGG..', id="d1") d2 = DNASequence('TCGGT-GGCC', id="d2") d3 = DNASequence('-C-GTTGC--', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep seqs and positions actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d3]) self.assertEqual(actual, expected) # keep seqs and positions (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3], invert_seqs_to_keep=True, invert_positions_to_keep=True) d2 = DNASequence('TCGGT-GGCC', id="d2") expected = Alignment([d2]) self.assertEqual(actual, expected) def test_subalignment_filter_out_everything(self): exp = Alignment([]) # no sequences obs = self.a1.subalignment(seqs_to_keep=None, invert_seqs_to_keep=True) self.assertEqual(obs, exp) # no positions obs = self.a1.subalignment(positions_to_keep=None, invert_positions_to_keep=True) self.assertEqual(obs, exp) def test_init_not_equal_lengths(self): invalid_seqs = [ self.d1, self.d2, self.d3, DNASequence('.-ACC-GTGC--', id="i2") ] self.assertRaises(AlignmentError, Alignment, invalid_seqs) def test_init_equal_lengths(self): seqs = [self.d1, self.d2, self.d3] Alignment(seqs) def test_init_validate(self): Alignment(self.seqs1, validate=True) # invalid DNA character invalid_seqs1 = [ self.d1, self.d2, self.d3, DNASequence('.-ACC-GTXGC--', id="i1") ] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs1, validate=True) def test_iter_positions(self): actual = list(self.a2.iter_positions()) expected = [[RNASequence(j) for j in i] for i in ['UA', 'UC', 'AG', 'UU', '-U']] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.assertEqual(actual, expected) actual = list(self.a2.iter_positions(constructor=str)) expected = [list('UA'), list('UC'), list('AG'), list('UU'), list('-U')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.assertEqual(actual, expected) def test_majority_consensus(self): d1 = DNASequence('TTT', id="d1") d2 = DNASequence('TT-', id="d2") d3 = DNASequence('TC-', id="d3") a1 = Alignment([d1, d2, d3]) self.assertTrue(a1.majority_consensus().equals(DNASequence('TT-'))) d1 = DNASequence('T', id="d1") d2 = DNASequence('A', id="d2") a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNASequence('T'), DNASequence('A')]) self.assertEqual(self.empty.majority_consensus(), '') def test_majority_consensus_constructor(self): d1 = DNASequence('TTT', id="d1") d2 = DNASequence('TT-', id="d2") d3 = DNASequence('TC-', id="d3") a1 = Alignment([d1, d2, d3]) obs = npt.assert_warns(DeprecationWarning, a1.majority_consensus, constructor=str) self.assertEqual(obs, 'TT-') def test_omit_gap_positions(self): expected = self.a2 self.assertEqual(self.a2.omit_gap_positions(1.0), expected) self.assertEqual(self.a2.omit_gap_positions(0.51), expected) r1 = RNASequence('UUAU', id="r1") r2 = RNASequence('ACGU', id="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.49), expected) r1 = RNASequence('UUAU', id="r1") r2 = RNASequence('ACGU', id="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.0), expected) self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty) self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty) self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty) # Test to ensure floating point precision bug isn't present. See the # tests for Alignment.position_frequencies for more details. seqs = [] for i in range(33): seqs.append(DNA('-.', id=str(i))) aln = Alignment(seqs) self.assertEqual(aln.omit_gap_positions(1 - np.finfo(float).eps), Alignment([DNA('', id=str(i)) for i in range(33)])) def test_omit_gap_sequences(self): expected = self.a2 self.assertEqual(self.a2.omit_gap_sequences(1.0), expected) self.assertEqual(self.a2.omit_gap_sequences(0.20), expected) expected = Alignment([self.r2]) self.assertEqual(self.a2.omit_gap_sequences(0.19), expected) self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty) self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty) self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty) # Test to ensure floating point precision bug isn't present. See the # tests for Alignment.position_frequencies for more details. aln = Alignment([DNA('.' * 33, id='abc'), DNA('-' * 33, id='def')]) self.assertEqual(aln.omit_gap_sequences(1 - np.finfo(float).eps), Alignment([])) def test_position_counters(self): self.assertEqual(self.empty.position_counters(), []) self.assertEqual(self.no_positions.position_counters(), []) expected = [ Counter({ 'U': 1, 'A': 1 }), Counter({ 'U': 1, 'C': 1 }), Counter({ 'A': 1, 'G': 1 }), Counter({'U': 2}), Counter({ '-': 1, 'U': 1 }) ] self.assertEqual(self.a2.position_counters(), expected) def test_position_frequencies(self): self.assertEqual(self.empty.position_frequencies(), []) self.assertEqual(self.no_positions.position_frequencies(), []) expected = [ defaultdict(float, { 'U': 0.5, 'A': 0.5 }), defaultdict(float, { 'U': 0.5, 'C': 0.5 }), defaultdict(float, { 'A': 0.5, 'G': 0.5 }), defaultdict(float, {'U': 1.0}), defaultdict(float, { '-': 0.5, 'U': 0.5 }) ] self.assertEqual(self.a2.position_frequencies(), expected) def test_position_frequencies_floating_point_precision(self): # Test that a position with no variation yields a frequency of exactly # 1.0. Note that it is important to use self.assertEqual here instead # of self.assertAlmostEqual because we want to test for exactly 1.0. A # previous implementation of Alignment.position_frequencies added # (1 / sequence_count) for each occurrence of a character in a position # to compute the frequencies (see # https://github.com/biocore/scikit-bio/issues/801). In certain cases, # this yielded a frequency slightly less than 1.0 due to roundoff # error. The test case here uses an alignment of 10 sequences with no # variation at a position. This test case exposes the roundoff error # present in the previous implementation because 1/10 added 10 times # yields a number slightly less than 1.0. This occurs because 1/10 # cannot be represented exactly as a floating point number. seqs = [] for i in range(10): seqs.append(DNA('A', id=str(i))) aln = Alignment(seqs) self.assertEqual(aln.position_frequencies(), [defaultdict(float, {'A': 1.0})]) def test_position_entropies(self): # tested by calculating values as described in this post: # http://stackoverflow.com/a/15476958/3424666 expected = [0.69314, 0.69314, 0.69314, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(), expected, 5) expected = [1.0, 1.0, 1.0, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(base=2), expected, 5) np.testing.assert_almost_equal(self.empty.position_entropies(base=2), []) def test_k_word_frequencies(self): expected = [ defaultdict(float, { 'U': 3 / 5, 'A': 1 / 5, '-': 1 / 5 }), defaultdict(float, { 'A': 1 / 5, 'C': 1 / 5, 'G': 1 / 5, 'U': 2 / 5 }) ] actual = self.a2.k_word_frequencies(k=1) for a, e in zip(actual, expected): self.assertEqual(sorted(a), sorted(e), 5) np.testing.assert_almost_equal(sorted(a.values()), sorted(e.values()), 5) def test_sequence_length(self): self.assertEqual(self.a1.sequence_length(), 13) self.assertEqual(self.a2.sequence_length(), 5) self.assertEqual(self.empty.sequence_length(), 0) def test_to_phylip(self): d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = npt.assert_warns(DeprecationWarning, a.to_phylip, map_labels=False) self.assertEqual(id_map, {'d1': 'd1', 'd3': 'd3', 'd2': 'd2'}) expected = "\n".join([ "3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--" ]) self.assertEqual(phylip_str, expected) def test_to_phylip_map_labels(self): d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = npt.assert_warns(DeprecationWarning, a.to_phylip, map_labels=True, label_prefix="s") self.assertEqual(id_map, {'s1': 'd1', 's3': 'd3', 's2': 'd2'}) expected = "\n".join([ "3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--" ]) self.assertEqual(phylip_str, expected) def test_to_phylip_no_sequences(self): with self.assertRaises(SequenceCollectionError): npt.assert_warns(DeprecationWarning, Alignment([]).to_phylip) def test_to_phylip_no_positions(self): d1 = DNASequence('', id="d1") d2 = DNASequence('', id="d2") a = Alignment([d1, d2]) with self.assertRaises(SequenceCollectionError): npt.assert_warns(DeprecationWarning, a.to_phylip) def test_validate_lengths(self): self.assertTrue(self.a1._validate_lengths()) self.assertTrue(self.a2._validate_lengths()) self.assertTrue(self.empty._validate_lengths()) self.assertTrue( Alignment([DNASequence('TTT', id="d1")])._validate_lengths())