def test_consensus_smaller_than_min_match_len(self): """ Usually, a match smaller than min_match_length counts as non-match, but if the whole string is smaller than min_match_length, counts as match. """ tester1 = IntervalPartitioner("TTATT", min_match_length=7, alignment=MSA([])) match, non_match, _ = tester1.get_intervals() self.assertEqual(match, make_typed_intervals([[0, 4]], Match)) self.assertEqual(non_match, []) tester2 = IntervalPartitioner("T*ATT", min_match_length=7, alignment=MSA([])) match, non_match, _ = tester2.get_intervals() self.assertEqual(match, []) self.assertEqual(non_match, make_typed_intervals([[0, 4]], NonMatch))
def test_GivenUnorderedIds_SubalignmentStillInSequenceOrder(self): """ Sequences given rearranged are still output in input order """ result = PrgBuilder.get_sub_alignment_by_list_id(["s3", "s1"], self.alignment) expected = MSA([self.alignment[0], self.alignment[2]]) self.assertTrue(msas_equal(expected, result))
def test_ambiguous_sequences_in_short_interval_separate_clusters(self): alignment = MSA([ SeqRecord(Seq("ARAT"), id="s1"), SeqRecord(Seq("WAAT"), id="s2"), ]) result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 5) self.assertEqual([["s1"], ["s2"]], result)
def test_get_subalignment_with_interval(self): result = PrgBuilder.get_sub_alignment_by_list_id( ["s2", "s3"], self.alignment, [0, 2] ) expected = MSA( [SeqRecord(Seq("C--"), id="s2"), SeqRecord(Seq("AAT"), id="s3"),] ) self.assertTrue(msas_equal(expected, result))
def test_TwoIdenticalSequencesClusteredTogether(self): alignment = MSA([ SeqRecord(Seq("AAAT"), id="s1"), SeqRecord(Seq("AAAT"), id="s2"), SeqRecord(Seq("C-CC"), id="s3"), ]) result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 1) self.assertEqual([["s1", "s2"], ["s3"]], result)
def setUpClass(cls): cls.alignment = MSA( [ SeqRecord(Seq("AAAT"), id="s1"), SeqRecord(Seq("C--C"), id="s2"), SeqRecord(Seq("AATT"), id="s3"), SeqRecord(Seq("GNGG"), id="s4"), ] )
def test_end_in_non_match(self): tester = IntervalPartitioner( "**ATT**AAA*C", min_match_length=3, alignment=MSA([]) ) match, non_match, _ = tester.get_intervals() self.assertEqual(match, make_typed_intervals([[2, 4], [7, 9]], Match)) self.assertEqual( non_match, make_typed_intervals([[0, 1], [5, 6], [10, 11]], NonMatch) )
def test_match_non_match_match(self): tester = IntervalPartitioner("ATT**AAAC", min_match_length=3, alignment=MSA([])) match, non_match, all_match = tester.get_intervals() expected_matches = make_typed_intervals([[0, 2], [5, 8]], Match) expected_non_matches = make_typed_intervals([[3, 4]], NonMatch) self.assertEqual(match, expected_matches) self.assertEqual(non_match, expected_non_matches) # Check interval sorting works self.assertEqual( all_match, [expected_matches[0], expected_non_matches[0], expected_matches[1]], )
def test_one_long_one_short_sequence_separate_and_ordered_clusters(self): alignment = MSA([ SeqRecord(Seq("AATTAATTATATAATAAC"), id="s1"), SeqRecord(Seq("A--------------AAT"), id="s2"), ]) order_1 = kmeans_cluster_seqs_in_interval([0, len(alignment[0])], alignment, 5) self.assertEqual(order_1, [["s1"], ["s2"]]) order_2 = kmeans_cluster_seqs_in_interval([0, len(alignment[0])], alignment[::-1], 5) self.assertEqual(order_2, [["s2"], ["s1"]])
def test_GivenAllSequencesBelowKmerSize_NoKMeansAndIdenticalSequencesClustered( self, mockKMeans): alignment = MSA([ SeqRecord(Seq("AA---AT"), id="s1"), SeqRecord(Seq("AA---TT"), id="s2"), SeqRecord(Seq("CA--CAT"), id="s3"), SeqRecord(Seq("A-A--AT"), id="s4"), ]) result = kmeans_cluster_seqs_in_interval([0, len(alignment[0])], alignment, 6) mockKMeans.assert_not_called() self.assertEqual([["s1", "s4"], ["s2"], ["s3"]], result)
def test_GivenOrderedIds_SubalignmentInSequenceOrder(self): result = PrgBuilder.get_sub_alignment_by_list_id(["s1", "s3"], self.alignment) expected = MSA([self.alignment[0], self.alignment[2]]) self.assertTrue(msas_equal(expected, result))
def test_short_match_counted_as_non_match(self): tester = IntervalPartitioner("AT***", min_match_length=3, alignment=MSA([])) match, non_match, _ = tester.get_intervals() self.assertEqual(match, []) self.assertEqual(non_match, make_typed_intervals([[0, 4]], NonMatch))
def test_all_match(self): tester = IntervalPartitioner("ATATAAA", min_match_length=3, alignment=MSA([])) match, non_match, _ = tester.get_intervals() self.assertEqual(match, make_typed_intervals([[0, 6]], Match)) self.assertEqual(non_match, [])
def test_one_seq_returns_single_id(self): alignment = MSA([SeqRecord(Seq("AAAT"), id="s1")]) result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 1) self.assertTrue(result.no_clustering)