예제 #1
0
 def test_avoid_empty_alleles_previous_non_match_merged(self):
     """Edge case of collapsed match interval, part 2"""
     msa = make_alignment(["CCTTAGGTTT", "AATTA--TTT"])
     tester = IntervalPartitioner("**TTA**TTT", min_match_length=3, alignment=msa)
     match, non_match, _ = tester.get_intervals()
     self.assertEqual(match, make_typed_intervals([[7, 9]], Match))
     self.assertEqual(non_match, make_typed_intervals([[0, 6]], NonMatch))
예제 #2
0
 def test_GivenThreeSequencesAboveKmerSize_KMeansClusteringCalled(self, mockfit):
     alignment = make_alignment(["AAAT", "TTTT", "ATAT"])
     try:
         result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 2)
     except ValueError:
         pass
     mockfit.assert_called_once()
예제 #3
0
 def test_GivenRepeatedUngappedSequencesBelowKmerSize_EndUpInSameCluster(
         self):
     sequences = ["A-A-T", "CCCCC", "AA--T"]
     alignment = make_alignment(sequences)
     actual = kmeans_cluster_seqs_in_interval([0, 4], alignment, 4)
     expected = [["s0", "s2"], ["s1"]]
     self.assertEqual(actual, expected)
예제 #4
0
 def test_N_special_treatment(self):
     """
     i)A and N at pos 2 are different, but still consensus
     ii)N and N at pos 0 are same, but not consensus"""
     alignment = make_alignment(["NTN", "NTA"])
     result = PrgBuilder.get_consensus(alignment)
     self.assertEqual(result, "*TA")
예제 #5
0
 def test_GivenManyVeryDifferentSequences_EachSeqInOwnCluster(self):
     # all 256 distinct DNA 4-mers.
     # We want clustering to keep looking for clusters, and stop at MAX_CLUSTERS
     all_4mers = list(map("".join, product(standard_bases, repeat=4)))
     alignment = make_alignment(all_4mers)
     result = kmeans_cluster_seqs_in_interval([0, 4], alignment, 4)
     self.assertEqual(len(result.clustered_ids), MAX_CLUSTERS)
예제 #6
0
 def test_SequencesUnevenLengthIfGapsRemoved_ClusteringRuns(self):
     """If check for 'one-ref' property in clusters
     was on ungapped sequences, hamming distance computation would fail because sequences have different length"""
     sequences = ["A---T", "AAAAT", "AAA-T"]
     alignment = make_alignment(sequences)
     actual = kmeans_cluster_seqs_in_interval([0, 4], alignment, 1)
     expected = [["s0"], ["s1", "s2"]]
     self.assertEqual(actual, expected)
예제 #7
0
 def test_SequencesUnevenLengthIfGapsRemoved_ClusteringRuns(self):
     """Checking for 'one-ref' property in clusters
     needs to be on ungapped sequences (elsehamming distance computation would fail
     due to different seq length"""
     sequences = ["A---T", "AAAAT", "AAA-T"]
     alignment = make_alignment(sequences)
     actual = kmeans_cluster_seqs_in_interval([0, 4], alignment, 1)
     expected = [["s0"], ["s1", "s2"]]
     self.assertEqual(actual.clustered_ids, expected)
예제 #8
0
 def test_nonmatch_interval_switching_indels(self):
     """Because the sequences are the same, despite different alignment"""
     alignment = make_alignment(["A---A", "A-A--"])
     match_intervals, non_match_intervals = make_intervals([], [[0, 5]])
     IntervalPartitioner.enforce_multisequence_nonmatch_intervals(
         match_intervals, non_match_intervals, alignment
     )
     self.assertEqual(match_intervals, make_typed_intervals([[0, 5]], Match))
     self.assertEqual(non_match_intervals, [])
예제 #9
0
 def test_nonmatch_interval_switching_Ns(self):
     """'N's make sequences get removed"""
     alignment = make_alignment(["ANAAA", "ATAAT"])
     match_intervals, non_match_intervals = make_intervals([], [[0, 5]])
     IntervalPartitioner.enforce_multisequence_nonmatch_intervals(
         match_intervals, non_match_intervals, alignment
     )
     self.assertEqual(match_intervals, make_typed_intervals([[0, 5]], Match))
     self.assertEqual(non_match_intervals, [])
예제 #10
0
 def test_GivenAllSequencesOneSnpApart_ReturnsNoClustering(self):
     sequences = ["CATATAAAATA", "CATATAACATA", "CATATAAGATA", "CATATAATATA"]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval(
             [0, seq_size - 1], alignment, kmer_size
         )
         self.assertTrue(result.no_clustering)
         self.assertEqual(result.sequences, sequences)
예제 #11
0
 def test_GivenTwoSequenceGroups_ReturnsTwoClusters(self):
     sequences = ["CATATAAAATA", "CATATAATATA", "GGGGCGGGCCC", "GGGGCGGGCGC"]
     expected_clustering = [["s0", "s1"], ["s2", "s3"]]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval(
             [0, seq_size - 1], alignment, kmer_size
         )
         self.assertEqual(expected_clustering, result.clustered_ids)
예제 #12
0
 def test_avoid_empty_alleles_short_match(self):
     """
     Padding behaviour also expected, but now the leading match interval becomes too
     short and collapses to a non_match interval
     """
     msa = make_alignment(["TTAGGTTT", "TTA--TTT"])
     tester = IntervalPartitioner("TTA**TTT", min_match_length=3, alignment=msa)
     match, non_match, _ = tester.get_intervals()
     self.assertEqual(match, make_typed_intervals([[5, 7]], Match))
     self.assertEqual(non_match, make_typed_intervals([[0, 4]], NonMatch))
예제 #13
0
 def test_avoid_empty_alleles_long_match(self):
     """
     If we let the non-match interval be only [4,5],
     this would result in an empty allele in the prg,
     so require padding using the preceding match sequence
     """
     msa = make_alignment(["TTAAGGTTT", "TTAA--TTT"])
     tester = IntervalPartitioner("TTAA**TTT", min_match_length=3, alignment=msa)
     match, non_match, _ = tester.get_intervals()
     self.assertEqual(match, make_typed_intervals([[0, 2], [6, 8]], Match))
     self.assertEqual(non_match, make_typed_intervals([[3, 5]], NonMatch))
예제 #14
0
 def test_GivenAllSequencesOneSnpApart_ReturnsNoClustering(self):
     sequences = [
         "CATATAAAATA", "CATATAACATA", "CATATAAGATA", "CATATAATATA"
     ]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     expected_clustering = [[record.id] for record in alignment]
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval([0, seq_size - 1],
                                                  alignment, kmer_size)
         self.assertEqual(expected_clustering, result)
예제 #15
0
 def test_ambiguous_alignment_skip_clustering(self):
     """
     `added_seq` below is an equally valid alignment as "A--TTTTA" to the sequence
     "ATTAATTA"
     If we have such ambiguous alignments (defined as more than one gapped alignment
     corresponding to the same ungapped sequence), we choose not to cluster the
     alignment, as it can create ambiguous graphs (whereby different paths spell same sequence)
     """
     added_seq = "ATTTT--A"
     self.tested_params["alignment"] = make_alignment(self.aligned_seqs +
                                                      [added_seq])
     self.assertTrue(PrgBuilder.skip_clustering(**self.tested_params))
예제 #16
0
 def setUp(self):
     """
     Set of parameters whereby clustering is to be performed.
     We'll modify each of them in turn
     """
     self.aligned_seqs = ["ATTTTTTA", "A--TTTTA", "ATTTCTTA"]
     self.tested_params = {
         "interval": Interval(IntervalType.Match, 0, 7),
         "max_nesting": 2,
         "nesting_level": 1,
         "min_match_length": 2,
         "alignment": make_alignment(self.aligned_seqs),
     }
예제 #17
0
    def test_GivenAllSequencesBelowKmerSize_NoClustering(self, mockKMeans):
        alignment = make_alignment(
            [
                "AA---AT",
                "AA---TT",
                "CA--CAT",
                "A-A--AT",
            ]
        )

        result = kmeans_cluster_seqs_in_interval([0, len(alignment[0])], alignment, 6)
        mockKMeans.assert_not_called()
        self.assertTrue(result.no_clustering)
        self.assertEqual(result.sequences, ["AAAT", "AATT", "CACAT"])
예제 #18
0
    def test_first_id_in_first_cluster(self):
        alignment = make_alignment(
            [
                "AATTAATTATATAATAAC",
                "AATTAAGTATATAATAAC",
                "TTAATTAATTAATTAATT",
            ],
            ["s1", "s2", "s3"],
        )
        order_1 = kmeans_cluster_seqs_in_interval([0, len(alignment[0])], alignment, 5)
        self.assertEqual(order_1.clustered_ids, [["s1", "s2"], ["s3"]])

        order_2 = kmeans_cluster_seqs_in_interval(
            [0, len(alignment[0])], alignment[::-1], 5
        )
        self.assertEqual(order_2.clustered_ids, [["s3"], ["s2", "s1"]])
예제 #19
0
 def test_GivenThreeSequenceGroups_ReturnsThreeClusters(self):
     sequences = [
         "CCCCCCAACCT",
         "CCCCCCAATCT",
         "GGGGCGGGCCC",
         "GGGGCGGGCGC",
         "TTTAATTTTAA",
         "TTTAAGTTTAA",
     ]
     expected_clustering = [["s0", "s1"], ["s2", "s3"], ["s4", "s5"]]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval([0, seq_size - 1],
                                                  alignment, kmer_size)
         for cluster in expected_clustering:
             self.assertTrue(cluster in result)
예제 #20
0
 def test_first_sequence_placed_in_first_cluster(self):
     """
     Runs kmeans clustering on randomly generated multiple sequence alignments
     """
     seq_len = 20
     num_seqs = 5
     bases = list(standard_bases)
     # Function has different behaviour at below and above seq_len
     for used_len in [seq_len - 5, seq_len + 5]:
         with self.subTest(kmer_size=seq_len):
             for _ in range(5):  # Run on a number of random alignments
                 sequences = [
                     "".join(random.choices(bases, k=seq_len))
                     for _ in range(num_seqs)
                 ]
                 alignment = make_alignment(sequences)
                 result = kmeans_cluster_seqs_in_interval([0, seq_len - 1],
                                                          alignment,
                                                          used_len)
                 self.assertTrue(result[0][0] == "s0")
예제 #21
0
 def test_GivenAllSequencesSmallEditDist_ReturnsNoClustering(self):
     """Cf graph 157.pdf in issue #15"""
     sequences = [
         "gctccgccggtcccgccggtcc",
         "gctccgccgggcccgccggtcc",
         "tctccgccggtcccgccggtcc",
         "gctcagccggtcccgccggtcc",
         "gctccgccggtcccaccggtcc",
         "gctccgccggtaccgccggtcc",
         "gctccgctggtcccgccggtcc",
         "gctccgccggtcccgctggtcc",
         "gctccgccggtcccgccggtct",
         "gctccgccggtcccgcctgtcc",
         "gctccgccggtcctgccggtcc",
     ]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     expected_clustering = [[record.id] for record in alignment]
     for kmer_size in range(4, 8):
         result = kmeans_cluster_seqs_in_interval([0, seq_size - 1],
                                                  alignment, kmer_size)
         self.assertEqual(expected_clustering, result)
예제 #22
0
    def test_GivenSequencesWithSameKmerCounts_ClusteringInterrupted(self):
        """
        Sequences below are not 'one-ref-like', yet kmer counts are identical.
        This is because the sequences contain repeats and gaps, making them
        not identical from the point of view of edit distance.
        Number of clusters will try to be increased, but kmeans will only find one,
        as there is a single data point in kmer space.
        This test checks the code deals with this by aborting further clustering.
        """
        sequences = [
            "TTTTTTTGGGGGGGAAAAAAATTTTTTT-------AAAAAAATTTTTTTAAAAAAA-------",
            "-------TTTTTTTAAAAAAATTTTTTTGGGGGGGAAAAAAATTTTTTT-------AAAAAAA",
            "TTTTTTTAAAAAAATTTTTTTAAAAAAATTTTTTT-------GGGGGGG-------AAAAAAA",
        ]
        ungapped_sequences = list(map(ungap, sequences))
        distinct_kmers = count_distinct_kmers(ungapped_sequences, kmer_size=7)
        count_matrix = count_kmer_occurrences(ungapped_sequences, distinct_kmers)
        distinct_count_patterns = set(map(str, count_matrix))
        assert len(distinct_count_patterns) == 1
        assert not sequences_are_one_reference_like(sequences)

        alignment = make_alignment(sequences)
        result = kmeans_cluster_seqs_in_interval([0, len(sequences[0])], alignment, 7)
        self.assertTrue(result.no_clustering)
예제 #23
0
 def test_sub_alignment_with_empty_sequence(self):
     msa = make_alignment(["TTAGGTTT", "TTA--TTT", "GGA-TTTT"])
     self.assertTrue(has_empty_sequence(msa, [3, 4]))
예제 #24
0
 def setUpClass(cls):
     cls.alignment = make_alignment(["AAAT", "C--C", "AATT", "GNGG"],
                                    ["s1", "s2", "s3", "s4"])
예제 #25
0
 def test_get_subalignment_with_interval(self):
     result = PrgBuilder.get_sub_alignment_by_list_id(["s2", "s3"],
                                                      self.alignment,
                                                      [0, 2])
     expected = make_alignment(["C--", "AAT"], ["s2", "s3"])
     self.assertTrue(msas_equal(expected, result))
예제 #26
0
 def test_all_gap_nonmatch(self):
     alignment = make_alignment(["A--A", "A--A"])
     result = PrgBuilder.get_consensus(alignment)
     self.assertEqual(result, "A**A")
예제 #27
0
 def test_count_alignment_seqs(self):
     msa = make_alignment(self.input_seqs)
     result = count(get_alignment_seqs(msa))
     self.assertEqual(result, 3)
예제 #28
0
 def test_IUPACAmbiguous_nonmatch(self):
     alignment = make_alignment(["RYA", "RTA"])
     result = PrgBuilder.get_consensus(alignment)
     self.assertEqual(result, "**A")
예제 #29
0
 def test_mixed_match_nonmatch(self):
     alignment = make_alignment(["AAGTA", "CATTA"])
     result = PrgBuilder.get_consensus(alignment)
     self.assertEqual(result, "*A*TA")
예제 #30
0
 def test_first_sequence_in_is_first_sequence_out(self):
     alignment = make_alignment(["TTTT", "AAAA", "CC-C"])
     result = get_expanded_sequences(alignment)
     expected = ["TTTT", "AAAA", "CCC"]
     self.assertEqual(expected, result)