Exemplo n.º 1
0
    def test_subset_seqs_Alignment(self):
        rna1 = RnaSequence("UCG", name="rna1")
        rna2 = RnaSequence("YCG", name="rna2")
        rna3 = RnaSequence("CAR", name="rna3")

        sub_aln = Alignment([rna2, rna3], moltype=RNA)
        aln = Alignment([rna1, rna2, rna3], moltype=RNA)
        obs_sub_aln = aln.take_seqs(["rna2", "rna3"])

        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertEqual(str(obs_sub_aln), str(sub_aln))

        # Selected sequences should be in specified order?
        obs_sub_aln_1 = self.aln.take_seqs(["rna3", "rna2"])
        obs_sub_aln_2 = self.aln.take_seqs(["rna2", "rna3"])
        self.assertNotEqual(str(obs_sub_aln_1), str(obs_sub_aln_2))
Exemplo n.º 2
0
 def setUp(self):
     """Setup for Fasta tests."""
     self.strings = ["AAAA", "CCCC", "gggg", "uuuu"]
     self.labels = ["1st", "2nd", "3rd", "4th"]
     self.infos = ["Dog", "Cat", "Mouse", "Rat"]
     self.sequences_with_labels = list(map(Sequence, self.strings))
     self.sequences_with_names = list(map(Sequence, self.strings))
     for l, sl, sn in zip(self.labels, self.sequences_with_labels,
                          self.sequences_with_names):
         sl.label = l
         sn.name = l
     self.fasta_no_label = ">0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu\n"
     self.fasta_with_label = ">1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU\n"
     self.fasta_with_label_lw2 = (
         ">1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU\n")
     self.alignment_dict = {
         "1st": "AAAA",
         "2nd": "CCCC",
         "3rd": "GGGG",
         "4th": "UUUU",
     }
     self.alignment_object = Alignment(self.alignment_dict)
     for label, info in zip(self.labels, self.infos):
         self.alignment_object.named_seqs[label].info = Info(species=info)
     self.fasta_with_label_species = (
         ">1st:Dog\nAAAA\n>2nd:Cat\nCCCC\n>3rd:Mouse\nGGGG\n>4th:Rat\nUUUU\n"
     )
     self.alignment_object.RowOrder = ["1st", "2nd", "3rd", "4th"]
Exemplo n.º 3
0
def alignment_traceback(seqs, aligned_positions, word_length):
    """Alignment object from state matrix and ending point."""
    (starts, ends, maps) = map_traceback(aligned_positions)
    aligneds = []
    for (start, end, amap, (name, seq)) in zip(starts, ends, maps, seqs):
        gs = Aligned(amap * word_length,
                     seq[start * word_length:end * word_length])
        aligneds.append((name, gs))
    return Alignment(moltype=None, data=aligneds)
Exemplo n.º 4
0
    def setUp(self):
        """setUp method for all tests"""
        # named sequences
        self.rna1 = RnaSequence("UCAGGG", name="rna1")
        self.rna2 = RnaSequence("YCU-RG", name="rna2")
        self.rna3 = RnaSequence("CAA-NR", name="rna3")
        self.model1 = ArraySequence("UCAGGG",
                                    name="rna1",
                                    alphabet=RNA.alphabets.degen_gapped)
        self.model2 = ArraySequence("YCU-RG",
                                    name="rna2",
                                    alphabet=RNA.alphabets.degen_gapped)
        self.model3 = ArraySequence("CAA-NR",
                                    name="rna3",
                                    alphabet=RNA.alphabets.degen_gapped)

        self.aln = Alignment([self.rna1, self.rna2, self.rna3], moltype=RNA)
        self.da = ArrayAlignment(
            [self.model1, self.model2, self.model3],
            moltype=RNA,
            alphabet=RNA.alphabets.degen_gapped,
        )

        # seqs no name
        self.nn_rna1 = RnaSequence("UCAGGG")
        self.nn_rna2 = RnaSequence("YCU-RG")
        self.nn_rna3 = RnaSequence("CAA-NR")

        self.nn_model1 = ArraySequence("UCAGGG",
                                       alphabet=RNA.alphabets.degen_gapped)
        self.nn_model2 = ArraySequence("YCU-RG",
                                       alphabet=RNA.alphabets.degen_gapped)
        self.nn_model3 = ArraySequence("CAA-NR",
                                       alphabet=RNA.alphabets.degen_gapped)

        self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],
                                moltype=RNA)
        self.nn_da = ArrayAlignment(
            [self.nn_model1, self.nn_model2, self.nn_model3],
            moltype=RNA,
            alphabet=RNA.alphabets.degen_gapped,
        )
Exemplo n.º 5
0
    def get_alignment(self,
                      feature_types=None,
                      where_feature=None,
                      omit_redundant=True):
        """Arguments:
            - feature_types: annotations to be applied to the returned
              sequences
            - omit_redundant: exclude redundant gap positions"""
        seqs = []
        annotations = {}

        for member in self.members:
            if feature_types:
                seq = member.get_annotated_aligned(feature_types,
                                                   where_feature)
            else:
                seq = member.aligned_seq
            if seq is None:
                continue
            name = seq.name

            if self._rc:  # names should reflect change to strand
                loc = member.location.copy()
                loc.strand *= -1
                name = str(loc)

            annotations[name] = seq.data.annotations
            seq.name = seq.data.name = name
            seqs += [(name, seq)]

        if seqs is None:
            return None

        aln = Alignment(data=seqs, moltype=DNA)

        if self._rc:
            aln = aln.rc()

        if omit_redundant:
            aln = aln.omit_gap_pos()

        return aln
Exemplo n.º 6
0
    def test_subset_positions_Alignment(self):
        rna1 = RnaSequence("UCG", name="rna1")
        rna2 = RnaSequence("YCG", name="rna2")
        rna3 = RnaSequence("CAR", name="rna3")

        sub_aln = Alignment([rna1, rna2, rna3], moltype=RNA)

        obs_sub_aln = self.aln.take_positions([0, 1, 5])
        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertNotEqual(obs_sub_aln, self.aln)
        # string representations should be the same. This fails right
        # now, because sequence order is not maintained. See separate test.
        self.assertEqual(str(obs_sub_aln), str(sub_aln))
Exemplo n.º 7
0
    def setUp(self):
        """Setup for Clustal tests."""
        self.unaligned_dict = {
            "1st": "AAA",
            "2nd": "CCCC",
            "3rd": "GGGG",
            "4th": "UUUU",
        }
        self.alignment_dict = {
            "1st": "AAAA",
            "2nd": "CCCC",
            "3rd": "GGGG",
            "4th": "UUUU",
        }
        # create alignment change order.
        self.alignment_object = Alignment(self.alignment_dict)
        self.alignment_order = ["2nd", "4th", "3rd", "1st"]
        self.alignment_object.RowOrder = self.alignment_order

        self.clustal_with_label = """CLUSTAL

1st    AAAA
2nd    CCCC
3rd    GGGG
4th    UUUU
"""
        self.clustal_with_label_lw2 = """CLUSTAL

1st    AA
2nd    CC
3rd    GG
4th    UU

1st    AA
2nd    CC
3rd    GG
4th    UU
"""

        self.clustal_with_label_reordered = """CLUSTAL

2nd    CCCC
4th    UUUU
3rd    GGGG
1st    AAAA
"""

        self.clustal_with_label_lw2_reordered = """CLUSTAL
Exemplo n.º 8
0
    def test_user_function_multiple(self):
        """user defined composable functions should not interfere with each other"""
        from cogent3 import make_aligned_seqs
        from cogent3.core.alignment import Alignment

        u_function_1 = user_function(self.foo, "aligned", "aligned")
        u_function_2 = user_function(self.bar, "aligned", "pairwise_distances")

        aln_1 = make_aligned_seqs(data=[("a", "GCAAGCGTTTAT"), ("b", "GCTTTTGTCAAT")])
        data = dict([("s1", "ACGTACGTA"), ("s2", "GTGTACGTA")])
        aln_2 = Alignment(data=data, moltype="dna")

        got_1 = u_function_1(aln_1)
        got_2 = u_function_2(aln_2)
        self.assertEqual(got_1.to_dict(), {"a": "GCAA", "b": "GCTT"})
        self.assertEqual(got_2, {("s1", "s2"): 2.0, ("s2", "s1"): 2.0})
Exemplo n.º 9
0
def pairwise_to_multiple(pwise, ref_seq, moltype, info=None):
    """
    turns pairwise alignments to a reference into a multiple alignment

    Parameters
    ----------
    pwise
        Series of pairwise alignments to ref_seq as
        [(non-refseq name, aligned pair), ...]
    ref_seq
        The sequence common in all pairwise alignments
    moltype
        molecular type for the returned alignment
    info
        info object

    Returns
    -------
    ArrayAlign
    """
    if not hasattr(ref_seq, "name"):
        raise TypeError(
            f"ref_seq must be a cogent3 sequence, not {type(ref_seq)}")

    refseqs = [
        s for _, aln in pwise for s in aln.seqs if s.name == ref_seq.name
    ]
    ref_gaps = _gap_union(refseqs)

    m = gap_coords_to_map(ref_gaps, len(ref_seq))
    aligned = [Aligned(m, ref_seq)]
    for other_name, aln in pwise:
        curr_ref = aln.named_seqs[ref_seq.name]
        curr_ref_gaps = dict(curr_ref.map.get_gap_coordinates())
        other_seq = aln.named_seqs[other_name]
        other_gaps = dict(other_seq.map.get_gap_coordinates())
        diff_gaps = _combined_refseq_gaps(curr_ref_gaps, ref_gaps)
        inject = _gaps_for_injection(other_gaps, diff_gaps,
                                     len(other_seq.data))
        if inject:
            m = gap_coords_to_map(inject, len(other_seq.data))
            other_seq = Aligned(m, other_seq.data)

        aligned.append(other_seq)
    # default to ArrayAlign
    return Alignment(aligned, moltype=moltype,
                     info=info).to_type(array_align=True, moltype=moltype)
Exemplo n.º 10
0
def get_align_for_phylip(data, id_map=None):
    """
    Convenience function to return aligment object from phylip data

    data: sequence of lines in phylip format (an open file, list, etc)
    id_map: optional id mapping from external ids to phylip labels - not sure
        if we're going to implement this

    returns Alignment object
    """

    mpp = MinimalPhylipParser(data, id_map)

    tuples = []
    for tup in mpp:
        tuples.append(tup)
    return Alignment(tuples)
Exemplo n.º 11
0
class AllTests(TestCase):
    def setUp(self):
        """setUp method for all tests"""
        # named sequences
        self.rna1 = RnaSequence("UCAGGG", name="rna1")
        self.rna2 = RnaSequence("YCU-RG", name="rna2")
        self.rna3 = RnaSequence("CAA-NR", name="rna3")
        self.model1 = ArraySequence("UCAGGG",
                                    name="rna1",
                                    alphabet=RNA.alphabets.degen_gapped)
        self.model2 = ArraySequence("YCU-RG",
                                    name="rna2",
                                    alphabet=RNA.alphabets.degen_gapped)
        self.model3 = ArraySequence("CAA-NR",
                                    name="rna3",
                                    alphabet=RNA.alphabets.degen_gapped)

        self.aln = Alignment([self.rna1, self.rna2, self.rna3], moltype=RNA)
        self.da = ArrayAlignment(
            [self.model1, self.model2, self.model3],
            moltype=RNA,
            alphabet=RNA.alphabets.degen_gapped,
        )

        # seqs no name
        self.nn_rna1 = RnaSequence("UCAGGG")
        self.nn_rna2 = RnaSequence("YCU-RG")
        self.nn_rna3 = RnaSequence("CAA-NR")

        self.nn_model1 = ArraySequence("UCAGGG",
                                       alphabet=RNA.alphabets.degen_gapped)
        self.nn_model2 = ArraySequence("YCU-RG",
                                       alphabet=RNA.alphabets.degen_gapped)
        self.nn_model3 = ArraySequence("CAA-NR",
                                       alphabet=RNA.alphabets.degen_gapped)

        self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],
                                moltype=RNA)
        self.nn_da = ArrayAlignment(
            [self.nn_model1, self.nn_model2, self.nn_model3],
            moltype=RNA,
            alphabet=RNA.alphabets.degen_gapped,
        )

    def test_printing_named_seqs(self):
        """Printing named seqs should work the same on Aln and DenseAln"""
        # Note: the newline trailing each sequence is intentional, because
        # we want each FASTA-format record to be separated.
        exp_lines_general = [
            ">rna1", "UCAGGG", ">rna2", "YCU-RG", ">rna3", "CAA-NR"
        ]
        self.assertEqual(str(self.aln), "\n".join(exp_lines_general) + "\n")
        self.assertEqual(str(self.da), "\n".join(exp_lines_general) + "\n")

    def test_printing_unnamed_seqs(self):
        """Printing unnamed sequences should work the same on Aln and DenseAln
        """
        exp_lines_gen = [
            ">seq_0", "UCAGGG", ">seq_1", "YCU-RG", ">seq_2", "CAA-NR\n"
        ]
        self.assertEqual(str(self.nn_aln), "\n".join(exp_lines_gen))
        self.assertEqual(str(self.nn_da), "\n".join(exp_lines_gen))

    def test_ArrayAlignment_without_moltype(self):
        """Expect MolType to be picked up from the sequences."""

        m1 = ArraySequence("UCAG",
                           alphabet=RNA.alphabets.degen_gapped,
                           name="rna1")
        m2 = ArraySequence("CCCR",
                           alphabet=RNA.alphabets.degen_gapped,
                           name="rna2")
        da = ArrayAlignment([m1, m2])
        exp_lines = [">rna1", "UCAG", ">rna2", "CCCR"]
        self.assertEqual(str(da), "\n".join(exp_lines) + "\n")

    def test_names(self):
        # Should both alignments handle names the same way?
        self.assertEqual(self.aln.names, ["rna1", "rna2", "rna3"])
        self.assertEqual(self.da.names, ["rna1", "rna2", "rna3"])
        # On unnamed sequences the behavior is now the same.
        self.assertEqual(self.nn_aln.names, ["seq_0", "seq_1", "seq_2"])
        self.assertEqual(self.nn_da.names, ["seq_0", "seq_1", "seq_2"])

    def test_seqFreqs(self):
        """seqFreqs should work the same on Alignment and ArrayAlignment"""
        get_index = RNA.alphabets.degen_gapped.index
        # 'UCAGGG'
        # 'YCU-RG'
        # 'CAA-NR'

        expected_counts = {
            0: {
                "U": 1,
                "C": 1,
                "A": 1,
                "G": 3
            },
            1: {
                "Y": 1,
                "C": 1,
                "U": 1,
                "-": 1,
                "R": 1,
                "G": 1
            },
            2: {
                "C": 1,
                "A": 2,
                "-": 1,
                "N": 1,
                "R": 1
            },
        }
        got1 = self.da.counts_per_seq(allow_gap=True, include_ambiguity=True)
        got2 = self.aln.counts_per_seq(allow_gap=True, include_ambiguity=True)
        for pos, counts in expected_counts.items():
            for char in counts:
                self.assertEqual(got1[pos, char], expected_counts[pos][char])
                self.assertEqual(got2[pos, char], expected_counts[pos][char])

    def test_subset_positions_ArrayAlignment(self):
        # because dict order volatile, need to grab the
        # the index for ambig characters from the object
        # The full data comes from these seqs
        # 'UCAGGG'
        # 'YCU-RG'
        # 'CAA-NR'
        get_index = RNA.alphabets.degen_gapped.index
        G = get_index("-")
        N = get_index("N")
        R = get_index("R")
        Y = get_index("Y")
        full_data = array([[0, 1, 2, 3, 3, 3], [Y, 1, 0, G, R, 3],
                           [1, 2, 2, G, N, R]])

        model1 = ArraySequence("UCG",
                               name="rna1",
                               alphabet=RNA.alphabets.degen_gapped)
        model2 = ArraySequence("YCG",
                               name="rna2",
                               alphabet=RNA.alphabets.degen_gapped)
        model3 = ArraySequence("CAR",
                               name="rna3",
                               alphabet=RNA.alphabets.degen_gapped)
        sub_da = ArrayAlignment([model1, model2, model3],
                                moltype=RNA,
                                alphabet=RNA.alphabets.degen_gapped)

        sub_data = array([[0, 1, 3], [Y, 1, 3], [1, 2, R]])

        # First check some data
        self.assertEqual(self.da.array_seqs, full_data)
        self.assertEqual(self.da.array_positions, transpose(full_data))
        self.assertEqual(sub_da.array_seqs, sub_data)
        self.assertEqual(sub_da.array_positions, transpose(sub_data))

        obs_sub_da_TP = self.da.take_positions([0, 1, 5])
        obs_sub_da_SA = self.da.get_sub_alignment(pos=[0, 1, 5])

        # When using the get_sub_alignment method the data is right
        self.assertEqual(obs_sub_da_SA, sub_da)
        self.assertNotEqual(obs_sub_da_SA, self.da)
        self.assertEqual(obs_sub_da_SA.array_seqs, sub_data)
        self.assertEqual(obs_sub_da_SA.array_positions, transpose(sub_data))

        # For the take_positions method: Why does this work
        self.assertEqual(obs_sub_da_TP, sub_da)
        self.assertNotEqual(obs_sub_da_TP, self.da)
        # If the data doesn't match?
        self.assertEqual(obs_sub_da_TP.array_seqs, sub_data)
        self.assertEqual(obs_sub_da_TP.array_positions, transpose(sub_data))
        # Shouldn't the __eq__ method check the data at least?

    def test_subset_positions_Alignment(self):
        rna1 = RnaSequence("UCG", name="rna1")
        rna2 = RnaSequence("YCG", name="rna2")
        rna3 = RnaSequence("CAR", name="rna3")

        sub_aln = Alignment([rna1, rna2, rna3], moltype=RNA)

        obs_sub_aln = self.aln.take_positions([0, 1, 5])
        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertNotEqual(obs_sub_aln, self.aln)
        # string representations should be the same. This fails right
        # now, because sequence order is not maintained. See separate test.
        self.assertEqual(str(obs_sub_aln), str(sub_aln))

    def test_take_positions_sequence_order(self):
        """Alignment take_positions should maintain seq order"""
        # This works
        self.assertEqual(self.da.names, ["rna1", "rna2", "rna3"])
        sub_da = self.da.get_sub_alignment(pos=[0, 1, 5])
        self.assertEqual(sub_da.names, ["rna1", "rna2", "rna3"])
        # seq order not maintained in Alignment
        self.assertEqual(self.aln.names, ["rna1", "rna2", "rna3"])
        sub_aln = self.aln.take_positions([0, 1, 5])
        self.assertEqual(sub_aln.names, ["rna1", "rna2", "rna3"])

    def test_subset_seqs_Alignment(self):
        rna1 = RnaSequence("UCG", name="rna1")
        rna2 = RnaSequence("YCG", name="rna2")
        rna3 = RnaSequence("CAR", name="rna3")

        sub_aln = Alignment([rna2, rna3], moltype=RNA)
        aln = Alignment([rna1, rna2, rna3], moltype=RNA)
        obs_sub_aln = aln.take_seqs(["rna2", "rna3"])

        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertEqual(str(obs_sub_aln), str(sub_aln))

        # Selected sequences should be in specified order?
        obs_sub_aln_1 = self.aln.take_seqs(["rna3", "rna2"])
        obs_sub_aln_2 = self.aln.take_seqs(["rna2", "rna3"])
        self.assertNotEqual(str(obs_sub_aln_1), str(obs_sub_aln_2))

    def test_subset_seqs_ArrayAlignment(self):
        model1 = ArraySequence("UCG",
                               name="rna1",
                               alphabet=RNA.alphabets.degen_gapped)
        model2 = ArraySequence("YCG",
                               name="rna2",
                               alphabet=RNA.alphabets.degen_gapped)
        model3 = ArraySequence("CAR",
                               name="rna3",
                               alphabet=RNA.alphabets.degen_gapped)
        sub_da = ArrayAlignment([model1, model2, model3],
                                moltype=RNA,
                                alphabet=RNA.alphabets.degen_gapped)

        # take_seqs by name should have the same effect as
        # get_sub_alignment by seq idx?
        obs_sub_da_TS = self.da.take_seqs(["rna1"])
        obs_sub_da_SA = self.da.get_sub_alignment(seqs=[0])

        # These two are now the same. Fixed mapping of key to char array.
        self.assertEqual(obs_sub_da_TS, obs_sub_da_SA)
        self.assertEqual(str(obs_sub_da_TS), str(obs_sub_da_SA))

    def test_aln_equality(self):
        # When does something compare equal?
        self.assertEqual(self.da == self.da, True)
        # one sequence less
        other_da1 = ArrayAlignment([self.model1, self.model2],
                                   moltype=RNA,
                                   alphabet=RNA.alphabets.degen_gapped)
        self.assertEqual(self.da == other_da1, False)
        # seqs in different order -- doesn't matter
        other_da2 = ArrayAlignment(
            [self.model1, self.model3, self.model2],
            moltype=RNA,
            alphabet=RNA.alphabets.degen_gapped,
        )
        self.assertEqual(self.da == other_da2, True)
        # seqs in different encoding -- doesn't matter, only looks at data
        other_da3 = ArrayAlignment([self.model1, self.model2, self.model3])
        # Should this compare False even though the data is exactly the same?
        # The moltype is different...
        self.assertEqual(self.da == other_da3, True)
        assert alltrue(
            list(map(alltrue, self.da.array_seqs == other_da3.array_seqs)))

    def test_seq_equality(self):
        model1 = ArraySequence("UCG",
                               name="rna1",
                               alphabet=RNA.alphabets.degen_gapped)
        model2 = ArraySequence("UCG",
                               name="rna1",
                               alphabet=RNA.alphabets.degen_gapped)
        # Shouldn't the above two sequences be equal?
        self.assertEqual(model1, model2)
        # string comparison is True
        self.assertEqual(str(model1), str(model2))

    def test_seq_ungapping(self):
        rna1 = RnaSequence("U-C-A-G-", name="rna1")
        model1 = ArraySequence("U-C-A-G-",
                               name="rna1",
                               alphabet=RNA.alphabets.degen_gapped)

        self.assertEqual(rna1, "U-C-A-G-")
        self.assertEqual(rna1.degap(), "UCAG")

        # check is produces the right string from the beginning
        self.assertEqual(str(model1), "U-C-A-G-")
        self.assertEqual(model1._data, [0, 4, 1, 4, 2, 4, 3, 4])
        # ArraySequence should maybe have the same degap method as normal seq
        self.assertEqual(str(model1.degap()), "UCAG")

    def test_the_rest_of_ModelSequence(self):
        """The class ArraySequence has 14 methods, but only 2 unittests.
        You might want to add some tests there..."""
        # note: mostly these are tested in derived classes, for convenience.
        pass