Exemplo n.º 1
0
    def test_make_aligned_seqs(self):
        """test Alignment/ArrayAlignment constructor utility function"""
        data = {"a": "AGGTT", "b": "AGAA-"}
        got = make_aligned_seqs(data)
        self.assertIsInstance(got, ArrayAlignment)
        self.assertEqual(got.to_dict(), data)
        self.assertEqual(got.info["source"], "unknown")
        # moltype arg works
        got = make_aligned_seqs(data, moltype="dna")
        self.assertEqual(got.moltype.label, "dna")
        # info works
        got = make_aligned_seqs(data, info=dict(a=2))
        self.assertEqual(got.info["a"], 2)
        with self.assertRaises(AssertionError):
            _ = make_unaligned_seqs(data, info=2)

        # source works
        got = make_aligned_seqs(data, source="somewhere")
        self.assertEqual(got.info["source"], "somewhere")

        # array_align works
        got = make_aligned_seqs(data, array_align=False)
        self.assertIsInstance(got, Alignment)
        self.assertEqual(got.to_dict(), data)
        self.assertEqual(got.info["source"], "unknown")
Exemplo n.º 2
0
 def test_roundtrip_seqcoll(self):
     """SequenceCollection to_json enables roundtrip"""
     data = dict(A="TTGT", B="GGCT")
     seqcoll = make_unaligned_seqs(data=data, moltype="dna")
     got = deserialise_object(seqcoll.to_json())
     self.assertEqual(got.rc().to_dict(), seqcoll.rc().to_dict())
     self.assertIsInstance(got, alignment.SequenceCollection)
Exemplo n.º 3
0
    def test_replace_seqs(self):
        """synchronize gaps between protein seqs and codon seqs"""
        pd = {
            "FlyingFox": "C-TNAH",
            "DogFaced": "CGTNT-",
            "FreeTaile": "-GTDTH",
            "LittleBro": "C-TD-H",
            "TombBat": "C--STH",
        }
        pal = make_aligned_seqs(moltype=PROTEIN, data=pd)

        cu = {
            "TombBat": "TGTAGTACTCAT",
            "FreeTaile": "GGCACAGATACTCAT",
            "FlyingFox": "TGTACAAATGCTCAT",
            "LittleBro": "TGTACAGATCAT",
            "DogFaced": "TGTGGCACAAATACT",
        }

        co = make_unaligned_seqs(moltype=DNA, data=cu)
        cal = pal.replace_seqs(co)
        result = cal.to_dict()
        for taxon, expected_sequence in [
            ("FlyingFox", "TGT---ACAAATGCTCAT"),
            ("DogFaced", "TGTGGCACAAATACT---"),
            ("FreeTaile", "---GGCACAGATACTCAT"),
            ("LittleBro", "TGT---ACAGAT---CAT"),
            ("TombBat", "TGT------AGTACTCAT"),
        ]:
            self.assertEqual(result[taxon], expected_sequence)
Exemplo n.º 4
0
 def test_progress_with_guide_tree(self):
     """progressive align works with provided guide tree"""
     tree = make_tree(treestring=self.treestring)
     aligner = align_app.progressive_align(model="nucleotide",
                                           guide_tree=self.treestring)
     aln = aligner(self.seqs)
     self.assertEqual(len(aln), 42)
     aligner = align_app.progressive_align(model="nucleotide",
                                           guide_tree=tree)
     aln = aligner(self.seqs)
     self.assertEqual(len(aln), 42)
     # even if it has underscores in name
     treestring = ("(Bandicoot:0.4,FlyingFox:0.05,(Rhesus_macaque:0.06,"
                   "Human:0.0):0.04);")
     aligner = align_app.progressive_align(model="nucleotide",
                                           guide_tree=treestring)
     data = self.seqs.to_dict()
     data["Rhesus macaque"] = data.pop("Rhesus")
     seqs = make_unaligned_seqs(data)
     aln = aligner(seqs)
     self.assertEqual(len(aln), 42)
     # guide tree with no lengths raises value error
     with self.assertRaises(ValueError):
         _ = align_app.progressive_align(
             model="nucleotide",
             guide_tree="(Bandicoot,FlyingFox,(Rhesus_macaque,Human));",
         )
Exemplo n.º 5
0
 def test_dotplot_seqcoll(self):
     """dotplot sequence collection, gaps are removed"""
     seqs = make_unaligned_seqs(
         {"seq1": "ACGG", "seq2": "CGCA", "seq3": "CCG-"}, moltype="dna"
     )
     dp = seqs.dotplot("seq1", "seq3")
     self.assertNotEqual(dp._aligned_coords, None)
     self.assertEqual(len(dp.seq1), 4)
     self.assertEqual(len(dp.seq2), 3)
Exemplo n.º 6
0
    def test_roundtrip_annotated_seqcoll(self):
        """SequenceCollection to_json enables roundtrip of annotated sequences"""
        data = dict(A="TTGTA", B="GGCT")
        seqs = make_unaligned_seqs(data=data, moltype="dna")

        f = seqs.named_seqs["A"].add_feature("gene", "n1", [(2, 5)])
        data = seqs.to_json()
        expect = str(f.get_slice())
        got = deserialise_object(data)
        self.assertEqual(str(got.named_seqs["A"].annotations[0].get_slice()), expect)
Exemplo n.º 7
0
    def test_make_unaligned_seqs(self):
        """test SequenceCollection constructor utility function"""
        data = {"a": "AGGTT", "b": "AG"}
        got = make_unaligned_seqs(data)
        self.assertIsInstance(got, SequenceCollection)
        self.assertEqual(got.to_dict(), data)
        self.assertEqual(got.info["source"], "unknown")
        # moltype arg works
        got = make_unaligned_seqs(data, moltype="dna")
        self.assertEqual(got.moltype.label, "dna")
        # info works
        got = make_unaligned_seqs(data, info=dict(a=2))
        self.assertEqual(got.info["a"], 2)
        with self.assertRaises(AssertionError):
            _ = make_unaligned_seqs(data, info=2)

        # source works
        got = make_unaligned_seqs(data, source="somewhere")
        self.assertEqual(got.info["source"], "somewhere")
Exemplo n.º 8
0
 def test_dotplot_title(self):
     """setting empty string title works"""
     seqs = make_unaligned_seqs(
         {
             "seq1": "ACGG",
             "seq2": "CGCA",
             "seq3": "CCG-"
         }, moltype="dna")
     dp = seqs.dotplot("seq1", "seq3", title="")
     self.assertEqual(dp.figure.layout.title, "")
Exemplo n.º 9
0
 def test_progressive_fails(self):
     """should return NotCompletedResult along with message"""
     # Bandicoot has an inf-frame stop codon
     seqs = make_unaligned_seqs(
         data={"Human": "GCCTCA", "Rhesus": "GCCAGCTCA", "Bandicoot": "TGATCATTA"},
         moltype="dna",
     )
     aligner = align_app.progressive_align(model="codon")
     got = aligner(seqs)
     self.assertTrue(type(got), NotCompleted)
Exemplo n.º 10
0
    def test_trim_stop_codons(self):
        """trims stop codons using the specified genetic code"""
        trimmer = sample.trim_stop_codons()  # defaults to standard code
        seqs = make_unaligned_seqs(data={
            "seq1": "AAATTTCCC",
            "seq2": "AAATTTTAA"
        },
                                   moltype="dna")
        got = trimmer(seqs)
        expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT"}
        self.assertEqual(got.to_dict(), expect)

        trimmer = sample.trim_stop_codons(gc=1)  # standard code
        seqs = make_unaligned_seqs(data={
            "seq1": "AAATTTCCC",
            "seq2": "AAATTTTAA"
        },
                                   moltype="dna")
        got = trimmer(seqs)
        expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT"}
        self.assertEqual(got.to_dict(), expect)
        trimmer = sample.trim_stop_codons(gc=1)  # standard code
        aln = make_aligned_seqs(data={
            "seq1": "AAATTTCCC",
            "seq2": "AAATTTTAA"
        },
                                moltype="dna")
        got = trimmer(aln)
        expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT---"}
        self.assertEqual(got.to_dict(), expect)

        # different genetic code
        trimmer = sample.trim_stop_codons(gc=2)  # mt code
        seqs = make_unaligned_seqs(data={
            "seq1": "AAATTTCCC",
            "seq2": "AAATTTAGA"
        },
                                   moltype="dna")
        got = trimmer(seqs)
        expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT"}
        self.assertEqual(got.to_dict(), expect)
Exemplo n.º 11
0
 def test_translate_seqcoll(self):
     """correctly translate a sequence collection"""
     seqs = dict(a="ATGAGG", b="ATGTAA")
     seqs = make_unaligned_seqs(seqs)
     # trim terminal stops
     translater = translate_seqs()
     aa = translater(seqs)
     self.assertEqual(aa.to_dict(), dict(a="MR", b="M"))
     self.assertEqual(aa.moltype.label, "protein")
     # don't trim terminal stops, returns NotCompleted
     translater = translate_seqs(trim_terminal_stop=False)
     aa = translater(seqs)
     self.assertIsInstance(aa, NotCompleted)
Exemplo n.º 12
0
 def test_dotplot_missing(self):
     """fail if a sequence name not present"""
     seqs = make_unaligned_seqs(
         {
             "seq1": "ACGG",
             "seq2": "CGCA",
             "seq3": "CCG-"
         }, moltype="dna")
     with self.assertRaises(ValueError):
         _ = seqs.dotplot("seq1", "seq5")
     with self.assertRaises(ValueError):
         _ = seqs.dotplot("seq5", "seq1")
     with self.assertRaises(ValueError):
         _ = seqs.dotplot("seq5", "seq6")
Exemplo n.º 13
0
    def test_select_translatable(self):
        """correctly get translatable seqs"""
        data = {
            "a": "AATATAAATGCCAGCTCATTACAGCATGAGAACA" "GCAGTTTATTACTTCATAAAGTCATA",
            "rc": "TATGACTTTATGAAGTAATAAACTGCTGTTCTCA" "TGCTGTAATGAGCTGGCATTTATATT",
        }
        seqs = make_unaligned_seqs(data=data, moltype=DNA)
        trans = select_translatable(allow_rc=False)
        tr = trans(seqs)
        ex = data.copy()
        ex.pop("rc")
        self.assertEqual(tr.to_dict(), ex)
        trans = select_translatable(allow_rc=True)
        tr = trans(seqs)
        ex = data.copy()
        ex["rc"] = data["a"]
        self.assertEqual(tr.to_dict(), ex)

        # if seqs not translatable returns NotCompletedResult
        data = dict(a="TAATTGATTAA", b="GCAGTTTATTA")
        seqs = make_unaligned_seqs(data=data, moltype=DNA)
        got = select_translatable(allow_rc=False)
        self.assertTrue(type(got), NotCompleted)
Exemplo n.º 14
0
    def test_minlength(self):
        """correctly identifies data with minimal length"""
        aln = make_aligned_seqs(data=[("a",
                                       "GCAAGCGTTTAT"), ("b", "GCTTTTGTCAAT")])

        # if using subtract_degen, fails if incorect moltype
        ml = sample.min_length(9, subtract_degen=True)
        got = ml(aln)
        self.assertIsInstance(got, NotCompleted)
        self.assertEqual(got.type, "ERROR")

        # but works if subtract_degen is False
        ml = sample.min_length(9, subtract_degen=False)
        aln = ml(aln)
        self.assertEqual(len(aln), 12)
        # or if moltype provided
        ml = sample.min_length(9, subtract_degen=True, moltype="dna")
        aln = ml(aln)
        self.assertEqual(len(aln), 12)

        alns = [
            make_aligned_seqs(data=[("a", "GCAAGCGTTTAT"),
                                    ("b", "GCTTTTGTCAAT")],
                              moltype=DNA),
            make_aligned_seqs(data=[("a", "GGAAGCGT"), ("b", "GCTTT-GT")],
                              moltype=DNA),
        ]
        ml = sample.min_length(9)
        got = [aln.to_dict() for aln in map(ml, alns) if aln]
        expected = [dict((("a", "GCAAGCGTTTAT"), ("b", "GCTTTTGTCAAT")))]
        self.assertEqual(got, expected)

        # returns NotCompletedResult if nothing satisifies
        got = ml(alns[1])
        self.assertTrue(type(got) == sample.NotCompleted)

        alns = [
            make_unaligned_seqs(data=[("a", "GGAAGCGT"), ("b", "GCTTNGT")],
                                moltype=DNA)
        ]
        ml = sample.min_length(6)
        got = [aln.to_dict() for aln in map(ml, alns) if aln]
        expected = [dict((("a", "GGAAGCGT"), ("b", "GCTTNGT")))]
        self.assertEqual(got, expected)

        ml = sample.min_length(7)
        got = [aln.to_dict() for aln in map(ml, alns) if aln]
        expected = []
        self.assertEqual(got, expected)
Exemplo n.º 15
0
 def setUp(self):
     self.al = make_aligned_seqs(
         data={
             "a": "GTACGTACGATC",
             "b": "GTACGTACGTAC",
             "c": "GTACGTACGTTC",
             "e": "GTACGTACTGGT",
         })
     self.collection = make_unaligned_seqs(
         data={
             "a": "GTACGTACGATC",
             "b": "GTACGTACGTAC",
             "c": "GTACGTACGTTC",
             "e": "GTACGTACTGGT",
         })
Exemplo n.º 16
0
    def test_DnaRna_interconversion(self):
        """test interconversion between Rna and Dna by SequenceCollection and
        Alignment"""
        dna = {
            "seq1": "--ACGT--GT---",
            "seq2": "--ACGTA-GT---",
            "seq3": "--ACGTA-GT---",
        }
        rna = {
            "seq1": "--ACGU--GU---",
            "seq2": "--ACGUA-GU---",
            "seq3": "--ACGUA-GU---",
        }
        collect_Dna = make_unaligned_seqs(data=dna, moltype=DNA)
        collect_Rna = make_unaligned_seqs(data=rna, moltype=RNA)
        self.assertEqual(collect_Rna.to_dna().to_dict(), dna)
        self.assertEqual(collect_Dna.to_rna().to_dict(), rna)

        aln_Dna = make_aligned_seqs(data=dna, moltype=DNA)
        aln_Rna = make_aligned_seqs(data=rna, moltype=RNA)
        rna_from_dna = aln_Dna.to_rna()
        dna_from_rna = aln_Rna.to_dna()
        self.assertEqual(rna_from_dna.to_dict(), rna)
        self.assertEqual(dna_from_rna.to_dict(), dna)
Exemplo n.º 17
0
    def test_progressive_est_tree(self):
        """excercise progressive alignment without a guide tree"""
        seqs = make_unaligned_seqs(
            data={
                "A": "TGTGGCACAAATGCTCATGCCAGCTCTTTACAGCATGAGAACA",
                "B": "TGTGGCACAGATACTCATGCCAGCTCATTACAGCATGAGAACAGCAGTTT",
                "C": "TGTGGCACAAGTACTCATGCCAGCTCAGTACAGCATGAGAACAGCAGTTT",
            })
        aln, tree = cogent3.align.progressive.TreeAlign(
            HKY85(), seqs, show_progress=False, param_vals={"kappa": 4.0})

        expect = {
            "A": "TGTGGCACAAATGCTCATGCCAGCTCTTTACAGCATGAGAACA-------",
            "C": "TGTGGCACAAGTACTCATGCCAGCTCAGTACAGCATGAGAACAGCAGTTT",
            "B": "TGTGGCACAGATACTCATGCCAGCTCATTACAGCATGAGAACAGCAGTTT",
        }
        self.assertEqual(aln.to_dict(), expect)
Exemplo n.º 18
0
 def test_degap(self):
     """test stripping gaps from collections and alignments"""
     aln = make_aligned_seqs(
         data={
             "seq1": "--ACGT--GT---",
             "seq2": "--ACGTA-GT---",
             "seq3": "--ACGTA-GT---",
         })
     observed = aln.degap()
     expect = {"seq1": "ACGTGT", "seq2": "ACGTAGT", "seq3": "ACGTAGT"}
     self.assertEqual(observed.to_dict(), expect)
     collection = make_unaligned_seqs(
         data={
             "seq1": "--ACGT--GT---",
             "seq2": "--ACGTA-GT---",
             "seq3": "--ACGTA-GT---",
         },
         moltype=DNA,
     )
     observed = collection.degap()
     self.assertEqual(observed.to_dict(), expect)
     self.assertEqual(observed.moltype, DNA)
Exemplo n.º 19
0
    def test_omit_duplicated(self):
        """correctly drop duplicated sequences"""
        # strict omit_duplicated
        data = {
            "a": "ACGT",
            "b": "ACG-",  # identical excepting -
            "c": "ACGN",  # non-strict matches above
            "d": "ACGG",
            "e": "ACGG",
            "k": "ACGG",  # strict identical
            "f": "RAAA",
            "g": "YAAA",  # non-strict identical
            "h": "GGGG",
        }  # unique!
        seqs = make_unaligned_seqs(data=data, moltype=DNA)

        # mask_degen = True : [{'a', 'c', 'b'}, {'k', 'd', 'e'},
        # {'g', 'f'}] are dupe sets. Only 'h' unique
        drop = sample.omit_duplicated(mask_degen=True,
                                      choose=None,
                                      moltype="dna")
        got = drop(seqs)
        self.assertEqual(got.to_dict(), {"h": "GGGG"})
        # mask_degen = False : [{'a', 'b'}, {'k', 'd', 'e'}]
        # c, f, g, h
        drop = sample.omit_duplicated(mask_degen=False,
                                      choose=None,
                                      moltype="dna")
        got = drop(seqs)
        expect = {
            "a": "ACGT",
            "b": "ACG-",
            "c": "ACGN",
            "f": "RAAA",
            "g": "YAAA",
            "h": "GGGG",
        }
        self.assertEqual(got.to_dict(), expect)
Exemplo n.º 20
0
    def load(self, data):
        """returns sequences

        Parameters
        ----------
        data
            file path or cogent3 sequence collection / alignment
        """
        if type(data) == str:
            with open_(data) as infile:
                data = dict(record for record in self._parser(infile))
            seqs = self.klass(data=data, moltype=self.moltype)
            seqs.info.path = data
        elif not isinstance(data, SequenceCollection):
            if self.aligned:
                seqs = make_aligned_seqs(data, moltype=self.moltype)
            else:
                seqs = make_unaligned_seqs(data, moltype=self.moltype)

        if not (self._output_types & {"aligned"}):
            seqs = seqs.degap()

        return seqs
Exemplo n.º 21
0
class FastSlowDistTests(TestCase):
    seqs1 = make_unaligned_seqs(_seqs1, moltype=DNA)
    seqs2 = make_unaligned_seqs(_seqs2, moltype=DNA)
    seqs3 = make_unaligned_seqs(_seqs3, moltype=DNA)
    seqs4 = make_unaligned_seqs(_seqs4, moltype=DNA)
    seqs5 = make_unaligned_seqs(_seqs5, moltype=PROTEIN)

    def test_init(self):
        """tests if fast_slow_dist can be initialised correctly"""

        fast_slow_dist = dist_app.fast_slow_dist(fast_calc="hamming",
                                                 moltype="dna")
        self.assertIsInstance(fast_slow_dist.fast_calc, HammingPair)
        self.assertIsNone(fast_slow_dist._sm)

        fast_slow_dist = dist_app.fast_slow_dist(distance="TN93")
        self.assertIsInstance(fast_slow_dist.fast_calc, TN93Pair)
        self.assertEqual(fast_slow_dist._sm.name, "TN93")
        fast_slow_dist = dist_app.fast_slow_dist(distance="GTR")
        self.assertEqual(fast_slow_dist._sm.name, "GTR")

        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93")
        self.assertEqual(fast_slow_dist._sm.name, "TN93")
        self.assertIsNone(fast_slow_dist.fast_calc)

        with self.assertRaises(ValueError):
            fast_slow_dist = dist_app.fast_slow_dist(distance="TN93",
                                                     fast_calc="TN93",
                                                     slow_calc="TN93")

        with self.assertRaises(ValueError):
            fast_slow_dist = dist_app.fast_slow_dist(fast_calc="GTR")

        with self.assertRaises(ValueError):
            fast_slow_dist = dist_app.fast_slow_dist(slow_calc="hamming")

    def test_compatible_parameters(self):
        """tests if the input parameters are compatible with fast_slow_dist initialisation"""
        fast_slow_dist = dist_app.fast_slow_dist(fast_calc="hamming",
                                                 moltype="dna")
        fast_slow_dist = dist_app.fast_slow_dist(fast_calc="TN93")
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR")
        fast_slow_dist = dist_app.fast_slow_dist(fast_calc="TN93")

        # fails for paralinear or hamming if no moltype
        with self.assertRaises(ValueError):
            fast_slow_dist = dist_app.fast_slow_dist(slow_calc="hamming")
        with self.assertRaises(ValueError):
            fast_slow_dist = dist_app.fast_slow_dist(slow_calc="paralinear")

        # fails for hamming as slow_calc
        with self.assertRaises(ValueError):
            fast_slow_dist = dist_app.fast_slow_dist(slow_calc="hamming",
                                                     moltype="dna")
        with self.assertRaises(ValueError):
            fast_slow_dist = dist_app.fast_slow_dist(fast_calc="GTR")

    def test_composable_apps(self):
        """tests two composable apps"""
        composable_apps = _get_all_composable_apps()
        fast_slow_dist = dist_app.fast_slow_dist(fast_calc="hamming",
                                                 moltype="dna")
        for app in composable_apps:
            # Compose two composable applications, there should not be exceptions.
            got = app + fast_slow_dist
            self.assertIsInstance(got, dist_app.fast_slow_dist)
            self.assertEqual(got._type, "distance")
            self.assertIs(got.input, app)
            self.assertIs(got.output, None)
            self.assertIsInstance(got._input_types, frozenset)
            self.assertIsInstance(got._output_types, frozenset)
            self.assertIs(got._in, app)
            self.assertIs(got._out, None)
            app.disconnect()
            fast_slow_dist.disconnect()

    def test_est_dist_pair_slow(self):
        """tests the distance between seq pairs in aln"""

        aligner = align.align_to_ref()
        aln3 = aligner(self.seqs3)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR")
        got = fast_slow_dist(aln3).to_dict()
        assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")])
        self.assertTrue(got[("Mouse", "Human")] >= 0)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93")
        got = fast_slow_dist(aln3).to_dict()
        assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")])
        self.assertTrue(got[("Mouse", "Human")] >= 0)

        aligner = align.align_to_ref(ref_seq="Human")
        aln3 = aligner(self.seqs3)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR")
        got = fast_slow_dist(aln3).to_dict()
        assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")])
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93")
        got = fast_slow_dist(aln3).to_dict()
        assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")])
        self.assertTrue(got[("Mouse", "Human")] >= 0)

        aligner = align.align_to_ref(ref_seq="Mouse")
        aln3 = aligner(self.seqs3)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(got[("Mouse", "Human")] >= 0)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(got[("Mouse", "Human")] >= 0)

        aligner = align.align_to_ref()
        aln3 = aligner(self.seqs4)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(got[("Human", "Opossum")] >= 0)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(got[("Human", "Opossum")] >= 0)

        aligner = align.align_to_ref(ref_seq="Human")
        aln3 = aligner(self.seqs4)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(got[("Human", "Opossum")] >= 0)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(got[("Human", "Opossum")] >= 0)

        aligner = align.align_to_ref(ref_seq="Opossum")
        aln3 = aligner(self.seqs4)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(got[("Human", "Opossum")] >= 0)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(got[("Human", "Opossum")] >= 0)

        # now as a process
        proc = align.align_to_ref() + dist_app.fast_slow_dist(
            fast_calc="hamming", moltype="dna")
        got = proc(self.seqs1)
        self.assertEqual(got[("Human", "Rhesus")], 1)

        treestring = "(Human:0.2,Bandicoot:0.2)"
        aligner = align.progressive_align(model="WG01", guide_tree=treestring)
        _ = aligner(self.seqs5)

    def test_composes_with_write_tabular(self):
        """correctly links to tabular"""
        with TemporaryDirectory(dir=".") as dirname:
            writer = io.write_tabular(dirname)
            dist_calc = dist_app.fast_slow_dist(distance="hamming",
                                                moltype="protein")
            _ = dist_calc + writer

    def test_functions_as_composable(self):
        """works as a composable app"""
        from pathlib import Path

        loader = io.load_aligned(moltype="dna", format="paml")
        dist = dist_app.fast_slow_dist("hamming", moltype="dna")
        with TemporaryDirectory(dir=".") as dirname:
            dirname = Path(dirname)
            writer = io.write_tabular(dirname)
            proc = loader + dist + writer
            _ = proc("data/brca1_5.250.paml")
            output = dirname / "brca1_5.250.tsv"
            self.assertTrue(output.exists())
Exemplo n.º 22
0
 def test_dotplot_single(self):
     """dotplot with single sequence should not fail"""
     seqs = make_unaligned_seqs({"seq1": "ACGG"}, moltype="dna")
     dp = seqs.dotplot()
     self.assertEqual(dp.seq1, dp.seq2)
Exemplo n.º 23
0
    def test_trim_stop_codons(self):
        """test without terminal stop handling"""
        seq_coll = make_unaligned_seqs(
            data={"seq1": "ACGTAA", "seq2": "ACGACG", "seq3": "ACGCGT"}, moltype=DNA
        )
        seq_coll = seq_coll.trim_stop_codons()
        seqs = seq_coll.to_dict()
        self.assertEqual(seqs["seq1"], "ACG")  # note: not 'acg---'
        self.assertEqual(seqs["seq2"], "ACGACG")
        # aligned
        aln = make_aligned_seqs(
            data={"seq1": "ACGTAA", "seq2": "ACGTGA", "seq3": "ACGTAA"}, moltype=DNA
        )
        aln = aln.trim_stop_codons()
        self.assertEqual(
            aln.to_dict(), {"seq1": "ACG", "seq2": "ACG", "seq3": "ACG"}
        )  # note: not 'acg---'
        aln = make_aligned_seqs(
            data={"seq1": "ACGAAA", "seq2": "ACGTGA", "seq3": "ACGTAA"}, moltype=DNA
        )
        aln = aln.trim_stop_codons()
        self.assertEqual(
            aln.to_dict(), {"seq1": "ACGAAA", "seq2": "ACG---", "seq3": "ACG---"}
        )

        # for case where a sequence length is not divisible by 3
        seq_coll = make_unaligned_seqs(
            data={"seq1": "ACGTAA", "seq2": "ACGAC"}, moltype=DNA
        )
        # fail
        self.assertRaises(ValueError, seq_coll.trim_stop_codons)
        # unless explicitly over-ridden with allow_partial
        new_coll = seq_coll.trim_stop_codons(allow_partial=True)
        self.assertEqual(new_coll.to_dict(), dict(seq1="ACG", seq2="ACGAC"))

        # should work for alignments too
        aln = make_aligned_seqs(
            data={"seq1": "ACGTAA---", "seq2": "ACGAC----", "seq3": "ACGCAATTT"},
            moltype=DNA,
        )
        # fail
        self.assertRaises(ValueError, aln.trim_stop_codons)
        # unless explicitly over-ridden with allow_partial
        aln = aln.trim_stop_codons(allow_partial=True)
        self.assertEqual(
            aln.to_dict(),
            {"seq1": "ACG------", "seq2": "ACGAC----", "seq3": "ACGCAATTT"},
        )
        # mixed lengths
        aln = make_aligned_seqs(
            data={"seq1": "ACGTAA---", "seq2": "ACGAC----", "seq3": "ACGCAATGA"},
            moltype=DNA,
        )
        aln = aln.trim_stop_codons(allow_partial=True)
        self.assertEqual(
            aln.to_dict(), {"seq1": "ACG---", "seq2": "ACGAC-", "seq3": "ACGCAA"}
        )
        # longest seq not divisible by 3
        aln = make_aligned_seqs(
            data={"seq1": "ACGTAA--", "seq2": "ACGAC---", "seq3": "ACGC-ATG"},
            moltype=DNA,
        )
        aln = aln.trim_stop_codons(allow_partial=True)
        self.assertEqual(
            aln.to_dict(), {"seq1": "ACG-----", "seq2": "ACGAC---", "seq3": "ACGC-ATG"}
        )
Exemplo n.º 24
0
class RefalignmentTests(TestCase):
    seqs = make_unaligned_seqs(_seqs, moltype=DNA)

    def test_align_to_ref(self):
        """correctly aligns to a reference"""
        aligner = align_app.align_to_ref(ref_seq="Human")
        aln = aligner(self.seqs)
        expect = {
            "Bandicoot": "---NACTCATTAATGCTTGAAACCAGCAGTTTATTGTCCAAC",
            "FlyingFox": "GCCAGCTCTTTACAGCATGAGAACAG---TTTATTATACACT",
            "Human": "GCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTCACT",
            "Rhesus": "GCCAGCTCATTACAGCATGAGAAC---AGTTTGTTACTCACT",
        }
        self.assertEqual(aln.to_dict(), expect)

    def test_align_to_ref_generic_moltype(self):
        """tests when the moltype is generic"""
        test_moltypes = [
            "text", "rna", "protein", "protein_with_stop", "bytes", "ab"
        ]
        for test_moltype in test_moltypes:
            aligner = align_app.align_to_ref(moltype=test_moltype)
            self.assertEqual(aligner._moltype.label, test_moltype)
            self.assertEqual(
                aligner._kwargs["S"],
                make_generic_scoring_dict(10, get_moltype(test_moltype)),
            )

    def test_align_to_ref_result_has_moltype(self):
        """aligned object has correct moltype"""
        aligner = align_app.align_to_ref(moltype="dna")
        got = aligner(self.seqs)
        self.assertEqual(got.moltype.label, "dna")

    def test_merged_gaps(self):
        """correctly merges gaps"""
        a = dict([(2, 3), (4, 9)])
        b = dict([(2, 6), (8, 5)])
        # omitting one just returns the other
        self.assertIs(_merged_gaps(a, {}), a)
        self.assertIs(_merged_gaps({}, b), b)
        got = _merged_gaps(a, b)
        self.assertEqual(got, [(2, 6), (4, 9), (8, 5)])

    def test_aln_to_ref_known(self):
        """correctly recapitulates known case"""
        orig = make_aligned_seqs(
            {
                "Ref": "CAG---GAGAACAGAAACCCAT--TACTCACT",
                "Qu1": "CAG---GAGAACAG---CCCGTGTTACTCACT",
                "Qu2": "CAGCATGAGAACAGAAACCCGT--TA---ACT",
                "Qu3": "CAGCATGAGAACAGAAACCCGT----CTCACT",
                "Qu4": "CAGCATGAGAACAGAAACCCGTGTTACTCACT",
                "Qu5": "CAG---GAGAACAG---CCCAT--TACTCACT",
                "Qu6": "CAG---GA-AACAG---CCCAT--TACTCACT",
                "Qu7": "CAG---GA--ACAGA--CCCGT--TA---ACT",
            },
            moltype="dna",
        )
        expect = orig.to_dict()
        aligner = align_app.align_to_ref(ref_seq="Ref")
        aln = aligner(orig.degap())
        self.assertEqual(aln.to_dict(), expect)

    def test_gap_union(self):
        """correctly identifies the union of all gaps"""
        # fails if not all sequences same
        seq = DNA.make_seq("AACCCGTT")
        all_gaps = dict([(0, 3), (2, 1), (5, 3), (6, 3)])
        final_seq = make_aligned(all_gaps, seq)
        gap_sets = [
            dict([(5, 1), (6, 3)]),
            dict([(2, 1), (5, 3)]),
            dict([(2, 1), (5, 1), (6, 2)]),
            dict([(0, 3)]),
        ]
        seqs = [make_aligned(gaps, seq) for gaps in gap_sets]
        got = _gap_union(seqs)
        self.assertEqual(got, dict(all_gaps))

        # must all be Aligned instances
        with self.assertRaises(TypeError):
            _gap_union(seqs + ["GGGGGGGG"])

        # must all have the same name
        with self.assertRaises(ValueError):
            _gap_union(seqs + [make_aligned({}, seq, name="blah")])

    def test_gap_difference(self):
        """correctly identifies the difference in gaps"""
        seq = DNA.make_seq("AACCCGTT")
        all_gaps = dict([(0, 3), (2, 1), (5, 3), (6, 3)])
        gap_sets = [
            dict([(5, 1), (6, 3)]),
            dict([(2, 1), (5, 3)]),
            dict([(2, 1), (5, 1), (6, 2)]),
            dict([(0, 3)]),
        ]
        seqs = [make_aligned(gaps, seq) for gaps in gap_sets]
        union = _gap_union(seqs)
        expects = [
            [dict([(0, 3), (2, 1)]), dict([(5, 2)])],
            [dict([(0, 3), (6, 3)]), {}],
            [dict([(0, 3)]), dict([(5, 2), (6, 1)])],
            [dict([(2, 1), (5, 3), (6, 3)]), {}],
        ]
        for seq, (plain, overlap) in zip(seqs, expects):
            seq_gaps = dict(seq.map.get_gap_coordinates())
            got_plain, got_overlap = _gap_difference(seq_gaps, union)
            self.assertEqual(got_plain, dict(plain))
            self.assertEqual(got_overlap, dict(overlap))

    def test_merged_gaps(self):
        """correctly handles gap values"""
        a_gaps = {0: 2}
        b_gaps = {2: 2}
        self.assertEqual(_merged_gaps(a_gaps, {}), a_gaps)
        self.assertEqual(_merged_gaps({}, b_gaps), b_gaps)

    def test_combined_refseq_gaps(self):
        union = dict([(0, 3), (2, 1), (5, 3), (6, 3)])
        gap_sets = [
            [(5, 1), (6, 3)],
            [(2, 1), (5, 3)],
            [(2, 1), (5, 1), (6, 2)],
            [(0, 3)],
        ]
        # for subset gaps, their alignment position is the
        # offset + their position + their gap length
        expects = [
            dict([(6, 2), (0, 3), (2, 1)]),
            dict([(0, 3), (10, 3)]),
            dict([(0, 3), (5 + 1 + 1, 2), (6 + 2 + 2, 1)]),
            dict([(2 + 3, 1), (5 + 3, 3), (6 + 3, 3)]),
        ]
        for i, gap_set in enumerate(gap_sets):
            got = _combined_refseq_gaps(dict(gap_set), union)
            self.assertEqual(got, expects[i])

        # if union gaps equals ref gaps
        got = _combined_refseq_gaps({2: 2}, {2: 2})
        self.assertEqual(got, {})

    def test_gaps_for_injection(self):
        # for gaps before any otherseq gaps, alignment coord is otherseq coord
        oseq_gaps = {2: 1, 6: 2}
        rseq_gaps = {0: 3}
        expect = {0: 3, 2: 1, 6: 2}
        seqlen = 50
        got = _gaps_for_injection(oseq_gaps, rseq_gaps, seqlen)
        self.assertEqual(got, expect)
        # for gaps after otherseq gaps seq coord is align coord minus gap
        # length totals
        got = _gaps_for_injection(oseq_gaps, {4: 3}, seqlen)
        expect = {2: 1, 3: 3, 6: 2}
        self.assertEqual(got, expect)
        got = _gaps_for_injection(oseq_gaps, {11: 3}, seqlen)
        expect = {2: 1, 6: 2, 8: 3}
        self.assertEqual(got, expect)
        # gaps beyond sequence length added to end of sequence
        got = _gaps_for_injection({2: 1, 6: 2}, {11: 3, 8: 3}, 7)
        expect = {2: 1, 6: 2, 7: 6}
        self.assertEqual(got, expect)

    def test_pairwise_to_multiple(self):
        """the standalone function constructs a multiple alignment"""
        expect = {
            "Ref": "CAG---GAGAACAGAAACCCAT--TACTCACT",
            "Qu1": "CAG---GAGAACAG---CCCGTGTTACTCACT",
            "Qu2": "CAGCATGAGAACAGAAACCCGT--TA---ACT",
            "Qu3": "CAGCATGAGAACAGAAACCCGT----CTCACT",
            "Qu7": "CAG---GA--ACAGA--CCCGT--TA---ACT",
            "Qu4": "CAGCATGAGAACAGAAACCCGTGTTACTCACT",
            "Qu5": "CAG---GAGAACAG---CCCAT--TACTCACT",
            "Qu6": "CAG---GA-AACAG---CCCAT--TACTCACT",
        }
        aln = make_aligned_seqs(expect, moltype="dna").omit_gap_pos()
        expect = aln.to_dict()
        for refseq_name in ["Qu3"]:
            refseq, pwise = make_pairwise(expect, refseq_name)
            got = pairwise_to_multiple(pwise,
                                       ref_seq=refseq,
                                       moltype=refseq.moltype)
            self.assertEqual(len(got), len(aln))
            orig = dict(pwise)
            _, pwise = make_pairwise(got.to_dict(), refseq_name)
            got = dict(pwise)
            # should be able to recover the original pairwise alignments
            for key, value in got.items():
                self.assertEqual(value.to_dict(),
                                 orig[key].to_dict(),
                                 msg=refseq_name)

            with self.assertRaises(TypeError):
                pairwise_to_multiple(pwise, "ACGG", DNA)

    def test_pairwise_to_multiple_2(self):
        """correctly handle alignments with gaps beyond end of query"""

        # cogent3.core.alignment.DataError: Not all sequences are the same length:
        # max is 425, min is 419
        def make_pwise(data, ref_name):
            result = []
            for n, seqs in data.items():
                result.append([
                    n,
                    make_aligned_seqs(data=seqs,
                                      moltype="dna",
                                      array_align=False)
                ])
            ref_seq = result[0][1].get_seq(ref_name)
            return result, ref_seq

        pwise = {
            "Platypus": {
                "Opossum":
                "-----------------GTGC------GAT-------------------------------CCAAAAACCTGTGTC--ACCGT--------GCC----CAGAGCCTCC----CTCAGGCCGCTCGGGGAG---TG-------GCCCCCCG--GC-GGAGGGCAGGGATGGGGAGT-AGGGGTGGCAGTC----GGAACTGGAAGAGCTT-TACAAACC---------GA--------------------GGCT-AGAGGGTC-TGCTTAC-------TTTTTACCTTGG------------GTTTG-CCAGGAGGTAG----------AGGATGA-----------------CTAC--ATCAAG----AGC------------TGGG-------------",
                "Platypus":
                "CAGGATGACTACATCAAGAGCTGGGAAGATAACCAGCAAGGAGATGAAGCTCTGGACACTACCAAAGACCCCTGCCAGAACGTGAAGTGCAGCCGACACAAGGTCTGCATCGCTCAGGGCTACCAGAGAGCCATGTGTATCAGCCGCAAGAAGCTGGAGCACAGGATCAAGCAGCCAGCCCTGAAACTCCATGGAAACAGAGAGAGCTTCTGCAAGCCTTGTCACATGACCCAGCTGGCCTCTGTCTGCGGCTCGGACGGACACACTTACAGCTCCGTGTGCAAACTGGAGCAGCAGGCCTGTCTGACCAGCAAGCAGCTGACAGTCAAGTGTGAAGGCCAGTGCCCGTGCCCCACCGATCATGTTCCAGCCTCCACCGCTGATGGAAAACAAGAGACCT",
            },
            "Wombat": {
                "Opossum":
                "GTGCGATCCAAAAACCTGTGTCACCGTGCCCAGAGCCTCCCTCAGGCCGCTCGG-GGAGTGGCCCCCCGGCGGAGGGCAGGGATGGGGAGTAGGGGTGGCAGTCGGAACTGGAAGAGCTTTACAAACCGAGGCTAGAGGGTCTGCTTACTTTTTACCTTGG------GTTT--GC-CAGGA---GGT----AGAGGATGACTACATCAAGAGCTGGG---------------------------",
                "Wombat":
                "--------CA----------TCACCGC-CCCTGCACC---------CGGCTCGGCGGAGGGGGATTCTAA-GGGGGTCAAGGATGGCGAG-ACCCCTGGCAATTTCA--TGGAGGA------CGAGCAATGGCT-----GTC-GTCCATCTCCCAGTATAGCGGCAAGATCAAGCACTGGAACCGCTTCCGAGACGATGACTACATCAAGAGCTGGGAGGACAGTCAGCAAGGAGATGAAGCGC",
            },
        }
        pwise, ref_seq = make_pwise(pwise, "Opossum")
        aln = pairwise_to_multiple(pwise, ref_seq, ref_seq.moltype)
        self.assertNotIsInstance(aln, NotCompleted)

        pwise = {
            "Platypus": {
                "Opossum":
                "-----------------GTGC------GAT-------------------------------CCAAAAACCTGTGTC",
                "Platypus":
                "CAGGATGACTACATCAAGAGCTGGGAAGATAACCAGCAAGGAGATGAAGCTCTGGACACTACCAAAGACCCCTGCC",
            },
            "Wombat": {
                "Opossum": "GTGCGATCCAAAAACCTGTGTC",
                "Wombat": "--------CA----------TC",
            },
        }
        pwise, ref_seq = make_pwise(pwise, "Opossum")
        aln = pairwise_to_multiple(pwise, ref_seq, ref_seq.moltype)
        self.assertNotIsInstance(aln, NotCompleted)
Exemplo n.º 25
0
class RefalignmentTests(TestCase):
    seqs = make_unaligned_seqs(_seqs, moltype=DNA)
    treestring = "(Bandicoot:0.4,FlyingFox:0.05,(Rhesus:0.06," "Human:0.0):0.04);"

    def test_align_to_ref(self):
        """correctly aligns to a reference"""
        aligner = align_app.align_to_ref(ref_seq="Human")
        aln = aligner(self.seqs)
        expect = {
            "Bandicoot": "---NACTCATTAATGCTTGAAACCAGCAGTTTATTGTCCAAC",
            "FlyingFox": "GCCAGCTCTTTACAGCATGAGAACAG---TTTATTATACACT",
            "Human": "GCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTCACT",
            "Rhesus": "GCCAGCTCATTACAGCATGAGAAC---AGTTTGTTACTCACT",
        }
        self.assertEqual(aln.to_dict(), expect)

    def test_align_to_ref_generic_moltype(self):
        """tests when the moltype is generic"""
        test_moltypes = [
            "text", "rna", "protein", "protein_with_stop", "bytes", "ab"
        ]
        for test_moltype in test_moltypes:
            aligner = align_app.align_to_ref(moltype=test_moltype)
            self.assertEqual(aligner._moltype.label, test_moltype)
            self.assertEqual(
                aligner._kwargs["S"],
                make_generic_scoring_dict(10, get_moltype(test_moltype)),
            )

    def test_align_to_ref_result_has_moltype(self):
        """aligned object has correct moltype"""
        aligner = align_app.align_to_ref(moltype="dna")
        got = aligner(self.seqs)
        self.assertEqual(got.moltype.label, "dna")

    def test_progressive_align_protein_moltype(self):
        """tests guide_tree is None and moltype is protein"""
        from cogent3 import load_aligned_seqs

        seqs = load_aligned_seqs("data/nexus_aa.nxs", moltype="protein")
        seqs = seqs.degap()
        seqs = seqs.take_seqs(["Rat", "Cow", "Human", "Mouse", "Whale"])
        aligner = align_app.progressive_align(model="WG01")
        got = aligner(seqs)
        self.assertNotIsInstance(got, NotCompleted)
        aligner = align_app.progressive_align(model="protein")
        got = aligner(seqs)
        self.assertNotIsInstance(got, NotCompleted)

    def test_progressive_align_nuc(self):
        """progressive alignment with nuc models"""
        aligner = align_app.progressive_align(model="TN93", distance="TN93")
        aln = aligner(self.seqs)
        expect = {
            "Rhesus": "GCCAGCTCATTACAGCATGAGAACAG---TTTGTTACTCACT",
            "Human": "GCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTCACT",
            "Bandicoot": "NACTCATTAATGCTTGAAACCAGCAG---TTTATTGTCCAAC",
            "FlyingFox": "GCCAGCTCTTTACAGCATGAGAACAG---TTTATTATACACT",
        }
        got = aln.to_dict()
        self.assertEqual(got, expect)

        # using default
        aligner = align_app.progressive_align(model="TN93", distance="TN93")
        aln = aligner(self.seqs)
        self.assertEqual(len(aln), 42)
        self.assertEqual(aln.moltype, aligner._moltype)
        # todo the following is not robust across operating systems
        # so commenting out for now, but needs to be checked
        # expect = {'Human': 'GCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTCACT',
        #           'Rhesus': 'GCCAGCTCATTACAGCATGAGAA---CAGTTTGTTACTCACT',
        #           'Bandicoot': 'NACTCATTAATGCTTGAAACCAG---CAGTTTATTGTCCAAC',
        #           'FlyingFox': 'GCCAGCTCTTTACAGCATGAGAA---CAGTTTATTATACACT'}
        # got = aln.to_dict()
        # self.assertEqual(got, expect)

    def test_progressive_fails(self):
        """should return NotCompletedResult along with message"""
        # Bandicoot has an inf-frame stop codon
        seqs = make_unaligned_seqs(
            data={
                "Human": "GCCTCA",
                "Rhesus": "GCCAGCTCA",
                "Bandicoot": "TGATCATTA"
            },
            moltype="dna",
        )
        aligner = align_app.progressive_align(model="codon")
        got = aligner(seqs)
        self.assertTrue(type(got), NotCompleted)

    def test_progress_with_guide_tree(self):
        """progressive align works with provided guide tree"""
        tree = make_tree(treestring=self.treestring)
        aligner = align_app.progressive_align(model="nucleotide",
                                              guide_tree=self.treestring)
        aln = aligner(self.seqs)
        self.assertEqual(len(aln), 42)
        aligner = align_app.progressive_align(model="nucleotide",
                                              guide_tree=tree)
        aln = aligner(self.seqs)
        self.assertEqual(len(aln), 42)
        # even if it has underscores in name
        treestring = ("(Bandicoot:0.4,FlyingFox:0.05,(Rhesus_macaque:0.06,"
                      "Human:0.0):0.04);")
        aligner = align_app.progressive_align(model="nucleotide",
                                              guide_tree=treestring)
        data = self.seqs.to_dict()
        data["Rhesus macaque"] = data.pop("Rhesus")
        seqs = make_unaligned_seqs(data)
        aln = aligner(seqs)
        self.assertEqual(len(aln), 42)
        # guide tree with no lengths raises value error
        with self.assertRaises(ValueError):
            _ = align_app.progressive_align(
                model="nucleotide",
                guide_tree="(Bandicoot,FlyingFox,(Rhesus_macaque,Human));",
            )

    def test_progressive_align_codon(self):
        """progressive alignment with codon models"""
        aligner = align_app.progressive_align(model="GY94")
        aln = aligner(self.seqs)
        self.assertEqual(len(aln), 42)
        aligner = align_app.progressive_align(model="codon")
        aln = aligner(self.seqs)
        self.assertEqual(len(aln), 42)

    def test_pickle_progressive_align(self):
        """test progressive_align is picklable"""
        from pickle import dumps, loads

        aligner = align_app.progressive_align(model="codon")
        aln = aligner(self.seqs)
        got = loads(dumps(aln))
        self.assertTrue(got)

    def test_with_genetic_code(self):
        """handles genetic code argument"""
        aligner = align_app.progressive_align(model="GY94", gc="2")
        # the 'TGA' codon is a sense codon in vertebrate mitochondrial
        self.assertTrue("TGA" in aligner._model.get_motifs())
        aligner = align_app.progressive_align(model="codon")
        # but a stop codon in the standard nuclear
        self.assertTrue("TGA" not in aligner._model.get_motifs())
        # try using a nuclear
        with self.assertRaises(TypeError):
            aligner = align_app.progressive_align(model="nucleotide", gc="2")

    def test_progressive_align_protein(self):
        """progressive alignment with protein models"""
        seqs = self.seqs.get_translation()
        aligner = align_app.progressive_align(model="WG01",
                                              guide_tree=self.treestring)
        aln = aligner(seqs)
        self.assertEqual(len(aln), 14)
        aligner = align_app.progressive_align(model="protein",
                                              guide_tree=self.treestring)
        aln = aligner(seqs)
        self.assertEqual(len(aln), 14)
Exemplo n.º 26
0
def get_one2one_orthologs(
    compara, ref_genes, outpath, not_strict, force_overwrite, test
):
    """writes one-to-one orthologs of protein coding genes to outpath"""

    species = Counter(compara.species)
    written = 0
    records = []
    with click.progressbar(ref_genes, label="Finding 1to1 orthologs") as ids:
        for gene in ids:
            outfile_name = os.path.join(outpath, "%s.fa.gz" % gene)
            if os.path.exists(outfile_name) and not force_overwrite:
                written += 1
                continue

            syntenic = list(
                compara.get_related_genes(
                    stableid=gene, relationship="ortholog_one2one"
                )
            )

            if len(syntenic) != 1:
                continue

            syntenic = syntenic[0]

            if not not_strict and (
                syntenic is None or Counter(syntenic.get_species_set()) != species
            ):
                # skipping, not all species had a 1to1 ortholog for this gene
                continue

            seqs = []
            for m in syntenic.members:
                records.append([gene, m.stableid, m.location, m.description])
                name = Species.get_common_name(m.genome.species)
                cds = m.canonical_transcript.cds.trim_stop_codon(allow_partial=True)
                cds.name = name
                seqs.append([name, cds])

            seqs = make_unaligned_seqs(data=seqs)
            if test:
                print()
                print(gene)
                print(seqs.to_fasta())
            else:
                with gzip.open(outfile_name, "wt") as outfile:
                    outfile.write(seqs.to_fasta() + "\n")
                LOGGER.output_file(outfile_name)

            written += 1
    if test:
        msg = "Would have written %d files to %s" % (written, outpath)
    else:
        msg = "Wrote %d files to %s" % (written, outpath)

    click.echo(msg)

    if written > 0:
        metadata = make_table(
            header=["refid", "stableid", "location", "description"], rows=records
        )
        metadata.write(os.path.join(outpath, "metadata.tsv"))

    return
Exemplo n.º 27
0
class FastSlowDistTests(TestCase):
    seqs1 = make_unaligned_seqs(_seqs1, moltype=DNA)
    seqs2 = make_unaligned_seqs(_seqs2, moltype=DNA)
    seqs3 = make_unaligned_seqs(_seqs3, moltype=DNA)
    seqs4 = make_unaligned_seqs(_seqs4, moltype=DNA)
    seqs5 = make_unaligned_seqs(_seqs5, moltype=PROTEIN)

    def test_init(self):
        """tests if fast_slow_dist can be initialised correctly"""

        fast_slow_dist = dist_app.fast_slow_dist(fast_calc="hamming",
                                                 moltype="dna")
        self.assertIsInstance(fast_slow_dist.fast_calc, HammingPair)
        self.assertIsNone(fast_slow_dist._sm)

        fast_slow_dist = dist_app.fast_slow_dist(distance="TN93")
        self.assertIsInstance(fast_slow_dist.fast_calc, TN93Pair)
        self.assertEqual(fast_slow_dist._sm.name, "TN93")
        fast_slow_dist = dist_app.fast_slow_dist(distance="GTR")
        self.assertEqual(fast_slow_dist._sm.name, "GTR")

        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93")
        self.assertEqual(fast_slow_dist._sm.name, "TN93")
        self.assertIsNone(fast_slow_dist.fast_calc)

        with self.assertRaises(ValueError):
            fast_slow_dist = dist_app.fast_slow_dist(distance="TN93",
                                                     fast_calc="TN93",
                                                     slow_calc="TN93")

        with self.assertRaises(ValueError):
            fast_slow_dist = dist_app.fast_slow_dist(fast_calc="GTR")

        with self.assertRaises(ValueError):
            fast_slow_dist = dist_app.fast_slow_dist(slow_calc="hamming")

    def test_compatible_parameters(self):
        """tests if the input parameters are compatible with fast_slow_dist initialisation"""
        fast_slow_dist = dist_app.fast_slow_dist(fast_calc="hamming",
                                                 moltype="dna")
        fast_slow_dist = dist_app.fast_slow_dist(fast_calc="TN93")
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR")
        fast_slow_dist = dist_app.fast_slow_dist(fast_calc="TN93")

        # fails for paralinear or hamming if no moltype
        with self.assertRaises(ValueError):
            fast_slow_dist = dist_app.fast_slow_dist(slow_calc="hamming")
        with self.assertRaises(ValueError):
            fast_slow_dist = dist_app.fast_slow_dist(slow_calc="paralinear")

        # fails for hamming as slow_calc
        with self.assertRaises(ValueError):
            fast_slow_dist = dist_app.fast_slow_dist(slow_calc="hamming",
                                                     moltype="dna")
        with self.assertRaises(ValueError):
            fast_slow_dist = dist_app.fast_slow_dist(fast_calc="GTR")

    def test_composable_apps(self):
        """tests two composable apps"""
        composable_apps = _get_all_composable_apps()
        fast_slow_dist = dist_app.fast_slow_dist(fast_calc="hamming",
                                                 moltype="dna")
        for app in composable_apps:
            # Compose two composable applications, there should not be exceptions.
            got = app + fast_slow_dist
            self.assertIsInstance(got, dist_app.fast_slow_dist)
            self.assertEqual(got._type, "distance")
            self.assertIs(got.input, app)
            self.assertIs(got.output, None)
            self.assertIsInstance(got._input_types, frozenset)
            self.assertIsInstance(got._output_types, frozenset)
            self.assertIs(got._in, app)
            self.assertIs(got._out, None)
            app.disconnect()
            fast_slow_dist.disconnect()

    def test_est_dist_pair_slow(self):
        """tests the distance between seq pairs in aln"""

        aligner = align.align_to_ref()
        aln3 = aligner(self.seqs3)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR")
        got = fast_slow_dist(aln3).to_dict()
        assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")])
        self.assertTrue(0 <= got[("Mouse", "Human")])
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93")
        got = fast_slow_dist(aln3).to_dict()
        assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")])
        self.assertTrue(0 <= got[("Mouse", "Human")])

        aligner = align.align_to_ref(ref_seq="Human")
        aln3 = aligner(self.seqs3)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR")
        got = fast_slow_dist(aln3).to_dict()
        assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")])
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93")
        got = fast_slow_dist(aln3).to_dict()
        assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")])
        self.assertTrue(0 <= got[("Mouse", "Human")])

        aligner = align.align_to_ref(ref_seq="Mouse")
        aln3 = aligner(self.seqs3)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(0 <= got[("Mouse", "Human")])
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(0 <= got[("Mouse", "Human")])

        aligner = align.align_to_ref()
        aln3 = aligner(self.seqs4)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(0 <= got[("Human", "Opossum")])
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(0 <= got[("Human", "Opossum")])

        aligner = align.align_to_ref(ref_seq="Human")
        aln3 = aligner(self.seqs4)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(0 <= got[("Human", "Opossum")])
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(0 <= got[("Human", "Opossum")])

        aligner = align.align_to_ref(ref_seq="Opossum")
        aln3 = aligner(self.seqs4)
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(0 <= got[("Human", "Opossum")])
        fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93")
        got = fast_slow_dist(aln3).to_dict()
        self.assertTrue(0 <= got[("Human", "Opossum")])

        treestring = "(Human:0.2,Bandicoot:0.2)"
        aligner = align.progressive_align(model="WG01", guide_tree=treestring)
        _ = aligner(self.seqs5)
Exemplo n.º 28
0
    def test_take_n_seqs(self):
        """select specified number of sequences from a collection"""
        seqs1 = make_unaligned_seqs(
            data={
                "a": "ACGT",
                "b": "ACG-",
                "c": "ACGN",
                "d": "ACGG",
                "e": "ACGG",
                "k": "ACGG",
                "f": "RAAA",
                "g": "YAAA",
                "h": "GGGG",
            })
        seqs2 = seqs1.take_seqs(["a", "c", "e", "g", "h"])

        # by order, fixed
        take = sample.take_n_seqs(3, fixed_choice=True)
        got = take(seqs1)
        self.assertEqual(len(got.names), 3)
        # this should return NotCompleted because it applies the names present in 1 to the next one
        got = take(seqs2)
        self.assertIsInstance(got, NotCompleted)

        take = sample.take_n_seqs(30)
        # this should fail because too few seqs
        got = take(seqs1)
        self.assertIsInstance(got, NotCompleted)

        # by order, not fixed
        take = sample.take_n_seqs(3, fixed_choice=False)
        got1 = take(seqs1)
        got2 = take(seqs2)
        self.assertNotEqual(set(got1.names), set(got2.names))

        # random choice, fixed
        take = sample.take_n_seqs(3, random=True, fixed_choice=True)
        self.assertEqual(take._fixed_choice, True)

        got1 = take(seqs2)
        got2 = take(seqs1)
        self.assertEqual(got1.names, got2.names)

        # random choice, not fixed
        take = sample.take_n_seqs(2, random=True, fixed_choice=False)
        self.assertEqual(take._fixed_choice, False)
        # testing this is hard, we simply expect the labels to differ on subsequent call
        # the probability of drawing a specific pair of names on one call is 1/(9 choose 2) = 1/36
        # at n = 11, the probability all the pairs will be identical is ~=0
        first_call = take(seqs1)
        for _ in range(11):
            got = take(seqs1)
            different = first_call.names != got.names
            if different:
                break

        self.assertTrue(different,
                        msg="failed to generate different random sample")

        # try setting the seed
        take = sample.take_n_seqs(2, random=True, seed=123)
        got = take(seqs1)
        self.assertNotIsInstance(got, NotCompleted)