def test_composable_apps(self): """checks the ability of these two apps(fast_slow_dist and quick_tree) to communicate""" path = os.path.join(data_path, "brca1_5.paml") aln1 = load_aligned_seqs(path, moltype=DNA) fast_slow_dist = dist.fast_slow_dist(fast_calc="hamming", moltype="dna") quick = tree_app.quick_tree(drop_invalid=False) proc = fast_slow_dist + quick self.assertEqual( str(proc), "fast_slow_dist(type='distance', distance=None, moltype='dna',\n" "fast_calc='hamming', slow_calc=None) + quick_tree(type='tree',\n" "drop_invalid=False)", ) self.assertIsInstance(proc, tree_app.quick_tree) self.assertEqual(proc._type, "tree") self.assertIsInstance(proc.input, dist.fast_slow_dist) self.assertIs(proc.output, None) self.assertIsInstance(proc._input_types, frozenset) self.assertIsInstance(proc._output_types, frozenset) self.assertIsInstance(proc._in, dist.fast_slow_dist) self.assertIs(proc._out, None) tree1 = proc(aln1) self.assertIsInstance(tree1, PhyloNode) self.assertIsNotNone(tree1.children) self.assertEqual(set(tree1.get_tip_names()), set(aln1.names)) # tests when distances contain None data = dict( seq1="AGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT", seq2="TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC", ) aln2 = make_aligned_seqs(data=data, moltype=DNA) tree2 = proc(aln2) self.assertIsInstance(tree2, NotCompleted)
def _get_all_composables(tmp_dir_name): test_model1 = evo.model("HKY85") test_model2 = evo.model("GN") test_hyp = evo.hypothesis(test_model1, test_model2) test_num_reps = 100 applications = [ align.align_to_ref(), align.progressive_align(model="GY94"), dist.fast_slow_dist(moltype="dna", fast_calc="hamming"), evo.ancestral_states(), evo.bootstrap(hyp=test_hyp, num_reps=test_num_reps), evo.hypothesis(test_model1, test_model2), evo.model("GN"), evo.tabulate_stats(), sample.fixed_length(100), sample.min_length(100), io.write_db(tmp_dir_name, create=True), io.write_json(tmp_dir_name, create=True), io.write_seqs(tmp_dir_name, create=True), sample.omit_bad_seqs(), sample.omit_degenerates(), sample.omit_duplicated(), sample.take_codon_positions(1), sample.take_named_seqs(), sample.take_n_seqs(2), sample.trim_stop_codons(gc=1), translate.select_translatable(), tree.quick_tree(), tree.scale_branches(), tree.uniformize_tree(), ] return applications
def test_composes_with_write_tabular(self): """correctly links to tabular""" with TemporaryDirectory(dir=".") as dirname: writer = io.write_tabular(dirname) dist_calc = dist_app.fast_slow_dist(distance="hamming", moltype="protein") _ = dist_calc + writer
def test_init(self): """tests if fast_slow_dist can be initialised correctly""" fast_slow_dist = dist_app.fast_slow_dist(fast_calc="hamming", moltype="dna") self.assertIsInstance(fast_slow_dist.fast_calc, HammingPair) self.assertIsNone(fast_slow_dist._sm) fast_slow_dist = dist_app.fast_slow_dist(distance="TN93") self.assertIsInstance(fast_slow_dist.fast_calc, TN93Pair) self.assertEqual(fast_slow_dist._sm.name, "TN93") fast_slow_dist = dist_app.fast_slow_dist(distance="GTR") self.assertEqual(fast_slow_dist._sm.name, "GTR") fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") self.assertEqual(fast_slow_dist._sm.name, "TN93") self.assertIsNone(fast_slow_dist.fast_calc) with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(distance="TN93", fast_calc="TN93", slow_calc="TN93") with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(fast_calc="GTR") with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(slow_calc="hamming")
def test_quick_tree(self): """correctly calc a nj tree""" path = os.path.join(data_path, "brca1_5.paml") aln = load_aligned_seqs(path, moltype=DNA) fast_slow_dist = dist.fast_slow_dist(fast_calc="hamming", moltype="dna") dist_matrix = fast_slow_dist(aln) quick1 = tree_app.quick_tree() tree1 = quick1.quick_tree(dist_matrix) self.assertEqual(set(tree1.get_tip_names()), set(aln.names))
def test_functions_as_composable(self): """works as a composable app""" from pathlib import Path loader = io.load_aligned(moltype="dna", format="paml") dist = dist_app.fast_slow_dist("hamming", moltype="dna") with TemporaryDirectory(dir=".") as dirname: dirname = Path(dirname) writer = io.write_tabular(dirname) proc = loader + dist + writer _ = proc("data/brca1_5.250.paml") output = dirname / "brca1_5.250.tsv" self.assertTrue(output.exists())
def test_composable_apps(self): """tests two composable apps""" composable_apps = _get_all_composable_apps() fast_slow_dist = dist_app.fast_slow_dist(fast_calc="hamming", moltype="dna") for app in composable_apps: # Compose two composable applications, there should not be exceptions. got = app + fast_slow_dist self.assertIsInstance(got, dist_app.fast_slow_dist) self.assertEqual(got._type, "distance") self.assertIs(got.input, app) self.assertIs(got.output, None) self.assertIsInstance(got._input_types, frozenset) self.assertIsInstance(got._output_types, frozenset) self.assertIs(got._in, app) self.assertIs(got._out, None) app.disconnect() fast_slow_dist.disconnect()
def test_compatible_parameters(self): """tests if the input parameters are compatible with fast_slow_dist initialisation""" fast_slow_dist = dist_app.fast_slow_dist(fast_calc="hamming", moltype="dna") fast_slow_dist = dist_app.fast_slow_dist(fast_calc="TN93") fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") fast_slow_dist = dist_app.fast_slow_dist(fast_calc="TN93") # fails for paralinear or hamming if no moltype with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(slow_calc="hamming") with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(slow_calc="paralinear") # fails for hamming as slow_calc with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(slow_calc="hamming", moltype="dna") with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(fast_calc="GTR")
def __init__( self, model, gc=None, param_vals=None, guide_tree=None, unique_guides=False, indel_length=1e-1, indel_rate=1e-10, distance="percent", ): """ Parameters ---------- model substitution model instance or name. If 'codon' (uses MG94HKY), 'nucleotide' (uses HKY85), 'protein' (uses WG01). These choices provide also provide default settings for param_vals. gc : int or string the genetic code for a codon alignment, defaults to the standard genetic code param_vals : dict param name, values for parameters in model. Overrides default choices. guide_tree newick string, tree instance (must have branch lengths), or a callable that will build a tree from unaligned collection. If not provided, estimated ONCE via constructing a crude alignment. In the case of callable, or not provided, the computed guide tree is stored in the returned alignment.info['guide_tree']. unique_guides : bool whether each alignment requires a new guide tree indel_rate : float probability of gap insertion indel_length : float probability of gap extension distance : string the distance measure for building a guide tree. Default is 'percent', the proportion of differences. This is applicable for any moltype, and sequences with very high percent identity. For more diverged sequences we recommend 'paralinear'. """ super(progressive_align, self).__init__( input_types=self._input_types, output_types=self._output_types, data_types=self._data_types, ) self._param_vals = { "codon": dict(omega=0.4, kappa=3), "nucleotide": dict(kappa=3), }.get(model, param_vals) sm = {"codon": "MG94HKY", "nucleotide": "HKY85", "protein": "JTT92"}.get( model, model ) self._formatted_params() kwargs = {} if gc is None else dict(gc=gc) sm = get_model(sm, **kwargs) moltype = sm.alphabet.moltype self._model = sm self._scalar = sm.word_length self._indel_length = indel_length self._indel_rate = indel_rate self._moltype = moltype self._unique_guides = unique_guides self._distance = distance if callable(guide_tree): self._make_tree = guide_tree guide_tree = None # callback takes precedence else: al_to_ref = align_to_ref(moltype=self._moltype) dist_calc = dist.fast_slow_dist( distance=self._distance, moltype=self._moltype ) est_tree = quick_tree() self._make_tree = al_to_ref + dist_calc + est_tree if guide_tree is not None: if type(guide_tree) == str: guide_tree = make_tree(treestring=guide_tree, underscore_unmunge=True) if guide_tree.children[0].length is None: raise ValueError("Guide tree must have branch lengths") # make sure no zero lengths guide_tree = scale_branches()(guide_tree) self._guide_tree = guide_tree self._kwargs = dict( indel_length=self._indel_length, indel_rate=self._indel_rate, tree=self._guide_tree, param_vals=self._param_vals, show_progress=False, ) self.func = self.multiple_align
def test_est_dist_pair_slow(self): """tests the distance between seq pairs in aln""" aligner = align.align_to_ref() aln3 = aligner(self.seqs3) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) self.assertTrue(0 <= got[("Mouse", "Human")]) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) self.assertTrue(0 <= got[("Mouse", "Human")]) aligner = align.align_to_ref(ref_seq="Human") aln3 = aligner(self.seqs3) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) self.assertTrue(0 <= got[("Mouse", "Human")]) aligner = align.align_to_ref(ref_seq="Mouse") aln3 = aligner(self.seqs3) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Mouse", "Human")]) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Mouse", "Human")]) aligner = align.align_to_ref() aln3 = aligner(self.seqs4) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Human", "Opossum")]) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Human", "Opossum")]) aligner = align.align_to_ref(ref_seq="Human") aln3 = aligner(self.seqs4) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Human", "Opossum")]) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Human", "Opossum")]) aligner = align.align_to_ref(ref_seq="Opossum") aln3 = aligner(self.seqs4) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Human", "Opossum")]) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Human", "Opossum")]) treestring = "(Human:0.2,Bandicoot:0.2)" aligner = align.progressive_align(model="WG01", guide_tree=treestring) _ = aligner(self.seqs5)
def test_est_dist_pair_slow(self): """tests the distance between seq pairs in aln""" aligner = align.align_to_ref() aln3 = aligner(self.seqs3) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) self.assertTrue(got[("Mouse", "Human")] >= 0) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) self.assertTrue(got[("Mouse", "Human")] >= 0) aligner = align.align_to_ref(ref_seq="Human") aln3 = aligner(self.seqs3) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) self.assertTrue(got[("Mouse", "Human")] >= 0) aligner = align.align_to_ref(ref_seq="Mouse") aln3 = aligner(self.seqs3) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Mouse", "Human")] >= 0) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Mouse", "Human")] >= 0) aligner = align.align_to_ref() aln3 = aligner(self.seqs4) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Human", "Opossum")] >= 0) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Human", "Opossum")] >= 0) aligner = align.align_to_ref(ref_seq="Human") aln3 = aligner(self.seqs4) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Human", "Opossum")] >= 0) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Human", "Opossum")] >= 0) aligner = align.align_to_ref(ref_seq="Opossum") aln3 = aligner(self.seqs4) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Human", "Opossum")] >= 0) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Human", "Opossum")] >= 0) # now as a process proc = align.align_to_ref() + dist_app.fast_slow_dist( fast_calc="hamming", moltype="dna") got = proc(self.seqs1) self.assertEqual(got[("Human", "Rhesus")], 1) treestring = "(Human:0.2,Bandicoot:0.2)" aligner = align.progressive_align(model="WG01", guide_tree=treestring) _ = aligner(self.seqs5)