class TestTree(unittest.TestCase): """tests for a single tree-type""" def setUp(self): self.name = 'small tree - ' self.otu_names = ['NineBande', 'Mouse', 'HowlerMon', 'DogFaced'] self.otu_names.sort() self.newick = '(((Human,HowlerMon),Mouse),NineBande,DogFaced);' self.newick_sorted = '(DogFaced,((HowlerMon,Human),Mouse),NineBande);' self.newick_reduced = '((HowlerMon,Mouse),NineBande,DogFaced);' self.tree = LoadTree(treestring=self.newick) def test_sorttree(self): """testing (well, exercising at least) treesort""" new_tree = self.tree.sorted() if hasattr(self, 'newick_sorted'): self.assertEqual(self.newick_sorted, new_tree.getNewick(with_distances=0)) def test_getsubtree(self): """testing getting a subtree""" subtree = self.tree.unrooted().getSubTree(self.otu_names) new_tree = LoadTree(treestring=self.newick_reduced).unrooted() # check we get the same names self.assertEqual(*[len(t.Children) for t in (subtree, new_tree)]) self.assertEqual(str(subtree), str(new_tree)) def test_ascii(self): self.tree.asciiArt() # unlabeled internal node tr = DndParser("(B:0.2,(C:0.3,D:0.4):0.6)F;") tr.asciiArt(show_internal=True, compact=False) tr.asciiArt(show_internal=True, compact=True) tr.asciiArt(show_internal=False, compact=False)
def get_tree(filename): tree = LoadTree(filename) treename = os.path.basename(filename).rsplit('.', 1)[0] for edge in tree.getEdgeVector(): edge.NameLoaded = True edge.Name = edge.Name.replace('.', '_') return {'treename': treename, 'treestring': str(tree)}
class BigTreeSingleTests(TestTree): """using the big-tree for single-tree tests""" def setUp(self): self.name = 'big tree - ' self.otu_names = ['Horse', 'TombBat', 'Rhino', 'Pig', 'AsianElep', 'SpermWhal', 'Cat', 'Gorilla', 'Orangutan', 'bandicoot', 'Hedgehog', 'Sloth', 'HairyArma', 'Manatee', 'GoldenMol', 'Pangolin'] self.otu_names.sort() self.newick = '((((((((FlyingFox,DogFaced),((FreeTaile,LittleBro),(TombBat,RoundEare))),(FalseVamp,LeafNose)),(((Horse,Rhino),(Pangolin,(Cat,Dog))),(Llama,(Pig,(Cow,(Hippo,(SpermWhal,HumpbackW))))))),(Mole,Hedgehog)),(TreeShrew,(FlyingLem,((Jackrabbit,(FlyingSqu,(OldWorld,(Mouse,Rat)))),(Galago,(HowlerMon,(Rhesus,(Orangutan,(Gorilla,(Human,Chimpanzee)))))))))),(((NineBande,HairyArma),(Anteater,Sloth)),(((Dugong,Manatee),((AfricanEl,AsianElep),(RockHyrax,TreeHyrax))),(Aardvark,((GoldenMol,(Madagascar,Tenrec)),(LesserEle,GiantElep)))))),(caenolest,(phascogale,(wombat,bandicoot))));' self.newick_reduced = '(((((TombBat,(((Horse,Rhino),(Pangolin,Cat)),(Pig,SpermWhal))),Hedgehog),(Orangutan,Gorilla)),((HairyArma,Sloth),((Manatee,AsianElep),GoldenMol))),bandicoot);' self.tree = LoadTree(treestring = self.newick) def test_getEdgeNames(self): """testing (well, exercising at least), getedgenames""" # Fell over on small tree because "stem descended from root # joiner was a tip" a,b = self.otu_names[:2] clade = self.tree.getEdgeNames(a, b, True, False) def test_getTipNames(self): """testing (well, exercising at least), getTipNames""" a,b = self.otu_names[:2] tips = self.tree.getTipNames() self.assertEqual(len(tips), 55)
def test_getsetParamValue(self): """test getting, setting of param values""" t = LoadTree(treestring='((((a:.2,b:.3)ab:.1,c:.3)abc:.4),d:.6)') self.assertEqual(t.getParamValue('length', 'ab'), 0.1, 2) t.setParamValue('zz', 'ab', 4.321) node = t.getNodeMatchingName('ab') self.assertEqual(4.321, node.params['zz'], 4)
def test_making_from_list(self): tipnames_with_spaces = ['a_b', 'a b', "T'lk"] tipnames_with_spaces.sort() t = LoadTree(tip_names=tipnames_with_spaces) result = t.getTipNames() result.sort() assert result == tipnames_with_spaces
def test_balanced(self): """balancing an unrooted tree""" t = LoadTree( treestring='((a,b),((c1,(c2,(c3,(c4,(c5,(c6,c7)))))),(d,e)),f)') b = LoadTree( treestring='(c1,(c2,(c3,(c4,(c5,(c6,c7))))),((d,e),((a,b),f)))') self.assertEqual(str(t.balanced()), str(b))
def test_making_from_list(self): tipnames_with_spaces = ['a_b','a b',"T'lk"] tipnames_with_spaces.sort() t = LoadTree(tip_names=tipnames_with_spaces) result = t.getTipNames() result.sort() assert result == tipnames_with_spaces
def MatchNodes(self): #print "YAY" self.correctForFastMLNameChanges() #performs the correction on the output string if necessary #print "NAY" TerminiStringToNodeName_D = {} #a termini string is prepared for each internal node, that is, all termini under the internal node sorted an placed into a single string for NodeKey in self.UpperKey_L: TerminiStringToNodeName_D['-'.join(sorted(self.Nodes_D[NodeKey]['terminal']))] = NodeKey #prepares a cogent tree object for the fastML output FH = getInputTempFile(self.FastMLOutputTreeString) FastMLCogentTree = LoadTree(FH.name) self.FastMLToOriginalMatchedNodes_D = {} #for each cogent node in the FastML cogent tree for FastMLCogentNodeKey in FastMLCogentTree.getNodeNames(): #a termini string is prepared for the fastML node FastMLCogentNode = FastMLCogentTree.getNodeMatchingName(FastMLCogentNodeKey) FastMLTermini_L = [tip.Name for tip in FastMLCogentNode.iterTips()] #if it has more than 0 termini under the node if len(FastMLTermini_L) > 0: #A fastML termini string is prepared, and this termini string will be the same termini string as the equivalent cogent node FastMLTerminiString = '-'.join(sorted(FastMLTermini_L)) self.FastMLToOriginalMatchedNodes_D[FastMLCogentNodeKey] = TerminiStringToNodeName_D[FastMLTerminiString] #if it has no termini under it, then the node itself is a terminus and has the same name in FastML and Cogent else: self.FastMLToOriginalMatchedNodes_D[FastMLCogentNodeKey] = FastMLCogentNodeKey
def _test_tree(self, method, treestring): t = LoadTree(treestring=treestring) t_distances = t.getDistances() reconstructed = method(t_distances) distances = reconstructed.getDistances() for key in t_distances: self.assertAlmostEqual(t_distances[key], distances[key])
class BigTreeSingleTests(TestTree): """using the big-tree for single-tree tests""" def setUp(self): self.name = 'big tree - ' self.otu_names = [ 'Horse', 'TombBat', 'Rhino', 'Pig', 'AsianElep', 'SpermWhal', 'Cat', 'Gorilla', 'Orangutan', 'bandicoot', 'Hedgehog', 'Sloth', 'HairyArma', 'Manatee', 'GoldenMol', 'Pangolin' ] self.otu_names.sort() self.newick = '((((((((FlyingFox,DogFaced),((FreeTaile,LittleBro),(TombBat,RoundEare))),(FalseVamp,LeafNose)),(((Horse,Rhino),(Pangolin,(Cat,Dog))),(Llama,(Pig,(Cow,(Hippo,(SpermWhal,HumpbackW))))))),(Mole,Hedgehog)),(TreeShrew,(FlyingLem,((Jackrabbit,(FlyingSqu,(OldWorld,(Mouse,Rat)))),(Galago,(HowlerMon,(Rhesus,(Orangutan,(Gorilla,(Human,Chimpanzee)))))))))),(((NineBande,HairyArma),(Anteater,Sloth)),(((Dugong,Manatee),((AfricanEl,AsianElep),(RockHyrax,TreeHyrax))),(Aardvark,((GoldenMol,(Madagascar,Tenrec)),(LesserEle,GiantElep)))))),(caenolest,(phascogale,(wombat,bandicoot))));' self.newick_reduced = '(((((TombBat,(((Horse,Rhino),(Pangolin,Cat)),(Pig,SpermWhal))),Hedgehog),(Orangutan,Gorilla)),((HairyArma,Sloth),((Manatee,AsianElep),GoldenMol))),bandicoot);' self.tree = LoadTree(treestring=self.newick) def test_getEdgeNames(self): """testing (well, exercising at least), getedgenames""" # Fell over on small tree because "stem descended from root # joiner was a tip" a, b = self.otu_names[:2] clade = self.tree.getEdgeNames(a, b, True, False) def test_getTipNames(self): """testing (well, exercising at least), getTipNames""" a, b = self.otu_names[:2] tips = self.tree.getTipNames() self.assertEqual(len(tips), 55)
def ml4(aln, true_tree): ''' Input a true tree and an alignment Calculate the likelihood of all possible unrooted 4-taxon trees Return True if the ML tree is the true tree Return False otherwise ''' # all trees with unit branch lengths all_trees = [ LoadTree(treestring='((a,b),(c,d))'), LoadTree(treestring='((a,c),(b,d))'), LoadTree(treestring='((a,d),(b,c))') ] # optimise lf for all trees sm = JC69() results = [] for t in all_trees: lf = sm.makeLikelihoodFunction(t) lf.setAlignment(aln) lf.optimise(local=True) results.append(lf.getLogLikelihood()) # get the ml tree and compare to true tree ml_tree = all_trees[results.index(max(results))] return ml_tree.sameTopology(true_tree)
def setUp(self): self.name = 'small tree - ' self.otu_names = ['NineBande', 'Mouse', 'HowlerMon', 'DogFaced'] self.otu_names.sort() self.newick = '(((Human,HowlerMon),Mouse),NineBande,DogFaced);' self.newick_sorted = '(DogFaced,((HowlerMon,Human),Mouse),NineBande);' self.newick_reduced = '((HowlerMon,Mouse),NineBande,DogFaced);' self.tree = LoadTree(treestring=self.newick)
def test_sameShape(self): """test topology assessment""" t1 = LoadTree(treestring="(((s1,s5),s3),s2,s4);") t2 = LoadTree(treestring="((s1,s5),(s2,s4),s3);") t3 = LoadTree(treestring="((s1,s4),(s2,s5),s3);") assert t1.sameTopology(t2), (t1, t2) assert not t1.sameTopology(t3), (t1, t3) assert not t2.sameTopology(t3), (t2, t3)
def inflate_likelihood_function(data, model=None): supported_subs_models = ('GeneralStationary', 'General', 'DiscreteSubstitutionModel', 'General_with_gaps') if not model is None: model = model() elif data['name'] == 'GTR': if data['with_rate']: model = GTR(optimise_motif_probs=True, with_rate=True, distribution='gamma') else: model = GTR(optimise_motif_probs=True) elif data['name'] == 'General_with_gaps': assert not data['with_rate'], data['name'] + ' plus Gamma not supported' model = General(DNA.Alphabet, optimise_motif_probs=True, model_gaps=True, recode_gaps=False, name='General_with_gaps') elif data['name'] in supported_subs_models: assert not data['with_rate'], data['name'] + ' plus Gamma not supported' model = eval(data['name'])(DNA.Alphabet, optimise_motif_probs=True, model_gaps=False, recode_gaps=True, name=data['name']) else: st = 'inflate_likelihood_function: unsupported model ' + data['name'] raise NotImplementedError(st) if 'tree' in data: tree = LoadTree(treestring=data['tree'].encode('utf-8')) else: tip_names = [tip_name.encode('utf-8') for tip_name in data['tip_names']] tree = LoadTree(tip_names=tip_names) if data['with_rate']: lf = model.makeLikelihoodFunction(tree, bins=4) else: lf = model.makeLikelihoodFunction(tree) with lf.updatesPostponed(): lf.setMotifProbs(data['mprobs']) params = data['params'] for param in data['params']: dimensions = lf.defn_for[param].valid_dimensions if len(dimensions) == 0: lf.setParamRule(param, init=params[param]) elif 'edge' in dimensions and 'bin' in dimensions: for edge, bins in params[param].items(): for bin, init in bins.items(): lf.setParamRule(param, edge=edge, bin=bin, init=init) elif 'edge' in dimensions: for edge, init in params[param].items(): lf.setParamRule(param, edge=edge, init=init) elif 'bin' in dimensions: for bin, init in params[param].items(): lf.setParamRule(param, bin=bin, init=init) if 'dependencies' in data: for param, scopes in data['dependencies'].items(): for scope in scopes: lf.setParamRule(param, is_independent=False, **scope) return lf
def test_params_merge(self): t = LoadTree(treestring='((((a,b)ab,c)abc),d)') for (label, length, beta) in [('a',1, 20),('b',3,2.0),('ab',4,5.0),]: t.getNodeMatchingName(label).params = {'length':length, 'beta':beta} t = t.getSubTree(['b', 'c', 'd']) self.assertEqual(t.getNodeMatchingName('b').params, {'length':7, 'beta':float(2*3+4*5)/(3+4)}) self.assertRaises(ValueError, t.getSubTree, ['b','c','xxx']) self.assertEqual(str(t.getSubTree(['b','c','xxx'],ignore_missing=True)), '(b:7,c)root;')
def setUp(self): self.submodel = Nucleotide(do_scaling=True, model_gaps=False, equal_motif_probs=True, predicates={'beta': 'transition'}) self.data = LoadSeqs(filename=os.path.join(data_path, 'brca1_5.paml'), moltype=self.submodel.MolType) self.tree = LoadTree(filename=os.path.join(data_path, 'brca1_5.tree'))
def setUp(self): self.name = 'big tree - ' self.otu_names = [ 'Horse', 'TombBat', 'Rhino', 'Pig', 'AsianElep', 'SpermWhal', 'Cat', 'Gorilla', 'Orangutan', 'bandicoot', 'Hedgehog', 'Sloth', 'HairyArma', 'Manatee', 'GoldenMol', 'Pangolin' ] self.otu_names.sort() self.newick = '((((((((FlyingFox,DogFaced),((FreeTaile,LittleBro),(TombBat,RoundEare))),(FalseVamp,LeafNose)),(((Horse,Rhino),(Pangolin,(Cat,Dog))),(Llama,(Pig,(Cow,(Hippo,(SpermWhal,HumpbackW))))))),(Mole,Hedgehog)),(TreeShrew,(FlyingLem,((Jackrabbit,(FlyingSqu,(OldWorld,(Mouse,Rat)))),(Galago,(HowlerMon,(Rhesus,(Orangutan,(Gorilla,(Human,Chimpanzee)))))))))),(((NineBande,HairyArma),(Anteater,Sloth)),(((Dugong,Manatee),((AfricanEl,AsianElep),(RockHyrax,TreeHyrax))),(Aardvark,((GoldenMol,(Madagascar,Tenrec)),(LesserEle,GiantElep)))))),(caenolest,(phascogale,(wombat,bandicoot))));' self.newick_reduced = '(((((TombBat,(((Horse,Rhino),(Pangolin,Cat)),(Pig,SpermWhal))),Hedgehog),(Orangutan,Gorilla)),((HairyArma,Sloth),((Manatee,AsianElep),GoldenMol))),bandicoot);' self.tree = LoadTree(treestring=self.newick)
def test_limited_wls(self): """testing (well, exercising at least), wls with constrained start""" init = LoadTree(treestring='((a,c),b,d)') reconstructed = wls(self.dists, start=init) self.assertEqual(len(reconstructed.getTipNames()), 5) init2 = LoadTree(treestring='((a,d),b,c)') reconstructed = wls(self.dists, start=[init, init2]) self.assertEqual(len(reconstructed.getTipNames()), 5) init3 = LoadTree(treestring='((a,d),b,e)') self.assertRaises(Exception, wls, self.dists, start=[init, init3]) # if start tree has all seq names, should raise an error self.assertRaises(Exception, wls, self.dists, start=[LoadTree(treestring='((a,c),b,(d,e))')])
def test_getEdgeNamesUseOutgroup(self): t1 = LoadTree(treestring="((A,B)ab,(F,(C,D)cd)cdf,E)root;") # a, e, ogroup f t2 = LoadTree(treestring="((E,(A,B)ab)abe,F,(C,D)cd)root;") expected = ['A', 'B', 'E', 'ab'] for t in [t1, t2]: edges = t.getEdgeNames('A', 'E', getstem=False, getclade=True, outgroup_name="F") edges.sort() self.assertEqual(expected, edges)
def __init__(self, TreePath , NeedsToBeCogentModded): self.Parsed = True #used to determine if the full analysis can be conducted try: self.TreePath = TreePath self.NeedsToBeCogentModded = NeedsToBeCogentModded self.CogentTree = None #if the internal nodes need to be renamed, then it is done according to the "FixUpFileForCogent" method if self.NeedsToBeCogentModded: cogentFixUp = fixUpFileForCogent(self.TreePath) self.CogentTreeFile = cogentFixUp[0] self.CogentInputTreeString = cogentFixUp[1] self.CogentTree = LoadTree(self.CogentTreeFile.name) else: self.CogentTree = LoadTree(self.TreePath) #prepares an input string for FastML self.FastMLInputTreeString = self.FixUpFileForFastML(self.CogentTree) #executes method to fully parse tree, then sets all returned variables as class variables CogentNodesLeavesBranches = completeNodesLeavesBranches(self.CogentTree) self.NodeKey_L = CogentNodesLeavesBranches['NodeKey_L'] self.LeafKey_L = CogentNodesLeavesBranches['LeafKey_L'] self.UpperKey_L = CogentNodesLeavesBranches['UpperKey_L'] self.TopKey = CogentNodesLeavesBranches['TopKey'] self.BranchKey_L = CogentNodesLeavesBranches['BranchKey_L'] self.Nodes_D = CogentNodesLeavesBranches['Nodes_D'] #print self.LeafKey_L #executes quick run of FastML to get FastML's naming convention of internal nodes self.FastMLOutputTreeString = executeFastML(self.getTempFASTAFile() , self.FastMLInputTreeString , True) #prepares the FastMLToOriginalMatchedNodes_D self.MatchNodes() except Exception as e: self.Parsed = False
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) start_time = datetime.now() t = LoadTree(opts.input_tree) translation_dict = {} for i, tip in enumerate(t.iterTips()): translation_dict[tip.Name] = i single_rate = False #Generate commands telling BayesTraits which nodes to reconstruct bayestraits_commands = make_bayestraits_script(t, translation_dict, comments=False, single_rate=single_rate) #TODO: make this dynamic #Temporarily assuming there is a nexus file available nexus_fp = opts.input_tree.rsplit(".", 1)[0] + ".nexus" command_fp = "./bayestraits_commands.txt" path_to_bayestraits = "../" outfile = "./bayestrait_reconstruction.trait_table" command_file = open(command_fp, "w+") command_file.writelines(bayestraits_commands) command_file.close() command_file = open(command_fp, "U") bayestraits = BayesTraits() bayestraits_result = bayestraits(data=(nexus_fp, opts.input_trait_data, command_fp)) #print "StdOut:",result["StdOut"].read() print "StdErr:", bayestraits_result["StdErr"].read() print "Return code:", bayestraits_result["ExitStatus"] results = parse_reconstruction_output( bayestraits_result['StdOut'].readlines()) #print "Reconstructions:",results #Reconstruction results f = open(outfile, "w+") f.writelines(results) f.close() end_time = datetime.now() print "Start time:", start_time print "End time:", end_time print "Time to reconstruct:", end_time - start_time bayestraits_result.cleanUp()
def ml(doc, model='NG', gc=None, omega_indep=True, model_gaps=False, indel_indep=True, **kw): aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=DNA) tree = LoadTree(treestring=doc['tree'].encode('utf-8')) code = get_genetic_code(gc) if model != 'NG': # Trim terminal stop codons aln = aln.withoutTerminalStopCodons(code) if model_gaps: filt = lambda x: set(''.join(x)) <= set(DNA).union({'-'}) else: filt = lambda x: set(''.join(x)) <= set(DNA) aln = aln.filtered(filt, motif_length=3) flat_lf, time = _fit(aln, tree, model, code, omega_indep, model_gaps, indel_indep) return { 'lf': flat_lf, 'time': time, 'model': model, 'gc': code.Name, 'omega_indep': omega_indep, 'model_gaps': model_gaps, 'indel_indep': indel_indep }
def test_trees(self): treestring = "((A:.1,B:.22)ab:.3,((C:.4,D:.5)cd:.55,E:.6)cde:.7,F:.2)" for edge in 'ABCDEF': treestring = treestring.replace(edge, edge + edge.lower() * 10) t = LoadTree(treestring=treestring) for klass in [ UnrootedDendrogram, SquareDendrogram, ContemporaneousDendrogram, ShelvedDendrogram, # StraightDendrogram, # ContemporaneousStraightDendrogram ]: dendro = klass(t) dendro.getConnectingNode( 'Ccccccccccc', 'Eeeeeeeeeee').setCollapsed(color="green", label="C, D and E") do(klass.__name__, dendro, shade_param="length", show_params=["length"]) def callback(edge): return ["blue", "red"][edge.Name.startswith("A")] do("Highlight edge A", UnrootedDendrogram(t), edge_color_callback=callback)
def rooted(doc, rooted_edges=None, gc=None, **kw): aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=DNA) tree = LoadTree(treestring=doc['tree'].encode('utf-8')) code = get_genetic_code(gc) aln = aln.withoutTerminalStopCodons(code) aln = aln.filtered(lambda x: set(''.join(x)) <= set(DNA), motif_length=3) sp_kw = dict(upper=20., lower=0.05, is_independent=False) sm = MG94GTR(optimise_motif_probs=True) init_lf = sm.makeLikelihoodFunction(tree) init_lf.setAlignment(aln) with init_lf.updatesPostponed(): for param in init_lf.getParamNames(): if '/' in param: init_lf.setParamRule(param, **sp_kw) init_lf.setParamRule('length', edges=rooted_edges, is_independent=False) init_lf.optimise(local=True, show_progress=False, limit_action='raise') init_lf = nest.deflate_likelihood_function(init_lf, save_jsd=False) sm = GNC(optimise_motif_probs=True) lf = sm.makeLikelihoodFunction(tree) lf.setAlignment(aln) _populate_parameters(lf, init_lf, **sp_kw) for param in lf.getParamNames(): if '>' in param or param == 'omega': lf.setParamRule(param, edges=rooted_edges, is_independent=False) lf.optimise(local=True, show_progress=False, limit_action='raise') flat_lf = nest.deflate_likelihood_function(lf) flat_lf['hard_up'] = _is_hard_up(lf) return {'lf': flat_lf, 'gc': code.Name, 'rooted_edges': rooted_edges}
def MakeCachedObjects(model, tree, seq_length, opt_args): """simulates an alignment under F81, all models should be the same""" lf = model.makeLikelihoodFunction(tree) lf.setMotifProbs(dict(A=0.1, C=0.2, G=0.3, T=0.4)) aln = lf.simulateAlignment(seq_length) results = dict(aln=aln) discrete_tree = LoadTree(tip_names=aln.Names) def fit_general(results=results): if 'general' in results: return gen = General(DNA.Alphabet) gen_lf = _make_likelihood(gen, tree, results) gen_lf.optimise(**opt_args) results['general'] = gen_lf return def fit_gen_stat(results=results): if 'gen_stat' in results: return gen_stat = GeneralStationary(DNA.Alphabet) gen_stat_lf = _make_likelihood(gen_stat, tree, results) gen_stat_lf.optimise(**opt_args) results['gen_stat'] = gen_stat_lf def fit_constructed_gen(results=results): if 'constructed_gen' in results: return preds = [ MotifChange(a, b, forward_only=True) for a, b in [['A', 'C'], ['A', 'G'], ['A', 'T'], ['C', 'A'], ['C', 'G'], ['C', 'T'], ['G', 'C'], ['G', 'T'], ['T', 'A'], ['T', 'C'], ['T', 'G']] ] nuc = Nucleotide(predicates=preds) nuc_lf = _make_likelihood(nuc, tree, results) nuc_lf.optimise(**opt_args) results['constructed_gen'] = nuc_lf def fit_discrete(results=results): if 'discrete' in results: return dis_lf = _make_likelihood(DiscreteSubstitutionModel(DNA.Alphabet), discrete_tree, results, is_discrete=True) dis_lf.optimise(**opt_args) results['discrete'] = dis_lf funcs = dict(general=fit_general, gen_stat=fit_gen_stat, discrete=fit_discrete, constructed_gen=fit_constructed_gen) def call(self, obj_name): if obj_name not in results: funcs[obj_name]() return results[obj_name] return call
def setUp(self): #length all edges 1 except c=2. b&d transitions all other transverions self.al = LoadSeqs( data={'a':'tata', 'b':'tgtc', 'c':'gcga', 'd':'gaac', 'e':'gagc',}) self.tree = LoadTree(treestring='((a,b),(c,d),e);') self.model = cogent.evolve.substitution_model.Nucleotide( do_scaling=True, equal_motif_probs=True, model_gaps=True)
def test_gapped_CNFGTR(): aln = get_aln(os.path.join(get_data_dir(), 'ENSG00000100393.fasta.gz'), codon_position=-1, filter_gaps=False) tree = LoadTree(treestring='(Human,Mouse,Opossum);') doc = {'aln': str(aln), 'tree': str(tree)} cnfgtr_result = gapped.ml(doc, model='CNFGTR', model_gaps=True, omega_indep=False, indel_indep=False) model = lambda: gapped.CNFGTR(optimise_motif_probs=True, model_gaps=True) cnfgtr = gapped.inflate_likelihood_function(cnfgtr_result['lf'], model) pi = cnfgtr.getMotifProbsByNode()['root'].asarray() P = cnfgtr.getPsubForEdge('Human') assert_almost_equal(pi.dot(P), pi) omega = cnfgtr.getParamValue('omega') pi = cnfgtr.getMotifProbs() Q = cnfgtr.getRateMatrixForEdge('Human') cond_p = pi['CCG'] / sum(pi['CC' + c] for c in 'ACGT') ref_cell = Q['CCT']['CCG'] / cond_p cond_p = pi['CCC'] / sum(pi['CC' + c] for c in 'ACGT') assert_almost_equal(Q['CCA']['CCC'] / cond_p / ref_cell, cnfgtr.getParamValue('A/C')) assert_almost_equal(Q['---']['CCC'] / pi['CCC'] / ref_cell, cnfgtr.getParamValue('indel')) R = Q.asarray() / pi.asarray() assert_almost_equal(R.T, R)
def __init__(self, TreePath, NeedsToBeCogentModded): self.Parsed = True #used to determine if the full analysis can be conducted try: self.TreePath = TreePath self.NeedsToBeCogentModded = NeedsToBeCogentModded self.CogentTree = None #if the internal nodes need to be renamed, then it is done according to the "FixUpFileForCogent" method if self.NeedsToBeCogentModded: cogentFixUp = fixUpFileForCogent(self.TreePath) self.CogentTreeFile = cogentFixUp[0] self.CogentInputTreeString = cogentFixUp[1] self.CogentTree = LoadTree(self.CogentTreeFile.name) else: self.CogentTree = LoadTree(self.TreePath) #prepares an input string for FastML self.FastMLInputTreeString = self.FixUpFileForFastML( self.CogentTree) #executes method to fully parse tree, then sets all returned variables as class variables CogentNodesLeavesBranches = completeNodesLeavesBranches( self.CogentTree) self.NodeKey_L = CogentNodesLeavesBranches['NodeKey_L'] self.LeafKey_L = CogentNodesLeavesBranches['LeafKey_L'] self.UpperKey_L = CogentNodesLeavesBranches['UpperKey_L'] self.TopKey = CogentNodesLeavesBranches['TopKey'] self.BranchKey_L = CogentNodesLeavesBranches['BranchKey_L'] self.Nodes_D = CogentNodesLeavesBranches['Nodes_D'] #print self.LeafKey_L #executes quick run of FastML to get FastML's naming convention of internal nodes self.FastMLOutputTreeString = executeFastML( self.getTempFASTAFile(), self.FastMLInputTreeString, True) #prepares the FastMLToOriginalMatchedNodes_D self.MatchNodes() except Exception as e: self.Parsed = False
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) start_time = datetime.now() t = LoadTree(opts.input_tree) translation_dict = {} for i,tip in enumerate(t.iterTips()): translation_dict[tip.Name] = i single_rate = False #Generate commands telling BayesTraits which nodes to reconstruct bayestraits_commands = make_bayestraits_script(t,translation_dict,comments=False,single_rate=single_rate) #TODO: make this dynamic #Temporarily assuming there is a nexus file available nexus_fp = opts.input_tree.rsplit(".",1)[0] +".nexus" command_fp = "./bayestraits_commands.txt" path_to_bayestraits = "../" outfile = "./bayestrait_reconstruction.trait_table" command_file = open(command_fp,"w+") command_file.writelines(bayestraits_commands) command_file.close() command_file = open(command_fp,"U") bayestraits=BayesTraits() bayestraits_result = bayestraits(data=(nexus_fp,opts.input_trait_data,command_fp)) #print "StdOut:",result["StdOut"].read() print "StdErr:",bayestraits_result["StdErr"].read() print "Return code:",bayestraits_result["ExitStatus"] results = parse_reconstruction_output(bayestraits_result['StdOut'].readlines()) #print "Reconstructions:",results #Reconstruction results f = open(outfile,"w+") f.writelines(results) f.close() end_time = datetime.now() print "Start time:", start_time print "End time:",end_time print "Time to reconstruct:", end_time - start_time bayestraits_result.cleanUp()
def build_tree(tree_string, bl1, bl2, r): 'build a PyCogent tree object from a string and branch lengths' # we use r/2.0 because PyCogent defaults to adding a branch of # length 1 if you don't explicitly specify it # having 2 branches of r/2.0 keeps our internal branch at r tree_string_bl = tree_string % (bl1, bl2, r / 2.0, bl1, bl2, r / 2.0) t = LoadTree(treestring=tree_string_bl) return t
def setUp(self): self.name = 'small tree - ' self.otu_names = ['NineBande', 'Mouse', 'HowlerMon', 'DogFaced'] self.otu_names.sort() self.newick = '(((Human,HowlerMon),Mouse),NineBande,DogFaced);' self.newick_sorted = '(DogFaced,((HowlerMon,Human),Mouse),NineBande);' self.newick_reduced = '((HowlerMon,Mouse),NineBande,DogFaced);' self.tree = LoadTree(treestring = self.newick)
def test_setConstantLengths(self): t = LoadTree(treestring='((a:1,b:2):3,(c:4,d:5):6,e:7);') lf = self.model.makeLikelihoodFunction(t) #self.tree) lf.setParamRule('length', is_const=True) # lf.setConstantLengths(t) lf.setAlignment(self.al) self.assertEqual(lf.getParamValue('length', 'b'), 2) self.assertEqual(lf.getParamValue('length', 'd'), 5)
def main(): args = parser.parse_args() categories = args.categories map_fp = args.map_fp tree_fp = args.tree_fp output_fp = args.output_fp length = args.length map_dict = parse_mapping_file_to_dict(map_fp)[0] fields = categories.split(',') tree = LoadTree(tree_fp) furcated_tree = furcate_tree(tree, map_dict, fields, length=length) tree.writeToFile(output_fp)
def test_getsubtree(self): """testing getting a subtree""" subtree = self.tree.unrooted().getSubTree(self.otu_names) new_tree = LoadTree(treestring=self.newick_reduced).unrooted() # check we get the same names self.assertEqual(*[len(t.Children) for t in (subtree, new_tree)]) self.assertEqual(str(subtree), str(new_tree))
def different_tree_simulate_alignment(tree_information_list, all_trees): ''' input: list of tree information example can be[[p1,q1,r1,s1,t1,tree1],[p1,q1,r1,s1,t1,tree2]] tree1 and tree2 are the tree in the all_trees (a,b),(c,d)-->0 (a,c),(b,d)-->1 (a,d),(b,c)-->2 output: PyCogent alignment that different sites follow different order ''' #alnlist is to store all the alignment alnlist = [] #for each tree_information, call the function to get the alignment for tree_information in tree_information_list: aln1 = simulate_alignment_treefixed(all_trees, tree_information[0], tree_information[1], tree_information[2], tree_information[3], tree_information[4], tree_information[5]) alnlist.append(aln1) #put all the alignment together aln = alnlist[0] if (len(alnlist) > 1): for i in range(len(alnlist) - 1): aln = aln + alnlist[i + 1] #find the true tree and construct it according to it has the longest alignment length #index stands for the order of the longest alignment in the whole tree_information_list index = 0 for i in range(len(tree_information_list)): if tree_information_list[i][3] > tree_information_list[index][3]: index = i #build the true tree #index_true_tree stands for the order of the tree in all_trees that the longest alingment follow index_true_tree = tree_information_list[index][5] tree_string = all_trees[index_true_tree] true_tree_bl = tree_string % ( tree_information_list[index][0], tree_information_list[index][1], tree_information_list[index][2] / 2.0, tree_information_list[index][0], tree_information_list[index][1], tree_information_list[index][2] / 2.0) true_tree = LoadTree(treestring=true_tree_bl) return (aln, true_tree)
def test_simulateAlignment2(self): "Simulate alignment with dinucleotide model" al = LoadSeqs(data={'a': 'ggaatt', 'c': 'cctaat'}) t = LoadTree(treestring="(a,c);") sm = substitution_model.Dinucleotide(mprob_model='tuple') lf = sm.makeParamController(t) lf.setAlignment(al) simalign = lf.simulateAlignment() self.assertEqual(len(simalign), 6)
def test_distribution(): """distribution should return empirical distribution for DNA sequence""" al = get_aln('General', 1031).takeSeqs(('Mouse', )) distribution = jsd.distribution(al.getSeq('Mouse')) st = LoadTree(tip_names=('Mouse', )) sm = GTR() lf = sm.makeLikelihoodFunction(st) lf.setMotifProbsFromData(al) probs = lf.getMotifProbs() assert_array_almost_equal(array(probs), array(distribution))
def use_root_seq(root_sequence): al = LoadSeqs(data={'a': 'ggaatt', 'c': 'cctaat'}) t = LoadTree(treestring="(a,c);") sm = substitution_model.Dinucleotide(mprob_model='tuple') lf = sm.makeParamController(t) lf.setAlignment(al) simalign = lf.simulateAlignment(exclude_internal=False, root_sequence=root_sequence) root = simalign.NamedSeqs['root'] self.assertEqual(str(root), str(root_sequence))
def setUp(self): self.name = 'big tree - ' self.otu_names = ['Horse', 'TombBat', 'Rhino', 'Pig', 'AsianElep', 'SpermWhal', 'Cat', 'Gorilla', 'Orangutan', 'bandicoot', 'Hedgehog', 'Sloth', 'HairyArma', 'Manatee', 'GoldenMol', 'Pangolin'] self.otu_names.sort() self.newick = '((((((((FlyingFox,DogFaced),((FreeTaile,LittleBro),(TombBat,RoundEare))),(FalseVamp,LeafNose)),(((Horse,Rhino),(Pangolin,(Cat,Dog))),(Llama,(Pig,(Cow,(Hippo,(SpermWhal,HumpbackW))))))),(Mole,Hedgehog)),(TreeShrew,(FlyingLem,((Jackrabbit,(FlyingSqu,(OldWorld,(Mouse,Rat)))),(Galago,(HowlerMon,(Rhesus,(Orangutan,(Gorilla,(Human,Chimpanzee)))))))))),(((NineBande,HairyArma),(Anteater,Sloth)),(((Dugong,Manatee),((AfricanEl,AsianElep),(RockHyrax,TreeHyrax))),(Aardvark,((GoldenMol,(Madagascar,Tenrec)),(LesserEle,GiantElep)))))),(caenolest,(phascogale,(wombat,bandicoot))));' self.newick_reduced = '(((((TombBat,(((Horse,Rhino),(Pangolin,Cat)),(Pig,SpermWhal))),Hedgehog),(Orangutan,Gorilla)),((HairyArma,Sloth),((Manatee,AsianElep),GoldenMol))),bandicoot);' self.tree = LoadTree(treestring = self.newick)
def setUp(self): self.submodel = Nucleotide( do_scaling=True, model_gaps=False, equal_motif_probs=True, predicates = {'beta': 'transition'}) self.data = LoadSeqs( filename = os.path.join(data_path, 'brca1_5.paml'), moltype = self.submodel.MolType) self.tree = LoadTree( filename = os.path.join(data_path, 'brca1_5.tree'))
def test_distribution(): """distribution should return empirical distribution for DNA sequence""" with GzipFile(os.path.join(get_data_dir(), 'General_1031.fasta.gz')) as ff: data = ff.read() al = Alignment(data=data).takeSeqs(('Mouse', )) distribution = jsd.distribution(al.getSeq('Mouse')) st = LoadTree(tip_names=('Mouse', )) sm = GTR() lf = sm.makeLikelihoodFunction(st) lf.setMotifProbsFromData(al) probs = lf.getMotifProbs() assert_array_almost_equal(array(probs), array(distribution))
class TestTree(unittest.TestCase): """tests for a single tree-type""" def setUp(self): self.name = 'small tree - ' self.otu_names = ['NineBande', 'Mouse', 'HowlerMon', 'DogFaced'] self.otu_names.sort() self.newick = '(((Human,HowlerMon),Mouse),NineBande,DogFaced);' self.newick_sorted = '(DogFaced,((HowlerMon,Human),Mouse),NineBande);' self.newick_reduced = '((HowlerMon,Mouse),NineBande,DogFaced);' self.tree = LoadTree(treestring = self.newick) def test_sorttree(self): """testing (well, exercising at least) treesort""" new_tree = self.tree.sorted() if hasattr(self, 'newick_sorted'): self.assertEqual( self.newick_sorted, new_tree.getNewick(with_distances=0)) def test_getsubtree(self): """testing getting a subtree""" subtree = self.tree.unrooted().getSubTree(self.otu_names) new_tree = LoadTree(treestring = self.newick_reduced).unrooted() # check we get the same names self.assertEqual(*[len(t.Children) for t in (subtree,new_tree)]) self.assertEqual(str(subtree), str(new_tree)) def test_ascii(self): self.tree.asciiArt() # unlabeled internal node tr = DndParser("(B:0.2,(C:0.3,D:0.4):0.6)F;") tr.asciiArt(show_internal=True, compact=False) tr.asciiArt(show_internal=True, compact=True) tr.asciiArt(show_internal=False, compact=False)
def load_de_numericized_newick_tree(tree_in,before="'",after="'",root=False): from cogent.core.tree import PhyloNode from cogent import LoadTree import os.path if os.path.isfile(tree_in): tree = LoadTree(tree_in) else: tree = LoadTree(treestring=tree_in) terminals = tree.getTipNames() rename_dict = {} for tip in terminals: rename_dict[tip] = before + str(tip) + after tree.reassignNames(rename_dict) if root: tree = tree.rootAtMidpoint() treestring = tree.getNewick(with_distances=True) return treestring
def test_run_pick_de_novo_otus_muscle(self): """run_pick_de_novo_otus w muscle generates expected results """ self.params['assign_taxonomy'] = \ {'id_to_taxonomy_fp':self.test_data['refseqs_tax'][0], 'reference_seqs_fp':self.test_data['refseqs'][0]} self.params['align_seqs'] = {'alignment_method':'muscle'} self.params['filter_alignment'] = \ {'suppress_lane_mask_filter':None, 'entropy_threshold':'0.10'} run_pick_de_novo_otus( self.test_data['seqs'][0], self.test_out, call_commands_serially, self.params, self.qiime_config, parallel=False, status_update_callback=no_status_updates) input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0] otu_map_fp = join(self.test_out,'uclust_picked_otus', '%s_otus.txt' % input_file_basename) alignment_fp = join(self.test_out, 'muscle_aligned_seqs','%s_rep_set_aligned.fasta' % input_file_basename) taxonomy_assignments_fp = join(self.test_out, 'uclust_assigned_taxonomy','%s_rep_set_tax_assignments.txt' % input_file_basename) otu_table_fp = join(self.test_out,'otu_table.biom') tree_fp = join(self.test_out,'rep_set.tre') input_seqs = LoadSeqs(self.test_data['seqs'][0], format='fasta', aligned=False) # Number of OTUs falls within a range that was manually # confirmed otu_map_lines = list(open(otu_map_fp)) num_otus = len(otu_map_lines) otu_map_otu_ids = [o.split()[0] for o in otu_map_lines] self.assertEqual(num_otus,14) # all otus get taxonomy assignments taxonomy_assignment_lines = list(open(taxonomy_assignments_fp)) self.assertEqual(len(taxonomy_assignment_lines),num_otus) # all OTUs align aln = LoadSeqs(alignment_fp) self.assertTrue(aln.getNumSeqs(),num_otus) # all OTUs in tree tree = LoadTree(tree_fp) self.assertEqual(len(tree.tips()),num_otus) # check that the two final output files have non-zero size self.assertTrue(getsize(tree_fp) > 0) self.assertTrue(getsize(otu_table_fp) > 0) # Check that the log file is created and has size > 0 log_fp = glob(join(self.test_out,'log*.txt'))[0] self.assertTrue(getsize(log_fp) > 0) # parse the otu table otu_table = parse_biom_table(open(otu_table_fp,'U')) expected_sample_ids = ['f1','f2','f3','f4','p1','p2','t1','t2','not16S.1'] # sample IDs are as expected self.assertEqualItems(otu_table.SampleIds,expected_sample_ids) # expected OTUs self.assertEqualItems(otu_table.ObservationIds,otu_map_otu_ids) # number of sequences in the full otu table equals the number of # input sequences number_seqs_in_otu_table = sum([v.sum() for v in otu_table.iterSampleData()]) self.assertEqual(number_seqs_in_otu_table,input_seqs.getNumSeqs())
class ExplorePrediction: "CONSTRUCTOR" def __init__(self, Directory, DerivedoI, PDBoI): """ Class attributes: Figures_L (List): list of all the figure types that will be created FiguresSVG_D (Dict): key is the type of figure, value is the SVG syntax that will draw the figure DerivedoInterest (String): Derived node of interest that the figure will be based on PDBoInterest (String): PDB structure that the derived node sequence aligned to and achieved a significant hit on """ #initial setup of what figures will be created self.Figures_L = [ "TreeAndStates", "Alignment", "Structurecartoon", "Structuresurface" ] self.FigureSVG_D = {Key: [] for Key in self.Figures_L} self.Directory = Directory if self.Directory.endswith("/"): pass else: self.Directory = self.Directory + "/" self.DerivedoInterest = DerivedoI self.PDBoInterest = PDBoI print self.Directory print self.DerivedoInterest print self.PDBoInterest #output directory where files will be written self.OutputDirectory = "%sFigures/%s-%s/" % ( self.Directory, self.DerivedoInterest, self.PDBoInterest) if os.path.exists(self.OutputDirectory): pass else: os.system("mkdir " + self.OutputDirectory) #paths to relevant input files self.ReportPATH = self.Directory + "Report.xml" self.TreePATH = self.Directory + "ModdedTree.nwk" self.MatrixPATH = self.Directory + "ScoringMatrix.xml" #parses the report file for sequences and branch relationships self.NodeToSeq_D = { re.compile("<H>(.+?)</H>").search(Seq).group(1): re.compile("<S>(.+?)</S>").search(Seq).group(1) for Seq in re.findall("<Seq>.+?</Seq>", open(self.ReportPATH, "r").read()) } self.BranchToAlgorithm_D = { re.compile("<Branch_name>(.+?)</Branch_name>").search(Branch). group(1): ScopeAlgorithm(Branch) for Branch in re.findall("<Branch>.+?</Branch>", open(self.ReportPATH, "r").read(), re. DOTALL) } self.RectCount = 0 #dimensions self.TreeFigWIDTH = 750 self.TreeFigHEIGHT = 500 self.TreeFigXOffset = 25 self.TreeFigYOffset = 50 #loads and parses tree, gets evolutionary distances for proper branch lengths self.CogentTree = LoadTree(self.TreePATH) self.FastMLTree = FastMLTree(self.TreePATH, False) self.FastMLTree.setBranchLengths() self.LongestDistance = self.getLongestEvoDistance() self.EvoDistance_D = { Key: self.getEvoDistance(Key) for Key in self.NodeToSeq_D.keys() if Key != self.FastMLTree.TopKey } self.EvoDistance_D[self.FastMLTree.TopKey] = 0.0 self.ModdedEvoDistance_D = self.modEvoDistance() self.TreeCoords_D = self.setTreeCoords() FurthestPosition = 0.0 FurthestClade = "" #gets the furthest evolutionary distance for Key in self.FastMLTree.LeafKey_L: Val = self.TreeCoords_D[Key][0] + (12 * len(Key)) if Val > FurthestPosition: FurthestPosition = Val FurthestClade = Key self.BranchoInterest = "" for Key in self.FastMLTree.BranchKey_L: if Key.split(">>")[1] == self.DerivedoInterest: self.BranchoInterest = Key #gets all relevant information for the states portion of the figure self.StateIndices_L = [ int(X) - 1 for X in self.BranchToAlgorithm_D[self.BranchoInterest]. getAllMutationXMLKeysAccordingToAccession(self.PDBoInterest) ] self.LeafStates_D = { Key: [self.NodeToSeq_D[Key][state] for state in self.StateIndices_L] for Key in self.FastMLTree.LeafKey_L } self.StateColour_D = self.getStateToHex() self.StateInc = 25.0 self.StateFigHEIGHT = 500 self.StateFigWIDTH = self.StateInc * (len(self.StateIndices_L)) + 50 self.StateFigXOffset = self.TreeFigXOffset + self.TreeFigWIDTH + ( 12 * len(FurthestClade)) + 25 self.StateFigYOffset = 50 #creates the states and tree figure self.FigureSVG_D["TreeAndStates"].append( self.getSVGHeader( self.TreeFigHEIGHT + (self.TreeFigYOffset * 2), self.StateFigXOffset + self.StateFigWIDTH + self.TreeFigXOffset)) self.makeTreeFig() self.makeStatesFig() self.FigureSVG_D["TreeAndStates"].append("</svg>") self.TreeAndStatesFOutPATH = self.OutputDirectory + "TreeAndStates.png" TreeStateFOut = open(self.TreeAndStatesFOutPATH, "w") cairosvg.svg2png( bytestring="\n".join(self.FigureSVG_D["TreeAndStates"]), write_to=TreeStateFOut) TreeStateFOut.close() LongestCladeName = "" for Key in self.FastMLTree.LeafKey_L: if len(Key) > len(LongestCladeName): LongestCladeName = Key #gets all relevant information for the alignment cartoon portion of the figure self.MatrixInfo = self.parseScoringMatrix() self.AlnInc = 11.0 self.AlignmentFigWIDTH = self.AlnInc * len( self.MatrixInfo["Sseq"]) + self.AlnInc + ( 8 * len(LongestCladeName)) self.AlignmentFigHEIGHT = self.AlnInc * ( len(self.FastMLTree.LeafKey_L) + 1) + self.AlnInc self.AlignmentFigXOffset = self.AlnInc self.AlignmentFigYOffset = self.AlnInc self.FigureSVG_D["Alignment"].append( self.getSVGHeader(self.AlignmentFigHEIGHT, self.AlignmentFigWIDTH)) self.makeAlignmentFig() self.FigureSVG_D["Alignment"].append("</svg>") self.AlignmentFOutPATH = self.OutputDirectory + "Alignment.png" AlignmentFOut = open(self.AlignmentFOutPATH, "w") cairosvg.svg2png( bytestring="\n".join(self.FigureSVG_D["Alignment"]), write_to=AlignmentFOut) AlignmentFOut.close() #relevant information for the structure file in PDB format self.ColouredStructureFile = self.getColoredStructureFile() self.StructureFOutPATH = self.OutputDirectory + "Structure.pdb" open(self.StructureFOutPATH, "w").write(self.ColouredStructureFile.read()) self.TotalFigWIDTH = 1000 self.TotalFigHEIGHT = 600 self.TotalElement_L = [ self.getSVGHeader(self.TotalFigHEIGHT, self.TotalFigWIDTH) ] self.TotalElement_L.append( '''\t<image x="0" y="0" width="1000" height="500" xlink:href="file://%s"/>''' % (self.TreeAndStatesFOutPATH)) self.TotalElement_L.append( '''\t<image x="0" y="500" width="1000" height="100" xlink:href="file://%s"/>''' % (self.AlignmentFOutPATH)) self.TotalElement_L.append("</svg>") "gets the header for any SVG format file" def getSVGHeader(self, FrameHEIGHT, FrameWIDTH): return """<?xml version="1.0" standalone="no"?> <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> <svg xmlns:xlink="http://www.w3.org/1999/xlink" xmlns='http://www.w3.org/2000/svg' version='1.1' width='%s' height='%s'> """ % (str(FrameWIDTH), str(FrameHEIGHT)) "Dictionary where the key is the amino acid character and the value is the background colour" def getStateToHex(self): return {"A":"80B3E6","C":"E68080","D":"CC4DCC","E":"CC4DCC","F":"80B3E6",\ "G":"E6994D","H":"1AB3B3","I":"80B3E6","K":"E6331A","L":"80B3E6",\ "M":"80B3E6","N":"1ACC1A","P":"CCCC00","Q":"1ACC1A","R":"E6331A",\ "S":"1ACC1A","T":"1ACC1A","V":"80B3E6","W":"80B3E6","Y":"1AB3B3",\ "-":"FFFFFF","X":"FFFFFF"} "returns the total evolutionary distance from the origin to the node of interest" def getEvoDistance(self, startingToNodeKey): distance = 0.0 rootNodeHasNotBeenReached = True ToNodeKey = startingToNodeKey while rootNodeHasNotBeenReached: distance += self.FastMLTree.BranchLength_D[ToNodeKey] branchUpHasNotBeenFound = True for BranchKey in self.FastMLTree.BranchKey_L: if branchUpHasNotBeenFound: if re.compile(">>" + ToNodeKey + "$").search(BranchKey): branchUpHasNotBeenFound = False ToNodeKey = BranchKey.split(">>")[0] if ToNodeKey == self.FastMLTree.TopKey: rootNodeHasNotBeenReached = False return distance "gets the node with the longest evolutionary distance from the origin" def getLongestEvoDistance(self): longestDistance = 0.0 for LeafKey in self.FastMLTree.LeafKey_L: distance = self.getEvoDistance(LeafKey) if distance > longestDistance: longestDistance = distance return longestDistance "modifies evolutionary distance into a different format" def modEvoDistance(self): Ret = {} for Key in self.EvoDistance_D.keys(): if Key == self.FastMLTree.TopKey: Ret[Key] = self.EvoDistance_D[Key] else: if self.EvoDistance_D[Key] == 0: Ret[Key] = self.EvoDistance_D[Key] else: Ret[Key] = self.EvoDistance_D[Key] return Ret "sets tree node coordinates (horizontal and vertical) for the SVG image" def setTreeCoords(self): Lines_L = self.CogentTree.asciiArt().split("\n") MaxVert = 0 VertCoord_D = {} for i in range(0, len(Lines_L)): if re.compile("[a-zA-Z0-9_\.@]+").search(Lines_L[i]): Leaves = re.findall("([a-zA-Z0-9_\.@]+)", Lines_L[i]) for Leaf in Leaves: VertCoord_D[Leaf] = i MaxVert = i TreeCoords_D = { Key: [(self.ModdedEvoDistance_D[Key] / self.LongestDistance) * self.TreeFigWIDTH + self.TreeFigXOffset, float(float(VertCoord_D[Key]) / float(MaxVert)) * self.TreeFigHEIGHT + self.TreeFigYOffset] for Key in self.NodeToSeq_D.keys() } return TreeCoords_D "adds node names at each node vertex" def addNodeNamesAtNodePoints(self): for Key in self.FastMLTree.LeafKey_L: xy = self.TreeCoords_D[Key] xStart = str(xy[0]) yStart = str(xy[1]) self.FigureSVG_D["TreeAndStates"].append( '''\t<text x='%s' y='%s' text-anchor='left' font-size='20' font-family='Courier' style="fill: #000000;" >%s</text>''' % (xStart, yStart, Key)) "adds the vertical lines of the tree image" def addVerticalLines(self): for branchKey in self.FastMLTree.BranchKey_L: fro = branchKey.split(">>")[0] to = branchKey.split(">>")[1] froXY = self.TreeCoords_D[fro] toXY = self.TreeCoords_D[to] if branchKey == self.BranchoInterest: self.FigureSVG_D["TreeAndStates"].append( '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(255,0,0);stroke-width:1 " />''' % (str(froXY[0]), str(froXY[1]), str(froXY[0]), str( toXY[1]))) else: self.FigureSVG_D["TreeAndStates"].append( '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(0,0,0);stroke-width:1 " />''' % (str(froXY[0]), str(froXY[1]), str(froXY[0]), str( toXY[1]))) "adds the horizontal lines of the tree image" def addHorizontalLines(self): for branchKey in self.FastMLTree.BranchKey_L: fro = branchKey.split(">>")[0] to = branchKey.split(">>")[1] froXY = self.TreeCoords_D[fro] toXY = self.TreeCoords_D[to] if branchKey == self.BranchoInterest: self.FigureSVG_D["TreeAndStates"].append( '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(255,0,0);stroke-width:1 " />''' % (str(froXY[0]), str(toXY[1]), str(toXY[0]), str( toXY[1]))) else: self.FigureSVG_D["TreeAndStates"].append( '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(0,0,0);stroke-width:1 " />''' % (str(froXY[0]), str(toXY[1]), str(toXY[0]), str( toXY[1]))) "does all methods necessary to make the tree image" def makeTreeFig(self): self.addNodeNamesAtNodePoints() self.addVerticalLines() self.addHorizontalLines() "adds the rows for the mutated states in each sequence" def addStateRows(self): inc = self.StateInc vertInc = float(self.StateFigHEIGHT / float(len(self.LeafStates_D))) lowestY = float("inf") for Key in self.TreeCoords_D.keys(): if self.TreeCoords_D[Key][1] < lowestY: lowestY = self.TreeCoords_D[Key][1] stateY = lowestY - (1.5 * vertInc) stateX = 0.0 + self.StateFigXOffset for i in self.StateIndices_L: self.FigureSVG_D["TreeAndStates"].append( '''\t<text x='%s' y='%s' text-anchor='middle' font-size='16' font-family='Courier' transform="rotate(90, %s, %s)" style="fill: #000000;" >%s</text>''' % (str(stateX), str(stateY), str(stateX), str(stateY), str(i + 1))) stateX += inc for Key in self.LeafStates_D.keys(): X = 0.0 + self.StateFigXOffset for State in self.LeafStates_D[Key]: Y = self.TreeCoords_D[Key][1] RectX = X - (float(inc / 2.0)) RectY = Y - (float(vertInc / 2.0)) - 5.0 self.FigureSVG_D["TreeAndStates"].append('''\t<rect class='r%s' x='%s' y='%s' width='%s' height='%s' style="fill:#%s" />''' % (str(self.RectCount),\ str(RectX),str(RectY),\ str(inc),str(vertInc),\ self.StateColour_D[State])) self.FigureSVG_D["TreeAndStates"].append( '''\t<text x='%s' y='%s' font-size='20' font-family='Courier' text-anchor='middle' style="fill: #000000;" >%s</text>''' % (str(X), str(Y), State)) X += inc "executes the method to make the states figure" def makeStatesFig(self): self.addStateRows() "parses the scoring matrix for alignment to the PDB sequence information" def parseScoringMatrix(self): allAlignments_L = re.findall("<PDB_alignment>.+?</PDB_alignment>", open(self.MatrixPATH, "r").read(), re.DOTALL) KeyAln = "" NotFound = True for Alignment in allAlignments_L: if NotFound: PDBID = re.compile("<PDB_id>(.+?)</PDB_id>").search( Alignment).group(1).split("|")[0] if self.PDBoInterest.upper() == PDBID: NotFound = False KeyAln = Alignment self.ChainoInterest = re.compile( "<PDB_id>(.+?)</PDB_id>").search(Alignment).group( 1).split("|")[1].lower() return {"Qstart" : int(re.compile("<Alignment_start_query>(.+?)</Alignment_start_query>").search(KeyAln).group(1))-1,\ "Qend" : int(re.compile("<Alignment_end_query>(.+?)</Alignment_end_query>").search(KeyAln).group(1))-1,\ "Sstart" : int(re.compile("<Alignment_start_subject>(.+?)</Alignment_start_subject>").search(KeyAln).group(1))-1,\ "Send" : int(re.compile("<Alignment_end_subject>(.+?)</Alignment_end_subject>").search(KeyAln).group(1))-1,\ "Sseq" : re.compile("<Aligned_subject_sequence>(.+?)</Aligned_subject_sequence>").search(KeyAln).group(1)} "makes the cartoon of all aligned sequences in the protein family" def makeAlignmentFig(self): AllSeqs_L = [self.MatrixInfo["Sseq"]] + [ self.NodeToSeq_D[Key] [self.MatrixInfo["Qstart"]:self.MatrixInfo["Qstart"] + len(self.MatrixInfo["Sseq"])] for Key in self.FastMLTree.LeafKey_L ] l1 = len(AllSeqs_L[0]) AllHeaders_L = [self.PDBoInterest] + self.FastMLTree.LeafKey_L l2 = 0 for Header in AllHeaders_L: if len(Header) > l2: l2 = len(Header) l = l1 xinc = self.AlnInc yinc = self.AlnInc Y = self.AlignmentFigYOffset for i in range(0, len(AllSeqs_L)): X = 0.0 + self.AlignmentFigXOffset for State in AllSeqs_L[i]: RectX = X - (float(xinc / 2.0)) RectY = Y - (float(yinc / 2.0)) - 5.0 self.FigureSVG_D["Alignment"].append('''\t<rect class='r%s' x='%s' y='%s' width='%s' height='%s' style="fill:#%s" />''' % (str(self.RectCount),\ str(RectX),str(RectY),\ str(xinc),str(yinc),\ self.StateColour_D[State])) self.FigureSVG_D["Alignment"].append( '''\t<text x='%s' y='%s' text-anchor='middle' font-size='10' font-family='Courier' style="fill: #000000;" >%s</text>''' % (str(X), str(Y), State)) X += xinc self.FigureSVG_D["Alignment"].append( '''\t<text x='%s' y='%s' text-anchor='left' font-size='10' font-family='Courier' style="fill: #000000;" >%s</text>''' % (str(X + self.AlnInc), str(Y), AllHeaders_L[i])) Y += yinc "gets a PDB format file with the temperature factors coloured to reflect mutated sites" def getColoredStructureFile(self): NotFound = True DesiredBranchKey = "" for BranchKey in self.FastMLTree.BranchKey_L: if BranchKey.split(">>")[1] == self.DerivedoInterest: DesiredBranchKey = BranchKey NotFound = False PDBAndPDBXMLContents = getAllPDBFileDicts([self.PDBoInterest]) SA = self.BranchToAlgorithm_D[DesiredBranchKey] SA.PDBContents_D = PDBAndPDBXMLContents[0] SA.PDBXMLContents_D = PDBAndPDBXMLContents[1] FH = getOutputTempFile() SA.createPDBColoredFile(self.PDBoInterest, FH.name) return FH
def test_get_tree_get_splits(self): """getTree should provide a reciprocal map of getSplits""" tree = LoadTree(filename=os.path.join(data_path,"murphy.tree")) self.assertTrue(tree.sameTopology(getTree(getSplits(tree))))
def __init__(self, Directory, DerivedoI, PDBoI): """ Class attributes: Figures_L (List): list of all the figure types that will be created FiguresSVG_D (Dict): key is the type of figure, value is the SVG syntax that will draw the figure DerivedoInterest (String): Derived node of interest that the figure will be based on PDBoInterest (String): PDB structure that the derived node sequence aligned to and achieved a significant hit on """ #initial setup of what figures will be created self.Figures_L = [ "TreeAndStates", "Alignment", "Structurecartoon", "Structuresurface" ] self.FigureSVG_D = {Key: [] for Key in self.Figures_L} self.Directory = Directory if self.Directory.endswith("/"): pass else: self.Directory = self.Directory + "/" self.DerivedoInterest = DerivedoI self.PDBoInterest = PDBoI print self.Directory print self.DerivedoInterest print self.PDBoInterest #output directory where files will be written self.OutputDirectory = "%sFigures/%s-%s/" % ( self.Directory, self.DerivedoInterest, self.PDBoInterest) if os.path.exists(self.OutputDirectory): pass else: os.system("mkdir " + self.OutputDirectory) #paths to relevant input files self.ReportPATH = self.Directory + "Report.xml" self.TreePATH = self.Directory + "ModdedTree.nwk" self.MatrixPATH = self.Directory + "ScoringMatrix.xml" #parses the report file for sequences and branch relationships self.NodeToSeq_D = { re.compile("<H>(.+?)</H>").search(Seq).group(1): re.compile("<S>(.+?)</S>").search(Seq).group(1) for Seq in re.findall("<Seq>.+?</Seq>", open(self.ReportPATH, "r").read()) } self.BranchToAlgorithm_D = { re.compile("<Branch_name>(.+?)</Branch_name>").search(Branch). group(1): ScopeAlgorithm(Branch) for Branch in re.findall("<Branch>.+?</Branch>", open(self.ReportPATH, "r").read(), re. DOTALL) } self.RectCount = 0 #dimensions self.TreeFigWIDTH = 750 self.TreeFigHEIGHT = 500 self.TreeFigXOffset = 25 self.TreeFigYOffset = 50 #loads and parses tree, gets evolutionary distances for proper branch lengths self.CogentTree = LoadTree(self.TreePATH) self.FastMLTree = FastMLTree(self.TreePATH, False) self.FastMLTree.setBranchLengths() self.LongestDistance = self.getLongestEvoDistance() self.EvoDistance_D = { Key: self.getEvoDistance(Key) for Key in self.NodeToSeq_D.keys() if Key != self.FastMLTree.TopKey } self.EvoDistance_D[self.FastMLTree.TopKey] = 0.0 self.ModdedEvoDistance_D = self.modEvoDistance() self.TreeCoords_D = self.setTreeCoords() FurthestPosition = 0.0 FurthestClade = "" #gets the furthest evolutionary distance for Key in self.FastMLTree.LeafKey_L: Val = self.TreeCoords_D[Key][0] + (12 * len(Key)) if Val > FurthestPosition: FurthestPosition = Val FurthestClade = Key self.BranchoInterest = "" for Key in self.FastMLTree.BranchKey_L: if Key.split(">>")[1] == self.DerivedoInterest: self.BranchoInterest = Key #gets all relevant information for the states portion of the figure self.StateIndices_L = [ int(X) - 1 for X in self.BranchToAlgorithm_D[self.BranchoInterest]. getAllMutationXMLKeysAccordingToAccession(self.PDBoInterest) ] self.LeafStates_D = { Key: [self.NodeToSeq_D[Key][state] for state in self.StateIndices_L] for Key in self.FastMLTree.LeafKey_L } self.StateColour_D = self.getStateToHex() self.StateInc = 25.0 self.StateFigHEIGHT = 500 self.StateFigWIDTH = self.StateInc * (len(self.StateIndices_L)) + 50 self.StateFigXOffset = self.TreeFigXOffset + self.TreeFigWIDTH + ( 12 * len(FurthestClade)) + 25 self.StateFigYOffset = 50 #creates the states and tree figure self.FigureSVG_D["TreeAndStates"].append( self.getSVGHeader( self.TreeFigHEIGHT + (self.TreeFigYOffset * 2), self.StateFigXOffset + self.StateFigWIDTH + self.TreeFigXOffset)) self.makeTreeFig() self.makeStatesFig() self.FigureSVG_D["TreeAndStates"].append("</svg>") self.TreeAndStatesFOutPATH = self.OutputDirectory + "TreeAndStates.png" TreeStateFOut = open(self.TreeAndStatesFOutPATH, "w") cairosvg.svg2png( bytestring="\n".join(self.FigureSVG_D["TreeAndStates"]), write_to=TreeStateFOut) TreeStateFOut.close() LongestCladeName = "" for Key in self.FastMLTree.LeafKey_L: if len(Key) > len(LongestCladeName): LongestCladeName = Key #gets all relevant information for the alignment cartoon portion of the figure self.MatrixInfo = self.parseScoringMatrix() self.AlnInc = 11.0 self.AlignmentFigWIDTH = self.AlnInc * len( self.MatrixInfo["Sseq"]) + self.AlnInc + ( 8 * len(LongestCladeName)) self.AlignmentFigHEIGHT = self.AlnInc * ( len(self.FastMLTree.LeafKey_L) + 1) + self.AlnInc self.AlignmentFigXOffset = self.AlnInc self.AlignmentFigYOffset = self.AlnInc self.FigureSVG_D["Alignment"].append( self.getSVGHeader(self.AlignmentFigHEIGHT, self.AlignmentFigWIDTH)) self.makeAlignmentFig() self.FigureSVG_D["Alignment"].append("</svg>") self.AlignmentFOutPATH = self.OutputDirectory + "Alignment.png" AlignmentFOut = open(self.AlignmentFOutPATH, "w") cairosvg.svg2png( bytestring="\n".join(self.FigureSVG_D["Alignment"]), write_to=AlignmentFOut) AlignmentFOut.close() #relevant information for the structure file in PDB format self.ColouredStructureFile = self.getColoredStructureFile() self.StructureFOutPATH = self.OutputDirectory + "Structure.pdb" open(self.StructureFOutPATH, "w").write(self.ColouredStructureFile.read()) self.TotalFigWIDTH = 1000 self.TotalFigHEIGHT = 600 self.TotalElement_L = [ self.getSVGHeader(self.TotalFigHEIGHT, self.TotalFigWIDTH) ] self.TotalElement_L.append( '''\t<image x="0" y="0" width="1000" height="500" xlink:href="file://%s"/>''' % (self.TreeAndStatesFOutPATH)) self.TotalElement_L.append( '''\t<image x="0" y="500" width="1000" height="100" xlink:href="file://%s"/>''' % (self.AlignmentFOutPATH)) self.TotalElement_L.append("</svg>")
class LikelihoodFunctionTests(TestCase): """tests for a tree analysis class. Various tests to create a tree analysis class, set parameters, and test various functions. """ def setUp(self): self.submodel = Nucleotide( do_scaling=True, model_gaps=False, equal_motif_probs=True, predicates = {'beta': 'transition'}) self.data = LoadSeqs( filename = os.path.join(data_path, 'brca1_5.paml'), moltype = self.submodel.MolType) self.tree = LoadTree( filename = os.path.join(data_path, 'brca1_5.tree')) def _makeLikelihoodFunction(self, **kw): lf = self.submodel.makeLikelihoodFunction(self.tree, **kw) lf.setParamRule('beta', is_independent=True) lf.setAlignment(self.data) return lf def _setLengthsAndBetas(self, likelihood_function): for (species, length) in [ ("DogFaced", 0.1), ("NineBande", 0.2), ("Human", 0.3), ("HowlerMon", 0.4), ("Mouse", 0.5)]: likelihood_function.setParamRule("length", value=length, edge=species, is_constant=True) for (species1, species2, length) in [ ("Human", "HowlerMon", 0.7), ("Human", "Mouse", 0.6)]: LCA = self.tree.getConnectingNode(species1, species2).Name likelihood_function.setParamRule("length", value=length, edge=LCA, is_constant=True) likelihood_function.setParamRule("beta", value=4.0, is_constant=True) def test_information_criteria(self): """test get information criteria from a model.""" lf = self._makeLikelihoodFunction() nfp = lf.getNumFreeParams() lnL = lf.getLogLikelihood() l = len(self.data) self.assertFloatEqual(lf.getAic(), aic(lnL, nfp)) self.assertFloatEqual(lf.getAic(second_order=True), aic(lnL, nfp, l)) self.assertFloatEqual(lf.getBic(), bic(lnL, nfp, l)) def test_result_str(self): # actualy more a test of self._setLengthsAndBetas() likelihood_function = self._makeLikelihoodFunction() self._setLengthsAndBetas(likelihood_function) self.assertEqual(str(likelihood_function), \ """Likelihood Function Table\n\ ====== beta ------ 4.0000 ------ ============================= edge parent length ----------------------------- Human edge.0 0.3000 HowlerMon edge.0 0.4000 edge.0 edge.1 0.7000 Mouse edge.1 0.5000 edge.1 root 0.6000 NineBande root 0.2000 DogFaced root 0.1000 ----------------------------- =============== motif mprobs --------------- T 0.2500 C 0.2500 A 0.2500 G 0.2500 ---------------""") likelihood_function = self._makeLikelihoodFunction(digits=2,space=2) self.assertEqual(str(likelihood_function), \ """Likelihood Function Table\n\ =============================== edge parent length beta ------------------------------- Human edge.0 1.00 1.00 HowlerMon edge.0 1.00 1.00 edge.0 edge.1 1.00 1.00 Mouse edge.1 1.00 1.00 edge.1 root 1.00 1.00 NineBande root 1.00 1.00 DogFaced root 1.00 1.00 ------------------------------- ============= motif mprobs ------------- T 0.25 C 0.25 A 0.25 G 0.25 -------------""") def test_calclikelihood(self): likelihood_function = self._makeLikelihoodFunction() self._setLengthsAndBetas(likelihood_function) self.assertAlmostEquals(-250.686745262, likelihood_function.getLogLikelihood(),places=9) def test_g_statistic(self): likelihood_function = self._makeLikelihoodFunction() self._setLengthsAndBetas(likelihood_function) self.assertAlmostEquals(230.77670557, likelihood_function.getGStatistic(),places=6) def test_ancestralsequences(self): likelihood_function = self._makeLikelihoodFunction() self._setLengthsAndBetas(likelihood_function) result = likelihood_function.reconstructAncestralSeqs()['edge.0'] a_column_with_mostly_Ts = -1 motif_G = 2 self.assertAlmostEquals(2.28460181711e-05, result[a_column_with_mostly_Ts][motif_G], places=8) lf = self.submodel.makeLikelihoodFunction(self.tree, bins=['low', 'high']) lf.setParamRule('beta', bin='low', value=0.1) lf.setParamRule('beta', bin='high', value=10.0) lf.setAlignment(self.data) result = lf.reconstructAncestralSeqs() def test_likely_ancestral(self): """excercising the most likely ancestral sequences""" likelihood_function = self._makeLikelihoodFunction() self._setLengthsAndBetas(likelihood_function) result = likelihood_function.likelyAncestralSeqs() def test_simulateAlignment(self): "Simulate DNA alignment" likelihood_function = self._makeLikelihoodFunction() self._setLengthsAndBetas(likelihood_function) simulated_alignment = likelihood_function.simulateAlignment(20, exclude_internal = False) self.assertEqual(len(simulated_alignment), 20) self.assertEqual(len(simulated_alignment.getSeqNames()), 8) def test_simulateHetergeneousAlignment(self): "Simulate substitution-heterogeneous DNA alignment" lf = self.submodel.makeLikelihoodFunction(self.tree, bins=['low', 'high']) lf.setParamRule('beta', bin='low', value=0.1) lf.setParamRule('beta', bin='high', value=10.0) simulated_alignment = lf.simulateAlignment(100) def test_simulatePatchyHetergeneousAlignment(self): "Simulate patchy substitution-heterogeneous DNA alignment" lf = self.submodel.makeLikelihoodFunction(self.tree, bins=['low', 'high'], sites_independent=False) lf.setParamRule('beta', bin='low', value=0.1) lf.setParamRule('beta', bin='high', value=10.0) simulated_alignment = lf.simulateAlignment(100) def test_simulateAlignment2(self): "Simulate alignment with dinucleotide model" al = LoadSeqs(data={'a':'ggaatt','c':'cctaat'}) t = LoadTree(treestring="(a,c);") sm = substitution_model.Dinucleotide(mprob_model='tuple') lf = sm.makeParamController(t) lf.setAlignment(al) simalign = lf.simulateAlignment() self.assertEqual(len(simalign), 6) def test_simulateAlignment3(self): """Simulated alignment with gap-induced ambiguous positions preserved""" t = LoadTree(treestring='(a:0.4,b:0.3,(c:0.15,d:0.2)edge.0:0.1)root;') al = LoadSeqs(data={ 'a':'g--cactat?', 'b':'---c-ctcct', 'c':'-a-c-ctat-', 'd':'-a-c-ctat-'}) sm = Nucleotide(recode_gaps=True) lf = sm.makeParamController(t) #pc.setConstantLengths() lf.setAlignment(al) #print lf.simulateAlignment(sequence_length=10) simulated = lf.simulateAlignment() self.assertEqual(len(simulated.getSeqNames()), 4) import re self.assertEqual( re.sub('[ATCG]', 'x', simulated.todict()['a']), 'x??xxxxxx?') def test_simulateAlignment_root_sequence(self): """provide a root sequence for simulating an alignment""" def use_root_seq(root_sequence): al = LoadSeqs(data={'a':'ggaatt','c':'cctaat'}) t = LoadTree(treestring="(a,c);") sm = substitution_model.Dinucleotide(mprob_model='tuple') lf = sm.makeParamController(t) lf.setAlignment(al) simalign = lf.simulateAlignment(exclude_internal=False, root_sequence=root_sequence) root = simalign.NamedSeqs['root'] self.assertEqual(str(root), str(root_sequence)) root_sequence = DNA.makeSequence('GTAATT') use_root_seq(root_sequence) # as a sequence instance use_root_seq('GTAATC') # as a string def test_pc_initial_parameters(self): """Default parameter values from original annotated tree""" likelihood_function = self._makeLikelihoodFunction() self._setLengthsAndBetas(likelihood_function) tree = likelihood_function.getAnnotatedTree() lf = self.submodel.makeParamController(tree) lf.setAlignment(self.data) self.assertEqual(lf.getParamValue("length", "Human"), 0.3) self.assertEqual(lf.getParamValue("beta", "Human"), 4.0) def test_set_par_all(self): likelihood_function = self._makeLikelihoodFunction() likelihood_function.setParamRule("length", value=4.0, is_constant=True) likelihood_function.setParamRule("beta", value=6.0, is_constant=True) self.assertEqual(str(likelihood_function), \ """Likelihood Function Table ====== beta ------ 6.0000 ------ ============================= edge parent length ----------------------------- Human edge.0 4.0000 HowlerMon edge.0 4.0000 edge.0 edge.1 4.0000 Mouse edge.1 4.0000 edge.1 root 4.0000 NineBande root 4.0000 DogFaced root 4.0000 ----------------------------- =============== motif mprobs --------------- T 0.2500 C 0.2500 A 0.2500 G 0.2500 ---------------""") #self.submodel.setScaleRule("ts",['beta']) #self.submodel.setScaleRule("tv",['beta'], exclude_pars = True) self.assertEqual(str(likelihood_function),\ """Likelihood Function Table ====== beta ------ 6.0000 ------ ============================= edge parent length ----------------------------- Human edge.0 4.0000 HowlerMon edge.0 4.0000 edge.0 edge.1 4.0000 Mouse edge.1 4.0000 edge.1 root 4.0000 NineBande root 4.0000 DogFaced root 4.0000 ----------------------------- =============== motif mprobs --------------- T 0.2500 C 0.2500 A 0.2500 G 0.2500 ---------------""") def test_getMotifProbs(self): likelihood_function = self._makeLikelihoodFunction() mprobs = likelihood_function.getMotifProbs() assert hasattr(mprobs, 'keys'), mprobs keys = mprobs.keys() keys.sort() obs = self.submodel.getMotifs() obs.sort() self.assertEqual(obs, keys) def test_getAnnotatedTree(self): likelihood_function = self._makeLikelihoodFunction() likelihood_function.setParamRule("length", value=4.0, edge="Human", is_constant=True) result = likelihood_function.getAnnotatedTree() self.assertEqual(result.getNodeMatchingName('Human').params['length'], 4.0) self.assertEqual(result.getNodeMatchingName('Human').Length, 4.0) def test_getparamsasdict(self): likelihood_function = self._makeLikelihoodFunction() likelihood_function.setName("TEST") self.assertEqual(str(likelihood_function),\ """TEST ======================================= edge parent length beta --------------------------------------- Human edge.0 1.0000 1.0000 HowlerMon edge.0 1.0000 1.0000 edge.0 edge.1 1.0000 1.0000 Mouse edge.1 1.0000 1.0000 edge.1 root 1.0000 1.0000 NineBande root 1.0000 1.0000 DogFaced root 1.0000 1.0000 --------------------------------------- =============== motif mprobs --------------- T 0.2500 C 0.2500 A 0.2500 G 0.2500 ---------------""") self.assertEqual(likelihood_function.getParamValueDict(['edge']), { 'beta': {'NineBande': 1.0, 'edge.1': 1.0,'DogFaced': 1.0, 'Human': 1.0, 'edge.0': 1.0, 'Mouse': 1.0, 'HowlerMon': 1.0}, 'length': {'NineBande': 1.0,'edge.1': 1.0, 'DogFaced': 1.0, 'Human': 1.0, 'edge.0': 1.0, 'Mouse': 1.0,'HowlerMon': 1.0}}) def test_get_statistics_from_empirical_model(self): """should return valid dict from an empirical substitution model""" submod = JTT92() aln = self.data.getTranslation() lf = submod.makeLikelihoodFunction(self.tree) lf.setAlignment(aln) stats = lf.getParamValueDict(['edge'], params=['length']) def test_constant_to_free(self): """excercise setting a constant param rule, then freeing it""" # checks by just trying to make the calculator lf = self.submodel.makeLikelihoodFunction(self.tree) lf.setAlignment(self.data) lf.setParamRule('beta', is_constant=True, value=2.0, edges=['NineBande', 'DogFaced'], is_clade=True) lf.setParamRule('beta', init=2.0, is_constant=False, edges=['NineBande', 'DogFaced'], is_clade=True) def test_get_psub_rate_matrix(self): """lf should return consistent rate matrix and psub""" lf = self.submodel.makeLikelihoodFunction(self.tree) lf.setAlignment(self.data) Q = lf.getRateMatrixForEdge('NineBande') P = lf.getPsubForEdge('NineBande') self.assertFloatEqual(expm(Q.array)(1.0), P.array) # should fail for a discrete Markov model dm = substitution_model.DiscreteSubstitutionModel(DNA.Alphabet) lf = dm.makeLikelihoodFunction(self.tree) lf.setAlignment(self.data) self.assertRaises(Exception, lf.getRateMatrixForEdge, 'NineBande') def test_make_discrete_markov(self): """lf ignores tree lengths if a discrete Markov model""" t = LoadTree(treestring='(a:0.4,b:0.3,(c:0.15,d:0.2)edge.0:0.1)root;') dm = substitution_model.DiscreteSubstitutionModel(DNA.Alphabet) lf = dm.makeLikelihoodFunction(t)
class FastMLTree: """ Class attributes: Parsed (Bool): an indication of whether or not the user-defined tree was successfully parsed, if it was not, then the rest of the analysis is not performed TreePath (String): absolute path to tree file NeedsToBeCogentModded (Bool): whether or not placeholder names for the internal nodes need to be created CogentTree (Object LoadTree): pyCogent Class object containing parsed newick syntax tree FastMLInputTreeString (String): representation of tree in newick with internal node names removed FastMLOutputTreeString (String): representation of tree in newick with internal nodes named according to FastML naming convention FastMLToOriginalMatchedNodes_D (Dict): Key is the node name in the cogent convention, value is the node name in the FastML convention NodeKey_L (List): List of all node name keys LeafKey_L (List): List of all terminal node name keys UpperKey_L (List): List of all internal (non-terminal) node name keys TopKey (String): root node name key BranchKey_L (List): List of all paths (from ancestral to immediate derived) along the tree Nodes_D (Dict): Key is the node name, value is a sub-dict containing immediate derived nodes and terminal nodes under the node """ "CONSTRUCTOR" def __init__(self, TreePath , NeedsToBeCogentModded): self.Parsed = True #used to determine if the full analysis can be conducted try: self.TreePath = TreePath self.NeedsToBeCogentModded = NeedsToBeCogentModded self.CogentTree = None #if the internal nodes need to be renamed, then it is done according to the "FixUpFileForCogent" method if self.NeedsToBeCogentModded: cogentFixUp = fixUpFileForCogent(self.TreePath) self.CogentTreeFile = cogentFixUp[0] self.CogentInputTreeString = cogentFixUp[1] self.CogentTree = LoadTree(self.CogentTreeFile.name) else: self.CogentTree = LoadTree(self.TreePath) #prepares an input string for FastML self.FastMLInputTreeString = self.FixUpFileForFastML(self.CogentTree) #executes method to fully parse tree, then sets all returned variables as class variables CogentNodesLeavesBranches = completeNodesLeavesBranches(self.CogentTree) self.NodeKey_L = CogentNodesLeavesBranches['NodeKey_L'] self.LeafKey_L = CogentNodesLeavesBranches['LeafKey_L'] self.UpperKey_L = CogentNodesLeavesBranches['UpperKey_L'] self.TopKey = CogentNodesLeavesBranches['TopKey'] self.BranchKey_L = CogentNodesLeavesBranches['BranchKey_L'] self.Nodes_D = CogentNodesLeavesBranches['Nodes_D'] #print self.LeafKey_L #executes quick run of FastML to get FastML's naming convention of internal nodes self.FastMLOutputTreeString = executeFastML(self.getTempFASTAFile() , self.FastMLInputTreeString , True) #prepares the FastMLToOriginalMatchedNodes_D self.MatchNodes() except Exception as e: self.Parsed = False "Removes internal node names so that FastML adds its own naming convention" def FixUpFileForFastML(self, CogentTree): #gets the tree string for the cogent object TreeString = CogentTree.getNewick(with_distances=True).replace("'","") i = 0 NotThroughTheString = True #while loop moves one space along tree string until it gets to the end while NotThroughTheString: #when a close bracket is found, it signifies the end of an internal node if TreeString[i] == ")": if TreeString[i+1] == ";": pass else: #tree string replaces the name of the internal node with nothing lengthToColon = len(re.compile("^(.+?)[:;]").search(TreeString[i:]).group(1)) - 1 TreeString = TreeString[:i+1]+ TreeString[i+lengthToColon+1:] #check to end while loop if i == len(TreeString) - 1: NotThroughTheString = False i += 1 return TreeString "Prepares simple FastaFile to be given to FastML" def getTempFASTAFile(self): retString_L = [] #FastaFile will have the sequence "GREAT" for each terminal sequence for LeafKey in self.LeafKey_L: retString_L.append(">"+LeafKey) retString_L.append("GREAT") return '\n'.join(retString_L) "Corrects for instances where FastML anomalously renames terminal nodes" def correctForFastMLNameChanges(self): #gets lists of terminal names in the FastML input and output strings (in the same order) FastMLInputNames = [re.compile("^(.+?):").search(TaxString).group(1) for TaxString in re.findall("[A-Za-z0-9_./]+:[.0-9]+",self.FastMLInputTreeString)] #print FastMLInputNames FastMLOutputNames = [re.compile("^(.+?):").search(TaxString).group(1) for TaxString in re.findall("[A-Za-z0-9_./]+:[.0-9]+",self.FastMLOutputTreeString)] FastMLOutputNames = [Name for Name in FastMLOutputNames if re.compile("^N[0-9]+$").search(Name) == None] #when equivalent node names are not the same, then the output string node name is renamed according to the input string node name for i in range(0,len(FastMLInputNames)): if FastMLInputNames[i] != FastMLOutputNames[i]: self.FastMLOutputTreeString = re.sub("([,\(\)])%s:" % (FastMLOutputNames[i]) , r"\1%s:" % (FastMLInputNames[i]) , self.FastMLOutputTreeString) "Matches original (cogent) node names with how the nodes are named in FastML" def MatchNodes(self): #print "YAY" self.correctForFastMLNameChanges() #performs the correction on the output string if necessary #print "NAY" TerminiStringToNodeName_D = {} #a termini string is prepared for each internal node, that is, all termini under the internal node sorted an placed into a single string for NodeKey in self.UpperKey_L: TerminiStringToNodeName_D['-'.join(sorted(self.Nodes_D[NodeKey]['terminal']))] = NodeKey #prepares a cogent tree object for the fastML output FH = getInputTempFile(self.FastMLOutputTreeString) FastMLCogentTree = LoadTree(FH.name) self.FastMLToOriginalMatchedNodes_D = {} #for each cogent node in the FastML cogent tree for FastMLCogentNodeKey in FastMLCogentTree.getNodeNames(): #a termini string is prepared for the fastML node FastMLCogentNode = FastMLCogentTree.getNodeMatchingName(FastMLCogentNodeKey) FastMLTermini_L = [tip.Name for tip in FastMLCogentNode.iterTips()] #if it has more than 0 termini under the node if len(FastMLTermini_L) > 0: #A fastML termini string is prepared, and this termini string will be the same termini string as the equivalent cogent node FastMLTerminiString = '-'.join(sorted(FastMLTermini_L)) self.FastMLToOriginalMatchedNodes_D[FastMLCogentNodeKey] = TerminiStringToNodeName_D[FastMLTerminiString] #if it has no termini under it, then the node itself is a terminus and has the same name in FastML and Cogent else: self.FastMLToOriginalMatchedNodes_D[FastMLCogentNodeKey] = FastMLCogentNodeKey "Sets branch lengths of each node" def setBranchLengths(self): self.BranchLength_D = {} #gets the distance between a node and its immediate ancestor for NodeNameKey in self.NodeKey_L: HigherNode = self.CogentTree.getNodeMatchingName(NodeNameKey) for ImmediateNeighbourNodeNameKey in self.Nodes_D[NodeNameKey]['immediate']: LowerNode = self.CogentTree.getNodeMatchingName(ImmediateNeighbourNodeNameKey) self.BranchLength_D[ImmediateNeighbourNodeNameKey] = HigherNode.distance(LowerNode)
header = line.split()[0] fileout.write(''.join([">", header, "\n", seqs[header], "\n"])) rawseqs.append((header, seqs[header])) tips.append(header) fileout.close() print "Aligning seqs using muscle with -diags" seqs = LoadSeqs(data=rawseqs, moltype=RNA, aligned=False) aln = align_unaligned_seqs(seqs, RNA, {"-diags": True}) fileout = open(folderout + "/" + basenames + "-seqsaligned.fasta", 'w') fileout.write(str(aln)) fileout.close() print "Folding sequences" #get subtree of the clade being folded to pass to PPfold tr = LoadTree(argv[3]) sub_tree = tr.getSubTree(tips, keep_root=True) filesubtree = open(folderout + "/" + basenames + "-subtreeDistances.nwk", 'w') filesubtree.write(sub_tree.getNewick(with_distances=True)) filesubtree.close() filesubtree = open(folderout + "/" + basenames + "-subtree.nwk", 'w') filesubtree.write(sub_tree.getNewick(with_distances=False)) #call PPfold with aligned sequences and subtree args = ["java", "-jar", PPFOLDDIR + "PPfold.jar", folderout + "/" + basenames + "-seqsaligned.fasta", "--outputd", folderout] check_call(args) print "Converting sequences to vienna" check_call(["ct2b.pl", folderout + basenames + "-seqsaligned.ct", ">",folderout + basenames + "-vienna.txt"]) print "DONE"
def TreeAlign(model, seqs, tree=None, indel_rate=0.01, indel_length=0.01, ui = None, ests_from_pairwise=True, param_vals=None): """Returns a multiple alignment and tree. Uses the provided substitution model and a tree for determining the progressive order. If a tree is not provided a Neighbour Joining tree is constructed from pairwise distances estimated from pairwise aligning the sequences. If running in parallel, only the distance estimation is parallelised and only the master CPU returns the alignment and tree, other CPU's return None, None. Arguments: - model: a substitution model - seqs: a sequence collection - indel_rate, indel_length: parameters for the progressive pair-HMM - ests_from_pairwise: if no tree provided and True, the median value of the substitution model parameters are used - param_vals: named key, value pairs for model parameters. These override ests_from_pairwise. """ _exclude_params = ['mprobs', 'rate', 'bin_switch'] if param_vals: param_vals = dict(param_vals) else: param_vals = {} if isinstance(seqs, dict): seq_names = list(seqs.keys()) else: seq_names = seqs.getSeqNames() two_seqs = len(seq_names) == 2 if tree: tip_names = tree.getTipNames() tip_names.sort() seq_names.sort() assert tip_names == seq_names, \ "names don't match between seqs and tree: tree=%s; seqs=%s" % \ (tip_names, seq_names) ests_from_pairwise = False elif two_seqs: tree = LoadTree(tip_names=seqs.getSeqNames()) ests_from_pairwise = False else: if ests_from_pairwise: est_params = [param for param in model.getParamList() \ if param not in _exclude_params] else: est_params = None dcalc = EstimateDistances(seqs, model, do_pair_align=True, est_params=est_params) dcalc.run() dists = dcalc.getPairwiseDistances() tree = NJ.nj(dists) LF = model.makeLikelihoodFunction(tree.bifurcating(name_unnamed=True), aligned=False) if ests_from_pairwise and not param_vals: # we use the Median to avoid the influence of outlier pairs param_vals = {} for param in est_params: numbers = dcalc.getParamValues(param) print("Param Estimate Summary Stats: %s" % param) print(numbers.summarize()) param_vals[param] = numbers.Median ui.display("Doing %s alignment" % ["progressive", "pairwise"][two_seqs]) with LF.updatesPostponed(): for param, val in list(param_vals.items()): LF.setParamRule(param, value=val, is_constant=True) LF.setParamRule('indel_rate', value=indel_rate, is_constant=True) LF.setParamRule('indel_length', value=indel_length, is_constant=True) LF.setSequences(seqs) edge = LF.getLogLikelihood().edge align = edge.getViterbiPath().getAlignment() info = Info() info["AlignParams"] = param_vals info["AlignParams"].update(dict(indel_length=indel_length, indel_rate=indel_rate)) align.Info = info return align, tree
def test_reroot(self): tree = LoadTree(treestring="((a,b),(c,d),e)") tree2 = tree.rootedWithTip('b') self.assertEqual(tree2.getNewick(), "(a,b,((c,d),e));")
def test_run_pick_de_novo_otus_parallel(self): """run_pick_de_novo_otus generates expected results in parallel """ self.params['assign_taxonomy'] = \ {'id_to_taxonomy_fp':self.test_data['refseqs_tax'][0], 'reference_seqs_fp':self.test_data['refseqs'][0]} self.params['align_seqs'] = \ {'template_fp':self.test_data['refseqs_aligned'][0]} self.params['filter_alignment'] = \ {'lane_mask_fp':self.test_data['refseqs_aligned_lanemask'][0]} actual_tree_fp, actual_otu_table_fp = run_pick_de_novo_otus( self.test_data['seqs'][0], self.test_out, call_commands_serially, self.params, self.qiime_config, parallel=True, status_update_callback=no_status_updates) input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0] otu_map_fp = join(self.test_out,'uclust_picked_otus', '%s_otus.txt' % input_file_basename) alignment_fp = join(self.test_out, 'pynast_aligned_seqs','%s_rep_set_aligned.fasta' % input_file_basename) failures_fp = join(self.test_out, 'pynast_aligned_seqs','%s_rep_set_failures.fasta' % input_file_basename) taxonomy_assignments_fp = join(self.test_out, 'uclust_assigned_taxonomy','%s_rep_set_tax_assignments.txt' % input_file_basename) otu_table_fp = join(self.test_out,'otu_table.biom') tree_fp = join(self.test_out,'rep_set.tre') self.assertEqual(actual_tree_fp,tree_fp) self.assertEqual(actual_otu_table_fp,otu_table_fp) input_seqs = LoadSeqs(self.test_data['seqs'][0], format='fasta', aligned=False) # Number of OTUs falls within a range that was manually # confirmed otu_map_lines = list(open(otu_map_fp)) num_otus = len(otu_map_lines) otu_map_otu_ids = [o.split()[0] for o in otu_map_lines] self.assertEqual(num_otus,14) # all otus get taxonomy assignments taxonomy_assignment_lines = list(open(taxonomy_assignments_fp)) self.assertEqual(len(taxonomy_assignment_lines),num_otus) # number of seqs which aligned + num of seqs which failed to # align sum to the number of OTUs aln = LoadSeqs(alignment_fp) failures = LoadSeqs(failures_fp,aligned=False) self.assertTrue(aln.getNumSeqs() + failures.getNumSeqs(),num_otus) # number of tips in the tree equals the number of sequences that # aligned tree = LoadTree(tree_fp) self.assertEqual(len(tree.tips()),aln.getNumSeqs()) # parse the otu table otu_table = parse_biom_table(open(otu_table_fp,'U')) expected_sample_ids = ['f1','f2','f3','f4','p1','p2','t1','t2','not16S.1'] # sample IDs are as expected self.assertEqualItems(otu_table.SampleIds,expected_sample_ids) # otu ids are as expected self.assertEqualItems(otu_table.ObservationIds,otu_map_otu_ids) # number of sequences in the full otu table equals the number of # input sequences number_seqs_in_otu_table = sum([v.sum() for v in otu_table.iterSampleData()]) self.assertEqual(number_seqs_in_otu_table,input_seqs.getNumSeqs()) # Check that the log file is created and has size > 0 log_fp = glob(join(self.test_out,'log*.txt'))[0] self.assertTrue(getsize(log_fp) > 0)
#!/usr/bin/env python from cogent import LoadTree tr = LoadTree('test.nw') print tr.rootedWithTip("X")
def processTree(fstr): # Attempt to load input as tree host_tree = LoadTree(treestring=fstr) host_dist = cogent_dist_to_qiime_dist(host_tree.getDistances()) return host_tree, host_dist
def gettree(self): treeobj = LoadTree(filename=os.path.join(data_path, "murphy.tree")) return treeobj.getSubTree(seqnames)
def setUp(self): self.tree = LoadTree(treestring='((a:3,b:4):2,(c:6,d:7):30,e:5)') self.dists = self.tree.getDistances()
class TreeReconstructionTests(unittest.TestCase): def setUp(self): self.tree = LoadTree(treestring='((a:3,b:4):2,(c:6,d:7):30,e:5)') self.dists = self.tree.getDistances() def assertTreeDistancesEqual(self, t1, t2): d1 = t1.getDistances() d2 = t2.getDistances() self.assertEqual(len(d1), len(d2)) for key in d2: self.assertAlmostEqual(d1[key], d2[key]) def test_nj(self): """testing nj""" reconstructed = nj(self.dists) self.assertTreeDistancesEqual(self.tree, reconstructed) def test_gnj(self): """testing gnj""" results = gnj(self.dists, keep=1) (length, reconstructed) = results[0] self.assertTreeDistancesEqual(self.tree, reconstructed) results = gnj(self.dists, keep=10) (length, reconstructed) = results[0] self.assertTreeDistancesEqual(self.tree, reconstructed) # Results should be a TreeCollection len(results) results.getConsensusTree() # From GNJ paper. Pearson, Robins, Zhang 1999. tied_dists = { ('a', 'b'):3, ('a', 'c'):3, ('a', 'd'):4, ('a', 'e'):3, ('b', 'c'):3, ('b', 'd'):3, ('b', 'e'):4, ('c', 'd'):3, ('c', 'e'):3, ('d', 'e'):3} results = gnj(tied_dists, keep=3) scores = [score for (score, tree) in results] self.assertEqual(scores[:2], [7.75, 7.75]) self.assertNotEqual(scores[2], 7.75) def test_wls(self): """testing wls""" reconstructed = wls(self.dists) self.assertTreeDistancesEqual(self.tree, reconstructed) def test_truncated_wls(self): """testing wls with order option""" order = ['e', 'b', 'c', 'd'] reconstructed = wls(self.dists, order=order) self.assertEqual(set(reconstructed.getTipNames()), set(order)) def test_limited_wls(self): """testing (well, exercising at least), wls with constrained start""" init = LoadTree(treestring='((a,c),b,d)') reconstructed = wls(self.dists, start=init) self.assertEqual(len(reconstructed.getTipNames()), 5) init2 = LoadTree(treestring='((a,d),b,c)') reconstructed = wls(self.dists, start=[init, init2]) self.assertEqual(len(reconstructed.getTipNames()), 5) init3 = LoadTree(treestring='((a,d),b,e)') self.assertRaises(Exception, wls, self.dists, start=[init, init3]) # if start tree has all seq names, should raise an error self.assertRaises(Exception, wls, self.dists, start=[LoadTree(treestring='((a,c),b,(d,e))')])