def test_taxon_assignment_and_namespace(self): for seed in itertools.chain(( 559, 631, 230, 212, 907, 237, ), (random.randint(0, 1000) for i in range(10))): rng = random.Random(seed) for psm in self.iter_psm_models(rng=rng): for kwargs in ( { "max_time": 20 }, { "num_extant_orthospecies": 10 }, { "num_extant_lineages": 20 }, ): lineage_taxon_namespace = dendropy.TaxonNamespace() species_taxon_namespace = dendropy.TaxonNamespace() kwargs["lineage_taxon_namespace"] = lineage_taxon_namespace kwargs["species_taxon_namespace"] = species_taxon_namespace lineage_tree, orthospecies_tree = psm.generate_sample( **kwargs) self.assertIs(lineage_tree.taxon_namespace, lineage_taxon_namespace) self.assertIs(orthospecies_tree.taxon_namespace, species_taxon_namespace) for tree in (lineage_tree, orthospecies_tree): self.check(tree)
def test_multiplePrune(self): pruner = TreePruner(re.compile("(.*)"), re.compile("(.*)"), True) pruner.set_taxon_set(["B", "C"]) mrca = self.tree.mrca(taxa=[n.taxon for n in self.tree.leaf_node_iter() if n.taxon.label in ["B", "C"]]) tree_to_prune2 = dendropy.Tree(seed_node=copy.deepcopy(mrca), taxon_namespace=dendropy.TaxonNamespace()) pruner.prune(tree_to_prune2) pruner.set_taxon_set(["A", "A2"]) mrca = self.tree.mrca(taxa=[n.taxon for n in self.tree.leaf_node_iter() if n.taxon.label in ["A", "A2"]]) tree_to_prune = dendropy.Tree(seed_node=copy.deepcopy(mrca), taxon_namespace=dendropy.TaxonNamespace()) pruner.prune(tree_to_prune) self.assertEqual([taxon.label for taxon in tree_to_prune.taxon_namespace], ["A", "A2"]) pruner.set_taxon_set(["B", "C"]) mrca = self.tree.mrca(taxa=[n.taxon for n in self.tree.leaf_node_iter() if n.taxon.label in ["B", "C"]]) tree_to_prune2 = dendropy.Tree(seed_node=copy.deepcopy(mrca), taxon_namespace=dendropy.TaxonNamespace()) pruner.prune(tree_to_prune2) self.assertEqual([taxon.label for taxon in tree_to_prune2.taxon_namespace], ["B", "C"]) self.assertEqual([taxon.label for taxon in self.tree.taxon_namespace], ["A", "B", "A2", "C"])
def setUp(self): self.expected_taxon_namespaces = [] self.standalone_taxon_namespaces = [] self.standalone_taxon_namespaces.append( dendropy.TaxonNamespace(["t1", "t2", "t3"])) self.standalone_taxon_namespaces.append( dendropy.TaxonNamespace(["s1", "s2", "s3"])) self.expected_taxon_namespaces.extend(self.standalone_taxon_namespaces) self.expected_tree_lists = collections.OrderedDict() for i in range(2): pdo1 = curated_test_tree_list.get_tree_list(4) self.expected_tree_lists[pdo1] = pdo1.taxon_namespace self.expected_taxon_namespaces.append(pdo1.taxon_namespace) for j in range(2): pdo2 = curated_test_tree_list.get_tree_list( 4, taxon_namespace=pdo1.taxon_namespace) self.expected_tree_lists[pdo2] = pdo2.taxon_namespace self.expected_char_matrices = collections.OrderedDict() for i in range(2): pdo1 = standard_file_test_chars.DnaTestChecker.get_char_matrix_from_class_data( ) self.expected_char_matrices[pdo1] = pdo1.taxon_namespace self.expected_taxon_namespaces.append(pdo1.taxon_namespace) for j in range(2): pdo2 = standard_file_test_chars.ProteinTestChecker.get_char_matrix_from_class_data( taxon_namespace=pdo1.taxon_namespace) self.expected_char_matrices[pdo2] = pdo2.taxon_namespace
def setup(mainDir,conFiles,folders = True): os.chdir(mainDir) # For each nexus file for nex in glob.glob('*nex'): # Make folder for locus if folders == True: # Make a folder and move MSA file into folder dirPath,fName = makeFolders(nex, mainDir) else: # Get locus name fName=os.path.split(n)[0] # Create paths dirPath = os.path.join(mainDir,fName) print(dirPath) # Move into folder and grab constraint files os.chdir(dirPath) os.system("cp ../*.constraint .") # Open locus sequence alignment, get taxa names locusTaxa = dendropy.TaxonNamespace() alignment = dendropy.DnaCharacterMatrix.get(path=nex, schema="nexus", taxon_namespace=locusTaxa) locusList = locusTaxa.labels() for f in conFiles: editFile(nex,fName,f,locusList) # When done with locus os.chdir(mainDir)
def test1(self): with open(pathmap.other_source_path("multispecies_coalescent_test_data.json")) as src: test_regimes = json.load(src) for test_regime in test_regimes: species_tree = dendropy.Tree.get( data=test_regime["species_tree"], schema="newick", rooting="force-rooted", ) species_tree.taxon_namespace.is_mutable = False msc = multispeciescoalescent.MultispeciesCoalescent(species_tree=species_tree) coalescent_species_lineage_label_map = test_regime["coalescent_species_lineage_label_map"] coalescent_species_lineage_map_fn = lambda x: species_tree.taxon_namespace.require_taxon(coalescent_species_lineage_label_map[x.label]) coalescent_taxa = dendropy.TaxonNamespace(sorted(coalescent_species_lineage_label_map.keys())) coalescent_taxa.is_mutable = False for sub_regime in test_regime["coalescent_trees"]: coalescent_tree = dendropy.Tree.get( data=sub_regime["coalescent_tree"], schema="newick", rooting="force-rooted", taxon_namespace=coalescent_taxa, ) obs_ln_likelihood = msc.score_coalescent_tree( coalescent_tree=coalescent_tree, coalescent_species_lineage_map_fn=coalescent_species_lineage_map_fn, ) exp_ln_likelihood = sub_regime["log_likelihood"] self.assertAlmostEqual(obs_ln_likelihood, exp_ln_likelihood, 2)
def get_backbone_tree(tree1, tree2): """Constain tree1 to its shared leaf set with tree2 Parameters ---------- tree1 : dendropy tree object tree2 : dendropy tree object Returns ------- tree1 : dendropy tree object """ tree1 = deepcopy(tree1) leaves1 = njmergepair.get_leaf_set(tree1) leaves2 = njmergepair.get_leaf_set(tree2) shared = list(leaves1.intersection(leaves2)) taxa = dendropy.TaxonNamespace(shared) tree1.retain_taxa_with_labels(shared) tree1.migrate_taxon_namespace(taxa) tree1.encode_bipartitions() return tree1
def setUp(self): self.namespace1 = dendropy.TaxonNamespace( ["dog", "cat", "snake", "fish", "tree"]) self.taxa1 = spectraltree.TaxaMetadata(self.namespace1, ["fish", "snake", "cat", "dog"], "DNA") self.dog = self.namespace1.get_taxon("dog") self.snake = self.namespace1.get_taxon("snake") self.array1 = np.array([ [3, 1, 1, 0, 0], [3, 2, 1, 0, 0], [3, 2, 4, 0, 0], [2, 2, 0, 0, 0], ]) self.tax2seq = { "fish": self.array1[0, :], "snake": self.array1[1, :], "cat": self.array1[2, :], "dog": self.array1[3, :], } d = { "fish": "TCCAA", "snake": "TGCAA", "cat": "TG-AA", "dog": "GGAAA", } self.dna_charmatrix = dendropy.DnaCharacterMatrix.from_dict( d, taxon_namespace=self.namespace1) self.tree = spectraltree.lopsided_tree(4, self.taxa1) self.dm = self.tree.phylogenetic_distance_matrix()
def testAttachTaxonNamespaceOnGet(self): t = dendropy.TaxonNamespace() d = dendropy.DataSet.get_from_path( pathmap.mixed_source_path('reference_single_taxonset_dataset.nex'), "nexus", taxon_namespace=t) self.assertEqual(len(d.taxon_namespaces), 1) self.assertIsNot(d.attached_taxon_namespace, None) self.assertIs(d.taxon_namespaces[0], d.attached_taxon_namespace) self.assertIs(d.attached_taxon_namespace, t) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read(path=pathmap.tree_source_path('pythonidae.mle.nex'), schema="nexus") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read( path=pathmap.tree_source_path('pythonidae.reference-trees.newick'), schema="newick") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.detach_taxon_namespace() d.read_from_path( pathmap.char_source_path('caenophidia_mos.chars.fasta'), schema="fasta", data_type="protein") self.assertEqual(len(d.taxon_namespaces), 2) self.assertEqual(len(d.taxon_namespaces[0]), 33) self.assertEqual(len(d.taxon_namespaces[1]), 114)
def generate_contained_trees( containing_tree, contained_taxon_namespace=None, population_size=1, num_individuals_per_population=4, num_gene_trees=5, rng=None): if contained_taxon_namespace is None: contained_taxon_namespace = dendropy.TaxonNamespace() contained_to_containing_map = {} assert len(containing_tree.taxon_namespace) > 0 for sp_idx, sp_tax in enumerate(containing_tree.taxon_namespace): for gidx in range(num_individuals_per_population): glabel = "{sp}_{ind}^{sp}_{ind}".format(sp=sp_tax.label, ind=gidx+1) # glabel = "{sp}^{sp}_{ind}".format(sp=sp_tax.label, ind=gidx+1) g = contained_taxon_namespace.require_taxon(label=glabel) g.population_label = sp_tax.label contained_to_containing_map[g] = sp_tax ct = reconcile.ContainingTree( containing_tree=containing_tree, contained_taxon_namespace=contained_taxon_namespace, contained_to_containing_taxon_map=contained_to_containing_map) gene_trees = dendropy.TreeList(taxon_namespace=contained_taxon_namespace) for gtidx in range(num_gene_trees): gt = ct.embed_contained_kingman( default_pop_size=population_size, rng=rng) gene_trees.append(gt) return gene_trees
def check(self, title, src_prefix): tns = dendropy.TaxonNamespace() input_ds = dendropy.DataSet.get_from_path( src=pathmap.tree_source_path(src_prefix + ".dendropy-pruned.nex"), schema='nexus', attached_taxon_namespace=tns) input_taxa = input_ds.taxon_namespaces[0] output_ds = dendropy.DataSet.get_from_path( src=pathmap.tree_source_path(src_prefix + ".paup-pruned.nex"), schema='nexus', taxon_namespace=input_taxa) for set_idx, src_trees in enumerate(input_ds.tree_lists): src_trees = input_ds.tree_lists[set_idx] ref_trees = output_ds.tree_lists[set_idx] for tree_idx, src_tree in enumerate(src_trees): _LOG.debug("%s Set %d/%d, Tree %d/%d" % (title, set_idx + 1, len(input_ds.tree_lists), tree_idx + 1, len(src_trees))) ref_tree = ref_trees[tree_idx] # tree_dist = paup.symmetric_difference(src_tree, ref_tree) # d = src_tree.symmetric_difference(ref_tree) # if d > 0: # print d self.assertEqual( treecompare.symmetric_difference(src_tree, ref_tree), 0)
def test_basic_migration(self): char_matrix = self.get_char_matrix() tns = char_matrix.taxon_namespace new_tns = dendropy.TaxonNamespace() new_tns.is_case_sensitive = True char_matrix.migrate_taxon_namespace( new_tns, unify_taxa_by_label=False) self.assertIsNot(char_matrix.taxon_namespace, tns) self.assertIs(char_matrix.taxon_namespace, new_tns) self.assertEqual(len(char_matrix), char_matrix.nseqs) self.assertEqual(len(char_matrix), len(char_matrix.original_seqs)) assert len(char_matrix) == len(char_matrix._taxon_sequence_map) if len(char_matrix.taxon_namespace) != len(tns): x1 = [t.label for t in char_matrix.taxon_namespace] x2 = [t.label for t in tns] c1 = collections.Counter(x1) c2 = collections.Counter(x2) c3 = c2 - c1 print(c3) self.assertEqual(len(char_matrix.taxon_namespace), len(tns)) original_labels = [t.label for t in tns] new_labels = [t.label for t in new_tns] self.assertCountEqual(new_labels, original_labels) for taxon in char_matrix: self.assertIn(taxon, char_matrix.taxon_namespace) self.assertNotIn(taxon, tns) self.assertIs(char_matrix[taxon], char_matrix[taxon].original_seq) self.assertIn(char_matrix[taxon], char_matrix.original_seqs) char_matrix.original_seqs.remove(char_matrix[taxon]) self.assertEqual(char_matrix.original_seqs, [])
def test_reconstruct_taxon_namespace_unifying_case_sensitive_fail(self): char_matrix = self.get_char_matrix_with_case_insensitive_and_case_sensitive_label_collisions() new_tns = dendropy.TaxonNamespace() new_tns.is_case_sensitive = True char_matrix._taxon_namespace = new_tns with self.assertRaises(error.TaxonNamespaceReconstructionError): char_matrix.reconstruct_taxon_namespace(unify_taxa_by_label=True)
def calc_robinson_foulds_distance(): print("Robinson Foulds Distances between Trees") print("---------------------------------------") robinson_foulds_distances = {} for tree_i in os.listdir('out/phylogenetic_trees'): robinson_foulds_distances[tree_i] = {} for tree_j in os.listdir('out/phylogenetic_trees'): if tree_i >= tree_j: continue else: taxon_nmspce = dendropy.TaxonNamespace() treeA = dendropy.Tree.get_from_path( 'out/phylogenetic_trees/{}'.format(tree_i), 'nexus', taxon_namespace=taxon_nmspce, ) treeB = dendropy.Tree.get_from_path( 'out/phylogenetic_trees/{}'.format(tree_j), "nexus", taxon_namespace=taxon_nmspce) treeA.encode_bipartitions() treeB.encode_bipartitions() distance = round( dendropy.calculate.treecompare. weighted_robinson_foulds_distance(treeA, treeB), 4) robinson_foulds_distances[tree_i][tree_j] = distance print(tree_i, "AND", tree_j, distance, "\n") with open('out/robinson_foulds_distances_between_trees.json', 'w') as file: json.dump(robinson_foulds_distances, file)
def generate_contained_trees( containing_tree, contained_taxon_namespace=None, population_size=1, total_number_of_individuals=200, num_gene_trees=5, rng=None): if contained_taxon_namespace is None: contained_taxon_namespace = dendropy.TaxonNamespace() contained_to_containing_map = {} assert len(containing_tree.taxon_namespace) > 0 containing_tree = process_containing_tree_for_gene_samples( containing_tree=containing_tree, total_number_of_individuals=total_number_of_individuals, rng=rng) containing_tree_leaf_nodes = containing_tree.leaf_nodes() for sp_idx, sp_node in enumerate(containing_tree_leaf_nodes): sp_tax = sp_node.taxon for gidx in range(sp_node.num_individuals_sampled): glabel = "{sp}_{ind}^{sp}".format(sp=sp_tax.label, ind=gidx+1) # glabel = "{sp}^{sp}_{ind}".format(sp=sp_tax.label, ind=gidx+1) g = contained_taxon_namespace.require_taxon(label=glabel) g.population_label = sp_tax.label contained_to_containing_map[g] = sp_tax ct = reconcile.ContainingTree( containing_tree=containing_tree, contained_taxon_namespace=contained_taxon_namespace, contained_to_containing_taxon_map=contained_to_containing_map) gene_trees = dendropy.TreeList(taxon_namespace=contained_taxon_namespace) for gtidx in range(num_gene_trees): gt = ct.embed_contained_kingman( default_pop_size=population_size, rng=rng) gene_trees.append(gt) return containing_tree, gene_trees
def main(args=None): for param in [ 'birth_rate', 'death_rate', 'birth_rate_sd', 'death_rate_sd' ]: param = '--' + param args[param] = float(args[param]) # loading taxon list if args['<genome_list>'] is not None: taxa = Utils.parseGenomeList(args['<genome_list>'], check_exists=False) taxa = [x[0] for x in taxa] elif args['<comm_file>'] is not None: comm = CommTable.from_csv(args['<comm_file>'], sep='\t') taxa = comm.get_unique_taxon_names() # init dendropy taxon namespace taxa = dendropy.TaxonNamespace(taxa, label='taxa') # simulating tree if args['--star']: tree = star_tree(taxon_set=taxa) else: tree = birth_death(args['--birth_rate'], args['--death_rate'], birth_rate_sd=args['--birth_rate_sd'], death_rate_sd=args['--death_rate_sd'], num_extant_tips=len(taxa)) # writing tree outfmt = args['--outfmt'].lower() psbl_fmts = ['newick', 'nexus'] assert outfmt in psbl_fmts, 'output file format not recognized.' +\ ' Possible formats: {}'.format(', '.join(psbl_fmts)) tree.write_to_stream(sys.stdout, outfmt)
def runTest(self): """PureCoalescentTreeTest -- tree generation without checking [TODO: checks]""" _RNG = MockRandom() tns = dendropy.TaxonNamespace( ["t{}".format(i + 1) for i in range(100)]) t = coalescent.pure_kingman_tree(tns, rng=_RNG) assert t._debug_tree_is_valid()
def testBoundTaxonNamespaceDefault(self): d = dendropy.DataSet() t = dendropy.TaxonNamespace() d.attach_taxon_namespace(t) self.assertEqual(len(d.taxon_namespaces), 1) self.assertIs(d.taxon_namespaces[0], d.attached_taxon_namespace) d.read(path=pathmap.mixed_source_path( 'reference_single_taxonset_dataset.nex'), schema="nexus") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read(path=pathmap.tree_source_path('pythonidae.mle.nex'), schema="nexus") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read( path=pathmap.tree_source_path('pythonidae.reference-trees.newick'), schema="newick") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read(path=pathmap.char_source_path('caenophidia_mos.chars.fasta'), schema="fasta", data_type="protein") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 147)
def generate_star_tree2(): num_tips = 10 branch_length = 1 names = [] for i in range(num_tips + 1): names.append("s" + str(i)) taxon_namespace = dendropy.TaxonNamespace(names) tree = dendropy.Tree(taxon_namespace=taxon_namespace) index = 0 for i in range(num_tips + 1): if index == 0: tree.seed_node.taxon = taxon_namespace.get_taxon("s" + str(0)) tree.seed_node.X = 0 tree.seed_node.time = 0 else: node = dendropy.Node(taxon=taxon_namespace.get_taxon("s" + str(index))) node.edge_length = branch_length node.X = random.gauss(0, 1) node.time = branch_length tree.seed_node.add_child(node) return tree
def test_attached_taxon_namespace(self): tns = dendropy.TaxonNamespace() ds = dendropy.DataSet.get_from_path( pathmap.mixed_source_path('multitaxa_mesquite.nex'), "nexus", taxon_namespace=tns) self.verify_attached_taxon_namespace(ds, tns)
def generate_birthdeath_tree(num_extinct, br, dr): t = treesim.birth_death_tree(birth_rate=br, death_rate=dr, num_extinct_tips=num_extinct, is_retain_extinct_tips=True, is_add_extinct_attr=True) index = 0 namespace = [] for node in t.preorder_node_iter(): index = index + 1 namespace.append("s" + str(index)) #name all nodes instead of just leaves taxon_namespace = dendropy.TaxonNamespace(namespace) t.taxon_namespace = taxon_namespace index = 0 for node in t.preorder_node_iter(): index = index + 1 node.taxon = t.taxon_namespace.get_taxon("s" + str(index)) t = prune_nodes(t) #distance to root t = calculate_times(t) return t
def getDiploid(self): """ Set diploid species list. Open up a dialog for user to select diploid species. Get result from the dialog and store as a global variable. """ class emptyFileError(Exception): pass try: if len(self.inputFiles) == 0: raise emptyFileError # Create a taxon_namespace object based on current taxa names set. taxa = dendropy.TaxonNamespace() for taxon in list(self.taxa_names): taxa.add_taxon(dendropy.Taxon(taxon)) dialog = diploidList.DiploidListDlg(taxa, self.diploidList, self) if dialog.exec_(): # If executed, update diploid species list. self.diploidList = dialog.getDiploidSpeciesList() except emptyFileError: QMessageBox.warning(self, "Warning", "Please select a file type and upload data!", QMessageBox.Ok) return
def setUp(self): tree_str = "[&R] ((((H**o:0.21,Bogus1:0.23,Pongo:0.21)N1:0.28,Bogus2:0.49,Macaca:0.49)N2:0.13,Bogus3:0.62,Ateles:0.62)N3:0.38,Galago:1.00)N4:0.0;" data_str = """ #NEXUS BEGIN DATA; DIMENSIONS NTAX=8 NCHAR=2; FORMAT DATATYPE = CONTINUOUS GAP = - MISSING = ?; MATRIX H**o 4.09434 4.74493 Pongo 3.61092 3.33220 Macaca 2.37024 3.36730 Ateles 2.02815 2.89037 Galago -1.46968 2.30259 Bogus1 2.15 2.15 Bogus2 2.15 2.15 Bogus3 2.15 2.15 ; END; """ taxa = dendropy.TaxonNamespace() self.tree = dendropy.Tree.get_from_string(tree_str, 'newick', taxon_namespace=taxa) self.char_matrix = dendropy.ContinuousCharacterMatrix.get_from_string( data_str, 'nexus', taxon_namespace=taxa)
def rf_weighted(tree_object1, tree_object2): tree_newick1 = tree_object1.newick(tree_object1.root) + ";" tree_newick2 = tree_object2.newick(tree_object2.root) + ";" #print(tree_newick1) #print(tree_newick2) version = dendropy.__version__.split(".")[0] if version == '4': taxa = dendropy.TaxonNamespace() #set taxa same for all tree1 = dendropy.Tree.get(data=tree_newick1, schema='newick', taxon_namespace=taxa, rooting='force-rooted') tree2 = dendropy.Tree.get(data=tree_newick2, schema='newick', taxon_namespace=taxa, rooting='force-rooted') elif version == '3': taxa = dendropy.TaxonSet() #set taxa same for all tree1 = dendropy.Tree.get(data=tree_newick1, schema='newick', taxon_set=taxa, rooting='force-rooted') tree2 = dendropy.Tree.get(data=tree_newick2, schema='newick', taxon_set=taxa, rooting='force-rooted') tree1.encode_bipartitions() tree2.encode_bipartitions() dist = dendropy.calculate.treecompare.weighted_robinson_foulds_distance( tree1, tree2) return dist
def dist_from_files(cl1, cl2, distance_method=False): if distance_method == False: print("ERROR: must provide distance method") sys.exit(1) distance = { "symmetric": dp.calculate.treecompare.symmetric_difference, "weightedRF": dp.calculate.treecompare.weighted_robinson_foulds_distance, "euclidean": dp.calculate.treecompare.euclidean_distance, "quartet": quartet_distance, "triplet": triplet_distance } tns = dp.TaxonNamespace() t1 = dp.Tree.get_from_path(src=treedir + "/" + cl1 + ".phy_phyml_tree.txt", schema="newick") t2 = dp.Tree.get_from_path(src=cl2, schema="newick", taxon_namespace=t1.taxon_namespace) try: distance = distance[distance_method](t1, t2) except KeyError: "ERROR: distance method {0} not found".format(distance_method) sys.exit(1) return distance
def get_random_tree(): from dendropy.model import birthdeath tns = dendropy.TaxonNamespace() for group_id in AssemblageInducedTreeManagerTests.GROUP_IDS: for group_member in range(10): t = tns.require_taxon( label="{}{}".format(group_id, group_member)) tree = dendropy.simulate.birth_death_tree(birth_rate=0.1, death_rate=0.0, taxon_namespace=tns, num_extant_tips=len(tns)) tree.assemblage_leaf_sets = [] tree.assemblage_classification_regime_subtrees = [] for group_id in AssemblageInducedTreeManagerTests.GROUP_IDS: node_filter_fn = lambda nd: nd.taxon is None or nd.taxon.label.startswith( group_id) subtree1 = tree.extract_tree(node_filter_fn=node_filter_fn) assemblage_leaf_set = set() for leaf_nd in tree.leaf_node_iter(): if node_filter_fn(leaf_nd): assert leaf_nd.taxon.label.startswith(group_id) assemblage_leaf_set.add(leaf_nd) tree.assemblage_leaf_sets.append(assemblage_leaf_set) tree.assemblage_classification_regime_subtrees.append(subtree1) assert len(tree.assemblage_classification_regime_subtrees) == len( AssemblageInducedTreeManagerTests.GROUP_IDS) return tree
def labeler(files, etalon_tree, tree_path=".", rebuild=False): """ Constructs labels for given files. (Best phylogeny reconstruction method) :param files: an iterable with file paths to alignments :param etalon_tree: the path to etalon tree :param tree_path: a directory, where built trees will be stored :param rebuild: set it True, if you need to rebuild trees or build them from scratch :return: tensor with labels """ tree_path = osp.abspath(tree_path) # raxml needs absolute paths if rebuild: calculator = TreeConstruction.DistanceCalculator('blosum62') dist_constructor = TreeConstruction.DistanceTreeConstructor() # construct all trees with UPGMA, NJ and raxml for i, file in enumerate(files): aln = AlignIO.read(file, 'fasta') tree = dist_constructor.upgma(calculator.get_distance(aln)) name = file.split("/")[-1].split(".")[0] Phylo.write(tree, osp.join(tree_path, 'upgma_{}.tre'.format(name)), 'newick') tree = dist_constructor.nj(calculator.get_distance(aln)) Phylo.write(tree, osp.join(tree_path, 'nj_{}.tre'.format(name)), 'newick') raxml = RaxmlCommandline(sequences=osp.abspath(file), model='PROTCATWAG', name='{}.tre'.format(name), threads=3, working_dir=tree_path) _, stderr = raxml() print(stderr) print('{} finished'.format(name)) # get best tree tns = dendropy.TaxonNamespace() act_tree = dendropy.Tree.get_from_path(osp.join(tree_path, etalon_tree), "newick", taxon_namespace=tns) act_tree.encode_bipartitions() distances = np.zeros(shape=(len(files), 3)) for i, file in enumerate(files): name = file.split("/")[-1].split(".")[0] nj_tree = dendropy.Tree.get_from_path(osp.join( tree_path, "nj_{}.tre".format(name)), "newick", taxon_namespace=tns) up_tree = dendropy.Tree.get_from_path(osp.join( tree_path, "upgma_{}.tre".format(name)), "newick", taxon_namespace=tns) ml_tree = dendropy.Tree.get_from_path(osp.join( tree_path, "RAxML_bestTree.{}.tre".format(name)), "newick", taxon_namespace=tns) distances[i, 0] = dendropy.calculate.treecompare.symmetric_difference( nj_tree, act_tree) distances[i, 1] = dendropy.calculate.treecompare.symmetric_difference( up_tree, act_tree) distances[i, 2] = dendropy.calculate.treecompare.symmetric_difference( ml_tree, act_tree) return distances.argmin(1)
def test_probs(self): with open( os.path.join(_pathmap.TESTS_DATA_DIR, "marginal_probability_of_species.json")) as src: test_ref = json.load(src) for test_tree_set in test_ref: taxon_namespace = dendropy.TaxonNamespace( test_tree_set["taxon_namespace"]) tree = model.LineageTree.get( data=test_tree_set["tree"], schema="newick", taxon_namespace=taxon_namespace, ) tree.encode_bipartitions() for brlen_config in test_tree_set["branch_length_configurations"]: for split_bitmask, br_len in brlen_config[ "branch_lengths"].items(): split_bitmask = int(split_bitmask) assert split_bitmask in tree.split_bitmask_edge_map, split_bitmask tree.split_bitmask_edge_map[split_bitmask].length = br_len for speciation_rate_config in brlen_config[ "speciation_rate_configurations"]: tree.speciation_completion_rate = speciation_rate_config[ "speciation_rate"] for species_configuration in speciation_rate_config[ "species_configurations"]: species_labels = species_configuration["species"] expected_probability = species_configuration[ "probability"] obs_probability = tree.calc_marginal_probability_of_species( species_labels) self.assertAlmostEqual(expected_probability, obs_probability, 8)
def get_species_tree(self, ntax=10): _RNG = MockRandom() ages = [_RNG.randint(1000, 10000) for age in range(ntax)] ages.sort() pop_sizes = [_RNG.randint(1000, 10000) for pop in range(2 * ntax + 1)] taxon_namespace = dendropy.TaxonNamespace( ["t{}".format(i + 1) for i in range(ntax)]) species_tree = popgensim.pop_gen_tree(taxon_namespace=taxon_namespace, ages=ages, num_genes=4, pop_sizes=pop_sizes, rng=_RNG) ages2 = [] for node in species_tree.postorder_node_iter(): distance_from_tip = node.distance_from_tip() if distance_from_tip > 0: ages2.append(distance_from_tip) ages2.sort() for index in range(len(ages2)): assert (ages[index] - ages2[index]) < 10e-6 pop_sizes2 = [] for edge in species_tree.postorder_edge_iter(): pop_sizes2.append(edge.pop_size) pop_sizes2.sort() return species_tree
def compareDendropyTrees(tr1, tr2): from dendropy.calculate.treecompare \ import false_positives_and_negatives lb1 = set([l.taxon.label for l in tr1.leaf_nodes()]) lb2 = set([l.taxon.label for l in tr2.leaf_nodes()]) com = lb1.intersection(lb2) if com != lb1 or com != lb2: com = list(com) tns = dendropy.TaxonNamespace(com) tr1.retain_taxa_with_labels(com) tr1.migrate_taxon_namespace(tns) tr2.retain_taxa_with_labels(com) tr2.migrate_taxon_namespace(tns) com = list(com) tr1.update_bipartitions() tr2.update_bipartitions() nl = len(com) ei1 = len(tr1.internal_edges(exclude_seed_edge=True)) ei2 = len(tr2.internal_edges(exclude_seed_edge=True)) [fp, fn] = false_positives_and_negatives(tr1, tr2) rf = float(fp + fn) / (ei1 + ei2) return (nl, ei1, ei2, fp, fn, rf)
def bootstrap_consensus_tree(corpus, trees=[], consensus_level=0.5): tmp_dir = mkdtemp() for idx, tree in enumerate(trees): t = tree.dendrogram.to_ete(labels=corpus.titles) t.write(outfile=tmp_dir + '/tree_' + str(idx) + '.newick') trees = [] tns = dendropy.TaxonNamespace(corpus.titles, label="label") for filename in glob.glob(tmp_dir + '/*.newick'): tree = dendropy.Tree.get(path=filename, schema='newick', preserve_underscores=True, taxon_namespace=tns) trees.append(tree) tsum = TreeSummarizer(support_as_labels=True, support_as_edge_lengths=False, support_as_percentages=True, add_node_metadata=True, weighted_splits=True) taxon_namespace = trees[0].taxon_namespace split_distribution = dendropy.SplitDistribution( taxon_namespace=taxon_namespace) tsum.count_splits_on_trees(trees, split_distribution=split_distribution, is_bipartitions_updated=False) tree = tsum.tree_from_splits( split_distribution, min_freq=consensus_level, rooted=False, include_edge_lengths=False) # this param is crucial ete_tree = EteTree(tree.as_string("newick").replace('[&U] ', '') + ';') return ete_tree