def life_ott(fo): ''' Makes a bogus ott for the taxonomy. Basically this gives all the taxa the parent 'life' for later use in the TAG algorithm. ''' output = sys.stdout dataset = dendropy.DataSet() try: dataset.read(stream=fo, schema="Newick") except DataParseError as dfe: raise ValueError(str(dfe)) tree_list = dataset.tree_lists[0] parent_id = '805080' branch_counter = 0 tree_labels = set() for tree in tree_list: encode_splits(tree) tree_mask = tree.seed_node.edge.split_bitmask assert tree_mask is not None tree_tax = set(split_to_list(tree_mask)) split_list = [] for node in tree.leaf_iter(): tree_labels.add(node.taxon.label) output.write('805080\t|\t\t|\tlife\t|\tno rank\t|\tncbi:1,gbif:0\t|\t\t|\t\t|\t\t|\t\n') for i in tree_labels: name, ottoid = i.split('@') output.write(ottoid+'\t|\t'+parent_id+'\t|\t'+name+'\t|\tspecies\t|\tncbi:1\t|\t\t|\t\n')
def inplace_strict_consensus_merge(trees_to_merge, rooted=False, gordons_supertree=False): """Returns a tree that is the strict consensus merger of the input trees. """ tree_list = list(trees_to_merge) del trees_to_merge[1:] nTrees = len(tree_list) _LOG.debug('%d Trees to merge:\n%s\n' % (nTrees, '\n'.join([str(i) for i in tree_list]))) if nTrees < 2: return tree_list[0] tree_iter = iter(tree_list) to_modify = tree_iter.next() if rooted: raise NotImplementedError("Rooted SCM is not implemented") else: to_modify.deroot() encode_splits(to_modify) if _IS_DEBUG_LOGGING: assert to_modify._debug_tree_is_valid(check_splits=False) for to_consume in tree_iter: if not rooted: to_consume.deroot() encode_splits(to_consume) if _IS_DEBUG_LOGGING: assert to_consume._debug_tree_is_valid(check_splits=True) add_to_scm(to_modify, to_consume, rooted, gordons_supertree=gordons_supertree) if _IS_DEBUG_LOGGING: assert to_modify._debug_tree_is_valid(check_splits=False) return to_modify
def tree_from_splits(self, split_distribution, min_freq=0.5, include_edge_lengths=True): """Returns a consensus tree from splits in `split_distribution`. If include_edge_length_var is True, then the sample variance of the edge length will also be calculated and will be stored as a length_var attribute. """ taxon_set = split_distribution.taxon_set taxa_mask = taxon_set.all_taxa_bitmask() if self.weighted_splits: split_freqs = split_distribution.weighted_split_frequencies else: split_freqs = split_distribution.split_frequencies is_rooted = split_distribution.is_rooted #include_edge_lengths = self.support_as_labels and include_edge_lengths if self.support_as_edge_lengths and include_edge_lengths: raise Exception("Cannot map support as edge lengths if edge lengths are to be set on consensus tree") to_try_to_add = [] _almost_one = lambda x: abs(x - 1.0) <= 0.0000001 for s, freq in split_freqs.iteritems(): if (min_freq is None) or (freq > min_freq) or (_almost_one(min_freq) and _almost_one(freq)): to_try_to_add.append((freq, s)) to_try_to_add.sort(reverse=True) splits_for_tree = [i[1] for i in to_try_to_add] con_tree = treesplit.tree_from_splits(splits=splits_for_tree, taxon_set=taxon_set, is_rooted=is_rooted) treesplit.encode_splits(con_tree) if include_edge_lengths: split_edge_lengths = {} for split, edges in split_distribution.split_edge_lengths.items(): if len(edges) > 0: mean, var = mean_and_sample_variance(edges) elen = mean else: elen = None split_edge_lengths[split] = elen else: split_edge_lengths = None for node in con_tree.postorder_node_iter(): split = node.edge.split_bitmask if split in split_freqs: self.map_split_support_to_node(node=node, split_support=split_freqs[split]) if include_edge_lengths and split in split_distribution.split_edge_lengths: edges = split_distribution.split_edge_lengths[split] if len(edges) > 0: mean, var = mean_and_sample_variance(edges) elen = mean else: elen = None node.edge.length = elen return con_tree
def false_positives_and_negatives(reference_tree, test_tree): """ False pos = splits in test_tree NOT in reference_tree False neg = splits in reference_tree NOT in test_tree """ sym_diff = 0 false_positives = 0 false_negatives = 0 if reference_tree.taxon_set is not test_tree.taxon_set: raise TypeError("Trees have different TaxonSet objects: %s vs. %s" \ % (hex(id(reference_tree.taxon_set)), hex(id(test_tree.taxon_set)))) if not hasattr(reference_tree, "split_edges"): treesplit.encode_splits(reference_tree) if not hasattr(test_tree, "split_edges"): treesplit.encode_splits(test_tree) for split in reference_tree.split_edges: if split in test_tree.split_edges: pass else: false_negatives = false_negatives + 1 sym_diff = sym_diff + 1 for split in test_tree.split_edges: if split in reference_tree.split_edges: pass else: false_positives = false_positives + 1 sym_diff = sym_diff + 1 return false_positives, false_negatives
def runTest(self): tree_list = dendropy.TreeList(stream=StringIO( """((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247); ((t5:2.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247); ((t5:0.161175,t6:0.161175):0.392293,((t2:0.075411,(t4:0.104381,t1:0.075411):1):0.065840,t3:0.170221):0.383247); ((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):0.028969):0.065840,t3:0.170221):0.383247); """), schema="newick") for i in tree_list: encode_splits(i) self.assertAlmostEqual( treecalc.euclidean_distance(tree_list[0], tree_list[1]), 2.0) self.assertAlmostEqual( treecalc.euclidean_distance(tree_list[0], tree_list[2]), math.sqrt(2.0)) self.assertAlmostEqual( treecalc.euclidean_distance(tree_list[0], tree_list[3]), 0.97103099999999998) self.assertAlmostEqual( treecalc.euclidean_distance(tree_list[1], tree_list[2]), math.sqrt(6.0)) self.assertAlmostEqual( treecalc.euclidean_distance(tree_list[1], tree_list[3]), 2.2232636377544162) self.assertAlmostEqual( treecalc.euclidean_distance(tree_list[2], tree_list[3]), 1.000419513484718)
def run(self): while not self.kill_received: try: source = self.work_queue.get_nowait() except Queue.Empty: break self.send_info("Received task: '%s'." % source, wrap=False) fsrc = open(source, "rU") for tidx, tree in enumerate(tree_source_iter(fsrc, schema=self.schema, taxon_set=self.taxon_set, as_rooted=self.is_rooted, store_tree_weights=self.weighted_trees)): if tidx >= self.tree_offset: if (self.log_frequency == 1) or (tidx > 0 and self.log_frequency > 0 and tidx % self.log_frequency == 0): self.send_info("(processing) '%s': tree at offset %d" % (source, tidx), wrap=False) treesplit.encode_splits(tree) self.split_distribution.count_splits_on_tree(tree) if self.calc_tree_probs: self.topology_counter.count(tree, tree_splits_encoded=True) else: if (self.log_frequency == 1) or (tidx > 0 and self.log_frequency > 0 and tidx % self.log_frequency == 0): self.send_info("(processing) '%s': tree at offset %d (skipping)" % (source, tidx), wrap=False) if self.kill_received: break if self.kill_received: break self.send_info("Completed task: '%s'." % (source), wrap=False) if self.kill_received: self.send_warning("Terminating in response to kill request.") else: self.result_split_dist_queue.put(self.split_distribution) self.result_topology_hash_map_queue.put(self.topology_counter.topology_hash_map)
def runTest(self): ref = dendropy.Tree(stream=StringIO("((t5,t6),((t4,(t2,t1)),t3));"), schema="newick") taxon_set = ref.taxon_set encode_splits(ref) o_tree = dendropy.Tree(stream=StringIO("((t1,t2),((t4,(t5,t6)),t3));"), schema="newick", taxon_set=taxon_set) encode_splits(o_tree) self.assertEqual(treecalc.symmetric_difference(o_tree, ref), 2)
def calc(self, tree=None, create_midpoints=None): """ Calculates the distances. Note that the path length (in number of steps) between taxa that span the root will be off by one if the tree is unrooted. """ if tree is not None: self.tree = tree assert self.tree is not None if not hasattr(self.tree, "split_edges"): treesplit.encode_splits(self.tree) self.taxon_set = self.tree.taxon_set self._pat_dists = {} self._path_steps = {} for i1, t1 in enumerate(self.taxon_set): self._pat_dists[t1] = {} self._path_steps[t1] = {} self._mrca[t1] = {} self.max_dist = None self.max_dist_taxa = None self.max_dist_nodes = None for node in self.tree.postorder_node_iter(): children = node.child_nodes() if len(children) == 0: node.desc_paths = {node: (0, 0)} else: node.desc_paths = {} for cidx1, c1 in enumerate(children): for desc1, (desc1_plen, desc1_psteps) in c1.desc_paths.items(): node.desc_paths[desc1] = (desc1_plen + c1.edge.length, desc1_psteps + 1) for c2 in children[cidx1 + 1:]: for desc2, (desc2_plen, desc2_psteps) in c2.desc_paths.items(): self._mrca[desc1.taxon][ desc2.taxon] = c1.parent_node pat_dist = node.desc_paths[desc1][ 0] + desc2_plen + c2.edge.length self._pat_dists[desc1.taxon][ desc2.taxon] = pat_dist path_steps = node.desc_paths[desc1][ 1] + desc2_psteps + 1 self._path_steps[desc1.taxon][ desc2.taxon] = path_steps if pat_dist > self.max_dist: self.max_dist = pat_dist midpoint = float(pat_dist) / 2 if midpoint - node.desc_paths[desc1][ 0] <= 0: self.max_dist_nodes = (desc1, desc2) self.max_dist_taxa = (desc1.taxon, desc2.taxon) else: self.max_dist_nodes = (desc2, desc1) self.max_dist_taxa = (desc2.taxon, desc1.taxon) del (c1.desc_paths)
def kernelOfTest(self, trees): expected = trees[-1] input = trees[:-1] _LOG.debug('input = %s' % str(input)) output = inplace_strict_consensus_merge(input) encode_splits(output) encode_splits(expected) if symmetric_difference(expected, output) != 0: self.fail("\n%s\n!=\n%s" % (str(output), str(expected)))
def testReferenceTree(self): ref_tree_list = datagen.reference_tree_list() t_tree_list = dendropy.TreeList() for ref_tree in ref_tree_list: treesplit.encode_splits(ref_tree) splits = ref_tree.split_edges.keys() t_tree = treesplit.tree_from_splits(splits=splits, taxon_set=ref_tree_list.taxon_set, is_rooted=ref_tree.is_rooted) self.assertEqual(ref_tree.symmetric_difference(t_tree), 0)
def assertDistinctButEqualTree(self, tree1, tree2, **kwargs): otaxa = tree1.taxon_set ts = dendropy.TaxonSet() tree1.reindex_taxa(ts, clear=True) tree2.reindex_taxa(ts) self.assertIs(tree1.taxon_set, tree2.taxon_set) self.assertIsNot(tree1.taxon_set, otaxa) self.assertDistinctButEqual(tree1.taxon_set, otaxa, **kwargs) treesplit.encode_splits(tree1) treesplit.encode_splits(tree2) rfdist = treecalc.robinson_foulds_distance(tree1, tree2) self.assertAlmostEqual(rfdist, 0)
def testPatDistFunc(self): encode_splits(self.tree) def _chk_distance(t1, t2, exp_distance): tax1 = self.tree.taxon_set.get_taxon(label=t1) tax2 = self.tree.taxon_set.get_taxon(label=t2) pd = treecalc.patristic_distance(self.tree, tax1, tax2) self.assertEqual(pd, exp_distance) _chk_distance("a", "b", 2) _chk_distance("a", "c", 4) _chk_distance("b", "c", 4) _chk_distance("a", "d", 6) _chk_distance("f", "d", 4) _chk_distance("c", "d", 6)
def count(self, tree, tree_splits_encoded=False): """ Logs/registers a tree. """ if not tree_splits_encoded: treesplit.encode_splits(tree) topology = self.hash_topology(tree) if topology not in self.topology_hash_map: self.topology_hash_map[topology] = 1 else: self.topology_hash_map[ topology] = self.topology_hash_map[topology] + 1 self.total_trees_counted += 1
def countSplits(self, tc, is_rooted): _LOG.info(tc[0] + "; " + tc[1]) tree_filepaths = [pathmap.tree_source_path(tc[0])] taxa_filepath = pathmap.tree_source_path(tc[1]) paup_sd = paup.get_split_distribution(tree_filepaths, taxa_filepath, is_rooted=is_rooted, burnin=0) taxon_set = paup_sd.taxon_set dp_sd = treesplit.SplitDistribution(taxon_set=taxon_set) dp_sd.ignore_edge_lengths = True dp_sd.ignore_node_ages = True dp_sd.is_rooted = is_rooted _LOG.debug("Taxon set: %s" % [t.label for t in taxon_set]) taxa_mask = taxon_set.all_taxa_bitmask() taxon_set.lock() for tree_filepath in tree_filepaths: for tree in dataio.tree_source_iter(stream=open( tree_filepath, "rU"), schema='nexus', taxon_set=taxon_set, as_rooted=is_rooted): self.assertIs(tree.taxon_set, dp_sd.taxon_set) self.assertIs(tree.taxon_set, taxon_set) treesplit.encode_splits(tree) dp_sd.count_splits_on_tree(tree) self.assertEqual(dp_sd.total_trees_counted, paup_sd.total_trees_counted) # SplitsDistribution counts trivial splits, whereas PAUP* # contree does not, so the following will not work # assert len(dp_sd.splits) == len(paup_sd.splits),\ # "dp = %d, sd = %d" % (len(dp_sd.splits), len(paup_sd.splits)) taxa_mask = taxon_set.all_taxa_bitmask() for split in dp_sd.splits: if not treesplit.is_trivial_split(split, taxa_mask): self.assertIn(split, paup_sd.splits) self.assertEqual(dp_sd.split_counts[split], paup_sd.split_counts[split]) paup_sd.splits.remove(split) # if any splits remain, they were not # in dp_sd or were trivial remaining_splits = list(paup_sd.splits) for split in remaining_splits: if treesplit.is_trivial_split(split, taxa_mask): paup_sd.splits.remove(split) self.assertEqual(len(paup_sd.splits), 0)
def count(self, tree, tree_splits_encoded=False): """ Logs/registers a tree. """ if not tree_splits_encoded: treesplit.encode_splits(tree) topology = self.hash_topology(tree) if topology not in self.topology_hash_map: self.topology_hash_map[topology] = 1 else: self.topology_hash_map[topology] = self.topology_hash_map[topology] + 1 self.total_trees_counted += 1
def testUltrametricTrees(self): tree_files = [ "pythonidae.beast.summary.tre", "primates.beast.mcct.medianh.tre" ] for tree_file in tree_files: ref_tree = dendropy.Tree.get_from_path( pathmap.tree_source_path(tree_file), "nexus", as_rooted=True) treesplit.encode_splits(ref_tree) splits = ref_tree.split_edges.keys() t_tree = treesplit.tree_from_splits(splits=splits, taxon_set=ref_tree.taxon_set, is_rooted=ref_tree.is_rooted) treesplit.encode_splits(t_tree) self.assertEqual(ref_tree.symmetric_difference(t_tree), 0)
def long_branch_symmdiff(trees_to_compare, edge_len_threshold, copy_trees=False, rooted=False): """Returns matrix of the symmetric_differences between trees after all internal edges with lengths < `edge_len_threshold` have been collapsed. If `copy_trees` is True then the trees will be copied first (if False, then the trees may will have their short edges collapsed on exit). """ if copy_trees: tree_list = [copy.copy(i) for i in trees_to_compare] else: tree_list = list(trees_to_compare) n_trees = len(tree_list) _LOG.debug('%d Trees to compare:\n%s\n' % (n_trees, '\n'.join([str(i) for i in tree_list]))) if n_trees < 2: return [0 for t in tree_list] f_r = [] for tree in tree_list: to_collapse = [] encode_splits(tree) for edge in tree.preorder_edge_iter(filter_fn=Edge.is_internal): elen = edge.length if elen is not None and elen < edge_len_threshold: to_collapse.append(edge) for edge in to_collapse: collapse_edge(edge) f_r.append(tree.is_rooted) tree.is_rooted = bool(rooted) encode_splits(tree) sd_row = [0] * n_trees sd_mat = [list(sd_row) for i in xrange(n_trees)] for i, tree_one in enumerate(tree_list[:-1]): for col_count, tree_two in enumerate(tree_list[1 + i:]): j = i + 1 + col_count sd = symmetric_difference(tree_one, tree_two) sd_mat[i][j] = sd sd_mat[j][i] = sd if not copy_trees: for r, tree in itertools.izip(f_r, tree_list): tree.is_rooted = r return sd_mat
def calc(self, tree=None, create_midpoints=None): """ Calculates the distances. Note that the path length (in number of steps) between taxa that span the root will be off by one if the tree is unrooted. """ if tree is not None: self.tree = tree assert self.tree is not None if not hasattr(self.tree, "split_edges"): treesplit.encode_splits(self.tree) self.taxon_set = self.tree.taxon_set self._pat_dists = {} self._path_steps = {} for i1, t1 in enumerate(self.taxon_set): self._pat_dists[t1] = {} self._path_steps[t1] = {} self._mrca[t1] = {} self.max_dist = None self.max_dist_taxa = None self.max_dist_nodes = None for node in self.tree.postorder_node_iter(): children = node.child_nodes() if len(children) == 0: node.desc_paths = {node: (0, 0)} else: node.desc_paths = {} for cidx1, c1 in enumerate(children): for desc1, (desc1_plen, desc1_psteps) in c1.desc_paths.items(): node.desc_paths[desc1] = (desc1_plen + c1.edge.length, desc1_psteps + 1) for c2 in children[cidx1 + 1 :]: for desc2, (desc2_plen, desc2_psteps) in c2.desc_paths.items(): self._mrca[desc1.taxon][desc2.taxon] = c1.parent_node pat_dist = node.desc_paths[desc1][0] + desc2_plen + c2.edge.length self._pat_dists[desc1.taxon][desc2.taxon] = pat_dist path_steps = node.desc_paths[desc1][1] + desc2_psteps + 1 self._path_steps[desc1.taxon][desc2.taxon] = path_steps if pat_dist > self.max_dist: self.max_dist = pat_dist midpoint = float(pat_dist) / 2 if midpoint - node.desc_paths[desc1][0] <= 0: self.max_dist_nodes = (desc1, desc2) self.max_dist_taxa = (desc1.taxon, desc2.taxon) else: self.max_dist_nodes = (desc2, desc1) self.max_dist_taxa = (desc2.taxon, desc1.taxon) del (c1.desc_paths)
def runTest(self): n = '(Basichlsac,(Lamprothma,Mougeotisp),(((Haplomitr2,Petalaphy),((Angiopteri,(((Azollacaro,((Dennstasam,(Oleandrapi,Polypodapp)),Dicksonant)),Vittarifle),Botrychbit)),(Isoetesmel,((((Agathismac,Agathisova),Pseudotsu),(((Libocedrus,Juniperusc),Callitris),Athrotaxi)),((Liriodchi,Nelumbo),Sagittari))))),Thuidium));' trees = dendropy.TreeList(stream=StringIO(n + n), schema="newick") ref = trees[0] changing = trees[1] rng = RepeatedRandom() treesplit.encode_splits(ref) treesplit.encode_splits(changing) orig_root = changing.seed_node for i in xrange(50): treemanip.randomly_rotate(changing, rng=rng) self.assertNotEqual(str(changing), n) self.assertEqual(orig_root, changing.seed_node) changing.debug_check_tree(logger_obj=_LOG, splits=True) if treecalc.symmetric_difference(ref, changing) != 0: self.fail("\n%s\n!=\n%s" % (str(ref), str(changing)))
def runTest(self): tree_list = dendropy.TreeList( stream=StringIO("""((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247); ((t5:2.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247); ((t5:0.161175,t6:0.161175):0.392293,((t2:0.075411,(t4:0.104381,t1:0.075411):1):0.065840,t3:0.170221):0.383247); ((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):0.028969):0.065840,t3:0.170221):0.383247); """), schema="newick") for i in tree_list: encode_splits(i) self.assertAlmostEqual(treecalc.euclidean_distance(tree_list[0], tree_list[1]), 2.0) self.assertAlmostEqual(treecalc.euclidean_distance(tree_list[0], tree_list[2]), math.sqrt(2.0)) self.assertAlmostEqual(treecalc.euclidean_distance(tree_list[0], tree_list[3]), 0.97103099999999998) self.assertAlmostEqual(treecalc.euclidean_distance(tree_list[1], tree_list[2]), math.sqrt(6.0)) self.assertAlmostEqual(treecalc.euclidean_distance(tree_list[1], tree_list[3]), 2.2232636377544162) self.assertAlmostEqual(treecalc.euclidean_distance(tree_list[2], tree_list[3]), 1.000419513484718)
def runTest(self): n = '(Basichlsac,(Lamprothma,Mougeotisp),(((Haplomitr2,Petalaphy),((Angiopteri,(((Azollacaro,((Dennstasam,(Oleandrapi,Polypodapp)),Dicksonant)),Vittarifle),Botrychbit)),(Isoetesmel,((((Agathismac,Agathisova),Pseudotsu),(((Libocedrus,Juniperusc),Callitris),Athrotaxi)),((Liriodchi,Nelumbo),Sagittari))))),Thuidium));' trees = dendropy.TreeList(stream=StringIO(n+n), schema="newick") ref = trees[0] changing = trees[1] rng = RepeatedRandom() treesplit.encode_splits(ref) treesplit.encode_splits(changing) orig_root = changing.seed_node for i in xrange(50): treemanip.randomly_rotate(changing, rng=rng) self.assertNotEqual(str(changing), n) self.assertEqual(orig_root, changing.seed_node) changing.debug_check_tree(logger_obj=_LOG, splits=True) if treecalc.symmetric_difference(ref, changing) != 0: self.fail("\n%s\n!=\n%s" % (str(ref), str(changing)))
def map_split_support_to_tree(self, tree, split_distribution): "Maps splits support to the given tree." if self.weighted_splits: split_freqs = split_distribution.weighted_split_frequencies else: split_freqs = split_distribution.split_frequencies tree.reindex_taxa(taxon_set=split_distribution.taxon_set) assert tree.taxon_set is split_distribution.taxon_set treesplit.encode_splits(tree) for split in tree.split_edges: if split in split_freqs: split_support = split_freqs[split] else: split_support = 0.0 self.map_split_support_to_node(tree.split_edges[split].head_node, split_support) return tree
def calc(self, tree=None, create_midpoints=None): """ Calculates the distances. """ if tree is not None: self.tree = tree assert tree is not None if not hasattr(self.tree, "split_edges"): treesplit.encode_splits(self.tree) self.taxon_set = tree.taxon_set self._pat_dists = {} for i1, t1 in enumerate(self.taxon_set): self._pat_dists[t1] = {} self._mrca[t1] = {} self.max_dist = None self.max_dist_taxa = None self.max_dist_nodes = None for node in tree.postorder_node_iter(): children = node.child_nodes() if len(children) == 0: node.desc_paths = {node: 0} else: node.desc_paths = {} for cidx1, c1 in enumerate(children): for desc1, desc1_plen in c1.desc_paths.items(): node.desc_paths[desc1] = desc1_plen + c1.edge.length for c2 in children[cidx1 + 1:]: for desc2, desc2_plen in c2.desc_paths.items(): pat_dist = node.desc_paths[ desc1] + desc2_plen + c2.edge.length self._pat_dists[desc1.taxon][ desc2.taxon] = pat_dist self._mrca[desc1.taxon][ desc2.taxon] = c1.parent_node if pat_dist > self.max_dist: self.max_dist = pat_dist midpoint = float(pat_dist) / 2 if midpoint - node.desc_paths[desc1] <= 0: self.max_dist_nodes = (desc1, desc2) self.max_dist_taxa = (desc1.taxon, desc2.taxon) else: self.max_dist_nodes = (desc2, desc1) self.max_dist_taxa = (desc2.taxon, desc1.taxon) del (c1.desc_paths)
def run(self): while not self.kill_received: try: source = self.work_queue.get_nowait() except Queue.Empty: break self.send_info("Received task: '%s'." % source, wrap=False) fsrc = open(source, "rU") for tidx, tree in enumerate( tree_source_iter( fsrc, schema=self.schema, taxon_set=self.taxon_set, as_rooted=self.is_rooted, store_tree_weights=self.weighted_trees)): if tidx >= self.tree_offset: if (self.log_frequency == 1) or (tidx > 0 and self.log_frequency > 0 and tidx % self.log_frequency == 0): self.send_info( "(processing) '%s': tree at offset %d" % (source, tidx), wrap=False) treesplit.encode_splits(tree) self.split_distribution.count_splits_on_tree(tree) if self.calc_tree_probs: self.topology_counter.count( tree, tree_splits_encoded=True) else: if (self.log_frequency == 1) or (tidx > 0 and self.log_frequency > 0 and tidx % self.log_frequency == 0): self.send_info( "(processing) '%s': tree at offset %d (skipping)" % (source, tidx), wrap=False) if self.kill_received: break if self.kill_received: break self.send_info("Completed task: '%s'." % (source), wrap=False) if self.kill_received: self.send_warning("Terminating in response to kill request.") else: self.result_split_dist_queue.put(self.split_distribution) self.result_topology_hash_map_queue.put( self.topology_counter.topology_hash_map)
def testUltrametricTrees(self): tree_files = [ "pythonidae.beast.summary.tre", "primates.beast.mcct.medianh.tre" ] for tree_file in tree_files: ref_tree = dendropy.Tree.get_from_path(pathmap.tree_source_path(tree_file), "nexus", as_rooted=True) treesplit.encode_splits(ref_tree) splits = ref_tree.split_edges.keys() t_tree = treesplit.tree_from_splits(splits=splits, taxon_set=ref_tree.taxon_set, is_rooted=ref_tree.is_rooted) treesplit.encode_splits(t_tree) self.assertEqual(ref_tree.symmetric_difference(t_tree), 0)
def countSplits(self, tc, is_rooted): _LOG.info(tc[0] + "; " + tc[1]) tree_filepaths = [pathmap.tree_source_path(tc[0])] taxa_filepath = pathmap.tree_source_path(tc[1]) paup_sd = paup.get_split_distribution(tree_filepaths, taxa_filepath, is_rooted=is_rooted, burnin=0) taxon_set = paup_sd.taxon_set dp_sd = treesplit.SplitDistribution(taxon_set=taxon_set) dp_sd.ignore_edge_lengths = True dp_sd.ignore_node_ages = True dp_sd.is_rooted = is_rooted _LOG.debug("Taxon set: %s" % [t.label for t in taxon_set]) taxa_mask = taxon_set.all_taxa_bitmask() taxon_set.lock() for tree_filepath in tree_filepaths: for tree in dataio.tree_source_iter( stream=open(tree_filepath, "rU"), schema="nexus", taxon_set=taxon_set, as_rooted=is_rooted ): self.assertIs(tree.taxon_set, dp_sd.taxon_set) self.assertIs(tree.taxon_set, taxon_set) treesplit.encode_splits(tree) dp_sd.count_splits_on_tree(tree) self.assertEqual(dp_sd.total_trees_counted, paup_sd.total_trees_counted) # SplitsDistribution counts trivial splits, whereas PAUP* # contree does not, so the following will not work # assert len(dp_sd.splits) == len(paup_sd.splits),\ # "dp = %d, sd = %d" % (len(dp_sd.splits), len(paup_sd.splits)) taxa_mask = taxon_set.all_taxa_bitmask() for split in dp_sd.splits: if not treesplit.is_trivial_split(split, taxa_mask): self.assertIn(split, paup_sd.splits) self.assertEqual(dp_sd.split_counts[split], paup_sd.split_counts[split]) paup_sd.splits.remove(split) # if any splits remain, they were not # in dp_sd or were trivial remaining_splits = list(paup_sd.splits) for split in remaining_splits: if treesplit.is_trivial_split(split, taxa_mask): paup_sd.splits.remove(split) self.assertEqual(len(paup_sd.splits), 0)
def long_branch_symmdiff(trees_to_compare, edge_len_threshold, copy_trees=False, rooted=False): """Returns matrix of the symmetric_differences between trees after all internal edges with lengths < `edge_len_threshold` have been collapsed. If `copy_trees` is True then the trees will be copied first (if False, then the trees may will have their short edges collapsed on exit). """ if copy_trees: tree_list = [copy.copy(i) for i in trees_to_compare] else: tree_list = list(trees_to_compare) n_trees = len(tree_list) _LOG.debug('%d Trees to compare:\n%s\n' % (n_trees, '\n'.join([str(i) for i in tree_list]))) if n_trees < 2: return [0 for t in tree_list] f_r = [] for tree in tree_list: to_collapse = [] encode_splits(tree) for edge in tree.preorder_edge_iter(filter_fn=Edge.is_internal): elen = edge.length if elen is not None and elen < edge_len_threshold: to_collapse.append(edge) for edge in to_collapse: collapse_edge(edge) f_r.append(tree.is_rooted) tree.is_rooted = bool(rooted) encode_splits(tree) sd_row = [0]*n_trees sd_mat = [list(sd_row) for i in xrange(n_trees)] for i, tree_one in enumerate(tree_list[:-1]): for col_count, tree_two in enumerate(tree_list[1+i:]): j = i + 1 + col_count sd = symmetric_difference(tree_one, tree_two) sd_mat[i][j] = sd sd_mat[j][i] = sd if not copy_trees: for r, tree in itertools.izip(f_r, tree_list): tree.is_rooted = r return sd_mat
def calc(self, tree=None, create_midpoints=None): """ Calculates the distances. """ if tree is not None: self.tree = tree assert self.tree is not None if not hasattr(self.tree, "split_edges"): treesplit.encode_splits(self.tree) self.taxon_set = self.tree.taxon_set self._pat_dists = {} for i1, t1 in enumerate(self.taxon_set): self._pat_dists[t1] = {} self._mrca[t1] = {} self.max_dist = None self.max_dist_taxa = None self.max_dist_nodes = None for node in self.tree.postorder_node_iter(): children = node.child_nodes() if len(children) == 0: node.desc_paths = {node : 0} else: node.desc_paths = {} for cidx1, c1 in enumerate(children): for desc1, desc1_plen in c1.desc_paths.items(): node.desc_paths[desc1] = desc1_plen + c1.edge.length for c2 in children[cidx1+1:]: for desc2, desc2_plen in c2.desc_paths.items(): pat_dist = node.desc_paths[desc1] + desc2_plen + c2.edge.length self._pat_dists[desc1.taxon][desc2.taxon] = pat_dist self._mrca[desc1.taxon][desc2.taxon] = c1.parent_node if pat_dist > self.max_dist: self.max_dist = pat_dist midpoint = float(pat_dist) / 2 if midpoint - node.desc_paths[desc1] <= 0: self.max_dist_nodes = (desc1, desc2) self.max_dist_taxa = (desc1.taxon, desc2.taxon) else: self.max_dist_nodes = (desc2, desc1) self.max_dist_taxa = (desc2.taxon, desc1.taxon) del(c1.desc_paths)
def count_splits_on_trees(self, tree_iterator, split_distribution=None, trees_splits_encoded=False): """ Given a list of trees file, a SplitsDistribution object (a new one, or, if passed as an argument) is returned collating the split data in the files. """ if split_distribution is None: split_distribution = treesplit.SplitDistribution() taxon_set = split_distribution.taxon_set for tree_idx, tree in enumerate(tree_iterator): self.total_trees_counted += 1 if taxon_set is None: assert(split_distribution.taxon_set is None) split_distribution.taxon_set = tree.taxon_set taxon_set = tree.taxon_set else: assert(taxon_set is tree.taxon_set) if not trees_splits_encoded: treesplit.encode_splits(tree) split_distribution.count_splits_on_tree(tree) return split_distribution
def patristic_distance(tree, taxon1, taxon2): """ Given a tree with splits encoded, and two taxa on that tree, returns the patristic distance between the two. Much more inefficient than constructing a PatristicDistanceMatrix object. """ if not hasattr(tree, "split_edges"): treesplit.encode_splits(tree) mrca = tree.mrca(taxa=[taxon1, taxon2]) dist = 0 n = tree.find_node(lambda x: x.taxon == taxon1) while n != mrca: if n.edge.length is not None: dist += n.edge.length n = n.parent_node n = tree.find_node(lambda x: x.taxon == taxon2) while n != mrca: if n.edge.length is not None: dist += n.edge.length n = n.parent_node return dist
def runTest(self): tree_list = dendropy.TreeList(stream=StringIO( """((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247);""" ), schema="newick") for i in tree_list: _LOG.debug(i.get_indented_form()) treesplit.encode_splits(i) _LOG.debug(i.get_indented_form(splits=True)) i.debug_check_tree(splits=True, logger_obj=_LOG) root1 = tree_list[0].seed_node root1e = root1.edge self.assertEqual(treesplit.split_to_list(root1e.split_bitmask), range(6)) self.assertEqual( treesplit.split_to_list(root1e.split_bitmask, one_based=True), range(1, 7)) self.assertEqual( treesplit.split_to_list(root1e.split_bitmask, mask=21, one_based=True), [1, 3, 5]) self.assertEqual( treesplit.split_to_list(root1e.split_bitmask, mask=21), [0, 2, 4]) self.assertEqual(treesplit.count_bits(root1e.split_bitmask), 6) fc1 = root1.child_nodes()[0] fc1e = fc1.edge self.assertEqual(treesplit.split_to_list(fc1e.split_bitmask), [0, 1]) self.assertEqual( treesplit.split_to_list(fc1e.split_bitmask, one_based=True), [1, 2]) self.assertEqual( treesplit.split_to_list(fc1e.split_bitmask, mask=0x15, one_based=True), [1]) self.assertEqual( treesplit.split_to_list(fc1e.split_bitmask, mask=0x15), [0]) self.assertEqual(treesplit.count_bits(fc1e.split_bitmask), 2)
def runTest(self): tree_list = dendropy.TreeList( stream=StringIO("""((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247);"""), schema="newick") for i in tree_list: _LOG.debug(i.get_indented_form()) treesplit.encode_splits(i) _LOG.debug(i.get_indented_form(splits=True)) i.debug_check_tree(splits=True, logger_obj=_LOG) root1 = tree_list[0].seed_node root1e = root1.edge self.assertEqual(treesplit.split_to_list(root1e.split_bitmask), range(6)) self.assertEqual(treesplit.split_to_list(root1e.split_bitmask, one_based=True), range(1,7)) self.assertEqual(treesplit.split_to_list(root1e.split_bitmask, mask=21, one_based=True), [1, 3, 5]) self.assertEqual(treesplit.split_to_list(root1e.split_bitmask, mask=21), [0, 2, 4]) self.assertEqual(treesplit.count_bits(root1e.split_bitmask), 6) fc1 = root1.child_nodes()[0] fc1e = fc1.edge self.assertEqual(treesplit.split_to_list(fc1e.split_bitmask), [0, 1]) self.assertEqual(treesplit.split_to_list(fc1e.split_bitmask, one_based=True), [1, 2]) self.assertEqual(treesplit.split_to_list(fc1e.split_bitmask, mask=0x15, one_based=True), [1]) self.assertEqual(treesplit.split_to_list(fc1e.split_bitmask, mask=0x15), [0]) self.assertEqual(treesplit.count_bits(fc1e.split_bitmask), 2)
taxa = dendropy.TaxonSet() true_tree = dendropy.Tree.get_from_path(sys.argv[1],"Newick",taxon_set = taxa) # true tree (bigtree) mrp_tree = dendropy.Tree.get_from_path(sys.argv[2],"Nexus",taxon_set = taxa) # MRP tree mrp_con = dendropy.Tree.get_from_path(sys.argv[3],"Nexus",taxon_set = taxa) # MRP tree sas_tree = dendropy.Tree.get_from_path(sys.argv[4],"Newick",taxon_set = taxa) # SAS tree to_prune = [] for node in mrp_tree.leaf_nodes(): if node.taxon.label == 'roottaxon': to_prune.append(node) assert(len(to_prune) == 1) included = set([node.taxon for node in mrp_tree.leaf_nodes()]) #print "number of leaves",len(included) #print mrp_tree.leaf_nodes() prune_tree_to_included(sas_tree, included) prune_tree_to_included(true_tree, included) encode_splits(true_tree) encode_splits(mrp_tree) encode_splits(sas_tree) mrp_distance = true_tree.false_positives_and_negatives(mrp_tree) sas_distance = true_tree.false_positives_and_negatives(sas_tree) mrp_to_mrpcon = mrp_tree.false_positives_and_negatives(mrp_con) sas_to_mrpcon = sas_tree.false_positives_and_negatives(mrp_con) #print sas_tree.as_ascii_plot() #print mrp_tree.as_ascii_plot() #print true_tree.as_ascii_plot() print "mrp to mrp con",mrp_to_mrpcon[0],mrp_to_mrpcon[1] print "sas to mrp con",sas_to_mrpcon[0],sas_to_mrpcon[1] print "true MRP",mrp_distance[0],mrp_distance[1] print "true SAS",sas_distance[0],sas_distance[1] mrp_to_sas = mrp_tree.false_positives_and_negatives(sas_tree) print "MRP SAS ",mrp_to_sas[0],mrp_to_sas[1]
mrp_con = dendropy.Tree.get_from_path(sys.argv[3], "Nexus", taxon_set=taxa) # MRP tree sas_tree = dendropy.Tree.get_from_path(sys.argv[4], "Newick", taxon_set=taxa) # SAS tree to_prune = [] for node in mrp_tree.leaf_nodes(): if node.taxon.label == 'roottaxon': to_prune.append(node) assert (len(to_prune) == 1) included = set([node.taxon for node in mrp_tree.leaf_nodes()]) #print "number of leaves",len(included) #print mrp_tree.leaf_nodes() prune_tree_to_included(sas_tree, included) prune_tree_to_included(true_tree, included) encode_splits(true_tree) encode_splits(mrp_tree) encode_splits(sas_tree) mrp_distance = true_tree.false_positives_and_negatives(mrp_tree) sas_distance = true_tree.false_positives_and_negatives(sas_tree) mrp_to_mrpcon = mrp_tree.false_positives_and_negatives(mrp_con) sas_to_mrpcon = sas_tree.false_positives_and_negatives(mrp_con) #print sas_tree.as_ascii_plot() #print mrp_tree.as_ascii_plot() #print true_tree.as_ascii_plot() print "mrp to mrp con", mrp_to_mrpcon[0], mrp_to_mrpcon[1] print "sas to mrp con", sas_to_mrpcon[0], sas_to_mrpcon[1] print "true MRP", mrp_distance[0], mrp_distance[1] print "true SAS", sas_distance[0], sas_distance[1] mrp_to_sas = mrp_tree.false_positives_and_negatives(sas_tree) print "MRP SAS ", mrp_to_sas[0], mrp_to_sas[1]
def add_to_scm(to_modify, to_consume, rooted=False, gordons_supertree=False): """Adds the tree `to_consume` to the tree `to_modify` in a strict consensus merge operation. Both trees must have had encode_splits called on them.""" assert (to_modify.taxon_set is to_consume.taxon_set) taxon_set = to_consume.taxon_set if rooted: raise NotImplementedError("rooted form of add_to_scm not implemented") to_mod_root = to_modify.seed_node to_mod_split = to_mod_root.edge.split_bitmask to_consume_root = to_consume.seed_node to_consume_split = to_consume_root.edge.split_bitmask leaf_intersection = to_mod_split & to_consume_split if _IS_DEBUG_LOGGING: _LOG.debug("add_to_scm:\n %s\n + %s\n%s" % (str(to_modify), str(to_consume), format_split(leaf_intersection, taxon_set=taxon_set))) n_common_leaves = count_bits(leaf_intersection) if n_common_leaves < 2: _LOG.error('trees must have at least 2 common leaves') raise ValueError('trees must have at least 2 common leaves') if n_common_leaves == 2: # SCM with 2 leaves in common results in a polytomy collapse_clade(to_mod_root) collapse_clade(to_consume_root) leaves_to_steal = [ c for c in to_consume_root.child_nodes() if not (leaf_intersection & c.edge.split_bitmask) ] for leaf in leaves_to_steal: to_mod_root.add_child(leaf) to_mod_root.edge.split_bitmask |= leaf.edge.split_bitmask to_modify.split_edges = { to_mod_root.edge.split_bitmask: to_mod_root.edge } for child in to_mod_root.child_nodes(): to_modify.split_edges[child.edge.split_bitmask] = child.edge return # at least 3 leaves in common tmse = to_modify.split_edges to_mod_relevant_splits = {} to_consume_relevant_splits = {} if not rooted: if _IS_DEBUG_LOGGING: to_modify.debug_check_tree(check_splits=True, logger_obj=_LOG) to_consume.debug_check_tree(check_splits=True, logger_obj=_LOG) reroot_on_lowest_common_index_path(to_modify, leaf_intersection) reroot_on_lowest_common_index_path(to_consume, leaf_intersection) if _IS_DEBUG_LOGGING: to_modify.debug_check_tree(check_splits=True, logger_obj=_LOG) to_consume.debug_check_tree(check_splits=True, logger_obj=_LOG) to_mod_root = to_modify.seed_node assert (to_mod_root.edge.split_bitmask == to_mod_split) to_consume_root = to_consume.seed_node assert (to_consume_root.edge.split_bitmask == to_consume_split) for s, e in tmse.iteritems(): s = e.split_bitmask masked = s & leaf_intersection if masked and masked != leaf_intersection: e_list = to_mod_relevant_splits.setdefault(masked, []) e_list.append((s, e)) for s, e in to_consume.split_edges.iteritems(): s = e.split_bitmask masked = s & leaf_intersection if masked and masked != leaf_intersection: e_list = to_consume_relevant_splits.setdefault(masked, []) e_list.append((s, e)) # Because each of these paths radiates away from the root (none of the paths # cross the root), the split_bitmasks for deeper edges will be supersets # of the split_bitmasks for shallower nodes. Thus if we reverse sort we # get the edges in the order root->tip for split, path in to_mod_relevant_splits.iteritems(): path.sort(reverse=True) t = [i[1] for i in path] del path[:] path.extend(t) for split, path in to_consume_relevant_splits.iteritems(): path.sort(reverse=True) t = [i[1] for i in path] del path[:] path.extend(t) if _IS_DEBUG_LOGGING: to_modify.debug_check_tree(check_splits=True, logger_obj=_LOG) to_consume.debug_check_tree(check_splits=True, logger_obj=_LOG) # first we'll collapse all paths in the common leafset in to_modify that # are not in to_consume _collapse_paths_not_found(to_mod_relevant_splits, to_consume_relevant_splits, tmse) # Now we'll collapse all paths in the common leafset in to_consume that # are not in to_modify _collapse_paths_not_found(to_consume_relevant_splits, to_mod_relevant_splits) # first we'll deal with subtrees that are: # - not in the leaf intersection set, and # - attached to "relevant" nodes # We simply move these subtrees from the to_consume tree to the appropriate # node in to_modify to_steal = [ i for i in to_consume_root.child_nodes() if (i.edge.split_bitmask & leaf_intersection) == 0 ] for child in to_steal: to_mod_root.add_child(child) to_mod_root.edge.split_bitmask |= child.edge.split_bitmask for masked_split, to_consume_path in to_consume_relevant_splits.iteritems( ): to_mod_path = to_mod_relevant_splits.get(masked_split) if _IS_DEBUG_LOGGING and to_mod_path is None: #to_mod_path is None: _LOG.debug("%s = mask" % format_split(leaf_intersection, taxon_set=taxon_set)) _LOG.debug("%s = masked" % format_split(masked_split, taxon_set=taxon_set)) _LOG.debug("%s = raw" % format_split( to_consume_path[-1].split_bitmask, taxon_set=taxon_set)) for k, v in to_mod_relevant_splits.iteritems(): _LOG.debug("%s in to_mod_relevant_splits" % format_split(k, taxon_set=taxon_set)) assert to_mod_path is not None to_mod_head = to_mod_path[-1].head_node to_mod_head_edge = to_mod_head.edge to_consume_head = to_consume_path[-1].head_node for child in to_consume_head.child_nodes(): if (child.edge.split_bitmask & leaf_intersection) == 0: # child is the root of a subtree that has no children in the leaf_intersection to_mod_head.add_child(child) to_mod_head_edge.split_bitmask |= child.edge.split_bitmask if len(to_consume_path) > 1: if len(to_mod_path) > 1: # collision if gordons_supertree: for edge in to_mod_path[2:]: p = edge.tail_node c = edge.head_node sibs = p.child_nodes() for sib in sibs: _LOG.debug("sib is %s" % (sib.compose_newick())) if sib is not c: if not sib.is_leaf(): collapse_clade(sib) collapse_edge(sib.edge) collapse_edge(p.edge) mid_node = to_mod_path[0].head_node for edge in to_consume_path[1:]: p = edge.tail_node avoid = edge.head_node for child in p.child_nodes(): _LOG.debug("child is %s" % (child.compose_newick())) if child is not avoid: mid_node.add_child(child) collapse_clade(child) if not child.is_leaf(): collapse_edge(child.edge) mid_node.edge.split_bitmask |= child.edge.split_bitmask else: for edge in to_mod_path[1:-1]: collapse_edge(edge) mid_node = to_mod_path[0].head_node for edge in to_consume_path[1:]: p = edge.tail_node avoid = edge.head_node for child in p.child_nodes(): if child is not avoid: mid_node.add_child(child) mid_node.edge.split_bitmask |= child.edge.split_bitmask else: # we have to move the subtrees from to_consume to to_modify to_mod_edge = to_mod_path[0] to_mod_tail, to_mod_head = to_mod_edge.tail_node, to_mod_edge.head_node deepest_edge_to_move = to_consume_path[0] deepest_node_to_move = deepest_edge_to_move.head_node tipmost_edge_to_move = to_consume_path[-1] tipmost_node_to_move = tipmost_edge_to_move.tail_node prev_head = tipmost_edge_to_move.head_node to_mod_tail.add_child(deepest_node_to_move) to_mod_tail.remove_child(to_mod_head) tipmost_node_to_move.add_child(to_mod_head) tipmost_node_to_move.remove_child(prev_head) encode_splits(to_modify)
dataset = DataSet() try: dataset.read(stream=fo, schema="Newick") except DataParseError as dfe: raise ValueError(str(dfe)) if len(dataset.taxon_sets) != 1: raise ValueError("Expecting one set of taxa in %s" % f) if len(dataset.tree_lists) != 1: raise ValueError("Expecting one tree in %s" % f) taxon_set = dataset.taxon_sets[0] tree_list = dataset.tree_lists[0] number_of_taxon = len(taxon_set) branch_counter = 0 code_list = [StringIO() for i in taxon_set] for tree in tree_list: encode_splits(tree) tree_mask = tree.seed_node.edge.split_bitmask assert tree_mask is not None tree_tax = set(split_to_list(tree_mask)) #print tree_tax split_list = [] for node in tree.postorder_internal_node_iter(): if node.parent_node is not None: branch_counter +=1 split_set = set(split_to_list(node.edge.split_bitmask)) split_list.append(split_set) for i,stream in enumerate(code_list): if i in tree_tax: for split in split_list: if i in split: stream.write('1')
def runTest(self): taxon_set = dendropy.TaxonSet([str(i + 1) for i in range(5)]) tree_list = dendropy.TreeList(stream=StringIO(""" (5,((4,3),2),1); (5,(4,3,2),1); (5,((4,3),2),1); (5,(4,3),2,1); (5,((4,3),2),1); (5,4,3,2,1); """), schema="newick", taxon_set=taxon_set) tree = tree_list[0] expected_tree = tree_list[1] treesplit.encode_splits(tree) all_cm = tree.seed_node.edge.split_bitmask split_to_target = 0xA treemanip.collapse_conflicting(tree.seed_node, split_to_target, all_cm) treesplit.encode_splits(tree) treesplit.encode_splits(expected_tree) self.assertEqual(treecalc.symmetric_difference(tree, expected_tree), 0) tree = tree_list[2] expected_tree = tree_list[3] treesplit.encode_splits(tree) all_cm = tree.seed_node.edge.split_bitmask split_to_target = 0x3 treemanip.collapse_conflicting(tree.seed_node, split_to_target, all_cm) treesplit.encode_splits(tree) treesplit.encode_splits(expected_tree) self.assertEqual(treecalc.symmetric_difference(tree, expected_tree), 0) tree = tree_list[4] expected_tree = tree_list[5] treesplit.encode_splits(tree) all_cm = tree.seed_node.edge.split_bitmask split_to_target = 0x5 treemanip.collapse_conflicting(tree.seed_node, split_to_target, all_cm) treesplit.encode_splits(tree) treesplit.encode_splits(expected_tree) self.assertEqual(treecalc.symmetric_difference(tree, expected_tree), 0)
def get_length_diffs(tree1, tree2, edge_length_attr="length", value_type=float, split_length_diff_map=False): """ Returns a list of tuples, with the first element of each tuple representing the length of the branch subtending a particular split on ``tree1``, and the second element the length of the same branch on ``tree2``. If a particular split is found on one tree but not in the other, a value of zero is used for the missing split. """ length_diffs = [] split_length_diffs = {} if tree1.taxon_set is not tree2.taxon_set: raise TypeError("Trees have different TaxonSet objects: %s vs. %s" \ % (hex(id(tree1.taxon_set)), hex(id(tree2.taxon_set)))) if not hasattr(tree1, "split_edges"): treesplit.encode_splits(tree1) if not hasattr(tree2, "split_edges"): treesplit.encode_splits(tree2) split_edges2_copy = dict(tree2.split_edges) # O(n*(2*bind + dict_item_cost)) split_edges1_ref = tree1.split_edges for split, edge in split_edges1_ref.iteritems(): # O n : 2*bind elen1 = getattr(edge, edge_length_attr) # attr + bind if elen1 is None: elen1 = 0 # worst-case: bind value1 = value_type(elen1) # ctor + bind try: e2 = split_edges2_copy.pop(split) # attr + dict_lookup + bind elen2 = getattr(e2, edge_length_attr) # attr + bind if elen2 is None: # allow root edge to have split with no value: raise error if not root edge if e2.tail_node is None: elen2 = 0.0 else: raise ValueError("Edge length attribute is 'None': Tree: %s ('%s'), Split: %s" % (tree2.oid, tree2.label, tree2.taxon_set.split_as_newick_string(split))) except KeyError: # excep elen2 = 0.0 value2 = value_type(elen2) # ctor + bind # best case length_diffs.append((value1,value2)) # ctor + listappend split_length_diffs[split] = length_diffs[-1] for split, edge in split_edges2_copy.iteritems(): # best-case not executed, worst case O(n) : 2*bind elen2 = getattr(edge, edge_length_attr) # attr + bind if elen2 is None: elen2 = 0 value2 = value_type(elen2) # ctor + bind e1 = split_edges1_ref.get(split) # attr + dict_lookup + bind if e1 is None: elen1 = 0.0 else: elen1 = getattr(e1, edge_length_attr) # attr + bind if elen1 is None: # allow root edge to have split with no value: raise error if not root edge if e1.tail_node is None: elen1 = 0.0 else: raise ValueError("Edge length attribute is 'None': Tree: %s ('%s'), Split: %s" % (tree1.oid, tree1.label, split)) #elen1 = 0 value1 = value_type(elen1) length_diffs.append((value1,value2)) # ctor + listappend split_length_diffs[split] = length_diffs[-1] # the numbers below do not reflect additions to the code to protect against # edges with length None # loops # best-case: # O(n * (dict_lookup + 3*attr + 3*ctor + 7*bind + listappend)) # worst-case: # separated: O(n * (2*dict_lookup + 4*attr + 3*ctor + 8*bind + listappend + excep) + n*(2*dict_lookup + 4*attr + 3*ctor + 8*bind + listappend)) # or: # O(2n*(2*dict_lookup + 4*attr + 3*ctor + 8*bind + listappend + 0.5*excep)) # total # best-case: # O(n * (dict_lookup + 3*attr + 3*ctor + 8*bind + listappend + dict_item_cost)) # worst-case: # O(2n*(2*dict_lookup + 4*attr + 3*ctor + 9*bind + listappend + 0.5*(dict_item_cost + excep)) if split_length_diff_map: return length_diffs, split_length_diffs else: return length_diffs
def runTest(self): taxon_set = dendropy.TaxonSet([str(i+1) for i in range(5)]) tree_list = dendropy.TreeList( stream=StringIO(""" (5,((4,3),2),1); (5,(4,3,2),1); (5,((4,3),2),1); (5,(4,3),2,1); (5,((4,3),2),1); (5,4,3,2,1); """), schema="newick", taxon_set=taxon_set) tree = tree_list[0] expected_tree = tree_list[1] treesplit.encode_splits(tree) all_cm = tree.seed_node.edge.split_bitmask split_to_target = 0xA treemanip.collapse_conflicting(tree.seed_node, split_to_target, all_cm) treesplit.encode_splits(tree) treesplit.encode_splits(expected_tree) self.assertEqual(treecalc.symmetric_difference(tree, expected_tree), 0) tree = tree_list[2] expected_tree = tree_list[3] treesplit.encode_splits(tree) all_cm = tree.seed_node.edge.split_bitmask split_to_target = 0x3 treemanip.collapse_conflicting(tree.seed_node, split_to_target, all_cm) treesplit.encode_splits(tree) treesplit.encode_splits(expected_tree) self.assertEqual(treecalc.symmetric_difference(tree, expected_tree), 0) tree = tree_list[4] expected_tree = tree_list[5] treesplit.encode_splits(tree) all_cm = tree.seed_node.edge.split_bitmask split_to_target = 0x5 treemanip.collapse_conflicting(tree.seed_node, split_to_target, all_cm) treesplit.encode_splits(tree) treesplit.encode_splits(expected_tree) self.assertEqual(treecalc.symmetric_difference(tree, expected_tree), 0)
def add_to_scm(to_modify, to_consume, rooted=False, gordons_supertree=False): """Adds the tree `to_consume` to the tree `to_modify` in a strict consensus merge operation. Both trees must have had encode_splits called on them.""" assert(to_modify.taxon_set is to_consume.taxon_set) taxon_set = to_consume.taxon_set if rooted: raise NotImplementedError("rooted form of add_to_scm not implemented") to_mod_root = to_modify.seed_node to_mod_split = to_mod_root.edge.split_bitmask to_consume_root = to_consume.seed_node to_consume_split = to_consume_root.edge.split_bitmask leaf_intersection = to_mod_split & to_consume_split if _IS_DEBUG_LOGGING: _LOG.debug("add_to_scm:\n %s\n + %s\n%s" % (str(to_modify), str(to_consume), format_split(leaf_intersection, taxon_set=taxon_set))) n_common_leaves = count_bits(leaf_intersection) if n_common_leaves < 2: _LOG.error('trees must have at least 2 common leaves') raise ValueError('trees must have at least 2 common leaves') if n_common_leaves == 2: # SCM with 2 leaves in common results in a polytomy collapse_clade(to_mod_root) collapse_clade(to_consume_root) leaves_to_steal = [c for c in to_consume_root.child_nodes() if not (leaf_intersection & c.edge.split_bitmask)] for leaf in leaves_to_steal: to_mod_root.add_child(leaf) to_mod_root.edge.split_bitmask |= leaf.edge.split_bitmask to_modify.split_edges = {to_mod_root.edge.split_bitmask : to_mod_root.edge} for child in to_mod_root.child_nodes(): to_modify.split_edges[child.edge.split_bitmask] = child.edge return # at least 3 leaves in common tmse = to_modify.split_edges to_mod_relevant_splits = {} to_consume_relevant_splits = {} if not rooted: if _IS_DEBUG_LOGGING: to_modify.debug_check_tree(check_splits=True, logger_obj=_LOG) to_consume.debug_check_tree(check_splits=True, logger_obj=_LOG) reroot_on_lowest_common_index_path(to_modify, leaf_intersection) reroot_on_lowest_common_index_path(to_consume, leaf_intersection) if _IS_DEBUG_LOGGING: to_modify.debug_check_tree(check_splits=True, logger_obj=_LOG) to_consume.debug_check_tree(check_splits=True, logger_obj=_LOG) to_mod_root = to_modify.seed_node assert(to_mod_root.edge.split_bitmask == to_mod_split) to_consume_root = to_consume.seed_node assert(to_consume_root.edge.split_bitmask == to_consume_split) for s, e in tmse.iteritems(): s = e.split_bitmask masked = s & leaf_intersection if masked and masked != leaf_intersection: e_list = to_mod_relevant_splits.setdefault(masked, []) e_list.append((s, e)) for s, e in to_consume.split_edges.iteritems(): s = e.split_bitmask masked = s & leaf_intersection if masked and masked != leaf_intersection: e_list = to_consume_relevant_splits.setdefault(masked, []) e_list.append((s, e)) # Because each of these paths radiates away from the root (none of the paths # cross the root), the split_bitmasks for deeper edges will be supersets # of the split_bitmasks for shallower nodes. Thus if we reverse sort we # get the edges in the order root->tip for split, path in to_mod_relevant_splits.iteritems(): path.sort(reverse=True) t = [i[1] for i in path] del path[:] path.extend(t) for split, path in to_consume_relevant_splits.iteritems(): path.sort(reverse=True) t = [i[1] for i in path] del path[:] path.extend(t) if _IS_DEBUG_LOGGING: to_modify.debug_check_tree(check_splits=True, logger_obj=_LOG) to_consume.debug_check_tree(check_splits=True, logger_obj=_LOG) # first we'll collapse all paths in the common leafset in to_modify that # are not in to_consume _collapse_paths_not_found(to_mod_relevant_splits, to_consume_relevant_splits, tmse) # Now we'll collapse all paths in the common leafset in to_consume that # are not in to_modify _collapse_paths_not_found(to_consume_relevant_splits, to_mod_relevant_splits) # first we'll deal with subtrees that are: # - not in the leaf intersection set, and # - attached to "relevant" nodes # We simply move these subtrees from the to_consume tree to the appropriate # node in to_modify to_steal = [i for i in to_consume_root.child_nodes() if (i.edge.split_bitmask & leaf_intersection) == 0] for child in to_steal: to_mod_root.add_child(child) to_mod_root.edge.split_bitmask |= child.edge.split_bitmask for masked_split, to_consume_path in to_consume_relevant_splits.iteritems(): to_mod_path = to_mod_relevant_splits.get(masked_split) if _IS_DEBUG_LOGGING and to_mod_path is None: #to_mod_path is None: _LOG.debug("%s = mask" % format_split(leaf_intersection, taxon_set=taxon_set)) _LOG.debug("%s = masked" % format_split(masked_split, taxon_set=taxon_set)) _LOG.debug("%s = raw" % format_split(to_consume_path[-1].split_bitmask, taxon_set=taxon_set)) for k, v in to_mod_relevant_splits.iteritems(): _LOG.debug("%s in to_mod_relevant_splits" % format_split(k, taxon_set=taxon_set)) assert to_mod_path is not None to_mod_head = to_mod_path[-1].head_node to_mod_head_edge = to_mod_head.edge to_consume_head = to_consume_path[-1].head_node for child in to_consume_head.child_nodes(): if (child.edge.split_bitmask & leaf_intersection) == 0: # child is the root of a subtree that has no children in the leaf_intersection to_mod_head.add_child(child) to_mod_head_edge.split_bitmask |= child.edge.split_bitmask if len(to_consume_path) > 1: if len(to_mod_path) > 1: # collision if gordons_supertree: for edge in to_mod_path[2:]: p = edge.tail_node c = edge.head_node sibs = p.child_nodes() for sib in sibs: _LOG.debug("sib is %s" % (sib.compose_newick())) if sib is not c: if not sib.is_leaf(): collapse_clade(sib) collapse_edge(sib.edge) collapse_edge(p.edge) mid_node = to_mod_path[0].head_node for edge in to_consume_path[1:]: p = edge.tail_node avoid = edge.head_node for child in p.child_nodes(): _LOG.debug("child is %s" % (child.compose_newick())) if child is not avoid: mid_node.add_child(child) collapse_clade(child) if not child.is_leaf(): collapse_edge(child.edge) mid_node.edge.split_bitmask |= child.edge.split_bitmask else: for edge in to_mod_path[1:-1]: collapse_edge(edge) mid_node = to_mod_path[0].head_node for edge in to_consume_path[1:]: p = edge.tail_node avoid = edge.head_node for child in p.child_nodes(): if child is not avoid: mid_node.add_child(child) mid_node.edge.split_bitmask |= child.edge.split_bitmask else: # we have to move the subtrees from to_consume to to_modify to_mod_edge = to_mod_path[0] to_mod_tail, to_mod_head = to_mod_edge.tail_node, to_mod_edge.head_node deepest_edge_to_move = to_consume_path[0] deepest_node_to_move = deepest_edge_to_move.head_node tipmost_edge_to_move = to_consume_path[-1] tipmost_node_to_move = tipmost_edge_to_move.tail_node prev_head = tipmost_edge_to_move.head_node to_mod_tail.add_child(deepest_node_to_move) to_mod_tail.remove_child(to_mod_head) tipmost_node_to_move.add_child(to_mod_head) tipmost_node_to_move.remove_child(prev_head) encode_splits(to_modify)
def process_sources_serial(support_filepaths, schema, is_rooted, ignore_node_ages, calc_tree_probs, weighted_trees, tree_offset, log_frequency, messenger): """ Returns a SplitDistribution object summarizing all trees found in `support_filepaths`. """ messenger.send_info("Running in serial mode.") taxon_set = dendropy.TaxonSet() split_distribution = treesplit.SplitDistribution(taxon_set=taxon_set) split_distribution.ignore_node_ages = ignore_node_ages split_distribution.is_rooted = is_rooted topology_counter = treesum.TopologyCounter() if support_filepaths is None or len(support_filepaths) == 0: messenger.send_info("Reading trees from standard input.") srcs = [sys.stdin] else: messenger.send_info("%d source(s) to be processed." % len(support_filepaths)) # do not want to have all files open at the same time #srcs = [open(f, "rU") for f in support_filepaths] # store filepaths, to open individually in loop srcs = support_filepaths for sidx, src in enumerate(srcs): # hack needed because we do not want to open all input files at the # same time; if not a file object, assume it is a file path and create # corresponding file object if not isinstance(src, file): src = open(src, "rU") name = getattr(src, "name", "<stdin>") messenger.send_info("Processing %d of %d: '%s'" % (sidx + 1, len(srcs), name), wrap=False) for tidx, tree in enumerate( tree_source_iter(src, schema=schema, taxon_set=taxon_set, store_tree_weights=weighted_trees, as_rooted=is_rooted)): if tidx >= tree_offset: if (log_frequency == 1) or (tidx > 0 and log_frequency > 0 and tidx % log_frequency == 0): messenger.send_info( "(processing) '%s': tree at offset %d" % (name, tidx), wrap=False) treesplit.encode_splits(tree) split_distribution.count_splits_on_tree(tree) topology_counter.count(tree, tree_splits_encoded=True) else: if (log_frequency == 1) or (tidx > 0 and log_frequency > 0 and tidx % log_frequency == 0): messenger.send_info( "(processing) '%s': tree at offset %d (skipping)" % (name, tidx), wrap=False) try: src.close() except ValueError: # "I/O operation on closed file" if we try to close sys.stdin pass messenger.send_info("Serial processing of %d source(s) completed." % len(srcs)) return split_distribution, topology_counter
def do_sim(birth_rate , death_rate, num_leaves, rng=None): temp_dir = tempfile.mkdtemp() model_tree = treesim.birth_death(birth_rate=birth_rate, death_rate=death_rate, ntax=num_leaves, rng=rng) ################################################################################ # Calling seq-gen mtf = os.path.join(temp_dir, 'simtree') print "temp_dir =", temp_dir treefile_obj = open(mtf, 'w') treefile_obj.write("%s;\n" % str(model_tree)) # CLOSING THE FILE IS IMPORTANT! This flushes buffers, assuring that the data # will be written to the filesystem before seq-gen is invoked. treefile_obj.close() import subprocess command_line = ['seq-gen', '-mHKY', '-on', ] if os.environ.get('TREE_INF_TEST_RAND_NUMBER_SEED'): sg_seed = seed else: if rng is None: sg_seed = random.randint(0,100000) else: sg_seed = rng.randint(0,100000) command_line.append('-z%d' % sg_seed) command_line.append('simtree') seq_gen_proc = subprocess.Popen(command_line, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=temp_dir) dataset = seq_gen_proc.communicate()[0] # seq-gen does not exit with an error code when it fails. I don't know why!! if seq_gen_proc.returncode != 0 or len(dataset) == 0: sys.exit('seq-gen failed!\n') sd = os.path.join(temp_dir, 'simdata.nex') d = open(sd, 'w') d.write(dataset) # CLOSING THE FILE IS IMPORTANT! This flushes buffers, assuring that the data # will be written to the filesystem before PAUP is invoked. d.close() ################################################################################ # PAUP pcf = os.path.join(temp_dir, 'execute_paup.nex') pc = open(pcf, 'w') pc.write('''execute simdata.nex ; hsearch nomultrees ; savetree file=inferred.tre format = NEXUS; quit; ''') pc.close() paup_proc = subprocess.Popen(['paup', '-n', pcf], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=temp_dir) (o, e) = paup_proc.communicate() paup_output = os.path.join(temp_dir, 'inferred.tre') # seq-gen does not exit with an error code when it fails. I don't know why!! if paup_proc.returncode != 0 or not os.path.exists(paup_output): sys.exit(e) # read true tree with the inferred tree (because it is nexus) inf_tree_list = TreeList.get_from_path(paup_output, "NEXUS", taxon_set=model_tree.taxon_set) assert len(inf_tree_list) == 1 inferred_tree = inf_tree_list[0] # determine which splits were missed treesplit.encode_splits(inferred_tree) treesplit.encode_splits(model_tree) missing = model_tree.find_missing_splits(inferred_tree) # sort the nodes of the true tree by depth and ask whether or not they were recovered node_depth_TF_list = [] for node in model_tree.postorder_node_iter(): children = node.child_nodes() if children and node.parent_node: first_child = children[0] node.depth = first_child.depth + first_child.edge.length if node.edge.split_bitmask in missing: recovered = 0 else: recovered = 1 node_depth_TF_list.append((node.depth, node.edge.length, recovered)) else: node.depth = 0.0 node_depth_TF_list.sort() os.remove(pcf) os.remove(paup_output) os.remove(sd) os.remove(mtf) os.rmdir(temp_dir) return node_depth_TF_list
def process_sources_serial( support_filepaths, schema, is_rooted, ignore_node_ages, ultrametricity_precision, calc_tree_probs, weighted_trees, tree_offset, log_frequency, messenger, ): """ Returns a SplitDistribution object summarizing all trees found in `support_filepaths`. """ messenger.send_info("Running in serial mode.") taxon_set = dendropy.TaxonSet() split_distribution = treesplit.SplitDistribution(taxon_set=taxon_set) split_distribution.ignore_node_ages = ignore_node_ages split_distribution.is_rooted = is_rooted split_distribution.ultrametricity_precision = ultrametricity_precision topology_counter = treesum.TopologyCounter() if support_filepaths is None or len(support_filepaths) == 0: messenger.send_info("Reading trees from standard input.") srcs = [sys.stdin] else: messenger.send_info("%d source(s) to be processed." % len(support_filepaths)) # do not want to have all files open at the same time # srcs = [open(f, "rU") for f in support_filepaths] # store filepaths, to open individually in loop srcs = support_filepaths for sidx, src in enumerate(srcs): # hack needed because we do not want to open all input files at the # same time; if not a file object, assume it is a file path and create # corresponding file object if not isinstance(src, file): src = open(src, "rU") name = getattr(src, "name", "<stdin>") messenger.send_info("Processing %d of %d: '%s'" % (sidx + 1, len(srcs), name), wrap=False) for tidx, tree in enumerate( tree_source_iter( src, schema=schema, taxon_set=taxon_set, store_tree_weights=weighted_trees, as_rooted=is_rooted ) ): if tidx >= tree_offset: if (log_frequency == 1) or (tidx > 0 and log_frequency > 0 and tidx % log_frequency == 0): messenger.send_info("(processing) '%s': tree at offset %d" % (name, tidx), wrap=False) treesplit.encode_splits(tree) split_distribution.count_splits_on_tree(tree) topology_counter.count(tree, tree_splits_encoded=True) else: if (log_frequency == 1) or (tidx > 0 and log_frequency > 0 and tidx % log_frequency == 0): messenger.send_info("(processing) '%s': tree at offset %d (skipping)" % (name, tidx), wrap=False) try: src.close() except ValueError: # "I/O operation on closed file" if we try to close sys.stdin pass messenger.send_info("Serial processing of %d source(s) completed." % len(srcs)) return split_distribution, topology_counter
def get_length_diffs(tree1, tree2, edge_length_attr="length", value_type=float, split_length_diff_map=False): """ Returns a list of tuples, with the first element of each tuple representing the length of the branch subtending a particular split on ``tree1``, and the second element the length of the same branch on ``tree2``. If a particular split is found on one tree but not in the other, a value of zero is used for the missing split. """ length_diffs = [] split_length_diffs = {} if tree1.taxon_set is not tree2.taxon_set: raise TypeError("Trees have different TaxonSet objects: %s vs. %s" \ % (hex(id(tree1.taxon_set)), hex(id(tree2.taxon_set)))) if not hasattr(tree1, "split_edges"): treesplit.encode_splits(tree1) if not hasattr(tree2, "split_edges"): treesplit.encode_splits(tree2) split_edges2_copy = dict( tree2.split_edges) # O(n*(2*bind + dict_item_cost)) split_edges1_ref = tree1.split_edges for split, edge in split_edges1_ref.iteritems(): # O n : 2*bind elen1 = getattr(edge, edge_length_attr) # attr + bind if elen1 is None: elen1 = 0 # worst-case: bind value1 = value_type(elen1) # ctor + bind try: e2 = split_edges2_copy.pop(split) # attr + dict_lookup + bind elen2 = getattr(e2, edge_length_attr) # attr + bind if elen2 is None: # allow root edge to have split with no value: raise error if not root edge if e2.tail_node is None: elen2 = 0.0 else: raise ValueError( "Edge length attribute is 'None': Tree: %s ('%s'), Split: %s" % (tree2.oid, tree2.label, tree2.taxon_set.split_as_newick_string(split))) except KeyError: # excep elen2 = 0.0 value2 = value_type(elen2) # ctor + bind # best case length_diffs.append((value1, value2)) # ctor + listappend split_length_diffs[split] = length_diffs[-1] for split, edge in split_edges2_copy.iteritems( ): # best-case not executed, worst case O(n) : 2*bind elen2 = getattr(edge, edge_length_attr) # attr + bind if elen2 is None: elen2 = 0 value2 = value_type(elen2) # ctor + bind e1 = split_edges1_ref.get(split) # attr + dict_lookup + bind if e1 is None: elen1 = 0.0 else: elen1 = getattr(e1, edge_length_attr) # attr + bind if elen1 is None: # allow root edge to have split with no value: raise error if not root edge if e1.tail_node is None: elen1 = 0.0 else: raise ValueError( "Edge length attribute is 'None': Tree: %s ('%s'), Split: %s" % (tree1.oid, tree1.label, split)) #elen1 = 0 value1 = value_type(elen1) length_diffs.append((value1, value2)) # ctor + listappend split_length_diffs[split] = length_diffs[-1] # the numbers below do not reflect additions to the code to protect against # edges with length None # loops # best-case: # O(n * (dict_lookup + 3*attr + 3*ctor + 7*bind + listappend)) # worst-case: # separated: O(n * (2*dict_lookup + 4*attr + 3*ctor + 8*bind + listappend + excep) + n*(2*dict_lookup + 4*attr + 3*ctor + 8*bind + listappend)) # or: # O(2n*(2*dict_lookup + 4*attr + 3*ctor + 8*bind + listappend + 0.5*excep)) # total # best-case: # O(n * (dict_lookup + 3*attr + 3*ctor + 8*bind + listappend + dict_item_cost)) # worst-case: # O(2n*(2*dict_lookup + 4*attr + 3*ctor + 9*bind + listappend + 0.5*(dict_item_cost + excep)) if split_length_diff_map: return length_diffs, split_length_diffs else: return length_diffs