def monophyletic_partition_discordance(tree, taxon_set_partition): """ Returns the number of deep coalescences on tree `tree` that would result if the taxa in `tax_sets` formed K mutually-exclusive monophyletic groups, where K = len(tax_sets) `taxon_set_partition` == TaxonSetPartition """ tax_sets = taxon_set_partition.subsets() dc_tree = dataobject.Tree() dc_tree.taxon_set = dataobject.TaxonSet() for t in range(len(tax_sets)): dc_tree.taxon_set.append(dataobject.Taxon(label=str(t))) def _get_dc_taxon(nd): for idx, tax_set in enumerate(tax_sets): if nd.taxon in tax_set: return dc_tree.taxon_set[idx] assert "taxon not found in partition: '%s'" % nd.taxon.label src_dc_map = {} for snd in tree.postorder_node_iter(): nnd = dataobject.Node() src_dc_map[snd] = nnd children = snd.child_nodes() if len(children) == 0: nnd.taxon = _get_dc_taxon(snd) else: taxa_set = [] for cnd in children: dc_node = src_dc_map[cnd] if len(dc_node.child_nodes()) > 1: nnd.add_child(dc_node) else: ctax = dc_node.taxon if ctax is not None and ctax not in taxa_set: taxa_set.append(ctax) del src_dc_map[cnd] if len(taxa_set) > 1: for t in taxa_set: cnd = dataobject.Node() cnd.taxon = t nnd.add_child(cnd) else: if len(nnd.child_nodes()) == 0: nnd.taxon = taxa_set[0] elif len(taxa_set) == 1: cnd = dataobject.Node() cnd.taxon = taxa_set[0] nnd.add_child(cnd) dc_tree.seed_node = nnd return len(dc_tree.leaf_nodes()) - len(tax_sets)
def _ncl_taxa_block_to_native(self, ncl_tb): _LOG.debug("Converting NCL taxa block to native") _LOG.debug("calling NxsTaxaBlock.GetInstanceIdentifierString()") tbiid = ncl_tb.GetInstanceIdentifierString() _LOG.debug("got %s" % tbiid) taxa_block = self.ncl_taxa_to_native.get(tbiid) if taxa_block is not None: return taxa_block _LOG.debug("calling NxsTaxaBlock.GetAllLabels()") labels = ncl_tb.GetAllLabels() _LOG.debug("labels = %s" % ' '.join(labels)) if self._taxa_to_fill is None: taxa_block = dataobject.TaxonSet(labels) else: taxa_block = self._taxa_to_fill self._taxa_to_fill = None taxa_block.extend([dataobject.Taxon(label=i) for i in labels]) self.ncl_taxa_to_native[tbiid] = taxa_block return taxa_block
def constrained_kingman(pop_tree, gene_tree_list=None, rng=None, gene_node_label_func=None, num_genes_attr='num_genes', pop_size_attr='pop_size', decorate_original_tree=False): """ Given a population tree, `pop_tree` this will return a *pair of trees*: a gene tree simulated on this population tree based on Kingman's n-coalescent, and population tree with the additional attribute 'gene_nodes' on each node, which is a list of uncoalesced nodes from the gene tree associated with the given node from the population tree. `pop_tree` should be a DendroPy Tree object or an object of a class derived from this with the following attribute `num_genes` -- the number of gene samples from each population in the present. Each edge on the tree should also have the attribute `pop_size_attr` is the attribute name of the edges of `pop_tree` that specify the population size. By default it is `pop_size`. The should specify the effective *haploid* population size; i.e., number of gene in the population: 2 * N in a diploid population of N individuals, or N in a haploid population of N individuals. If `pop_size` is 1 or 0 or None, then the edge lengths of `pop_tree` is taken to be in haploid population units; i.e. where 1 unit equals 2N generations for a diploid population of size N, or N generations for a haploid population of size N. Otherwise the edge lengths of `pop_tree` is taken to be in generations. If `gene_tree_list` is given, then the gene tree is added to the tree block, and the tree block's taxa block will be used to manage the gene tree's `taxa`. `gene_node_label_func` is a function that takes two arguments (a string and an integer, respectively, where the string is the containing species taxon label and the integer is the gene index) and returns a label for the corresponding the gene node. if `decorate_original_tree` is True, then the list of uncoalesced nodes at each node of the population tree is added to the original (input) population tree instead of a copy. Note that this function does very much the same thing as `contained_coalescent()`, but provides a very different API. """ # get our random number generator if rng is None: rng = GLOBAL_RNG # use the global rng by default if gene_tree_list is not None: gtaxa = gene_tree_list.taxon_set else: gtaxa = dataobject.TaxonSet() if gene_node_label_func is None: gene_node_label_func = lambda x, y: "%s_%02d" % (x, y) # we create a set of gene nodes for each leaf node on the population # tree, and associate those gene nodes to the leaf by assignment # of 'taxon'. for leaf_count, leaf in enumerate(pop_tree.leaf_iter()): gene_nodes = [] for gene_count in range(getattr(leaf, num_genes_attr)): gene_node = dataobject.Node() gene_node.taxon = gtaxa.require_taxon( label=gene_node_label_func(leaf.taxon.label, gene_count + 1)) gene_nodes.append(gene_node) leaf.gene_nodes = gene_nodes # We iterate through the edges of the population tree in post-order, # i.e., visiting child edges before we visit parent edges. For # each edge visited, we take the genes found in the child nodes, # and run the coalescent simulation on them attacheded by the length # of the edge. Any genes that have not yet coalesced at the end of # this period are added to the genes of the tail (parent) node of # the edge. if decorate_original_tree: working_poptree = pop_tree else: # start with a new (deep) copy of the population tree so as to not # to change the original tree working_poptree = copy.deepcopy(pop_tree) # start with a new tree gene_tree = dataobject.Tree() gene_tree.taxon_set = gtaxa for edge in working_poptree.postorder_edge_iter(): # if mrca root, run unconstrained coalescent if edge.head_node.parent_node is None: if len(edge.head_node.gene_nodes) > 1: final = coalescent.coalesce(nodes=edge.head_node.gene_nodes, pop_size=pop_size, period=None, rng=rng) else: final = edge.head_node.gene_nodes gene_tree.seed_node = final[0] else: if hasattr(edge, pop_size_attr): pop_size = getattr(edge, pop_size_attr) else: # this means all our time will be in population units pop_size = 1 uncoal = coalescent.coalesce(nodes=edge.head_node.gene_nodes, pop_size=pop_size, period=edge.length, rng=rng) if not hasattr(edge.tail_node, 'gene_nodes'): edge.tail_node.gene_nodes = [] edge.tail_node.gene_nodes.extend(uncoal) gene_tree.is_rooted = True if gene_tree_list is not None: gene_tree_list.append(gene_tree) return gene_tree, working_poptree else: return gene_tree, working_poptree
def birth_death(birth_rate, death_rate, birth_rate_sd=0.0, death_rate_sd=0.0, **kwargs): """ Returns a birth-death tree with birth rate specified by `birth_rate`, and death rate specified by `death_rate`, with edge lengths in continuous (real) units. `birth_rate_sd` is the standard deviation of the normally-distributed mutation added to the birth rate as it is inherited by daughter nodes; if 0, birth rate does not evolve on the tree. `death_rate_sd` is the standard deviation of the normally-distributed mutation added to the death rate as it is inherited by daughter nodes; if 0, death rate does not evolve on the tree. Tree growth is controlled by one or more of the following arguments, of which at least one must be specified: - If `ntax` is given as a keyword argument, tree is grown until the number of tips == ntax. - If `taxon_set` is given as a keyword argument, tree is grown until the number of tips == len(taxon_set), and the taxa are assigned randomly to the tips. - If 'max_time' is given as a keyword argument, tree is grown for a maximum of `max_time`. - If `gsa_ntax` is given then the tree will be simulated up to this number of tips (or 0 tips), then a tree will be randomly selected from the intervals which corresond to times at which the tree had exactly `ntax` leaves (or len(taxon_set) tips). This allows for simulations according to the "General Sampling Approach" of [citeHartmannWS2010]_ If more than one of the above is given, then tree growth will terminate when *any* of the termination conditions (i.e., number of tips == `ntax`, or number of tips == len(taxon_set) or maximum time = `max_time`) are met. Also accepts a Tree object (with valid branch lengths) as an argument passed using the keyword `tree`: if given, then this tree will be used; otherwise a new one will be created. If `assign_taxa` is False, then taxa will *not* be assigned to the tips; otherwise (default), taxa will be assigned. If `taxon_set` is given (`tree.taxon_set`, if `tree` is given), and the final number of tips on the tree after the termination condition is reached is less then the number of taxa in `taxon_set` (as will be the case, for example, when `ntax` < len(`taxon_set`)), then a random subset of taxa in `taxon_set` will be assigned to the tips of tree. If the number of tips is more than the number of taxa in the `taxon_set`, new Taxon objects will be created and added to the `taxon_set` if the keyword argument `create_required_taxa` is not given as False. Under some conditions, it is possible for all lineages on a tree to go extinct. In this case, if the keyword argument `repeat_until_success` is `True` (the default), then a new branching process is initiated. If `False` (default), then a TreeSimTotalExtinctionException is raised. A Random() object or equivalent can be passed using the `rng` keyword; otherwise GLOBAL_RNG is used. .. [citeHartmannWS2010] Hartmann, Wong, and Stadler "Sampling Trees from Evolutionary Models" Systematic Biology. 2010. 59(4). 465-476 """ target_num_taxa = kwargs.get('ntax') max_time = kwargs.get('max_time') taxon_set = kwargs.get('taxon_set') if (target_num_taxa is None) and (taxon_set is not None): target_num_taxa = len(taxon_set) elif taxon_set is None: taxon_set = dataobject.TaxonSet() gsa_ntax = kwargs.get('gsa_ntax') terminate_at_full_tree = False if target_num_taxa is None: if gsa_ntax is not None: raise ValueError( "When 'gsa_ntax' is used, either 'ntax' or 'taxon_set' must be used" ) if max_time is None: raise ValueError( "At least one of the following must be specified: 'ntax', 'taxon_set', or 'max_time'" ) else: if gsa_ntax is None: terminate_at_full_tree = True gsa_ntax = 1 + target_num_taxa elif gsa_ntax < target_num_taxa: raise ValueError("gsa_ntax must be greater than target_num_taxa") repeat_until_success = kwargs.get('repeat_until_success', True) rng = kwargs.get('rng', GLOBAL_RNG) # initialize tree if "tree" in kwargs: tree = kwargs['tree'] if "taxon_set" in kwargs and kwargs['taxon_set'] is not tree.taxon_set: raise ValueError("Cannot specify both `tree` and `taxon_set`") else: tree = dataobject.Tree(taxon_set=taxon_set) tree.is_rooted = True tree.seed_node.edge.length = 0.0 tree.seed_node.birth_rate = birth_rate tree.seed_node.death_rate = death_rate # grow tree leaf_nodes = tree.leaf_nodes() #_LOG.debug("Will generate a tree with no more than %s leaves to get a tree of %s leaves" % (str(gsa_ntax), str(target_num_taxa))) curr_num_leaves = len(leaf_nodes) total_time = 0 # for the GSA simulations targetted_time_slices is a list of tuple # the first element in the tuple is the duration of the amount # that the simulation spent at the (targetted) number of taxa # and a list of edge information. The list of edge information includes # a list of terminal edges in the tree and the length for that edge # that marks the beginning of the time slice that corresponds to the # targetted number of taxa. targetted_time_slices = [] extinct_tips = [] while True: if gsa_ntax is None: assert (max_time is not None) if total_time >= max_time: break elif curr_num_leaves >= gsa_ntax: break # get vector of birth/death probabilities, and # associate with nodes/events event_rates = [] event_nodes = [] for nd in leaf_nodes: if not hasattr(nd, 'birth_rate'): nd.birth_rate = birth_rate if not hasattr(nd, 'death_rate'): nd.death_rate = death_rate event_rates.append(nd.birth_rate) event_nodes.append((nd, True)) # birth event = True event_rates.append(nd.death_rate) event_nodes.append((nd, False)) # birth event = False; i.e. death # get total probability of any birth/death rate_of_any_event = sum(event_rates) # waiting time based on above probability #_LOG.debug("rate_of_any_event = %f" % (rate_of_any_event)) waiting_time = rng.expovariate(rate_of_any_event) #_LOG.debug("Drew waiting time of %f from hazard parameter of %f" % (waiting_time, rate_of_any_event)) if (gsa_ntax is not None) and (curr_num_leaves == target_num_taxa): edge_and_start_length = [] for nd in leaf_nodes: e = nd.edge edge_and_start_length.append((e, e.length)) targetted_time_slices.append((waiting_time, edge_and_start_length)) #_LOG.debug("Recording slice with %d edges" % len(edge_and_start_length)) if terminate_at_full_tree: break # add waiting time to nodes for nd in leaf_nodes: try: nd.edge.length += waiting_time except TypeError: nd.edge.length = waiting_time #_LOG.debug("Next waiting_time = %f" % waiting_time) total_time += waiting_time # if event occurs within time constraints if max_time is None or total_time <= max_time: # normalize probability for i in xrange(len(event_rates)): event_rates[i] = event_rates[i] / rate_of_any_event # select node/event and process nd, birth_event = probability.weighted_choice(event_nodes, event_rates, rng=rng) leaf_nodes.remove(nd) curr_num_leaves -= 1 if birth_event: #_LOG.debug("Speciation") c1 = nd.new_child() c2 = nd.new_child() c1.edge.length = 0 c2.edge.length = 0 c1.birth_rate = nd.birth_rate + rng.gauss(0, birth_rate_sd) c1.death_rate = nd.death_rate + rng.gauss(0, death_rate_sd) c2.birth_rate = nd.birth_rate + rng.gauss(0, birth_rate_sd) c2.death_rate = nd.death_rate + rng.gauss(0, death_rate_sd) leaf_nodes.append(c1) leaf_nodes.append(c2) curr_num_leaves += 2 else: #_LOG.debug("Extinction") if curr_num_leaves > 0: #_LOG.debug("Will delete " + str(id(nd)) + " with parent = " + str(id(nd.parent_node))) extinct_tips.append(nd) else: if (gsa_ntax is not None): if (len(targetted_time_slices) > 0): break if not repeat_until_success: raise TreeSimTotalExtinctionException() # We are going to basically restart the simulation because the tree has gone extinct (without reaching the specified ntax) leaf_nodes = [tree.seed_node] curr_num_leaves = 1 for nd in tree.seed_node.child_nodes(): treemanip.prune_subtree(tree, nd, delete_outdegree_one=False) extinct_tips = [] total_time = 0 assert (curr_num_leaves == len(leaf_nodes)) #_LOG.debug("Current tree \n%s" % (tree.as_ascii_plot(plot_metric='length', show_internal_node_labels=True))) #tree._debug_tree_is_valid() #_LOG.debug("Terminated with %d leaves (%d, %d according to len(leaf_nodes))" % (curr_num_leaves, len(leaf_nodes), len(tree.leaf_nodes()))) if gsa_ntax is not None: total_duration_at_target_n_tax = 0.0 for i in targetted_time_slices: total_duration_at_target_n_tax += i[0] r = rng.random() * total_duration_at_target_n_tax #_LOG.debug("Selected rng = %f out of (0, %f)" % (r, total_duration_at_target_n_tax)) selected_slice = None for n, i in enumerate(targetted_time_slices): r -= i[0] if r < 0.0: selected_slice = i assert (selected_slice is not None) #_LOG.debug("Selected time slice index %d" % n) edges_at_slice = selected_slice[1] last_waiting_time = selected_slice[0] for e, prev_length in edges_at_slice: daughter_nd = e.head_node for nd in daughter_nd.child_nodes(): treemanip.prune_subtree(tree, nd, delete_outdegree_one=False) #_LOG.debug("After pruning %s:\n%s" % (str(id(nd)), tree.as_ascii_plot(plot_metric='length', show_internal_node_labels=True))) try: extinct_tips.remove(nd) except: pass try: extinct_tips.remove(daughter_nd) except: pass e.length = prev_length + last_waiting_time # tree._debug_tree_is_valid() # for nd in extinct_tips: # _LOG.debug("Will be deleting " + str(id(nd))) for nd in extinct_tips: bef = len(tree.leaf_nodes()) while (nd.parent_node is not None) and (len( nd.parent_node.child_nodes()) == 1): _LOG.debug("Will be pruning %d rather than its only child (%d)" % (id(nd.parent_node), id(nd))) nd = nd.parent_node # _LOG.debug("Deleting " + str(nd.__dict__) + '\n' + str(nd.edge.__dict__)) # for n, pnd in enumerate(tree.postorder_node_iter()): # _LOG.debug("%d %s" % (n, repr(pnd))) # _LOG.debug("Before prune of %s:\n%s" % (str(id(nd)), tree.as_ascii_plot(plot_metric='length', show_internal_node_labels=True))) if nd.parent_node: treemanip.prune_subtree(tree, nd, delete_outdegree_one=False) _LOG.debug("After prune (went from %d to %d leaves):\n%s" % (bef, len(tree.leaf_nodes()), tree.as_ascii_plot(plot_metric='length', show_internal_node_labels=True))) # _LOG.debug("Deleted " + str(nd.__dict__)) # for n, pnd in enumerate(tree.postorder_node_iter()): # _LOG.debug("%d %s" % (n, repr(pnd))) # tree._debug_tree_is_valid() tree.delete_outdegree_one_nodes() # tree._debug_tree_is_valid() # _LOG.debug("After deg2suppression:\n%s" % (tree.as_ascii_plot(plot_metric='length', show_internal_node_labels=True))) if kwargs.get("assign_taxa", True): tree.randomly_assign_taxa(create_required_taxa=True, rng=rng) # return return tree
def discrete_birth_death(birth_rate, death_rate, birth_rate_sd=0.0, death_rate_sd=0.0, **kwargs): """ Returns a birth-death tree with birth rate specified by `birth_rate`, and death rate specified by `death_rate`, with edge lengths in discrete (integer) units. `birth_rate_sd` is the standard deviation of the normally-distributed mutation added to the birth rate as it is inherited by daughter nodes; if 0, birth rate does not evolve on the tree. `death_rate_sd` is the standard deviation of the normally-distributed mutation added to the death rate as it is inherited by daughter nodes; if 0, death rate does not evolve on the tree. Tree growth is controlled by one or more of the following arguments, of which at least one must be specified: - If `ntax` is given as a keyword argument, tree is grown until the number of tips == ntax. - If `taxon_set` is given as a keyword argument, tree is grown until the number of tips == len(taxon_set), and the taxa are assigned randomly to the tips. - If 'max_time' is given as a keyword argument, tree is grown for `max_time` number of generations. If more than one of the above is given, then tree growth will terminate when *any* of the termination conditions (i.e., number of tips == `ntax`, or number of tips == len(taxon_set) or number of generations = `max_time`) are met. Also accepts a Tree object (with valid branch lengths) as an argument passed using the keyword `tree`: if given, then this tree will be used; otherwise a new one will be created. If `assign_taxa` is False, then taxa will *not* be assigned to the tips; otherwise (default), taxa will be assigned. If `taxon_set` is given (`tree.taxon_set`, if `tree` is given), and the final number of tips on the tree after the termination condition is reached is less then the number of taxa in `taxon_set` (as will be the case, for example, when `ntax` < len(`taxon_set`)), then a random subset of taxa in `taxon_set` will be assigned to the tips of tree. If the number of tips is more than the number of taxa in the `taxon_set`, new Taxon objects will be created and added to the `taxon_set` if the keyword argument `create_required_taxa` is not given as False. Under some conditions, it is possible for all lineages on a tree to go extinct. In this case, if the keyword argument `repeat_until_success` is `True`, then a new branching process is initiated. If `False` (default), then a TreeSimTotalExtinctionException is raised. A Random() object or equivalent can be passed using the `rng` keyword; otherwise GLOBAL_RNG is used. """ if 'ntax' not in kwargs \ and 'taxon_set' not in kwargs \ and 'max_time' not in kwargs: raise ValueError( "At least one of the following must be specified: 'ntax', 'taxon_set', or 'max_time'" ) target_num_taxa = None taxon_set = None target_num_gens = kwargs.get('max_time', None) if 'taxon_set' in kwargs: taxon_set = kwargs.get('taxon_set') target_num_taxa = kwargs.get('ntax', len(taxon_set)) elif 'ntax' in kwargs: target_num_taxa = kwargs['ntax'] if taxon_set is None: taxon_set = dataobject.TaxonSet() repeat_until_success = kwargs.get('repeat_until_success', False) rng = kwargs.get('rng', GLOBAL_RNG) # grow tree if "tree" in kwargs: tree = kwargs['tree'] if "taxon_set" in kwargs and kwargs['taxon_set'] is not tree.taxon_set: raise ValueError("Cannot specify both `tree` and `taxon_set`") else: tree = dataobject.Tree(taxon_set=taxon_set) tree.is_rooted = True tree.seed_node.edge.length = 0 tree.seed_node.birth_rate = birth_rate tree.seed_node.death_rate = death_rate leaf_nodes = tree.leaf_nodes() num_gens = 0 while (target_num_taxa is None or len(leaf_nodes) < target_num_taxa) \ and (target_num_gens is None or num_gens < target_num_gens): for nd in leaf_nodes: if not hasattr(nd, 'birth_rate'): nd.birth_rate = birth_rate if not hasattr(nd, 'death_rate'): nd.death_rate = death_rate try: nd.edge.length += 1 except TypeError: nd.edge.length = 1 u = rng.uniform(0, 1) if u < nd.birth_rate: c1 = nd.new_child() c2 = nd.new_child() c1.edge.length = 0 c2.edge.length = 0 c1.birth_rate = nd.birth_rate + rng.gauss(0, birth_rate_sd) c1.death_rate = nd.death_rate + rng.gauss(0, death_rate_sd) c2.birth_rate = nd.birth_rate + rng.gauss(0, birth_rate_sd) c2.death_rate = nd.death_rate + rng.gauss(0, death_rate_sd) elif u > nd.birth_rate and u < (nd.birth_rate + nd.death_rate): if nd is not tree.seed_node: treemanip.prune_subtree(tree, nd) elif not repeat_until_success: # all lineages are extinct: raise exception raise TreeSimTotalExtinctionException() else: # all lineages are extinct: repeat num_gens = 0 num_gens += 1 leaf_nodes = tree.leaf_nodes() # If termination condition specified by ntax or taxon_set, then the last # split will have a daughter edges of length == 0; # so we continue growing the edges until the next birth/death event *or* # the max number of generations condition is given and met gens_to_add = 0 while (target_num_gens is None or num_gens < target_num_gens): u = rng.uniform(0, 1) if u < (birth_rate + death_rate): break gens_to_add += 1 for nd in tree.leaf_nodes(): nd.edge.length += gens_to_add if kwargs.get("assign_taxa", True): tree.randomly_assign_taxa(create_required_taxa=True, rng=rng) # return return tree
def tree_from_token_stream(stream_tokenizer, **kwargs): """ Processes a (SINGLE) TREE statement. Assumes that the input stream is located at the beginning of the statement (i.e., the first non-comment token should be the opening parenthesis of the tree definition). str_to_taxon kwarg (if used) must supply the StrToTaxon interface). """ translate_dict = kwargs.get("translate_dict", None) encode_splits = kwargs.get("encode_splits", False) rooting_interpreter = kwargs.get("rooting_interpreter", RootingInterpreter(**kwargs)) finish_node_func = kwargs.get("finish_node_func", None) edge_len_type = kwargs.get("edge_len_type", float) taxon_set = kwargs.get("taxon_set", None) suppress_internal_node_taxa = kwargs.get("suppress_internal_node_taxa", False) store_tree_weights = kwargs.get("store_tree_weights", False) extract_comment_metadata = kwargs.get('extract_comment_metadata', False) case_sensitive_taxon_labels = kwargs.get('case_sensitive_taxon_labels', False) allow_repeated_use = kwargs.get('allow_repeated_use', False) stream_tokenizer_extract_comment_metadata_setting = stream_tokenizer.extract_comment_metadata stream_tokenizer.extract_comment_metadata = extract_comment_metadata if taxon_set is None: taxon_set = dataobject.TaxonSet() tree = dataobject.Tree(taxon_set=taxon_set) stream_tokenizer.tree_rooting_comment = None # clear previous comment stream_tokenizer.clear_comment_metadata() token = stream_tokenizer.read_next_token() if not token: return None tree.is_rooted = rooting_interpreter.interpret_as_rooted( stream_tokenizer.tree_rooting_comment) # if stream_tokenizer.tree_rooting_comment is not None: # tree.is_rooted = rooting_interpreter.interpret_as_rooted(stream_tokenizer.tree_rooting_comment) # elif rooting_interpreter.interpret_as_rooted(stream_tokenizer.tree_rooting_comment): # tree_is_rooted = True if store_tree_weights and stream_tokenizer.tree_weight_comment is not None: try: weight_expression = stream_tokenizer.tree_weight_comment.split( ' ')[1] tree.weight = eval("/".join( ["float(%s)" % cv for cv in weight_expression.split('/')])) except IndexError: pass except ValueError: pass stream_tokenizer.tree_weight_comment = None if encode_splits: if len(taxon_set) == 0: raise Exception("When encoding splits on a tree as it is being parsed, a " + "fully pre-populated TaxonSet object must be specified using the 'taxon_set' keyword " \ + "to avoid taxon/split bitmask values changing as new Taxon objects are created " \ + "and added to the TaxonSet.") if tree.is_rooted: tree.split_edges = {} else: atb = taxon_set.all_taxa_bitmask() d = containers.NormalizedBitmaskDict(mask=atb) tree.split_edges = d split_map = tree.split_edges stt = kwargs.get('str_to_taxon') if stt is None: stt = StrToTaxon(taxon_set, translate_dict, allow_repeated_use=allow_repeated_use, case_sensitive=case_sensitive_taxon_labels) tree.seed_node = dataobject.Node() curr_node = tree.seed_node if encode_splits: curr_node.edge.split_bitmask = 0L ### NHX format support ### def store_node_comments(active_node): if stream_tokenizer.comments: active_node.comments.extend(stream_tokenizer.comments) def store_comment_metadata(target): if extract_comment_metadata: if stream_tokenizer.has_comment_metadata(): comment_metadata = stream_tokenizer.comment_metadata try: target.comment_metadata.update(comment_metadata) except AttributeError: target.comment_metadata = comment_metadata stream_tokenizer.clear_comment_metadata() elif not hasattr(target, "comment_metadata"): target.comment_metadata = {} # store and clear comments tree.comments = stream_tokenizer.comments stream_tokenizer.clear_comments() store_comment_metadata(tree) while True: if not token or token == ';': if curr_node is not tree.seed_node: raise stream_tokenizer.data_format_error( "Unbalanced parentheses -- not enough ')' characters found in tree description" ) if encode_splits: split_map[curr_node.edge.split_bitmask] = curr_node.edge break if token == '(': if not curr_node.parent_node: if curr_node.child_nodes(): raise stream_tokenizer.data_format_error( "Unexpected '(' after the tree description. Expecting a label for the root or a ;" ) tmp_node = dataobject.Node() if encode_splits: tmp_node.edge.split_bitmask = 0L curr_node.add_child(tmp_node) curr_node = tmp_node token = stream_tokenizer.read_next_token() store_node_comments(curr_node) store_comment_metadata(curr_node) elif token == ',': tmp_node = dataobject.Node() if curr_node.is_leaf() and not curr_node.taxon: # curr_node.taxon = taxon_set.Taxon(oid="UNAMED_" + str(id(curr_node)), label='') # taxon_set.add(curr_node.taxon) raise stream_tokenizer.data_format_error( "Missing taxon specifier in a tree -- found either a '(,' or ',,' construct." ) p = curr_node.parent_node if not p: raise stream_tokenizer.data_format_error( "Comma found one the 'outside' of a newick tree description" ) if encode_splits: tmp_node.edge.split_bitmask = 0L e = curr_node.edge u = e.split_bitmask split_map[u] = e p.edge.split_bitmask |= u if finish_node_func is not None: finish_node_func(curr_node, tree) p.add_child(tmp_node) curr_node = tmp_node token = stream_tokenizer.read_next_token() store_node_comments(curr_node) store_comment_metadata(curr_node) else: if token == ')': if curr_node.is_leaf() and not curr_node.taxon: raise stream_tokenizer.data_format_error( "Missing taxon specifier in a tree -- found either a '(,' or ',,' construct." ) p = curr_node.parent_node if not p: raise stream_tokenizer.data_format_error( "Unbalanced parentheses -- too many ')' characters found in tree description" ) if encode_splits: e = curr_node.edge u = e.split_bitmask p.edge.split_bitmask |= u split_map[u] = curr_node.edge if finish_node_func is not None: finish_node_func(curr_node, tree) curr_node = p else: is_leaf = curr_node.is_leaf() if is_leaf: if curr_node.taxon: raise stream_tokenizer.data_format_error( "Multiple labels found for the same leaf (taxon '%s' and label '%s')" % (str(curr_node.taxon), token)) try: t = stt_require_taxon(stt, label=token) except StrToTaxon.MultipleTaxonUseError, e: raise stream_tokenizer.data_format_error(e.msg) else: if curr_node.label: raise stream_tokenizer.data_format_error( "Multiple labels found for the same leaf (taxon '%s' and label '%s')" % (curr_node.label, token)) if suppress_internal_node_taxa: t = None else: try: t = stt.get_taxon(label=token) except StrToTaxon.MultipleTaxonUseError, e: raise stream_tokenizer.data_format_error(e.msg)
def _get_contained_taxon_set(self): if self._contained_taxon_set is None: self._contained_taxon_set = dataobject.TaxonSet() return self._contained_taxon_set