def _parse_tree_statement(self, taxon_set=None):
     """
     Processes a TREE command. Assumes that the file reader is
     positioned right after the "TREE" token in a TREE command.
     Calls on the NewickStatementParser of the trees module.
     """
     token = self.stream_tokenizer.read_next_token()
     if token == '*':
         token = self.stream_tokenizer.read_next_token()
     tree_name = token
     token = self.stream_tokenizer.read_next_token()
     if token != '=':
         raise self.data_format_error("Expecting '=' in definition of Tree '%s' but found '%s'" % (tree_name, token))
     tree_comments = self.stream_tokenizer.comments
     tree = nexustokenizer.tree_from_token_stream(stream_tokenizer=self.stream_tokenizer,
             taxon_set=taxon_set,
             translate_dict=self.tree_translate_dict,
             encode_splits=self.encode_splits,
             rooting_interpreter=self.rooting_interpreter,
             finish_node_func=self.finish_node_func,
             extract_comment_metadata=self.extract_comment_metadata,
             store_tree_weights=self.store_tree_weights,
             preserve_underscores=self.preserve_underscores,
             suppress_internal_node_taxa=self.suppress_internal_node_taxa,
             edge_len_type=self.edge_len_type,
             case_sensitive_taxon_labels=self.case_sensitive_taxon_labels)
     tree.label = tree_name
     if tree_comments is not None and len(tree_comments) > 0:
         tree.comments.extend(tree_comments)
     if self.stream_tokenizer.current_token != ';':
         self.stream_tokenizer.skip_to_semicolon()
     return tree
Exemplo n.º 2
0
 def _parse_tree_statement(self, taxon_set=None):
     """
     Processes a TREE command. Assumes that the file reader is
     positioned right after the "TREE" token in a TREE command.
     Calls on the NewickStatementParser of the trees module.
     """
     token = self.stream_tokenizer.read_next_token()
     if token == '*':
         token = self.stream_tokenizer.read_next_token()
     tree_name = token
     token = self.stream_tokenizer.read_next_token()
     if token != '=':
         raise self.data_format_error(
             "Expecting '=' in definition of Tree '%s' but found '%s'" %
             (tree_name, token))
     tree_comments = self.stream_tokenizer.comments
     tree = nexustokenizer.tree_from_token_stream(
         stream_tokenizer=self.stream_tokenizer,
         taxon_set=taxon_set,
         translate_dict=self.tree_translate_dict,
         encode_splits=self.encode_splits,
         rooting_interpreter=self.rooting_interpreter,
         finish_node_func=self.finish_node_func,
         extract_comment_metadata=self.extract_comment_metadata,
         store_tree_weights=self.store_tree_weights,
         preserve_underscores=self.preserve_underscores,
         suppress_internal_node_taxa=self.suppress_internal_node_taxa,
         edge_len_type=self.edge_len_type,
         case_sensitive_taxon_labels=self.case_sensitive_taxon_labels)
     tree.label = tree_name
     if tree_comments is not None and len(tree_comments) > 0:
         tree.comments.extend(tree_comments)
     if self.stream_tokenizer.current_token != ';':
         self.stream_tokenizer.skip_to_semicolon()
     return tree
Exemplo n.º 3
0
        def _ncl_tree_tokens_to_native_tree(self,
                                            ncl_tb,
                                            taxa_block,
                                            tree_tokens,
                                            rooted_flag=None):
            if not tree_tokens:
                return None
            iid = ncl_tb.GetInstanceIdentifierString()
            if taxa_block is None:
                taxa_block = self._ncl_taxa_block_to_native(ncl_tb)
#            self.taxa_block = taxa_block
            lti = ListOfTokenIterator(tree_tokens)
            lti.tree_rooted = rooted_flag

            if iid not in self.tree_translate_dicts:
                self.tree_translate_dicts[ncl_tb] = {}
                for n, t in enumerate(taxa_block):
                    self.tree_translate_dicts[ncl_tb][str(n + 1)] = t
                    if self.encode_splits:
                        t.clade_mask = (1 << n)
            return nexustokenizer.tree_from_token_stream(
                lti,
                taxon_set=taxa_block,
                translate_dict=self.tree_translate_dicts[ncl_tb],
                encode_splits=self.encode_splits,
                rooting_interpreter=self.rooting_interpreter,
                finish_node_func=self.finish_node_func)
        def _ncl_tree_tokens_to_native_tree(self, ncl_tb, taxa_block, tree_tokens, rooted_flag=None):
            if not tree_tokens:
                return None
            iid = ncl_tb.GetInstanceIdentifierString()
            if taxa_block is None:
                taxa_block = self._ncl_taxa_block_to_native(ncl_tb)
#            self.taxa_block = taxa_block
            lti = ListOfTokenIterator(tree_tokens)
            lti.tree_rooted = rooted_flag

            if iid not in self.tree_translate_dicts:
                self.tree_translate_dicts[ncl_tb] = {}
                for n, t in enumerate(taxa_block):
                    self.tree_translate_dicts[ncl_tb][str(n + 1)] = t
                    if self.encode_splits:
                        t.clade_mask = (1 << n)
            return nexustokenizer.tree_from_token_stream(lti,
                                            taxon_set=taxa_block,
                                            translate_dict=self.tree_translate_dicts[ncl_tb],
                                            encode_splits=self.encode_splits,
                                            rooting_interpreter=self.rooting_interpreter,
                                            finish_node_func=self.finish_node_func)
Exemplo n.º 5
0
def tree_source_iter(stream, **kwargs):
    """
    Iterates over a NEWICK-formatted source of trees given by file-like object
    `stream`

    Note that if `encode_splits` is True, then a `taxon_set` has to be given.
    This is because adding Taxon objects to a taxon set may invalidate split
    bitmasks. Because NEWICK tree taxa are added to a TaxonSet as they are found
    on a tree, there is a strong possibility that all split bitmasks get
    invalidated in the middle of parsing a tree. To avoid this, and, more
    importantly to avoid errors downstream in client code due to this, we
    force specification of a `taxon_set` if `encode_splits` is requested.

    The following optional keyword arguments are also recognized:

        `taxon_set`
            TaxonSet object to use when reading data.

        `as_rooted=True` (or `as_unrooted=False`)
            Unconditionally interprets all trees as rooted.

        `as_unrooted=True` (or `as_rooted=False`)
            Unconditionally interprets all trees as unrooted.

        `default_as_rooted=True` (or `default_as_unrooted=False`)
            Interprets all trees as rooted if rooting not given by `[&R]`
            or `[&U]` comments.

        `default_as_unrooted=True` (or `default_as_rooted=False`)
            Interprets all trees as rooted if rooting not given by `[&R]`
            or `[&U]` comments.

        `edge_len_type`
            Specifies the type of the edge lengths (int or float).

        `extract_comment_metadata`
            If True, any comments that begin with '&' or '&&' associated with
            items will be processed and stored as part of the annotation set of
            the object (`annotations`) If False, this will be skipped. Defaults
            to False.

        `store_tree_weights`
            If True, process the tree weight ("[&W 1/2]") comment
            associated with each tree, if any.

        `encode_splits`
            Specifies whether or not split bitmasks will be calculated and
            attached to the edges.

        `finish_node_func`
            Is a function that will be applied to each node after it has
            been constructed.

        `case_sensitive_taxon_labels`
            If True, then taxon labels are case sensitive (different cases
            = different taxa); defaults to False.

        `allow_duplicate_taxon_labels`
            if True, allow duplicate labels on trees

        `preserve_underscores`
            If True, unquoted underscores in labels will *not* converted to
            spaces. Defaults to False: all underscores not protected by
            quotes will be converted to spaces.

        `suppress_internal_node_taxa`
            If False, internal node labels will be instantantiatd into Taxon
            objects.  Defaults to True: internal node labels will *not* be
            treated as taxa.

        `allow_duplicate_taxon_labels`
            If True, then multiple identical taxon labels will be allowed.
            Defaults to False: treat multiple identical taxon labels as an
            error.

        `hyphens_as_tokens`
            If True, hyphens will be treated as special punctuation
            characters. Defaults to False, hyphens not treated as special
            punctuation characters.

    """
    if "taxon_set" in kwargs:
        taxon_set = kwargs["taxon_set"]
        del(kwargs["taxon_set"])
    else:
        taxon_set = None
    if "encode_splits" in kwargs and taxon_set is None:
        raise Exception('When encoding splits on trees, a pre-populated TaxonSet instance ' \
            + "must be provided using the 'taxon_set' keyword to avoid taxon/split bitmask values "\
            + "changing as new Taxon objects are added to the set.")
    preserve_underscores = kwargs.get('preserve_underscores', False)
    hyphens_as_tokens = kwargs.get('hyphens_as_tokens', nexustokenizer.DEFAULT_HYPHENS_AS_TOKENS)
    extract_comment_metadata = kwargs.get("extract_comment_metadata", False)
    newick_stream = nexustokenizer.NexusTokenizer(stream,
                                                  preserve_underscores=preserve_underscores,
                                                  hyphens_as_tokens=hyphens_as_tokens,
                                                  extract_comment_metadata=extract_comment_metadata,
                                                  case_sensitive_taxon_labels=kwargs.get('case_sensitive_taxon_labels', False))
    while not newick_stream.eof:
        t = nexustokenizer.tree_from_token_stream(newick_stream, taxon_set=taxon_set, **kwargs)
        if t is not None:
            yield t
        else:
            raise StopIteration()
Exemplo n.º 6
0
def tree_source_iter(stream, **kwargs):
    """
    Iterates over a NEWICK-formatted source of trees given by file-like object
    `stream`

    Note that if `encode_splits` is True, then a `taxon_set` has to be given.
    This is because adding Taxon objects to a taxon set may invalidate split
    bitmasks. Because NEWICK tree taxa are added to a TaxonSet as they are found
    on a tree, there is a strong possibility that all split bitmasks get
    invalidated in the middle of parsing a tree. To avoid this, and, more
    importantly to avoid errors downstream in client code due to this, we
    force specification of a `taxon_set` if `encode_splits` is requested.

    The following optional keyword arguments are also recognized:

        `taxon_set`
            TaxonSet object to use when reading data.

        `as_rooted=True` (or `as_unrooted=False`)
            Unconditionally interprets all trees as rooted.

        `as_unrooted=True` (or `as_rooted=False`)
            Unconditionally interprets all trees as unrooted.

        `default_as_rooted=True` (or `default_as_unrooted=False`)
            Interprets all trees as rooted if rooting not given by `[&R]`
            or `[&U]` comments.

        `default_as_unrooted=True` (or `default_as_rooted=False`)
            Interprets all trees as rooted if rooting not given by `[&R]`
            or `[&U]` comments.

        `edge_len_type`
            Specifies the type of the edge lengths (int or float).

        `extract_comment_metadata`
            If True, any comments that begin with '&' or '&&' associated with
            items will be processed and stored as part of the annotation set of
            the object (`annotations`) If False, this will be skipped. Defaults
            to False.

        `store_tree_weights`
            If True, process the tree weight ("[&W 1/2]") comment
            associated with each tree, if any.

        `encode_splits`
            Specifies whether or not split bitmasks will be calculated and
            attached to the edges.

        `finish_node_func`
            Is a function that will be applied to each node after it has
            been constructed.

        `case_sensitive_taxon_labels`
            If True, then taxon labels are case sensitive (different cases
            = different taxa); defaults to False.

        `allow_duplicate_taxon_labels`
            if True, allow duplicate labels on trees

        `preserve_underscores`
            If True, unquoted underscores in labels will *not* converted to
            spaces. Defaults to False: all underscores not protected by
            quotes will be converted to spaces.

        `suppress_internal_node_taxa`
            If False, internal node labels will be instantantiatd into Taxon
            objects.  Defaults to True: internal node labels will *not* be
            treated as taxa.

        `allow_duplicate_taxon_labels`
            If True, then multiple identical taxon labels will be allowed.
            Defaults to False: treat multiple identical taxon labels as an
            error.

        `hyphens_as_tokens`
            If True, hyphens will be treated as special punctuation
            characters. Defaults to False, hyphens not treated as special
            punctuation characters.

    """
    if "taxon_set" in kwargs:
        taxon_set = kwargs["taxon_set"]
        del (kwargs["taxon_set"])
    else:
        taxon_set = None
    if "encode_splits" in kwargs and taxon_set is None:
        raise Exception('When encoding splits on trees, a pre-populated TaxonSet instance ' \
            + "must be provided using the 'taxon_set' keyword to avoid taxon/split bitmask values "\
            + "changing as new Taxon objects are added to the set.")
    preserve_underscores = kwargs.get('preserve_underscores', False)
    hyphens_as_tokens = kwargs.get('hyphens_as_tokens',
                                   nexustokenizer.DEFAULT_HYPHENS_AS_TOKENS)
    extract_comment_metadata = kwargs.get("extract_comment_metadata", False)
    newick_stream = nexustokenizer.NexusTokenizer(
        stream,
        preserve_underscores=preserve_underscores,
        hyphens_as_tokens=hyphens_as_tokens,
        extract_comment_metadata=extract_comment_metadata,
        case_sensitive_taxon_labels=kwargs.get('case_sensitive_taxon_labels',
                                               False))
    while not newick_stream.eof:
        t = nexustokenizer.tree_from_token_stream(newick_stream,
                                                  taxon_set=taxon_set,
                                                  **kwargs)
        if t is not None:
            yield t
        else:
            raise StopIteration()
Exemplo n.º 7
0
def tree_source_iter(stream, **kwargs):
    """
    Iterates over a NEWICK-formatted source of trees given by file-like object
    `stream`

    Note that if `encode_splits` is True, then a `taxon_set` has to be given.
    This is because adding Taxon objects to a taxon set may invalidate split
    bitmasks. Because NEWICK tree taxa are added to a TaxonSet as they are found
    on a tree, there is a strong possibility that all split bitmasks get
    invalidated in the middle of parsing a tree. To avoid this, and, more
    importantly to avoid errors downstream in client code due to this, we
    force specification of a `taxon_set` if `encode_splits` is requested.

    The following optional keyword arguments are also recognized:

        - `taxon_set`: TaxonSet object to use when reading data
        - `as_rooted=True` (or `as_unrooted=False`): interprets trees as rooted
        - `as_unrooted=True` (or `as_rooted=False`): interprets trees as unrooted
        - `default_as_rooted=True` (or `default_as_unrooted=False`): interprets
           all trees as rooted if rooting not given by `[&R]` or `[&U]` comments
        - `default_as_unrooted=True` (or `default_as_rooted=False`): interprets
           all trees as rooted if rooting not given by `[&R]` or `[&U]` comments
        - `edge_len_type`: specifies the type of the edge lengths (int or float)
        - `encode_splits`: specifies whether or not split bitmasks will be
           calculated and attached to the edges.
        - `extract_comment_metadata`: if True, any 'hot comments' (i.e.,
            comments that begin with '&') or NHX comments associated with
            items will be processed and stored as a dictionary attribute of the
            object: "comment_metadata".
        - `store_tree_weights`: if True, process the tree weight ("[&W 1/2]")
           comment associated with each tree, if any.
        - `finish_node_func`: is a function that will be applied to each node
           after it has been constructed.
        - `case_insensitive_taxon_labels`: If False, then taxon labels are
            case sensitive (different cases = different taxa); defaults to True

    """
    if "taxon_set" in kwargs:
        taxon_set = kwargs["taxon_set"]
        del (kwargs["taxon_set"])
    else:
        taxon_set = None
    if "encode_splits" in kwargs and taxon_set is None:
        raise Exception(
            "When encoding splits on trees, a pre-populated TaxonSet instance "
            + "must be provided using the 'taxon_set' keyword to avoid taxon/split bitmask values "
            + "changing as new Taxon objects are added to the set."
        )
    preserve_underscores = kwargs.get("preserve_underscores", False)
    hyphens_as_tokens = kwargs.get("hyphens_as_tokens", nexustokenizer.DEFAULT_HYPHENS_AS_TOKENS)
    extract_comment_metadata = kwargs.get("extract_comment_metadata", False)
    newick_stream = nexustokenizer.NexusTokenizer(
        stream,
        preserve_underscores=preserve_underscores,
        hyphens_as_tokens=hyphens_as_tokens,
        extract_comment_metadata=extract_comment_metadata,
        case_insensitive_taxon_labels=kwargs.get("case_insensitive_taxon_labels", True),
    )
    while not newick_stream.eof:
        t = nexustokenizer.tree_from_token_stream(newick_stream, taxon_set=taxon_set, **kwargs)
        if t is not None:
            yield t
        else:
            raise StopIteration()