예제 #1
0
def generalized_tree_source_iter(stream, **kwargs):
    """
    Diagnoses and handles both NEXUS and NEWICK files.
    """
    stream_tokenizer = nexustokenizer.NexusTokenizer(stream)
    token = stream_tokenizer.read_next_token_ucase()
    schema = None
    if token.upper() == "#NEXUS":
        schema = "nexus"
    else:
        if token == "(":
            schema = "newick"
    try:
        stream_tokenizer.stream_handle.seek(0)
    except IOError:
        raise TypeError(
            "File schema of non-random access source (such as stdin) must be specified in advance."
        )
    if schema == "nexus":
        return tree_source_iter(stream, **kwargs)
    elif schema == "newick":
        return ioclient.tree_source_iter(stream, 'newick', **kwargs)
    else:
        raise TypeError(
            "Cannot diagnose file schema based on first token found: '%s' (looking for '#NEXUS' or '(')"
        )
예제 #2
0
 def testComments(self):
     f = "  i [f]t [i] y"
     r2 = StringIO(f)
     tokenizer = nexustokenizer.NexusTokenizer(r2)
     expected = ['i', 't', 'y']
     for e in expected:
         token = tokenizer.read_next_token()
         self.assertEqual(e, token)
예제 #3
0
 def _prepare_to_read_from_stream(self, file_obj):
     self.stream_tokenizer = nexustokenizer.NexusTokenizer(
         stream_handle=file_obj,
         preserve_underscores=self.preserve_underscores,
         hyphens_as_tokens=self.hyphens_as_tokens,
         extract_comment_metadata=self.extract_comment_metadata)
예제 #4
0
 def tree_source_iter(self, stream):
     """
     Iterates over a NEXUS-formatted source of trees.
     Only trees will be returned, and any and all character data will
     be skipped. The iterator will span over multiple tree blocks,
     but, because our NEXUS data model implementation currently does
     not recognize multiple taxon collection definnitions, taxa in
     those tree blocks will be aggregated into the same `TaxonSet` (a
     new one created, or the one passed to this method via the
     `taxon_set` argument). This behavior is similar to how multiple
     tree blocks are handled by a full NEXUS data file read.
     """
     self.reset()
     if self.dataset is None:
         self.dataset = dataobject.DataSet()
     self.stream_tokenizer = nexustokenizer.NexusTokenizer(
         stream,
         preserve_underscores=self.preserve_underscores,
         hyphens_as_tokens=self.hyphens_as_tokens,
         extract_comment_metadata=self.extract_comment_metadata)
     token = self.stream_tokenizer.read_next_token_ucase()
     if token.upper() != "#NEXUS":
         raise self.data_format_error("Expecting '#NEXUS', but found '%s'" %
                                      token)
     while not self.stream_tokenizer.eof:
         token = self.stream_tokenizer.read_next_token_ucase()
         while token != None and token != 'BEGIN' and not self.stream_tokenizer.eof:
             token = self.stream_tokenizer.read_next_token_ucase()
         token = self.stream_tokenizer.read_next_token_ucase()
         if token == 'TAXA':
             self._parse_taxa_block()
         elif token == 'TREES':
             self.stream_tokenizer.skip_to_semicolon(
             )  # move past BEGIN command
             link_title = None
             taxon_set = None
             self.tree_translate_dict.clear()
             while not (token == 'END' or token == 'ENDBLOCK') \
                     and not self.stream_tokenizer.eof \
                     and not token==None:
                 token = self.stream_tokenizer.read_next_token_ucase()
                 if token == 'LINK':
                     link_title = self._parse_link_statement().get('taxa')
                 if token == 'TRANSLATE':
                     if not taxon_set:
                         taxon_set = self._get_taxon_set(link_title)
                         self._prepopulate_translate_dict(taxon_set)
                     self._parse_translate_statement(taxon_set)
                 if token == 'TREE':
                     if not taxon_set:
                         taxon_set = self._get_taxon_set(link_title)
                         self._prepopulate_translate_dict(taxon_set)
                     tree = self._parse_tree_statement(taxon_set)
                     yield tree
             self.stream_tokenizer.skip_to_semicolon(
             )  # move past END command
         else:
             # unknown block
             while not (token == 'END' or token == 'ENDBLOCK') \
                 and not self.stream_tokenizer.eof \
                 and not token==None:
                 self.stream_tokenizer.skip_to_semicolon()
                 token = self.stream_tokenizer.read_next_token_ucase()
     self.reset()
예제 #5
0
def tree_source_iter(stream, **kwargs):
    """
    Iterates over a NEWICK-formatted source of trees given by file-like object
    `stream`

    Note that if `encode_splits` is True, then a `taxon_set` has to be given.
    This is because adding Taxon objects to a taxon set may invalidate split
    bitmasks. Because NEWICK tree taxa are added to a TaxonSet as they are found
    on a tree, there is a strong possibility that all split bitmasks get
    invalidated in the middle of parsing a tree. To avoid this, and, more
    importantly to avoid errors downstream in client code due to this, we
    force specification of a `taxon_set` if `encode_splits` is requested.

    The following optional keyword arguments are also recognized:

        `taxon_set`
            TaxonSet object to use when reading data.

        `as_rooted=True` (or `as_unrooted=False`)
            Unconditionally interprets all trees as rooted.

        `as_unrooted=True` (or `as_rooted=False`)
            Unconditionally interprets all trees as unrooted.

        `default_as_rooted=True` (or `default_as_unrooted=False`)
            Interprets all trees as rooted if rooting not given by `[&R]`
            or `[&U]` comments.

        `default_as_unrooted=True` (or `default_as_rooted=False`)
            Interprets all trees as rooted if rooting not given by `[&R]`
            or `[&U]` comments.

        `edge_len_type`
            Specifies the type of the edge lengths (int or float).

        `extract_comment_metadata`
            If True, any comments that begin with '&' or '&&' associated with
            items will be processed and stored as part of the annotation set of
            the object (`annotations`) If False, this will be skipped. Defaults
            to False.

        `store_tree_weights`
            If True, process the tree weight ("[&W 1/2]") comment
            associated with each tree, if any.

        `encode_splits`
            Specifies whether or not split bitmasks will be calculated and
            attached to the edges.

        `finish_node_func`
            Is a function that will be applied to each node after it has
            been constructed.

        `case_sensitive_taxon_labels`
            If True, then taxon labels are case sensitive (different cases
            = different taxa); defaults to False.

        `allow_duplicate_taxon_labels`
            if True, allow duplicate labels on trees

        `preserve_underscores`
            If True, unquoted underscores in labels will *not* converted to
            spaces. Defaults to False: all underscores not protected by
            quotes will be converted to spaces.

        `suppress_internal_node_taxa`
            If False, internal node labels will be instantantiatd into Taxon
            objects.  Defaults to True: internal node labels will *not* be
            treated as taxa.

        `allow_duplicate_taxon_labels`
            If True, then multiple identical taxon labels will be allowed.
            Defaults to False: treat multiple identical taxon labels as an
            error.

        `hyphens_as_tokens`
            If True, hyphens will be treated as special punctuation
            characters. Defaults to False, hyphens not treated as special
            punctuation characters.

    """
    if "taxon_set" in kwargs:
        taxon_set = kwargs["taxon_set"]
        del (kwargs["taxon_set"])
    else:
        taxon_set = None
    if "encode_splits" in kwargs and taxon_set is None:
        raise Exception('When encoding splits on trees, a pre-populated TaxonSet instance ' \
            + "must be provided using the 'taxon_set' keyword to avoid taxon/split bitmask values "\
            + "changing as new Taxon objects are added to the set.")
    preserve_underscores = kwargs.get('preserve_underscores', False)
    hyphens_as_tokens = kwargs.get('hyphens_as_tokens',
                                   nexustokenizer.DEFAULT_HYPHENS_AS_TOKENS)
    extract_comment_metadata = kwargs.get("extract_comment_metadata", False)
    newick_stream = nexustokenizer.NexusTokenizer(
        stream,
        preserve_underscores=preserve_underscores,
        hyphens_as_tokens=hyphens_as_tokens,
        extract_comment_metadata=extract_comment_metadata,
        case_sensitive_taxon_labels=kwargs.get('case_sensitive_taxon_labels',
                                               False))
    while not newick_stream.eof:
        t = nexustokenizer.tree_from_token_stream(newick_stream,
                                                  taxon_set=taxon_set,
                                                  **kwargs)
        if t is not None:
            yield t
        else:
            raise StopIteration()
예제 #6
0
    def testParseSpacy(self):
        f = """#NEXUS

BEGIN TAXA;
    DIMENSIONS NTAX=30;
    TAXLABELS
        'Anolis ahli'
        'Anolis garmani'
        'Anolis grahami'
        'Anolis valencienni'
        'Anolis lineatopus'
        'Anolis aliniger'
        'Anolis coelestinus'
        'Anolis bahorucoensis'
        'Anolis equestris'
        'Anolis luteogularis'
        'Anolis occultus'
        'Anolis barahonae'
        'Anolis cuvieri'
        'Anolis insolitus'
        'Anolis olssoni'
        'Anolis brevirostris'
        'Anolis distichus'
        'Anolis cristatellus'
        'Anolis krugi'
        'Anolis stratulus'
        'Anolis alutaceus'
        'Anolis vanidicus'
        'Anolis angusticeps'
        'Anolis paternus'
        'Anolis loysiana'
        'Anolis marcanoi'
        'Anolis strahmi'
        'Diplolaemus darwinii'
        'Anolis ophiolepis'
        'Anolis sagrei'
  ;
END;

BEGIN TREES;
    tree 'con 50 majrule' = [&U] ('Anolis ahli':0.2642130000,((('Anolis garmani':0.1068380000,'Anolis grahami':0.0863670000)1.00:0.069511,'Anolis valencienni':0.1642630000)0.87:0.020752,'Anolis lineatopus':0.1957260000)1.00:0.077682,((((((('Anolis aliniger':0.1600010000,'Anolis coelestinus':0.1932310000)1.00:0.071920,'Anolis bahorucoensis':0.2266880000)0.68:0.023043,('Anolis equestris':0.0227020000,'Anolis luteogularis':0.0306410000)1.00:0.198165,'Anolis occultus':0.4231200000)0.89:0.056277,('Anolis barahonae':0.2114890000,'Anolis cuvieri':0.1686700000)1.00:0.084190,('Anolis insolitus':0.2438820000,'Anolis olssoni':0.2568770000)1.00:0.050618)0.86:0.031679,(('Anolis brevirostris':0.1801300000,'Anolis distichus':0.1151360000)1.00:0.123136,(('Anolis cristatellus':0.2144360000,'Anolis krugi':0.1573300000)0.93:0.036788,'Anolis stratulus':0.1973470000)1.00:0.081037)1.00:0.056582)0.77:0.021826,(('Anolis alutaceus':0.1619060000,'Anolis vanidicus':0.2059960000)1.00:0.118216,(('Anolis angusticeps':0.0857100000,'Anolis paternus':0.0595110000)1.00:0.153413,'Anolis loysiana':0.1836280000)1.00:0.042858)1.00:0.057139,('Anolis marcanoi':0.2359120000,'Anolis strahmi':0.1977660000)1.00:0.141032,'Diplolaemus darwinii':0.6364930000)1.00:0.067869,('Anolis ophiolepis':0.0945010000,'Anolis sagrei':0.0967580000)1.00:0.179398)0.96:0.044895);
END;
"""
        r2 = StringIO(f)
        tokenizer = nexustokenizer.NexusTokenizer(r2)
        expected = [
            '#NEXUS', 'BEGIN', 'TAXA', ';', 'DIMENSIONS', 'NTAX', '=', '30',
            ';', 'TAXLABELS', 'Anolis ahli', 'Anolis garmani',
            'Anolis grahami', 'Anolis valencienni', 'Anolis lineatopus',
            'Anolis aliniger', 'Anolis coelestinus', 'Anolis bahorucoensis',
            'Anolis equestris', 'Anolis luteogularis', 'Anolis occultus',
            'Anolis barahonae', 'Anolis cuvieri', 'Anolis insolitus',
            'Anolis olssoni', 'Anolis brevirostris', 'Anolis distichus',
            'Anolis cristatellus', 'Anolis krugi', 'Anolis stratulus',
            'Anolis alutaceus', 'Anolis vanidicus', 'Anolis angusticeps',
            'Anolis paternus', 'Anolis loysiana', 'Anolis marcanoi',
            'Anolis strahmi', 'Diplolaemus darwinii', 'Anolis ophiolepis',
            'Anolis sagrei', ';', 'END', ';', 'BEGIN', 'TREES', ';', 'tree',
            'con 50 majrule', '=', '(', 'Anolis ahli', ':', '0.2642130000',
            ',', '(', '(', '(', 'Anolis garmani', ':', '0.1068380000', ',',
            'Anolis grahami', ':', '0.0863670000', ')', '1.00', ':',
            '0.069511', ',', 'Anolis valencienni', ':', '0.1642630000', ')',
            '0.87', ':', '0.020752', ',', 'Anolis lineatopus', ':',
            '0.1957260000', ')', '1.00', ':', '0.077682', ',', '(', '(', '(',
            '(', '(', '(', '(', 'Anolis aliniger', ':', '0.1600010000', ',',
            'Anolis coelestinus', ':', '0.1932310000', ')', '1.00', ':',
            '0.071920', ',', 'Anolis bahorucoensis', ':', '0.2266880000', ')',
            '0.68', ':', '0.023043', ',', '(', 'Anolis equestris', ':',
            '0.0227020000', ',', 'Anolis luteogularis', ':', '0.0306410000',
            ')', '1.00', ':', '0.198165', ',', 'Anolis occultus', ':',
            '0.4231200000', ')', '0.89', ':', '0.056277', ',', '(',
            'Anolis barahonae', ':', '0.2114890000', ',', 'Anolis cuvieri',
            ':', '0.1686700000', ')', '1.00', ':', '0.084190', ',', '(',
            'Anolis insolitus', ':', '0.2438820000', ',', 'Anolis olssoni',
            ':', '0.2568770000', ')', '1.00', ':', '0.050618', ')', '0.86',
            ':', '0.031679', ',', '(', '(', 'Anolis brevirostris', ':',
            '0.1801300000', ',', 'Anolis distichus', ':', '0.1151360000', ')',
            '1.00', ':', '0.123136', ',', '(', '(', 'Anolis cristatellus', ':',
            '0.2144360000', ',', 'Anolis krugi', ':', '0.1573300000', ')',
            '0.93', ':', '0.036788', ',', 'Anolis stratulus', ':',
            '0.1973470000', ')', '1.00', ':', '0.081037', ')', '1.00', ':',
            '0.056582', ')', '0.77', ':', '0.021826', ',', '(', '(',
            'Anolis alutaceus', ':', '0.1619060000', ',', 'Anolis vanidicus',
            ':', '0.2059960000', ')', '1.00', ':', '0.118216', ',', '(', '(',
            'Anolis angusticeps', ':', '0.0857100000', ',', 'Anolis paternus',
            ':', '0.0595110000', ')', '1.00', ':', '0.153413', ',',
            'Anolis loysiana', ':', '0.1836280000', ')', '1.00', ':',
            '0.042858', ')', '1.00', ':', '0.057139', ',', '(',
            'Anolis marcanoi', ':', '0.2359120000', ',', 'Anolis strahmi', ':',
            '0.1977660000', ')', '1.00', ':', '0.141032', ',',
            'Diplolaemus darwinii', ':', '0.6364930000', ')', '1.00', ':',
            '0.067869', ',', '(', 'Anolis ophiolepis', ':', '0.0945010000',
            ',', 'Anolis sagrei', ':', '0.0967580000', ')', '1.00', ':',
            '0.179398', ')', '0.96', ':', '0.044895', ')', ';', 'END', ';'
        ]
        for e in expected:
            token = tokenizer.read_next_token()
            self.assertEqual(token, e)