def generalized_tree_source_iter(stream, **kwargs): """ Diagnoses and handles both NEXUS and NEWICK files. """ stream_tokenizer = nexustokenizer.NexusTokenizer(stream) token = stream_tokenizer.read_next_token_ucase() schema = None if token.upper() == "#NEXUS": schema = "nexus" else: if token == "(": schema = "newick" try: stream_tokenizer.stream_handle.seek(0) except IOError: raise TypeError( "File schema of non-random access source (such as stdin) must be specified in advance." ) if schema == "nexus": return tree_source_iter(stream, **kwargs) elif schema == "newick": return ioclient.tree_source_iter(stream, 'newick', **kwargs) else: raise TypeError( "Cannot diagnose file schema based on first token found: '%s' (looking for '#NEXUS' or '(')" )
def testComments(self): f = " i [f]t [i] y" r2 = StringIO(f) tokenizer = nexustokenizer.NexusTokenizer(r2) expected = ['i', 't', 'y'] for e in expected: token = tokenizer.read_next_token() self.assertEqual(e, token)
def _prepare_to_read_from_stream(self, file_obj): self.stream_tokenizer = nexustokenizer.NexusTokenizer( stream_handle=file_obj, preserve_underscores=self.preserve_underscores, hyphens_as_tokens=self.hyphens_as_tokens, extract_comment_metadata=self.extract_comment_metadata)
def tree_source_iter(self, stream): """ Iterates over a NEXUS-formatted source of trees. Only trees will be returned, and any and all character data will be skipped. The iterator will span over multiple tree blocks, but, because our NEXUS data model implementation currently does not recognize multiple taxon collection definnitions, taxa in those tree blocks will be aggregated into the same `TaxonSet` (a new one created, or the one passed to this method via the `taxon_set` argument). This behavior is similar to how multiple tree blocks are handled by a full NEXUS data file read. """ self.reset() if self.dataset is None: self.dataset = dataobject.DataSet() self.stream_tokenizer = nexustokenizer.NexusTokenizer( stream, preserve_underscores=self.preserve_underscores, hyphens_as_tokens=self.hyphens_as_tokens, extract_comment_metadata=self.extract_comment_metadata) token = self.stream_tokenizer.read_next_token_ucase() if token.upper() != "#NEXUS": raise self.data_format_error("Expecting '#NEXUS', but found '%s'" % token) while not self.stream_tokenizer.eof: token = self.stream_tokenizer.read_next_token_ucase() while token != None and token != 'BEGIN' and not self.stream_tokenizer.eof: token = self.stream_tokenizer.read_next_token_ucase() token = self.stream_tokenizer.read_next_token_ucase() if token == 'TAXA': self._parse_taxa_block() elif token == 'TREES': self.stream_tokenizer.skip_to_semicolon( ) # move past BEGIN command link_title = None taxon_set = None self.tree_translate_dict.clear() while not (token == 'END' or token == 'ENDBLOCK') \ and not self.stream_tokenizer.eof \ and not token==None: token = self.stream_tokenizer.read_next_token_ucase() if token == 'LINK': link_title = self._parse_link_statement().get('taxa') if token == 'TRANSLATE': if not taxon_set: taxon_set = self._get_taxon_set(link_title) self._prepopulate_translate_dict(taxon_set) self._parse_translate_statement(taxon_set) if token == 'TREE': if not taxon_set: taxon_set = self._get_taxon_set(link_title) self._prepopulate_translate_dict(taxon_set) tree = self._parse_tree_statement(taxon_set) yield tree self.stream_tokenizer.skip_to_semicolon( ) # move past END command else: # unknown block while not (token == 'END' or token == 'ENDBLOCK') \ and not self.stream_tokenizer.eof \ and not token==None: self.stream_tokenizer.skip_to_semicolon() token = self.stream_tokenizer.read_next_token_ucase() self.reset()
def tree_source_iter(stream, **kwargs): """ Iterates over a NEWICK-formatted source of trees given by file-like object `stream` Note that if `encode_splits` is True, then a `taxon_set` has to be given. This is because adding Taxon objects to a taxon set may invalidate split bitmasks. Because NEWICK tree taxa are added to a TaxonSet as they are found on a tree, there is a strong possibility that all split bitmasks get invalidated in the middle of parsing a tree. To avoid this, and, more importantly to avoid errors downstream in client code due to this, we force specification of a `taxon_set` if `encode_splits` is requested. The following optional keyword arguments are also recognized: `taxon_set` TaxonSet object to use when reading data. `as_rooted=True` (or `as_unrooted=False`) Unconditionally interprets all trees as rooted. `as_unrooted=True` (or `as_rooted=False`) Unconditionally interprets all trees as unrooted. `default_as_rooted=True` (or `default_as_unrooted=False`) Interprets all trees as rooted if rooting not given by `[&R]` or `[&U]` comments. `default_as_unrooted=True` (or `default_as_rooted=False`) Interprets all trees as rooted if rooting not given by `[&R]` or `[&U]` comments. `edge_len_type` Specifies the type of the edge lengths (int or float). `extract_comment_metadata` If True, any comments that begin with '&' or '&&' associated with items will be processed and stored as part of the annotation set of the object (`annotations`) If False, this will be skipped. Defaults to False. `store_tree_weights` If True, process the tree weight ("[&W 1/2]") comment associated with each tree, if any. `encode_splits` Specifies whether or not split bitmasks will be calculated and attached to the edges. `finish_node_func` Is a function that will be applied to each node after it has been constructed. `case_sensitive_taxon_labels` If True, then taxon labels are case sensitive (different cases = different taxa); defaults to False. `allow_duplicate_taxon_labels` if True, allow duplicate labels on trees `preserve_underscores` If True, unquoted underscores in labels will *not* converted to spaces. Defaults to False: all underscores not protected by quotes will be converted to spaces. `suppress_internal_node_taxa` If False, internal node labels will be instantantiatd into Taxon objects. Defaults to True: internal node labels will *not* be treated as taxa. `allow_duplicate_taxon_labels` If True, then multiple identical taxon labels will be allowed. Defaults to False: treat multiple identical taxon labels as an error. `hyphens_as_tokens` If True, hyphens will be treated as special punctuation characters. Defaults to False, hyphens not treated as special punctuation characters. """ if "taxon_set" in kwargs: taxon_set = kwargs["taxon_set"] del (kwargs["taxon_set"]) else: taxon_set = None if "encode_splits" in kwargs and taxon_set is None: raise Exception('When encoding splits on trees, a pre-populated TaxonSet instance ' \ + "must be provided using the 'taxon_set' keyword to avoid taxon/split bitmask values "\ + "changing as new Taxon objects are added to the set.") preserve_underscores = kwargs.get('preserve_underscores', False) hyphens_as_tokens = kwargs.get('hyphens_as_tokens', nexustokenizer.DEFAULT_HYPHENS_AS_TOKENS) extract_comment_metadata = kwargs.get("extract_comment_metadata", False) newick_stream = nexustokenizer.NexusTokenizer( stream, preserve_underscores=preserve_underscores, hyphens_as_tokens=hyphens_as_tokens, extract_comment_metadata=extract_comment_metadata, case_sensitive_taxon_labels=kwargs.get('case_sensitive_taxon_labels', False)) while not newick_stream.eof: t = nexustokenizer.tree_from_token_stream(newick_stream, taxon_set=taxon_set, **kwargs) if t is not None: yield t else: raise StopIteration()
def testParseSpacy(self): f = """#NEXUS BEGIN TAXA; DIMENSIONS NTAX=30; TAXLABELS 'Anolis ahli' 'Anolis garmani' 'Anolis grahami' 'Anolis valencienni' 'Anolis lineatopus' 'Anolis aliniger' 'Anolis coelestinus' 'Anolis bahorucoensis' 'Anolis equestris' 'Anolis luteogularis' 'Anolis occultus' 'Anolis barahonae' 'Anolis cuvieri' 'Anolis insolitus' 'Anolis olssoni' 'Anolis brevirostris' 'Anolis distichus' 'Anolis cristatellus' 'Anolis krugi' 'Anolis stratulus' 'Anolis alutaceus' 'Anolis vanidicus' 'Anolis angusticeps' 'Anolis paternus' 'Anolis loysiana' 'Anolis marcanoi' 'Anolis strahmi' 'Diplolaemus darwinii' 'Anolis ophiolepis' 'Anolis sagrei' ; END; BEGIN TREES; tree 'con 50 majrule' = [&U] ('Anolis ahli':0.2642130000,((('Anolis garmani':0.1068380000,'Anolis grahami':0.0863670000)1.00:0.069511,'Anolis valencienni':0.1642630000)0.87:0.020752,'Anolis lineatopus':0.1957260000)1.00:0.077682,((((((('Anolis aliniger':0.1600010000,'Anolis coelestinus':0.1932310000)1.00:0.071920,'Anolis bahorucoensis':0.2266880000)0.68:0.023043,('Anolis equestris':0.0227020000,'Anolis luteogularis':0.0306410000)1.00:0.198165,'Anolis occultus':0.4231200000)0.89:0.056277,('Anolis barahonae':0.2114890000,'Anolis cuvieri':0.1686700000)1.00:0.084190,('Anolis insolitus':0.2438820000,'Anolis olssoni':0.2568770000)1.00:0.050618)0.86:0.031679,(('Anolis brevirostris':0.1801300000,'Anolis distichus':0.1151360000)1.00:0.123136,(('Anolis cristatellus':0.2144360000,'Anolis krugi':0.1573300000)0.93:0.036788,'Anolis stratulus':0.1973470000)1.00:0.081037)1.00:0.056582)0.77:0.021826,(('Anolis alutaceus':0.1619060000,'Anolis vanidicus':0.2059960000)1.00:0.118216,(('Anolis angusticeps':0.0857100000,'Anolis paternus':0.0595110000)1.00:0.153413,'Anolis loysiana':0.1836280000)1.00:0.042858)1.00:0.057139,('Anolis marcanoi':0.2359120000,'Anolis strahmi':0.1977660000)1.00:0.141032,'Diplolaemus darwinii':0.6364930000)1.00:0.067869,('Anolis ophiolepis':0.0945010000,'Anolis sagrei':0.0967580000)1.00:0.179398)0.96:0.044895); END; """ r2 = StringIO(f) tokenizer = nexustokenizer.NexusTokenizer(r2) expected = [ '#NEXUS', 'BEGIN', 'TAXA', ';', 'DIMENSIONS', 'NTAX', '=', '30', ';', 'TAXLABELS', 'Anolis ahli', 'Anolis garmani', 'Anolis grahami', 'Anolis valencienni', 'Anolis lineatopus', 'Anolis aliniger', 'Anolis coelestinus', 'Anolis bahorucoensis', 'Anolis equestris', 'Anolis luteogularis', 'Anolis occultus', 'Anolis barahonae', 'Anolis cuvieri', 'Anolis insolitus', 'Anolis olssoni', 'Anolis brevirostris', 'Anolis distichus', 'Anolis cristatellus', 'Anolis krugi', 'Anolis stratulus', 'Anolis alutaceus', 'Anolis vanidicus', 'Anolis angusticeps', 'Anolis paternus', 'Anolis loysiana', 'Anolis marcanoi', 'Anolis strahmi', 'Diplolaemus darwinii', 'Anolis ophiolepis', 'Anolis sagrei', ';', 'END', ';', 'BEGIN', 'TREES', ';', 'tree', 'con 50 majrule', '=', '(', 'Anolis ahli', ':', '0.2642130000', ',', '(', '(', '(', 'Anolis garmani', ':', '0.1068380000', ',', 'Anolis grahami', ':', '0.0863670000', ')', '1.00', ':', '0.069511', ',', 'Anolis valencienni', ':', '0.1642630000', ')', '0.87', ':', '0.020752', ',', 'Anolis lineatopus', ':', '0.1957260000', ')', '1.00', ':', '0.077682', ',', '(', '(', '(', '(', '(', '(', '(', 'Anolis aliniger', ':', '0.1600010000', ',', 'Anolis coelestinus', ':', '0.1932310000', ')', '1.00', ':', '0.071920', ',', 'Anolis bahorucoensis', ':', '0.2266880000', ')', '0.68', ':', '0.023043', ',', '(', 'Anolis equestris', ':', '0.0227020000', ',', 'Anolis luteogularis', ':', '0.0306410000', ')', '1.00', ':', '0.198165', ',', 'Anolis occultus', ':', '0.4231200000', ')', '0.89', ':', '0.056277', ',', '(', 'Anolis barahonae', ':', '0.2114890000', ',', 'Anolis cuvieri', ':', '0.1686700000', ')', '1.00', ':', '0.084190', ',', '(', 'Anolis insolitus', ':', '0.2438820000', ',', 'Anolis olssoni', ':', '0.2568770000', ')', '1.00', ':', '0.050618', ')', '0.86', ':', '0.031679', ',', '(', '(', 'Anolis brevirostris', ':', '0.1801300000', ',', 'Anolis distichus', ':', '0.1151360000', ')', '1.00', ':', '0.123136', ',', '(', '(', 'Anolis cristatellus', ':', '0.2144360000', ',', 'Anolis krugi', ':', '0.1573300000', ')', '0.93', ':', '0.036788', ',', 'Anolis stratulus', ':', '0.1973470000', ')', '1.00', ':', '0.081037', ')', '1.00', ':', '0.056582', ')', '0.77', ':', '0.021826', ',', '(', '(', 'Anolis alutaceus', ':', '0.1619060000', ',', 'Anolis vanidicus', ':', '0.2059960000', ')', '1.00', ':', '0.118216', ',', '(', '(', 'Anolis angusticeps', ':', '0.0857100000', ',', 'Anolis paternus', ':', '0.0595110000', ')', '1.00', ':', '0.153413', ',', 'Anolis loysiana', ':', '0.1836280000', ')', '1.00', ':', '0.042858', ')', '1.00', ':', '0.057139', ',', '(', 'Anolis marcanoi', ':', '0.2359120000', ',', 'Anolis strahmi', ':', '0.1977660000', ')', '1.00', ':', '0.141032', ',', 'Diplolaemus darwinii', ':', '0.6364930000', ')', '1.00', ':', '0.067869', ',', '(', 'Anolis ophiolepis', ':', '0.0945010000', ',', 'Anolis sagrei', ':', '0.0967580000', ')', '1.00', ':', '0.179398', ')', '0.96', ':', '0.044895', ')', ';', 'END', ';' ] for e in expected: token = tokenizer.read_next_token() self.assertEqual(token, e)