def _build_state_alphabet(self, char_block, symbols): sa = dataobject.get_state_alphabet_from_symbols( symbols, gap_symbol=self.gap_char, missing_symbol=self.missing_char) char_block.state_alphabets = [sa] char_block.default_state_alphabet = char_block.state_alphabets[0]
def read(self, stream): if self.exclude_chars: return self.dataset if self.dataset is None: self.dataset = dataobject.DataSet() taxon_set = self.get_default_taxon_set() self.char_matrix = self.dataset.new_char_matrix( char_matrix_type=self.char_matrix_type, taxon_set=taxon_set) if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \ and len(self.char_matrix.state_alphabets) == 0: self.char_matrix.state_alphabets.append( dataobject.get_state_alphabet_from_symbols("0123456789")) self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[ 0] if self.char_matrix.default_state_alphabet is not None: self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map( ) elif len(self.char_matrix.state_alphabets) == 0: raise ValueError("No state alphabets defined") elif len(self.char_matrix.state_alphabets) > 1: raise NotImplementedError( "Mixed state-alphabet matrices not supported") else: self.symbol_state_map = self.char_matrix.state_alphabets[0] self.stream = stream lines = fileutils.get_lines(self.stream) if len(lines) == 0: raise error.DataSourceError("No data in source", stream=self.stream) elif len(lines) <= 2: raise error.DataParseError( "Expecting at least 2 lines in PHYLIP format data source", stream=self.stream) desc_line = lines[0] lines = lines[1:] m = re.match('\s*(\d+)\s+(\d+)\s*$', desc_line) if m is None: raise self._data_parse_error( "Invalid data description line: '%s'" % desc_line) self.ntax = int(m.groups()[0]) self.nchar = int(m.groups()[1]) if self.ntax == 0 or self.nchar == 0: raise error.DataSourceError("No data in source", stream=self.stream) if self.interleaved: self._parse_interleaved(lines) else: self._parse_sequential(lines) self.stream = None return self.dataset
def read(self, stream): if self.exclude_chars: return self.dataset if self.dataset is None: self.dataset = dataobject.DataSet() taxon_set = self.get_default_taxon_set() self.char_matrix = self.dataset.new_char_matrix(char_matrix_type=self.char_matrix_type, taxon_set=taxon_set) if ( isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) and len(self.char_matrix.state_alphabets) == 0 ): self.char_matrix.state_alphabets.append(dataobject.get_state_alphabet_from_symbols("0123456789")) self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[0] if self.char_matrix.default_state_alphabet is not None: self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map() elif len(self.char_matrix.state_alphabets) == 0: raise ValueError("No state alphabets defined") elif len(self.char_matrix.state_alphabets) > 1: raise NotImplementedError("Mixed state-alphabet matrices not supported") else: self.symbol_state_map = self.char_matrix.state_alphabets[0] self.stream = stream lines = fileutils.get_lines(self.stream) if len(lines) == 0: raise error.DataSourceError("No data in source", stream=self.stream) elif len(lines) <= 2: raise error.DataParseError("Expecting at least 2 lines in PHYLIP format data source", stream=self.stream) desc_line = lines[0] lines = lines[1:] m = re.match("\s*(\d+)\s+(\d+)\s*$", desc_line) if m is None: raise self._data_parse_error("Invalid data description line: '%s'" % desc_line) self.ntax = int(m.groups()[0]) self.nchar = int(m.groups()[1]) if self.ntax == 0 or self.nchar == 0: raise error.DataSourceError("No data in source", stream=self.stream) if self.interleaved: self._parse_interleaved(lines) else: self._parse_sequential(lines) self.stream = None return self.dataset
def read(self, stream): """ Main file parsing driver. """ if self.exclude_chars: return self.dataset if self.dataset is None: self.dataset = dataobject.DataSet() taxon_set = self.get_default_taxon_set() self.char_matrix = self.dataset.new_char_matrix( char_matrix_type=self.char_matrix_type, taxon_set=taxon_set) if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \ and len(self.char_matrix.state_alphabets) == 0: self.char_matrix.state_alphabets.append( dataobject.get_state_alphabet_from_symbols("0123456789")) self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[ 0] if self.char_matrix.default_state_alphabet is not None: self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map( ) elif len(self.char_matrix.state_alphabets) == 0: raise ValueError("No state alphabets defined") elif len(self.char_matrix.state_alphabets) > 1: raise NotImplementedError( "Mixed state-alphabet matrices not supported") else: self.symbol_state_map = self.char_matrix.state_alphabets[0] curr_vec = None curr_taxon = None if self.simple_rows: legal_chars = self.char_matrix.default_state_alphabet.get_legal_symbols_as_str( ) for line_index, line in enumerate(stream): s = line.strip() if not s: continue if s.startswith('>'): if self.simple_rows and curr_taxon and curr_vec: self.char_matrix[curr_taxon] = "".join(curr_vec) name = s[1:].strip() curr_taxon = taxon_set.require_taxon(label=name) if curr_taxon in self.char_matrix: raise DataParseError( message="Fasta error: Repeated sequence name (%s) found" % name, row=line_index + 1, stream=stream) if curr_vec is not None and len(curr_vec) == 0: raise DataParseError( message= "Fasta error: Expected sequence, but found another sequence name (%s)" % name, row=line_index + 1, stream=stream) if self.simple_rows: curr_vec = [] else: curr_vec = dataobject.CharacterDataVector(taxon=curr_taxon) self.char_matrix[curr_taxon] = curr_vec elif curr_vec is None: raise DataParseError( message= "Fasta error: Expecting a lines starting with > before sequences", row=line_index + 1, stream=stream) else: if self.simple_rows: for col_ind, c in enumerate(s): c = c.strip() if not c: continue if c not in legal_chars: DataParseError( message='Unrecognized sequence symbol "%s"' % c, row=line_index + 1, column=col_ind + 1, stream=stream) curr_vec.append(c) else: for col_ind, c in enumerate(s): c = c.strip() if not c: continue try: state = self.symbol_state_map[c] curr_vec.append( dataobject.CharacterDataCell(value=state)) except: raise DataParseError( message='Unrecognized sequence symbol "%s"' % c, row=line_index + 1, column=col_ind + 1, stream=stream) if self.simple_rows and curr_taxon and curr_vec: self.char_matrix[curr_taxon] = "".join(curr_vec) return self.dataset
def read(self, stream): """ Main file parsing driver. """ if self.exclude_chars: return self.dataset if self.dataset is None: self.dataset = dataobject.DataSet() taxon_set = self.get_default_taxon_set() self.char_matrix = self.dataset.new_char_matrix(char_matrix_type=self.char_matrix_type, taxon_set=taxon_set) if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \ and len(self.char_matrix.state_alphabets) == 0: self.char_matrix.state_alphabets.append(dataobject.get_state_alphabet_from_symbols("0123456789")) self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[0] if self.char_matrix.default_state_alphabet is not None: self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map() elif len(self.char_matrix.state_alphabets) == 0: raise ValueError("No state alphabets defined") elif len(self.char_matrix.state_alphabets) > 1: raise NotImplementedError("Mixed state-alphabet matrices not supported") else: self.symbol_state_map = self.char_matrix.state_alphabets[0] curr_vec = None curr_taxon = None if self.simple_rows: legal_chars = self.char_matrix.default_state_alphabet.get_legal_symbols_as_str() for line_index, line in enumerate(stream): s = line.strip() if not s: continue if s.startswith('>'): if self.simple_rows and curr_taxon and curr_vec: self.char_matrix[curr_taxon] = "".join(curr_vec) name = s[1:].strip() curr_taxon = taxon_set.require_taxon(label=name) if curr_taxon in self.char_matrix: raise DataParseError(message="Fasta error: Repeated sequence name (%s) found" % name, row=line_index + 1, stream=stream) if curr_vec is not None and len(curr_vec) == 0: raise DataParseError(message="Fasta error: Expected sequence, but found another sequence name (%s)" % name, row=line_index + 1, stream=stream) if self.simple_rows: curr_vec = [] else: curr_vec = dataobject.CharacterDataVector(taxon=curr_taxon) self.char_matrix[curr_taxon] = curr_vec elif curr_vec is None: raise DataParseError(message="Fasta error: Expecting a lines starting with > before sequences", row=line_index + 1, stream=stream) else: if self.simple_rows: for col_ind, c in enumerate(s): c = c.strip() if not c: continue if c not in legal_chars: DataParseError(message='Unrecognized sequence symbol "%s"' % c, row=line_index + 1, column=col_ind + 1, stream=stream) curr_vec.append(c) else: for col_ind, c in enumerate(s): c = c.strip() if not c: continue try: state = self.symbol_state_map[c] curr_vec.append(dataobject.CharacterDataCell(value=state)) except: raise DataParseError(message='Unrecognized sequence symbol "%s"' % c, row=line_index + 1, column=col_ind + 1, stream=stream) if self.simple_rows and curr_taxon and curr_vec: self.char_matrix[curr_taxon] = "".join(curr_vec) return self.dataset
def _build_state_alphabet(self, char_block, symbols): sa = dataobject.get_state_alphabet_from_symbols(symbols, gap_symbol=self.gap_char, missing_symbol=self.missing_char) char_block.state_alphabets = [sa] char_block.default_state_alphabet = char_block.state_alphabets[0]
def _ncl_characters_block_to_native(self, taxa_block, ncl_cb, ncl_nxs_reader_handle): """ Processes a FORMAT command. Assumes that the file reader is positioned right after the "FORMAT" token in a FORMAT command. """ raw_matrix = ncl_cb.GetRawDiscreteMatrixRef() if ncl_cb.IsMixedType(): _LOG.warn("Mixed datatype character blocks are not supported in Dendropy. Skipping...") return None char_block_type = _ncl_datatype_enum_to_dendropy(ncl_cb.GetDataType()) mapper = ncl_cb.GetDatatypeMapperForCharRef(0) symbols = mapper.GetSymbols() state_codes_mapping = mapper.GetPythonicStateVectors() char_block = char_block_type() char_block.taxon_set = taxa_block if isinstance(char_block, dataobject.StandardCharacterMatrix): sa = dataobject.get_state_alphabet_from_symbols( symbols=symbols, gap_symbol='-', missing_symbol='?' ) char_block.state_alphabets = [sa] char_block.default_state_alphabet = char_block.state_alphabets[0] symbol_state_map = char_block.default_state_alphabet.symbol_state_map() ncl_numeric_code_to_state = [] for s in symbols: ncl_numeric_code_to_state.append(symbol_state_map[s]) for sc in state_codes_mapping[len(symbols):-2]: search = set() for fundamental_state in sc: search.add(ncl_numeric_code_to_state[fundamental_state]) found = False for sym, state in symbol_state_map.iteritems(): ms = state.member_states if ms: possible = set(ms) if possible == search: found = True ncl_numeric_code_to_state.append(state) break if not found: raise ValueError("NCL datatype cannot be coerced into datatype because ambiguity code for %s is missing " % str(search)) ncl_numeric_code_to_state.append(symbol_state_map['-']) ncl_numeric_code_to_state.append(symbol_state_map['?']) assert (len(raw_matrix) == len(taxa_block)) for row_ind, taxon in enumerate(taxa_block): v = dataobject.CharacterDataVector(taxon=taxon) raw_row = raw_matrix[row_ind] char_block[taxon] = v if not self.exclude_chars: for c in raw_row: state = ncl_numeric_code_to_state[c] v.append(dataobject.CharacterDataCell(value=state)) #dataset.characters_blocks.append(char_block) supporting_exsets = False supporting_charset_exsets = False if supporting_exsets: s = ncl_cb.GetExcludedIndexSet() print "Excluded chars =", str(nclwrapper.NxsSetReader.GetSetAsVector(s)) if supporting_charset_exsets: _LOG.debug("Calling MultiFormatReader.GetNumTaxaBlocks()") nab = ncl_nxs_reader_handle.GetNumAssumptionsBlocks(ncl_cb) for k in xrange(nab): _LOG.debug("Calling MultiFormatReader.GetNumTaxaBlocks()") a = ncl_nxs_reader_handle.GetAssumptionsBlock(ncl_cb, k) cs = a.GetCharSetNames() print "CharSets have the names " , str(cs) return char_block
def _ncl_characters_block_to_native(self, taxa_block, ncl_cb, ncl_nxs_reader_handle): """ Processes a FORMAT command. Assumes that the file reader is positioned right after the "FORMAT" token in a FORMAT command. """ raw_matrix = ncl_cb.GetRawDiscreteMatrixRef() if ncl_cb.IsMixedType(): _LOG.warn( "Mixed datatype character blocks are not supported in Dendropy. Skipping..." ) return None char_block_type = _ncl_datatype_enum_to_dendropy( ncl_cb.GetDataType()) mapper = ncl_cb.GetDatatypeMapperForCharRef(0) symbols = mapper.GetSymbols() state_codes_mapping = mapper.GetPythonicStateVectors() char_block = char_block_type() char_block.taxon_set = taxa_block if isinstance(char_block, dataobject.StandardCharacterMatrix): sa = dataobject.get_state_alphabet_from_symbols( symbols=symbols, gap_symbol='-', missing_symbol='?') char_block.state_alphabets = [sa] char_block.default_state_alphabet = char_block.state_alphabets[ 0] symbol_state_map = char_block.default_state_alphabet.symbol_state_map( ) ncl_numeric_code_to_state = [] for s in symbols: ncl_numeric_code_to_state.append(symbol_state_map[s]) for sc in state_codes_mapping[len(symbols):-2]: search = set() for fundamental_state in sc: search.add(ncl_numeric_code_to_state[fundamental_state]) found = False for sym, state in symbol_state_map.iteritems(): ms = state.member_states if ms: possible = set(ms) if possible == search: found = True ncl_numeric_code_to_state.append(state) break if not found: raise ValueError( "NCL datatype cannot be coerced into datatype because ambiguity code for %s is missing " % str(search)) ncl_numeric_code_to_state.append(symbol_state_map['-']) ncl_numeric_code_to_state.append(symbol_state_map['?']) assert (len(raw_matrix) == len(taxa_block)) for row_ind, taxon in enumerate(taxa_block): v = dataobject.CharacterDataVector(taxon=taxon) raw_row = raw_matrix[row_ind] char_block[taxon] = v if not self.exclude_chars: for c in raw_row: state = ncl_numeric_code_to_state[c] v.append(dataobject.CharacterDataCell(value=state)) #dataset.characters_blocks.append(char_block) supporting_exsets = False supporting_charset_exsets = False if supporting_exsets: s = ncl_cb.GetExcludedIndexSet() print "Excluded chars =", str( nclwrapper.NxsSetReader.GetSetAsVector(s)) if supporting_charset_exsets: _LOG.debug("Calling MultiFormatReader.GetNumTaxaBlocks()") nab = ncl_nxs_reader_handle.GetNumAssumptionsBlocks(ncl_cb) for k in xrange(nab): _LOG.debug("Calling MultiFormatReader.GetNumTaxaBlocks()") a = ncl_nxs_reader_handle.GetAssumptionsBlock(ncl_cb, k) cs = a.GetCharSetNames() print "CharSets have the names ", str(cs) return char_block