def _read(self, stream, taxon_namespace_factory=None, tree_list_factory=None, char_matrix_factory=None, state_alphabet_factory=None, global_annotations_target=None): self.reset() self.stream = stream self.taxon_namespace = taxon_namespace_factory(label=None) if self.data_type is None: raise TypeError("Data type must be specified for this schema") if self.data_type == "standard" and self.default_state_alphabet is not None: self.char_matrix = char_matrix_factory( self.data_type, label=None, taxon_namespace=self.taxon_namespace, default_state_alphabet=self.default_state_alphabet, ) else: self.char_matrix = char_matrix_factory( self.data_type, label=None, taxon_namespace=self.taxon_namespace) if self.data_type == "standard": state_alphabet = state_alphabet_factory( fundamental_states="0123456789", no_data_symbol="?", gap_symbol="-", case_sensitive=False) self.char_matrix.state_alphabets.append(state_alphabet) lines = filesys.get_lines(stream) if len(lines) == 0: raise error.DataParseError("No data in source", stream=self.stream) elif len(lines) <= 2: raise error.DataParseError( "Expecting at least 2 lines in PHYLIP format data source", stream=self.stream) desc_line = lines[0] lines = lines[1:] m = re.match('\s*(\d+)\s+(\d+)\s*$', desc_line) if m is None: raise self._data_parse_error( "Invalid data description line: '%s'" % desc_line) self.ntax = int(m.groups()[0]) self.nchar = int(m.groups()[1]) if self.ntax == 0 or self.nchar == 0: raise error.DataParseError("No data in source", stream=self.stream) if self.interleaved: self._parse_interleaved(lines) else: self._parse_sequential(lines) if len(self.taxa_processed) != self.ntax: self._taxon_error(num_expected=self.ntax, found=self.taxa_processed) product = self.Product(taxon_namespaces=None, tree_lists=None, char_matrices=[self.char_matrix]) return product
def _read(self, stream, taxon_namespace_factory=None, tree_list_factory=None, char_matrix_factory=None, state_alphabet_factory=None, global_annotations_target=None): data = stream.read() start_positions = [] for match in MultiPhylipReader.data_block_start_pattern.finditer(data): start_positions.append(match.start(1)) if not start_positions: raise error.DataParseError("No PHYLIP data blocks found in source", stream=stream) char_matrices = [] for idx, start_pos in enumerate(start_positions): if idx == len(start_positions) - 1: end_pos = len(data) else: end_pos = start_positions[idx + 1] block = data[start_pos:end_pos] src = StringIO(block) subproduct = self._phylip_reader._read( stream=src, taxon_namespace_factory=taxon_namespace_factory, tree_list_factory=tree_list_factory, char_matrix_factory=char_matrix_factory, state_alphabet_factory=state_alphabet_factory, global_annotations_target=global_annotations_target, ) char_matrices.extend(subproduct.char_matrices) product = self.Product(taxon_namespaces=None, tree_lists=None, char_matrices=char_matrices) return product
def read(self, stream): if self.exclude_chars: return self.dataset if self.dataset is None: self.dataset = dataobject.DataSet() taxon_set = self.get_default_taxon_set() self.char_matrix = self.dataset.new_char_matrix( char_matrix_type=self.char_matrix_type, taxon_set=taxon_set) if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \ and len(self.char_matrix.state_alphabets) == 0: self.char_matrix.state_alphabets.append( dataobject.get_state_alphabet_from_symbols("0123456789")) self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[ 0] if self.char_matrix.default_state_alphabet is not None: self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map( ) elif len(self.char_matrix.state_alphabets) == 0: raise ValueError("No state alphabets defined") elif len(self.char_matrix.state_alphabets) > 1: raise NotImplementedError( "Mixed state-alphabet matrices not supported") else: self.symbol_state_map = self.char_matrix.state_alphabets[0] self.stream = stream lines = fileutils.get_lines(self.stream) if len(lines) == 0: raise error.DataSourceError("No data in source", stream=self.stream) elif len(lines) <= 2: raise error.DataParseError( "Expecting at least 2 lines in PHYLIP format data source", stream=self.stream) desc_line = lines[0] lines = lines[1:] m = re.match('\s*(\d+)\s+(\d+)\s*$', desc_line) if m is None: raise self._data_parse_error( "Invalid data description line: '%s'" % desc_line) self.ntax = int(m.groups()[0]) self.nchar = int(m.groups()[1]) if self.ntax == 0 or self.nchar == 0: raise error.DataSourceError("No data in source", stream=self.stream) if self.interleaved: self._parse_interleaved(lines) else: self._parse_sequential(lines) self.stream = None return self.dataset
def _taxon_error(self, num_expected, found): if num_expected == 1: n1 = "taxon" else: n1 = "taxa" if len(found) == 1: n2 = "taxon" else: n2 = "taxa" if num_expected > len(found): a = "only " else: a = "" raise error.DataParseError( "{} {} expected but {}{} {} found: {}".format( num_expected, n1, a, len(found), n2, ", ".join("{}".format(t) for t in found)))
def parse_char_matrix(self, nxchars): """ Given an XmlElement representing a nexml characters block, this instantiates and returns a corresponding DendroPy CharacterMatrix object. """ # clear self._id_state_alphabet_map = {} self._id_state_map = {} self._id_chartype_map = {} self._char_types = [] self._chartype_id_to_pos_map = {} # initiaiize label = nxchars.get('label', None) char_matrix_oid = nxchars.get('oid', '') # set up taxa otus_id = nxchars.get('otus', None) if otus_id is None: raise Exception( "Character Block %s (\"%s\"): Taxon namespace not specified" % (char_matrix_oid, char_matrix.label)) taxon_namespace = self._id_taxon_namespace_map.get(otus_id, None) if not taxon_namespace: raise Exception( "Character Block %s (\"%s\"): Specified taxon namespace not found" % (char_matrix_oid, char_matrix.label)) # character matrix instantiation nxchartype = nxchars.parse_type() extra_kwargs = {} if nxchartype.startswith('Dna'): data_type = "dna" elif nxchartype.startswith('Rna'): data_type = "rna" elif nxchartype.startswith('Protein'): data_type = "protein" elif nxchartype.startswith('Restriction'): data_type = "restriction" elif nxchartype.startswith('Standard'): data_type = "standard" extra_kwargs["default_state_alphabet"] = None elif nxchartype.startswith('Continuous'): data_type = "continuous" else: raise Exception( "Character Block %s (\"%s\"): Character type '%s' not supported" % (char_matrix_oid, char_matrix.label, nxchartype)) char_matrix = self._char_matrix_factory( data_type, taxon_namespace=taxon_namespace, label=label, **extra_kwargs) # annotation processing annotations = [i for i in nxchars.findall_annotations()] for annotation in annotations: self._parse_annotations(char_matrix, annotation) # get state mappings nxformat = nxchars.find_char_format() if nxformat is not None: self.parse_characters_format(nxformat, data_type, char_matrix) elif data_type == "standard": self.create_standard_character_alphabet(char_matrix) nxmatrix = nxchars.find_char_matrix() annotations = [i for i in nxmatrix.findall_annotations()] for annotation in annotations: self._parse_annotations(char_matrix.taxon_seq_map, annotation) for nxrow in nxmatrix.findall_char_row(): row_id = nxrow.get('id', None) label = nxrow.get('label', None) taxon_id = nxrow.get('otu', None) try: taxon = self._id_taxon_map[(otus_id, taxon_id)] except KeyError: raise error.DataParseError( message= 'Character Block %s (\"%s\"): Taxon with id "%s" not defined in taxa block "%s"' % (char_matrix.oid, char_matrix.label, taxon_id, otus_id)) character_vector = char_matrix.new_sequence(taxon=taxon) annotations = [i for i in nxrow.findall_annotations()] for annotation in annotations: self._parse_annotations(character_vector, annotation) if data_type == "continuous": if nxchartype.endswith('Seqs'): seq = nxrow.find_char_seq() if seq is not None: seq = seq.replace('\n\r', ' ').replace('\r\n', ' ').replace( '\n', ' ').replace('\r', ' ') col_idx = -1 for char in seq.split(' '): char = char.strip() if char: col_idx += 1 if len(self._char_types) <= col_idx: raise error.DataParseError(message="Character column/type ('<char>') not defined for character in position"\ + " %d (matrix = '%s' row='%s', taxon='%s')" % (col_idx+1, char_matrix.oid, row_id, taxon.label)) character_vector.append( character_value=float(char), character_type=self._char_types[col_idx]) else: for nxcell in nxrow.findall_char_cell(): chartype_id = nxcell.get('char', None) if chartype_id is None: raise error.DataParseError(message="'char' attribute missing for cell: cell markup must indicate character column type for character"\ + " (matrix = '%s' row='%s', taxon='%s')" % (char_matrix.oid, row_id, taxon.label)) if chartype_id not in self._id_chartype_map: raise error.DataParseError(message="Character type ('<char>') with id '%s' referenced but not found for character" % chartype_id \ + " (matrix = '%s' row='%s', taxon='%s')" % (char_matrix.oid, row_id, taxon.label)) chartype = self._id_chartype_map[chartype_id] pos_idx = self._char_types.index(chartype) # column = id_chartype_map[chartype_id] # state = column.state_id_map[cell.get('state', None)] # annotations = [i for i in nxcell.findall_annotations] # for annotation in annotations: # self._parse_annotations(cell, annotation) character_vector.append(character_value=float( nxcell.get('state')), character_type=chartype) else: if nxchartype.endswith('Seqs'): seq = nxrow.find_char_seq() if seq is not None: seq = seq.replace(' ', '').replace('\n', '').replace('\r', '') col_idx = -1 for char in seq: col_idx += 1 state_alphabet = char_matrix.character_types[ col_idx].state_alphabet try: state = state_alphabet[char] except KeyError: raise error.DataParseError(message="Character Block row '%s', character position %s: State with symbol '%s' in sequence '%s' not defined" \ % (row_id, col_idx, char, seq)) if len(self._char_types) <= col_idx: raise error.DataParseError(message="Character column/type ('<char>') not defined for character in position"\ + " %d (row='%s', taxon='%s')" % (col_idx+1, row_id, taxon.label)) character_type = self._char_types[col_idx] character_vector.append( character_value=state, character_type=character_type) else: for nxcell in nxrow.findall_char_cell(): chartype_id = nxcell.get('char', None) if chartype_id is None: raise error.DataParseError(message="'char' attribute missing for cell: cell markup must indicate character column type for character"\ + " (matrix = '%s' row='%s', taxon='%s')" % (char_matrix_oid, row_id, taxon.label)) if chartype_id not in self._id_chartype_map: raise error.DataParseError(message="Character type ('<char>') with id '%s' referenced but not found for character" % chartype_id \ + " (matrix = '%s' row='%s', taxon='%s')" % (char_matrix_oid, row_id, taxon.label)) chartype = self._id_chartype_map[chartype_id] state_alphabet = self._id_chartype_map[ chartype_id].state_alphabet pos_idx = self._chartype_id_to_pos_map[chartype_id] state = self._id_state_map[(state_alphabet, nxcell.get('state', None))] character_vector.set_at(pos_idx, character_value=state, character_type=chartype) # self._id_state_alphabet_map = {} # self._id_state_map = {} # self._id_chartype_map = {} char_matrix[taxon] = character_vector