def __init__(self, row, column, taxon_set, max_taxa, label): self.taxon_set = taxon_set self.label = label message = "Cannot add '%s': Declared number of taxa (%d) already defined: %s" % \ (label, max_taxa, str([("%s" % t.label) for t in taxon_set])) DataParseError.__init__(self, message=message, row=row, column=column)
def __init__(self, row, column, taxon_set, max_taxa, label): self.taxon_set = taxon_set self.label = label message = "Cannot add '%s': Declared number of taxa (%d) already defined: %s" % ( label, max_taxa, str([("%s" % t.label) for t in taxon_set]), ) DataParseError.__init__(self, message=message, row=row, column=column)
def data_format_error(self, message): """ Returns an exception object parameterized with line and column number values. """ return DataParseError(message=message, row=self.current_line_number, column=self.current_col_number)
def read(self, stream): """ Main file parsing driver. """ if self.exclude_chars: return self.dataset if self.dataset is None: self.dataset = dataobject.DataSet() taxon_set = self.get_default_taxon_set() self.char_matrix = self.dataset.new_char_matrix( char_matrix_type=self.char_matrix_type, taxon_set=taxon_set) if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \ and len(self.char_matrix.state_alphabets) == 0: self.char_matrix.state_alphabets.append( dataobject.get_state_alphabet_from_symbols("0123456789")) self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[ 0] if self.char_matrix.default_state_alphabet is not None: self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map( ) elif len(self.char_matrix.state_alphabets) == 0: raise ValueError("No state alphabets defined") elif len(self.char_matrix.state_alphabets) > 1: raise NotImplementedError( "Mixed state-alphabet matrices not supported") else: self.symbol_state_map = self.char_matrix.state_alphabets[0] curr_vec = None curr_taxon = None if self.simple_rows: legal_chars = self.char_matrix.default_state_alphabet.get_legal_symbols_as_str( ) for line_index, line in enumerate(stream): s = line.strip() if not s: continue if s.startswith('>'): if self.simple_rows and curr_taxon and curr_vec: self.char_matrix[curr_taxon] = "".join(curr_vec) name = s[1:].strip() curr_taxon = taxon_set.require_taxon(label=name) if curr_taxon in self.char_matrix: raise DataParseError( message="Fasta error: Repeated sequence name (%s) found" % name, row=line_index + 1, stream=stream) if curr_vec is not None and len(curr_vec) == 0: raise DataParseError( message= "Fasta error: Expected sequence, but found another sequence name (%s)" % name, row=line_index + 1, stream=stream) if self.simple_rows: curr_vec = [] else: curr_vec = dataobject.CharacterDataVector(taxon=curr_taxon) self.char_matrix[curr_taxon] = curr_vec elif curr_vec is None: raise DataParseError( message= "Fasta error: Expecting a lines starting with > before sequences", row=line_index + 1, stream=stream) else: if self.simple_rows: for col_ind, c in enumerate(s): c = c.strip() if not c: continue if c not in legal_chars: DataParseError( message='Unrecognized sequence symbol "%s"' % c, row=line_index + 1, column=col_ind + 1, stream=stream) curr_vec.append(c) else: for col_ind, c in enumerate(s): c = c.strip() if not c: continue try: state = self.symbol_state_map[c] curr_vec.append( dataobject.CharacterDataCell(value=state)) except: raise DataParseError( message='Unrecognized sequence symbol "%s"' % c, row=line_index + 1, column=col_ind + 1, stream=stream) if self.simple_rows and curr_taxon and curr_vec: self.char_matrix[curr_taxon] = "".join(curr_vec) return self.dataset
def _read(self, stream, taxon_namespace_factory=None, tree_list_factory=None, char_matrix_factory=None, state_alphabet_factory=None, global_annotations_target=None): taxon_namespace = taxon_namespace_factory(label=None) if self.data_type is None: raise TypeError("Data type must be specified for this schema") if self.data_type == "standard" and self.default_state_alphabet is not None: char_matrix = char_matrix_factory( self.data_type, label=None, taxon_namespace=taxon_namespace, default_state_alphabet=self.default_state_alphabet, ) else: char_matrix = char_matrix_factory(self.data_type, label=None, taxon_namespace=taxon_namespace) symbol_state_map = char_matrix.default_state_alphabet.full_symbol_state_map curr_vec = None curr_taxon = None for line_index, line in enumerate(stream): s = line.strip() if not s: continue if s.startswith('>'): name = s[1:].strip() curr_taxon = taxon_namespace.require_taxon(label=name) if curr_taxon in char_matrix: raise DataParseError( message= "FASTA error: Repeated sequence name ('{}') found". format(name), line_num=line_index + 1, stream=stream) if curr_vec is not None and len(curr_vec) == 0: raise DataParseError( message= "FASTA error: Expected sequence, but found another sequence name ('{}')" .format(name), line_num=line_index + 1, stream=stream) curr_vec = char_matrix[curr_taxon] elif curr_vec is None: raise DataParseError( message= "FASTA error: Expecting a lines starting with > before sequences", line_num=line_index + 1, stream=stream) else: states = [] for col_ind, c in enumerate(s): c = c.strip() if not c: continue try: state = symbol_state_map[c] except KeyError: raise DataParseError( message="Unrecognized sequence symbol '{}'".format( c), line_num=line_index + 1, col_num=col_ind + 1, stream=stream) states.append(state) curr_vec.extend(states) product = self.Product(taxon_namespaces=None, tree_lists=None, char_matrices=[char_matrix]) return product
def __init__(self, *args, **kwargs): DataParseError.__init__(self, *args, **kwargs)