Exemplo n.º 1
0
 def _process_chars(self, char_group, char_block, symbol_state_map, taxon):
     if self.exclude_chars:
         return
     if not char_group:
         return
     char_group = self._parse_nexus_multistate(char_group)
     for char in char_group:
         if len(char) == 1:
             try:
                 state = symbol_state_map[char.upper()]
             except KeyError:
                 if self.match_char is not None \
                     and char.upper() == self.match_char.upper():
                     state = char_block[0][len(char_block[taxon])].value
                 else:
                     raise self.data_format_error(
                         "Unrecognized (single) state encountered in '%s': '%s' is not defined in %s"
                         % ("".join(char_group), char,
                            symbol_state_map.keys()))
         else:
             if hasattr(char, "open_tag"):
                 state = self._get_state_for_multistate_char(
                     char, char_block.default_state_alphabet)
             else:
                 raise self.data_format_error(
                     "Multiple character state without multi-state mark-up: '%s'"
                     % char)
         if state is None:
             raise self.data_format_error(
                 "Unrecognized state encountered: '%s'" % char)
         char_block[taxon].append(dataobject.CharacterDataCell(value=state))
Exemplo n.º 2
0
 def _parse_sequence_from_line(self, current_taxon, line, line_index):
     for c in line:
         if c in [' ', '\t']:
             continue
         try:
             state = self.symbol_state_map[c.upper()]
         except KeyError:
             if not self.ignore_invalid_chars:
                 raise self._data_parse_error(
                     "Invalid state symbol for taxon '%s': '%s'" %
                     (current_taxon.label, c),
                     line_index=line_index)
         else:
             self.char_matrix[current_taxon].append(
                 dataobject.CharacterDataCell(value=state))
Exemplo n.º 3
0
 def _process_continuous_matrix_data(self, char_block):
     taxon_set = char_block.taxon_set
     token = self.stream_tokenizer.read_next_token()
     while token != ';' and not self.stream_tokenizer.eof:
         taxon = self._get_taxon(taxon_set=taxon_set, label=token)
         if taxon not in char_block:
             char_block[taxon] = dataobject.CharacterDataVector(taxon=taxon)
             if self.interleave:
                 raise NotImplementedError(
                     "Continuous characters in NEXUS schema not yet supported"
                 )
             else:
                 while len(
                         char_block[taxon]
                 ) < self.file_specified_nchar and not self.stream_tokenizer.eof:
                     char_group = self.stream_tokenizer.read_next_token(
                         ignore_punctuation="-+")
                     char_block[taxon].append(
                         dataobject.CharacterDataCell(
                             value=float(char_group)))
                 if len(char_block[taxon]) < self.file_specified_nchar:
                     raise self.data_format_error("Insufficient characters given for taxon '%s': expecting %d but only found %d ('%s')" \
                         % (taxon.label, self.file_specified_nchar, len(char_block[taxon]), char_block[taxon].symbols_as_string()))
                 token = self.stream_tokenizer.read_next_token()
Exemplo n.º 4
0
    def read(self, stream):
        """
        Main file parsing driver.
        """

        if self.exclude_chars:
            return self.dataset
        if self.dataset is None:
            self.dataset = dataobject.DataSet()
        taxon_set = self.get_default_taxon_set()
        self.char_matrix = self.dataset.new_char_matrix(
            char_matrix_type=self.char_matrix_type, taxon_set=taxon_set)
        if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \
            and len(self.char_matrix.state_alphabets) == 0:
            self.char_matrix.state_alphabets.append(
                dataobject.get_state_alphabet_from_symbols("0123456789"))
            self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[
                0]
        if self.char_matrix.default_state_alphabet is not None:
            self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map(
            )
        elif len(self.char_matrix.state_alphabets) == 0:
            raise ValueError("No state alphabets defined")
        elif len(self.char_matrix.state_alphabets) > 1:
            raise NotImplementedError(
                "Mixed state-alphabet matrices not supported")
        else:
            self.symbol_state_map = self.char_matrix.state_alphabets[0]

        curr_vec = None
        curr_taxon = None

        if self.simple_rows:
            legal_chars = self.char_matrix.default_state_alphabet.get_legal_symbols_as_str(
            )

        for line_index, line in enumerate(stream):
            s = line.strip()
            if not s:
                continue
            if s.startswith('>'):
                if self.simple_rows and curr_taxon and curr_vec:
                    self.char_matrix[curr_taxon] = "".join(curr_vec)
                name = s[1:].strip()
                curr_taxon = taxon_set.require_taxon(label=name)
                if curr_taxon in self.char_matrix:
                    raise DataParseError(
                        message="Fasta error: Repeated sequence name (%s) found"
                        % name,
                        row=line_index + 1,
                        stream=stream)
                if curr_vec is not None and len(curr_vec) == 0:
                    raise DataParseError(
                        message=
                        "Fasta error: Expected sequence, but found another sequence name (%s)"
                        % name,
                        row=line_index + 1,
                        stream=stream)
                if self.simple_rows:
                    curr_vec = []
                else:
                    curr_vec = dataobject.CharacterDataVector(taxon=curr_taxon)
                    self.char_matrix[curr_taxon] = curr_vec
            elif curr_vec is None:
                raise DataParseError(
                    message=
                    "Fasta error: Expecting a lines starting with > before sequences",
                    row=line_index + 1,
                    stream=stream)
            else:
                if self.simple_rows:
                    for col_ind, c in enumerate(s):
                        c = c.strip()
                        if not c:
                            continue
                        if c not in legal_chars:
                            DataParseError(
                                message='Unrecognized sequence symbol "%s"' %
                                c,
                                row=line_index + 1,
                                column=col_ind + 1,
                                stream=stream)
                        curr_vec.append(c)
                else:
                    for col_ind, c in enumerate(s):
                        c = c.strip()
                        if not c:
                            continue
                        try:
                            state = self.symbol_state_map[c]
                            curr_vec.append(
                                dataobject.CharacterDataCell(value=state))
                        except:
                            raise DataParseError(
                                message='Unrecognized sequence symbol "%s"' %
                                c,
                                row=line_index + 1,
                                column=col_ind + 1,
                                stream=stream)
        if self.simple_rows and curr_taxon and curr_vec:
            self.char_matrix[curr_taxon] = "".join(curr_vec)
        return self.dataset
Exemplo n.º 5
0
        def _ncl_characters_block_to_native(self, taxa_block, ncl_cb,
                                            ncl_nxs_reader_handle):
            """
            Processes a FORMAT command. Assumes that the file reader is
            positioned right after the "FORMAT" token in a FORMAT command.
            """
            raw_matrix = ncl_cb.GetRawDiscreteMatrixRef()
            if ncl_cb.IsMixedType():
                _LOG.warn(
                    "Mixed datatype character blocks are not supported in Dendropy.  Skipping..."
                )
                return None
            char_block_type = _ncl_datatype_enum_to_dendropy(
                ncl_cb.GetDataType())
            mapper = ncl_cb.GetDatatypeMapperForCharRef(0)
            symbols = mapper.GetSymbols()
            state_codes_mapping = mapper.GetPythonicStateVectors()

            char_block = char_block_type()
            char_block.taxon_set = taxa_block
            if isinstance(char_block, dataobject.StandardCharacterMatrix):
                sa = dataobject.get_state_alphabet_from_symbols(
                    symbols=symbols, gap_symbol='-', missing_symbol='?')
                char_block.state_alphabets = [sa]
                char_block.default_state_alphabet = char_block.state_alphabets[
                    0]
            symbol_state_map = char_block.default_state_alphabet.symbol_state_map(
            )

            ncl_numeric_code_to_state = []
            for s in symbols:
                ncl_numeric_code_to_state.append(symbol_state_map[s])
            for sc in state_codes_mapping[len(symbols):-2]:
                search = set()
                for fundamental_state in sc:
                    search.add(ncl_numeric_code_to_state[fundamental_state])
                found = False
                for sym, state in symbol_state_map.iteritems():
                    ms = state.member_states
                    if ms:
                        possible = set(ms)
                        if possible == search:
                            found = True
                            ncl_numeric_code_to_state.append(state)
                            break
                if not found:
                    raise ValueError(
                        "NCL datatype cannot be coerced into datatype because ambiguity code for %s is missing "
                        % str(search))
            ncl_numeric_code_to_state.append(symbol_state_map['-'])
            ncl_numeric_code_to_state.append(symbol_state_map['?'])

            assert (len(raw_matrix) == len(taxa_block))
            for row_ind, taxon in enumerate(taxa_block):
                v = dataobject.CharacterDataVector(taxon=taxon)
                raw_row = raw_matrix[row_ind]
                char_block[taxon] = v
                if not self.exclude_chars:
                    for c in raw_row:
                        state = ncl_numeric_code_to_state[c]
                        v.append(dataobject.CharacterDataCell(value=state))

            #dataset.characters_blocks.append(char_block)
            supporting_exsets = False
            supporting_charset_exsets = False

            if supporting_exsets:
                s = ncl_cb.GetExcludedIndexSet()
                print "Excluded chars =", str(
                    nclwrapper.NxsSetReader.GetSetAsVector(s))
            if supporting_charset_exsets:
                _LOG.debug("Calling MultiFormatReader.GetNumTaxaBlocks()")
                nab = ncl_nxs_reader_handle.GetNumAssumptionsBlocks(ncl_cb)
                for k in xrange(nab):
                    _LOG.debug("Calling MultiFormatReader.GetNumTaxaBlocks()")
                    a = ncl_nxs_reader_handle.GetAssumptionsBlock(ncl_cb, k)
                    cs = a.GetCharSetNames()
                    print "CharSets have the names ", str(cs)
            return char_block