Пример #1
0
 def _build_state_alphabet(self, char_block, symbols):
     sa = dataobject.get_state_alphabet_from_symbols(
         symbols,
         gap_symbol=self.gap_char,
         missing_symbol=self.missing_char)
     char_block.state_alphabets = [sa]
     char_block.default_state_alphabet = char_block.state_alphabets[0]
Пример #2
0
    def read(self, stream):
        if self.exclude_chars:
            return self.dataset
        if self.dataset is None:
            self.dataset = dataobject.DataSet()
        taxon_set = self.get_default_taxon_set()
        self.char_matrix = self.dataset.new_char_matrix(
            char_matrix_type=self.char_matrix_type, taxon_set=taxon_set)
        if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \
            and len(self.char_matrix.state_alphabets) == 0:
            self.char_matrix.state_alphabets.append(
                dataobject.get_state_alphabet_from_symbols("0123456789"))
            self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[
                0]
        if self.char_matrix.default_state_alphabet is not None:
            self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map(
            )
        elif len(self.char_matrix.state_alphabets) == 0:
            raise ValueError("No state alphabets defined")
        elif len(self.char_matrix.state_alphabets) > 1:
            raise NotImplementedError(
                "Mixed state-alphabet matrices not supported")
        else:
            self.symbol_state_map = self.char_matrix.state_alphabets[0]

        self.stream = stream
        lines = fileutils.get_lines(self.stream)
        if len(lines) == 0:
            raise error.DataSourceError("No data in source",
                                        stream=self.stream)
        elif len(lines) <= 2:
            raise error.DataParseError(
                "Expecting at least 2 lines in PHYLIP format data source",
                stream=self.stream)

        desc_line = lines[0]
        lines = lines[1:]
        m = re.match('\s*(\d+)\s+(\d+)\s*$', desc_line)
        if m is None:
            raise self._data_parse_error(
                "Invalid data description line: '%s'" % desc_line)
        self.ntax = int(m.groups()[0])
        self.nchar = int(m.groups()[1])
        if self.ntax == 0 or self.nchar == 0:
            raise error.DataSourceError("No data in source",
                                        stream=self.stream)
        if self.interleaved:
            self._parse_interleaved(lines)
        else:
            self._parse_sequential(lines)
        self.stream = None
        return self.dataset
Пример #3
0
    def read(self, stream):
        if self.exclude_chars:
            return self.dataset
        if self.dataset is None:
            self.dataset = dataobject.DataSet()
        taxon_set = self.get_default_taxon_set()
        self.char_matrix = self.dataset.new_char_matrix(char_matrix_type=self.char_matrix_type, taxon_set=taxon_set)
        if (
            isinstance(self.char_matrix, dataobject.StandardCharacterMatrix)
            and len(self.char_matrix.state_alphabets) == 0
        ):
            self.char_matrix.state_alphabets.append(dataobject.get_state_alphabet_from_symbols("0123456789"))
            self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[0]
        if self.char_matrix.default_state_alphabet is not None:
            self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map()
        elif len(self.char_matrix.state_alphabets) == 0:
            raise ValueError("No state alphabets defined")
        elif len(self.char_matrix.state_alphabets) > 1:
            raise NotImplementedError("Mixed state-alphabet matrices not supported")
        else:
            self.symbol_state_map = self.char_matrix.state_alphabets[0]

        self.stream = stream
        lines = fileutils.get_lines(self.stream)
        if len(lines) == 0:
            raise error.DataSourceError("No data in source", stream=self.stream)
        elif len(lines) <= 2:
            raise error.DataParseError("Expecting at least 2 lines in PHYLIP format data source", stream=self.stream)

        desc_line = lines[0]
        lines = lines[1:]
        m = re.match("\s*(\d+)\s+(\d+)\s*$", desc_line)
        if m is None:
            raise self._data_parse_error("Invalid data description line: '%s'" % desc_line)
        self.ntax = int(m.groups()[0])
        self.nchar = int(m.groups()[1])
        if self.ntax == 0 or self.nchar == 0:
            raise error.DataSourceError("No data in source", stream=self.stream)
        if self.interleaved:
            self._parse_interleaved(lines)
        else:
            self._parse_sequential(lines)
        self.stream = None
        return self.dataset
Пример #4
0
    def read(self, stream):
        """
        Main file parsing driver.
        """

        if self.exclude_chars:
            return self.dataset
        if self.dataset is None:
            self.dataset = dataobject.DataSet()
        taxon_set = self.get_default_taxon_set()
        self.char_matrix = self.dataset.new_char_matrix(
            char_matrix_type=self.char_matrix_type, taxon_set=taxon_set)
        if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \
            and len(self.char_matrix.state_alphabets) == 0:
            self.char_matrix.state_alphabets.append(
                dataobject.get_state_alphabet_from_symbols("0123456789"))
            self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[
                0]
        if self.char_matrix.default_state_alphabet is not None:
            self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map(
            )
        elif len(self.char_matrix.state_alphabets) == 0:
            raise ValueError("No state alphabets defined")
        elif len(self.char_matrix.state_alphabets) > 1:
            raise NotImplementedError(
                "Mixed state-alphabet matrices not supported")
        else:
            self.symbol_state_map = self.char_matrix.state_alphabets[0]

        curr_vec = None
        curr_taxon = None

        if self.simple_rows:
            legal_chars = self.char_matrix.default_state_alphabet.get_legal_symbols_as_str(
            )

        for line_index, line in enumerate(stream):
            s = line.strip()
            if not s:
                continue
            if s.startswith('>'):
                if self.simple_rows and curr_taxon and curr_vec:
                    self.char_matrix[curr_taxon] = "".join(curr_vec)
                name = s[1:].strip()
                curr_taxon = taxon_set.require_taxon(label=name)
                if curr_taxon in self.char_matrix:
                    raise DataParseError(
                        message="Fasta error: Repeated sequence name (%s) found"
                        % name,
                        row=line_index + 1,
                        stream=stream)
                if curr_vec is not None and len(curr_vec) == 0:
                    raise DataParseError(
                        message=
                        "Fasta error: Expected sequence, but found another sequence name (%s)"
                        % name,
                        row=line_index + 1,
                        stream=stream)
                if self.simple_rows:
                    curr_vec = []
                else:
                    curr_vec = dataobject.CharacterDataVector(taxon=curr_taxon)
                    self.char_matrix[curr_taxon] = curr_vec
            elif curr_vec is None:
                raise DataParseError(
                    message=
                    "Fasta error: Expecting a lines starting with > before sequences",
                    row=line_index + 1,
                    stream=stream)
            else:
                if self.simple_rows:
                    for col_ind, c in enumerate(s):
                        c = c.strip()
                        if not c:
                            continue
                        if c not in legal_chars:
                            DataParseError(
                                message='Unrecognized sequence symbol "%s"' %
                                c,
                                row=line_index + 1,
                                column=col_ind + 1,
                                stream=stream)
                        curr_vec.append(c)
                else:
                    for col_ind, c in enumerate(s):
                        c = c.strip()
                        if not c:
                            continue
                        try:
                            state = self.symbol_state_map[c]
                            curr_vec.append(
                                dataobject.CharacterDataCell(value=state))
                        except:
                            raise DataParseError(
                                message='Unrecognized sequence symbol "%s"' %
                                c,
                                row=line_index + 1,
                                column=col_ind + 1,
                                stream=stream)
        if self.simple_rows and curr_taxon and curr_vec:
            self.char_matrix[curr_taxon] = "".join(curr_vec)
        return self.dataset
Пример #5
0
    def read(self, stream):
        """
        Main file parsing driver.
        """

        if self.exclude_chars:
            return self.dataset
        if self.dataset is None:
            self.dataset = dataobject.DataSet()
        taxon_set = self.get_default_taxon_set()
        self.char_matrix = self.dataset.new_char_matrix(char_matrix_type=self.char_matrix_type,
                taxon_set=taxon_set)
        if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \
            and len(self.char_matrix.state_alphabets) == 0:
                self.char_matrix.state_alphabets.append(dataobject.get_state_alphabet_from_symbols("0123456789"))
                self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[0]
        if self.char_matrix.default_state_alphabet is not None:
            self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map()
        elif len(self.char_matrix.state_alphabets) == 0:
            raise ValueError("No state alphabets defined")
        elif len(self.char_matrix.state_alphabets) > 1:
            raise NotImplementedError("Mixed state-alphabet matrices not supported")
        else:
            self.symbol_state_map = self.char_matrix.state_alphabets[0]

        curr_vec = None
        curr_taxon = None

        if self.simple_rows:
            legal_chars = self.char_matrix.default_state_alphabet.get_legal_symbols_as_str()

        for line_index, line in enumerate(stream):
            s = line.strip()
            if not s:
                continue
            if s.startswith('>'):
                if self.simple_rows and curr_taxon and curr_vec:
                    self.char_matrix[curr_taxon] = "".join(curr_vec)
                name = s[1:].strip()
                curr_taxon = taxon_set.require_taxon(label=name)
                if curr_taxon in self.char_matrix:
                    raise DataParseError(message="Fasta error: Repeated sequence name (%s) found" % name, row=line_index + 1, stream=stream)
                if curr_vec is not None and len(curr_vec) == 0:
                    raise DataParseError(message="Fasta error: Expected sequence, but found another sequence name (%s)" % name, row=line_index + 1, stream=stream)
                if self.simple_rows:
                    curr_vec = []
                else:
                    curr_vec = dataobject.CharacterDataVector(taxon=curr_taxon)
                    self.char_matrix[curr_taxon] = curr_vec
            elif curr_vec is None:
                raise DataParseError(message="Fasta error: Expecting a lines starting with > before sequences", row=line_index + 1, stream=stream)
            else:
                if self.simple_rows:
                    for col_ind, c in enumerate(s):
                        c = c.strip()
                        if not c:
                            continue
                        if c not in legal_chars:
                            DataParseError(message='Unrecognized sequence symbol "%s"' % c, row=line_index + 1, column=col_ind + 1, stream=stream)
                        curr_vec.append(c)
                else:
                    for col_ind, c in enumerate(s):
                        c = c.strip()
                        if not c:
                            continue
                        try:
                            state = self.symbol_state_map[c]
                            curr_vec.append(dataobject.CharacterDataCell(value=state))
                        except:
                            raise DataParseError(message='Unrecognized sequence symbol "%s"' % c, row=line_index + 1, column=col_ind + 1, stream=stream)
        if self.simple_rows and curr_taxon and curr_vec:
            self.char_matrix[curr_taxon] = "".join(curr_vec)
        return self.dataset
 def _build_state_alphabet(self, char_block, symbols):
     sa = dataobject.get_state_alphabet_from_symbols(symbols,
             gap_symbol=self.gap_char,
             missing_symbol=self.missing_char)
     char_block.state_alphabets = [sa]
     char_block.default_state_alphabet = char_block.state_alphabets[0]
        def _ncl_characters_block_to_native(self, taxa_block, ncl_cb, ncl_nxs_reader_handle):
            """
            Processes a FORMAT command. Assumes that the file reader is
            positioned right after the "FORMAT" token in a FORMAT command.
            """
            raw_matrix = ncl_cb.GetRawDiscreteMatrixRef()
            if ncl_cb.IsMixedType():
                _LOG.warn("Mixed datatype character blocks are not supported in Dendropy.  Skipping...")
                return None
            char_block_type = _ncl_datatype_enum_to_dendropy(ncl_cb.GetDataType())
            mapper = ncl_cb.GetDatatypeMapperForCharRef(0)
            symbols = mapper.GetSymbols()
            state_codes_mapping = mapper.GetPythonicStateVectors()

            char_block = char_block_type()
            char_block.taxon_set = taxa_block
            if isinstance(char_block, dataobject.StandardCharacterMatrix):
                sa = dataobject.get_state_alphabet_from_symbols(
                        symbols=symbols,
                        gap_symbol='-',
                        missing_symbol='?'
                )
                char_block.state_alphabets = [sa]
                char_block.default_state_alphabet = char_block.state_alphabets[0]
            symbol_state_map = char_block.default_state_alphabet.symbol_state_map()

            ncl_numeric_code_to_state = []
            for s in symbols:
                ncl_numeric_code_to_state.append(symbol_state_map[s])
            for sc in state_codes_mapping[len(symbols):-2]:
                search = set()
                for fundamental_state in sc:
                    search.add(ncl_numeric_code_to_state[fundamental_state])
                found = False
                for sym, state in symbol_state_map.iteritems():
                    ms = state.member_states
                    if ms:
                        possible = set(ms)
                        if possible == search:
                            found = True
                            ncl_numeric_code_to_state.append(state)
                            break
                if not found:
                    raise ValueError("NCL datatype cannot be coerced into datatype because ambiguity code for %s is missing " % str(search))
            ncl_numeric_code_to_state.append(symbol_state_map['-'])
            ncl_numeric_code_to_state.append(symbol_state_map['?'])

            assert (len(raw_matrix) == len(taxa_block))
            for row_ind, taxon in enumerate(taxa_block):
                v = dataobject.CharacterDataVector(taxon=taxon)
                raw_row = raw_matrix[row_ind]
                char_block[taxon] = v
                if not self.exclude_chars:
                    for c in raw_row:
                        state = ncl_numeric_code_to_state[c]
                        v.append(dataobject.CharacterDataCell(value=state))

            #dataset.characters_blocks.append(char_block)
            supporting_exsets = False
            supporting_charset_exsets = False

            if supporting_exsets:
                s = ncl_cb.GetExcludedIndexSet()
                print "Excluded chars =", str(nclwrapper.NxsSetReader.GetSetAsVector(s))
            if supporting_charset_exsets:
                _LOG.debug("Calling MultiFormatReader.GetNumTaxaBlocks()")
                nab = ncl_nxs_reader_handle.GetNumAssumptionsBlocks(ncl_cb)
                for k in xrange(nab):
                    _LOG.debug("Calling MultiFormatReader.GetNumTaxaBlocks()")
                    a = ncl_nxs_reader_handle.GetAssumptionsBlock(ncl_cb, k)
                    cs = a.GetCharSetNames()
                    print "CharSets have the names " , str(cs)
            return char_block
Пример #8
0
        def _ncl_characters_block_to_native(self, taxa_block, ncl_cb,
                                            ncl_nxs_reader_handle):
            """
            Processes a FORMAT command. Assumes that the file reader is
            positioned right after the "FORMAT" token in a FORMAT command.
            """
            raw_matrix = ncl_cb.GetRawDiscreteMatrixRef()
            if ncl_cb.IsMixedType():
                _LOG.warn(
                    "Mixed datatype character blocks are not supported in Dendropy.  Skipping..."
                )
                return None
            char_block_type = _ncl_datatype_enum_to_dendropy(
                ncl_cb.GetDataType())
            mapper = ncl_cb.GetDatatypeMapperForCharRef(0)
            symbols = mapper.GetSymbols()
            state_codes_mapping = mapper.GetPythonicStateVectors()

            char_block = char_block_type()
            char_block.taxon_set = taxa_block
            if isinstance(char_block, dataobject.StandardCharacterMatrix):
                sa = dataobject.get_state_alphabet_from_symbols(
                    symbols=symbols, gap_symbol='-', missing_symbol='?')
                char_block.state_alphabets = [sa]
                char_block.default_state_alphabet = char_block.state_alphabets[
                    0]
            symbol_state_map = char_block.default_state_alphabet.symbol_state_map(
            )

            ncl_numeric_code_to_state = []
            for s in symbols:
                ncl_numeric_code_to_state.append(symbol_state_map[s])
            for sc in state_codes_mapping[len(symbols):-2]:
                search = set()
                for fundamental_state in sc:
                    search.add(ncl_numeric_code_to_state[fundamental_state])
                found = False
                for sym, state in symbol_state_map.iteritems():
                    ms = state.member_states
                    if ms:
                        possible = set(ms)
                        if possible == search:
                            found = True
                            ncl_numeric_code_to_state.append(state)
                            break
                if not found:
                    raise ValueError(
                        "NCL datatype cannot be coerced into datatype because ambiguity code for %s is missing "
                        % str(search))
            ncl_numeric_code_to_state.append(symbol_state_map['-'])
            ncl_numeric_code_to_state.append(symbol_state_map['?'])

            assert (len(raw_matrix) == len(taxa_block))
            for row_ind, taxon in enumerate(taxa_block):
                v = dataobject.CharacterDataVector(taxon=taxon)
                raw_row = raw_matrix[row_ind]
                char_block[taxon] = v
                if not self.exclude_chars:
                    for c in raw_row:
                        state = ncl_numeric_code_to_state[c]
                        v.append(dataobject.CharacterDataCell(value=state))

            #dataset.characters_blocks.append(char_block)
            supporting_exsets = False
            supporting_charset_exsets = False

            if supporting_exsets:
                s = ncl_cb.GetExcludedIndexSet()
                print "Excluded chars =", str(
                    nclwrapper.NxsSetReader.GetSetAsVector(s))
            if supporting_charset_exsets:
                _LOG.debug("Calling MultiFormatReader.GetNumTaxaBlocks()")
                nab = ncl_nxs_reader_handle.GetNumAssumptionsBlocks(ncl_cb)
                for k in xrange(nab):
                    _LOG.debug("Calling MultiFormatReader.GetNumTaxaBlocks()")
                    a = ncl_nxs_reader_handle.GetAssumptionsBlock(ncl_cb, k)
                    cs = a.GetCharSetNames()
                    print "CharSets have the names ", str(cs)
            return char_block