예제 #1
0
 def _read(self,
           stream,
           taxon_namespace_factory=None,
           tree_list_factory=None,
           char_matrix_factory=None,
           state_alphabet_factory=None,
           global_annotations_target=None):
     self.reset()
     self.stream = stream
     self.taxon_namespace = taxon_namespace_factory(label=None)
     if self.data_type is None:
         raise TypeError("Data type must be specified for this schema")
     if self.data_type == "standard" and self.default_state_alphabet is not None:
         self.char_matrix = char_matrix_factory(
             self.data_type,
             label=None,
             taxon_namespace=self.taxon_namespace,
             default_state_alphabet=self.default_state_alphabet,
         )
     else:
         self.char_matrix = char_matrix_factory(
             self.data_type,
             label=None,
             taxon_namespace=self.taxon_namespace)
         if self.data_type == "standard":
             state_alphabet = state_alphabet_factory(
                 fundamental_states="0123456789",
                 no_data_symbol="?",
                 gap_symbol="-",
                 case_sensitive=False)
             self.char_matrix.state_alphabets.append(state_alphabet)
     lines = filesys.get_lines(stream)
     if len(lines) == 0:
         raise error.DataParseError("No data in source", stream=self.stream)
     elif len(lines) <= 2:
         raise error.DataParseError(
             "Expecting at least 2 lines in PHYLIP format data source",
             stream=self.stream)
     desc_line = lines[0]
     lines = lines[1:]
     m = re.match('\s*(\d+)\s+(\d+)\s*$', desc_line)
     if m is None:
         raise self._data_parse_error(
             "Invalid data description line: '%s'" % desc_line)
     self.ntax = int(m.groups()[0])
     self.nchar = int(m.groups()[1])
     if self.ntax == 0 or self.nchar == 0:
         raise error.DataParseError("No data in source", stream=self.stream)
     if self.interleaved:
         self._parse_interleaved(lines)
     else:
         self._parse_sequential(lines)
     if len(self.taxa_processed) != self.ntax:
         self._taxon_error(num_expected=self.ntax,
                           found=self.taxa_processed)
     product = self.Product(taxon_namespaces=None,
                            tree_lists=None,
                            char_matrices=[self.char_matrix])
     return product
예제 #2
0
 def _read(self,
           stream,
           taxon_namespace_factory=None,
           tree_list_factory=None,
           char_matrix_factory=None,
           state_alphabet_factory=None,
           global_annotations_target=None):
     data = stream.read()
     start_positions = []
     for match in MultiPhylipReader.data_block_start_pattern.finditer(data):
         start_positions.append(match.start(1))
     if not start_positions:
         raise error.DataParseError("No PHYLIP data blocks found in source",
                                    stream=stream)
     char_matrices = []
     for idx, start_pos in enumerate(start_positions):
         if idx == len(start_positions) - 1:
             end_pos = len(data)
         else:
             end_pos = start_positions[idx + 1]
         block = data[start_pos:end_pos]
         src = StringIO(block)
         subproduct = self._phylip_reader._read(
             stream=src,
             taxon_namespace_factory=taxon_namespace_factory,
             tree_list_factory=tree_list_factory,
             char_matrix_factory=char_matrix_factory,
             state_alphabet_factory=state_alphabet_factory,
             global_annotations_target=global_annotations_target,
         )
         char_matrices.extend(subproduct.char_matrices)
     product = self.Product(taxon_namespaces=None,
                            tree_lists=None,
                            char_matrices=char_matrices)
     return product
예제 #3
0
    def read(self, stream):
        if self.exclude_chars:
            return self.dataset
        if self.dataset is None:
            self.dataset = dataobject.DataSet()
        taxon_set = self.get_default_taxon_set()
        self.char_matrix = self.dataset.new_char_matrix(
            char_matrix_type=self.char_matrix_type, taxon_set=taxon_set)
        if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \
            and len(self.char_matrix.state_alphabets) == 0:
            self.char_matrix.state_alphabets.append(
                dataobject.get_state_alphabet_from_symbols("0123456789"))
            self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[
                0]
        if self.char_matrix.default_state_alphabet is not None:
            self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map(
            )
        elif len(self.char_matrix.state_alphabets) == 0:
            raise ValueError("No state alphabets defined")
        elif len(self.char_matrix.state_alphabets) > 1:
            raise NotImplementedError(
                "Mixed state-alphabet matrices not supported")
        else:
            self.symbol_state_map = self.char_matrix.state_alphabets[0]

        self.stream = stream
        lines = fileutils.get_lines(self.stream)
        if len(lines) == 0:
            raise error.DataSourceError("No data in source",
                                        stream=self.stream)
        elif len(lines) <= 2:
            raise error.DataParseError(
                "Expecting at least 2 lines in PHYLIP format data source",
                stream=self.stream)

        desc_line = lines[0]
        lines = lines[1:]
        m = re.match('\s*(\d+)\s+(\d+)\s*$', desc_line)
        if m is None:
            raise self._data_parse_error(
                "Invalid data description line: '%s'" % desc_line)
        self.ntax = int(m.groups()[0])
        self.nchar = int(m.groups()[1])
        if self.ntax == 0 or self.nchar == 0:
            raise error.DataSourceError("No data in source",
                                        stream=self.stream)
        if self.interleaved:
            self._parse_interleaved(lines)
        else:
            self._parse_sequential(lines)
        self.stream = None
        return self.dataset
예제 #4
0
 def _taxon_error(self, num_expected, found):
     if num_expected == 1:
         n1 = "taxon"
     else:
         n1 = "taxa"
     if len(found) == 1:
         n2 = "taxon"
     else:
         n2 = "taxa"
     if num_expected > len(found):
         a = "only "
     else:
         a = ""
     raise error.DataParseError(
         "{} {} expected but {}{} {} found: {}".format(
             num_expected, n1, a, len(found), n2,
             ", ".join("{}".format(t) for t in found)))
예제 #5
0
    def parse_char_matrix(self, nxchars):
        """
        Given an XmlElement representing a nexml characters block, this
        instantiates and returns a corresponding DendroPy CharacterMatrix object.
        """

        # clear
        self._id_state_alphabet_map = {}
        self._id_state_map = {}
        self._id_chartype_map = {}
        self._char_types = []
        self._chartype_id_to_pos_map = {}

        # initiaiize
        label = nxchars.get('label', None)
        char_matrix_oid = nxchars.get('oid', '')

        # set up taxa
        otus_id = nxchars.get('otus', None)
        if otus_id is None:
            raise Exception(
                "Character Block %s (\"%s\"): Taxon namespace not specified" %
                (char_matrix_oid, char_matrix.label))
        taxon_namespace = self._id_taxon_namespace_map.get(otus_id, None)
        if not taxon_namespace:
            raise Exception(
                "Character Block %s (\"%s\"): Specified taxon namespace not found"
                % (char_matrix_oid, char_matrix.label))

        # character matrix instantiation
        nxchartype = nxchars.parse_type()
        extra_kwargs = {}
        if nxchartype.startswith('Dna'):
            data_type = "dna"
        elif nxchartype.startswith('Rna'):
            data_type = "rna"
        elif nxchartype.startswith('Protein'):
            data_type = "protein"
        elif nxchartype.startswith('Restriction'):
            data_type = "restriction"
        elif nxchartype.startswith('Standard'):
            data_type = "standard"
            extra_kwargs["default_state_alphabet"] = None
        elif nxchartype.startswith('Continuous'):
            data_type = "continuous"
        else:
            raise Exception(
                "Character Block %s (\"%s\"): Character type '%s' not supported"
                % (char_matrix_oid, char_matrix.label, nxchartype))
        char_matrix = self._char_matrix_factory(
            data_type,
            taxon_namespace=taxon_namespace,
            label=label,
            **extra_kwargs)

        # annotation processing
        annotations = [i for i in nxchars.findall_annotations()]
        for annotation in annotations:
            self._parse_annotations(char_matrix, annotation)

        # get state mappings
        nxformat = nxchars.find_char_format()
        if nxformat is not None:
            self.parse_characters_format(nxformat, data_type, char_matrix)
        elif data_type == "standard":
            self.create_standard_character_alphabet(char_matrix)

        nxmatrix = nxchars.find_char_matrix()
        annotations = [i for i in nxmatrix.findall_annotations()]
        for annotation in annotations:
            self._parse_annotations(char_matrix.taxon_seq_map, annotation)
        for nxrow in nxmatrix.findall_char_row():
            row_id = nxrow.get('id', None)
            label = nxrow.get('label', None)
            taxon_id = nxrow.get('otu', None)
            try:
                taxon = self._id_taxon_map[(otus_id, taxon_id)]
            except KeyError:
                raise error.DataParseError(
                    message=
                    'Character Block %s (\"%s\"): Taxon with id "%s" not defined in taxa block "%s"'
                    % (char_matrix.oid, char_matrix.label, taxon_id, otus_id))

            character_vector = char_matrix.new_sequence(taxon=taxon)
            annotations = [i for i in nxrow.findall_annotations()]
            for annotation in annotations:
                self._parse_annotations(character_vector, annotation)

            if data_type == "continuous":
                if nxchartype.endswith('Seqs'):
                    seq = nxrow.find_char_seq()
                    if seq is not None:
                        seq = seq.replace('\n\r',
                                          ' ').replace('\r\n', ' ').replace(
                                              '\n', ' ').replace('\r', ' ')
                        col_idx = -1
                        for char in seq.split(' '):
                            char = char.strip()
                            if char:
                                col_idx += 1
                                if len(self._char_types) <= col_idx:
                                    raise error.DataParseError(message="Character column/type ('<char>') not defined for character in position"\
                                        + " %d (matrix = '%s' row='%s', taxon='%s')" % (col_idx+1, char_matrix.oid, row_id, taxon.label))
                                character_vector.append(
                                    character_value=float(char),
                                    character_type=self._char_types[col_idx])
                else:
                    for nxcell in nxrow.findall_char_cell():
                        chartype_id = nxcell.get('char', None)
                        if chartype_id is None:
                            raise error.DataParseError(message="'char' attribute missing for cell: cell markup must indicate character column type for character"\
                                        + " (matrix = '%s' row='%s', taxon='%s')" % (char_matrix.oid, row_id, taxon.label))
                        if chartype_id not in self._id_chartype_map:
                            raise error.DataParseError(message="Character type ('<char>') with id '%s' referenced but not found for character" % chartype_id \
                                        + " (matrix = '%s' row='%s', taxon='%s')" % (char_matrix.oid, row_id, taxon.label))
                        chartype = self._id_chartype_map[chartype_id]
                        pos_idx = self._char_types.index(chartype)
                        #                         column = id_chartype_map[chartype_id]
                        #                         state = column.state_id_map[cell.get('state', None)]
                        # annotations = [i for i in nxcell.findall_annotations]
                        # for annotation in annotations:
                        #     self._parse_annotations(cell, annotation)
                        character_vector.append(character_value=float(
                            nxcell.get('state')),
                                                character_type=chartype)
            else:
                if nxchartype.endswith('Seqs'):
                    seq = nxrow.find_char_seq()
                    if seq is not None:
                        seq = seq.replace(' ',
                                          '').replace('\n',
                                                      '').replace('\r', '')
                        col_idx = -1
                        for char in seq:
                            col_idx += 1
                            state_alphabet = char_matrix.character_types[
                                col_idx].state_alphabet
                            try:
                                state = state_alphabet[char]
                            except KeyError:
                                raise error.DataParseError(message="Character Block row '%s', character position %s: State with symbol '%s' in sequence '%s' not defined" \
                                        % (row_id, col_idx, char, seq))
                            if len(self._char_types) <= col_idx:
                                raise error.DataParseError(message="Character column/type ('<char>') not defined for character in position"\
                                    + " %d (row='%s', taxon='%s')" % (col_idx+1, row_id, taxon.label))
                            character_type = self._char_types[col_idx]
                            character_vector.append(
                                character_value=state,
                                character_type=character_type)
                else:
                    for nxcell in nxrow.findall_char_cell():
                        chartype_id = nxcell.get('char', None)
                        if chartype_id is None:
                            raise error.DataParseError(message="'char' attribute missing for cell: cell markup must indicate character column type for character"\
                                        + " (matrix = '%s' row='%s', taxon='%s')" % (char_matrix_oid, row_id, taxon.label))
                        if chartype_id not in self._id_chartype_map:
                            raise error.DataParseError(message="Character type ('<char>') with id '%s' referenced but not found for character" % chartype_id \
                                        + " (matrix = '%s' row='%s', taxon='%s')" % (char_matrix_oid, row_id, taxon.label))
                        chartype = self._id_chartype_map[chartype_id]
                        state_alphabet = self._id_chartype_map[
                            chartype_id].state_alphabet
                        pos_idx = self._chartype_id_to_pos_map[chartype_id]
                        state = self._id_state_map[(state_alphabet,
                                                    nxcell.get('state', None))]
                        character_vector.set_at(pos_idx,
                                                character_value=state,
                                                character_type=chartype)
                        # self._id_state_alphabet_map = {}
                        # self._id_state_map = {}
                        # self._id_chartype_map = {}

            char_matrix[taxon] = character_vector