Пример #1
0
    def __init__(self, hit, gene_ref, gene_status):
        """
        :param hit: a match between a hmm profile and a replicon
        :type hit: :class:`macsypy.hit.CoreHit` object
        :param gene_ref: The ModelGene link to this hit
                         The ModeleGene have the same name than the CoreGene
                         But one hit can be link to several ModelGene (several Model)
                         To know for what gene this hit play role use the
                         :meth:`macsypy.gene.ModelGene.alternate_of` ::

                            hit.gene_ref.alternate_of()

        :type gene_ref: :class:`macsypy.gene.ModelGene` object
        :param gene_status:
        :type gene_status: :class:`macsypy.gene.GeneStatus` object
        """
        if not isinstance(hit, CoreHit):
            raise MacsypyError(
                f"The {self.__class__.__name__} 'hit' argument must be a CoreHit not {type(hit)}."
            )
        self._hit = hit
        if not isinstance(gene_ref, ModelGene):
            raise MacsypyError(
                f"The {self.__class__.__name__} 'gene_ref' argument must be a ModelGene "
                f"not {type(gene_ref)}.")
        self.gene_ref = gene_ref
        self.status = gene_status
Пример #2
0
    def __init__(self, path=None, profile_dir=None, def_dir=None, profile_suffix='.hmm', relative_path=False):
        """
        :param str path: if it's an installed model, path is the absolute path to a model family.
                     otherwise path is None, and profile_dir and def_dir must be specified.
        :param str profile_dir: the absolute path to the directory which contains the hmm profiles files.
        :param str def_dir: The absolute path to the directory which contains the models definitions (xml files)
                            or submodels.
        :param str profile_suffix: the suffix of hmm files
        :param bool relative_path: True if you want to work with relative path, False to work with absolute path.
        :raise: MacsypyError if path is set and profile_dir or def_dir is set
        :raise: MacsypyError if profile_dir is set but not def_dir and vice versa
        """
        if path and any((profile_dir, def_dir)):
            raise MacsypyError("'path' and '{}' are incompatible arguments".format(
                'profile_dir' if profile_dir else 'def_dir'))
        elif not path and not all((profile_dir, def_dir)):
            raise MacsypyError("if 'profile_dir' is specified 'def_dir' must be specified_too and vice versa")
        self.path = path
        if path is not None:
            self.name = os.path.basename(path)
        else:
            self.name = os.path.basename(def_dir)
        if not profile_dir:
            profile_dir = os.path.join(path, 'profiles')
        self._profiles = self._scan_profiles(profile_dir,
                                             profile_suffix=profile_suffix,
                                             relative_path=relative_path)

        self._definitions = {}
        if not def_dir:
            def_dir = os.path.join(self.path, 'definitions')
            for definition in os.listdir(def_dir):
                definition_path = os.path.join(def_dir, definition)
                new_def = self._scan_definitions(def_path=definition_path)
                if new_def:  # _scan_definitions can return None if a dir is empty
                    new_def.fqn = f"{self.name}{_separator}{new_def.fqn}"
                    if new_def.subdefinitions:
                        for def_loc in new_def.subdefinitions.values():
                            def_loc.fqn = f"{self.name}{_separator}{def_loc.fqn}"
                    self._definitions[new_def.name] = new_def
        else:
            import glob
            for model_path in glob.glob(os.path.join(def_dir, '*.xml')):

                model_fqn = os.path.basename(os.path.splitext(model_path)[0])

                if not relative_path:
                    model_path = os.path.abspath(model_path)

                new_def = DefinitionLocation(name=model_fqn,
                                             path=model_path)
                self._definitions[new_def.name] = new_def
Пример #3
0
    def _build_my_indexes(self, index_dir):
        """
        Build macsyfinder indexes. These indexes are stored in a file.

        The file format is the following:
         - the first line is the path of the sequence-db indexed
         - one entry per line, with each line having this format:
         - sequence id;sequence length;sequence rank

        """
        index_file = os.path.join(index_dir, self.name + ".idx")
        try:
            with open(self._fasta_path, 'r') as fasta_file:
                with open(index_file, 'w') as my_base:
                    my_base.write(self._fasta_path + '\n')
                    f_iter = fasta_iter(fasta_file)
                    seq_nb = 0
                    for seq_id, comment, length in f_iter:
                        seq_nb += 1
                        my_base.write(
                            f"{seq_id}{self._field_separator}{length:d}{self._field_separator}{seq_nb:d}\n"
                        )
                    my_base.flush()
        except Exception as err:
            msg = f"unable to index the sequence dataset: {self.cfg.sequence_db()} : {err}"
            _log.critical(msg, exc_info=True)
            raise MacsypyError(msg) from err
        return index_file
Пример #4
0
    def __init__(self, hit, gene_ref=None, gene_status=None, counterpart=None):
        """
        hit that is outside a cluster, the gene_ref is a loner

        :param hit: a match between a hmm profile and a replicon
        :type hit: :class:`macsypy.hit.CoreHit` object
        :param gene_ref: The ModelGene link to this hit
                         The ModeleGene have the same name than the CoreGene
                         But one hit can be link to several ModelGene (several Model)
                         To know for what gene this hit play role use the
                         :meth:`macsypy.gene.ModelGene.alternate_of` ::

                            hit.gene_ref.alternate_of()

        :type gene_ref: :class:`macsypy.gene.ModelGene` object
        :param gene_status:
        :type gene_status: :class:`macsypy.gene.GeneStatus` object
        :param counterpart: the other occurence of the gene or exchangeable in the replicon
        :type counterpart: list of :class:`macsypy.hit.CoreHit`
        """
        super().__init__(hit,
                         gene_ref=gene_ref,
                         gene_status=gene_status,
                         counterpart=counterpart)

        if not self.gene_ref.multi_system:
            msg = f"{hit.id} cannot be a multi systems, gene_ref '{gene_ref.name}' not tag as multi_system"
            _log.critical(msg)
            raise MacsypyError(msg)
Пример #5
0
    def _parse_section(self, section_node, allowed_elements):
        """
        Parse a node containing configurations options and value

        :param section_node:
        :param allowed_elements: The elements allowed in this section
                                 Only these elements are parsed and in the final dictionnary
        :type allowed_elements: a dict with options name as keys and function to parse the element
        :return: dict
        """
        section = {}
        for child in section_node:
            element = child.tag
            if element in allowed_elements:
                value = child.text
                try:
                    value = allowed_elements[element](value)
                except (TypeError, ValueError) as err:
                    msg = f"The model configuration file '{self._path}' cannot be parsed: {err}"
                    _log.critical(msg)
                    raise MacsypyError(msg) from None
            else:
                _log.warning(
                    f"unknown element '{element}' in '{self._path}' ignore it."
                )
                continue
            section[element] = value
        return section
Пример #6
0
def _validator(cast_func, raw, default, sequence=False):
    if raw == '':
        if default is None:
            raise MacsypyError('Please enter some value')
        else:
            raw = default
    elif sequence:
        raw = [item.strip() for item in raw.split(',')]

    try:
        if isinstance(raw, type([])):
            value = [cast_func(item) for item in raw]
        else:
            value = cast_func(raw)
    except ValueError as err:
        raise MacsypyError(f'Invalid value: {err}') from err
    return value
Пример #7
0
 def parse_cut_ga(value):
     if value.lower() in ('true', 1):
         return True
     elif value.lower() in ('false', 0):
         return False
     else:
         msg = f"cannot parse 'cut_ga' element in '{self._path}' expect True, 1, False, 0 got : '{value}'"
         _log.critical(msg)
         raise MacsypyError(msg)
Пример #8
0
    def _fill_gembase_min_max(self, topology, default_topology):
        """
        For each replicon_name of a gembase dataset, it fills the internal dictionary with a namedtuple RepliconInfo

        :param topology: the topologies for each replicon
                         (parsed from the file specified with the option --topology-file)
        :type topology: dict
        :param default_topology: the topology provided by the config.replicon_topology
        :type default_topology: string
        """
        def grp_replicon(entry):
            """
            in gembase the identifier of fasta sequence follows the following schema:
            <replicon-name>_<seq-name> with eventually '_' inside the <replicon_name>
            but not in the <seq-name>.
            so grp_replicon allow to group sequences belonging to the same replicon.
            """
            return "_".join(entry[0].split('_')[:-1])

        def parse_seq_id(seq_id):
            """
            parse a gemabse sequence id (.idx)
            seq_id has the following format <replicon-name>_<seq-name> with eventually '_' inside the <replicon_name>
            but not in the <seq-name>.
            """
            *replicon_name, seq_name = seq_id.split('_')
            replicon_name = "_".join(replicon_name)
            return replicon_name, seq_name

        replicons = (x[1] for x in groupby(self._idx, grp_replicon))
        for replicon in replicons:
            genes = []
            seq_id, seq_length, _min = next(replicon)

            replicon_name, seq_name = parse_seq_id(seq_id)
            genes.append((seq_name, seq_length))
            for seq_id, seq_length, rank in replicon:
                # pass all sequence of the replicon until the last one
                _, seq_name = parse_seq_id(seq_id)
                genes.append((seq_name, seq_length))
            _, seq_name = parse_seq_id(seq_id)
            try:
                _max = rank
            except UnboundLocalError:
                msg = f"Error during sequence-db '{self.cfg.sequence_db()}' parsing. Are you sure db-type is 'gembase'?"
                _log.critical(msg)
                raise MacsypyError(msg) from None
            genes.append((seq_name, seq_length))
            if replicon_name in topology:
                self._DB[replicon_name] = RepliconInfo(topology[replicon_name],
                                                       _min, _max, genes)
            else:
                self._DB[replicon_name] = RepliconInfo(default_topology, _min,
                                                       _max, genes)
Пример #9
0
 def __init__(self, hit, gene_ref=None, gene_status=None, counterpart=None):
     if isinstance(hit, CoreHit) and not (gene_ref and gene_status):
         raise MacsypyError(
             f"Cannot Create a {self.__class__.__name__} hit from "
             f"CoreHit ({hit.gene.name}, {hit.position}) "
             "without specifying 'gene_ref' and 'gene_status'")
     elif isinstance(hit, CoreHit):
         super().__init__(hit, gene_ref, gene_status)
     elif isinstance(hit, ModelHit):
         super().__init__(hit.hit,
                          gene_ref=hit.gene_ref,
                          gene_status=hit.gene_ref.status)
     self.counterpart = counterpart
Пример #10
0
    def _get_model_conf_node(self):
        """
        Find the root of the document

        :return: the document root of model_conf
        """
        try:
            tree = Et.parse(self._path)
            model_node = tree.getroot()
        except Exception as err:
            msg = f"unable to parse model configuration '{self._path}' : {err}"
            _log.critical(msg)
            raise MacsypyError(msg) from None
        return model_node
Пример #11
0
    def __iter__(self):
        """
        :raise MacsypyError: if the indexes are not buid
        :return: an iterator on the indexes

        To use it the index must be build.
        """
        path = self.find_my_indexes()
        if path is None:
            raise MacsypyError("Build index before to use it.")
        with open(path) as idx_file:
            # The first line of index is the path to the data
            # It is not an index
            _ = next(idx_file)
            for line in idx_file:
                try:
                    seq_id, length, _rank = line.split(self._field_separator)
                except Exception as err:
                    raise MacsypyError(
                        f"fail to parse database index {path} at line: {line}",
                        err) from err
                length = int(length)
                _rank = int(_rank)
                yield seq_id, length, _rank
Пример #12
0
    def counterpart(self, counterparts):
        """

        :param counterpart:
        """
        if not counterparts:
            self._counterpart = set()
        elif all([
                hit.gene_ref.alternate_of().name is
                self.gene_ref.alternate_of().name for hit in counterparts
        ]):
            self._counterpart = set(counterparts)
        else:
            msg = f"Try to set counterpart for hit '{self.gene_ref.name}' with non compatible hits: " \
                  f"{[hit.gene_ref.name for hit in counterparts]}"
            _log.error(msg)
            raise MacsypyError(msg)
Пример #13
0
    def _build_my_indexes(self):
        """
        Build macsyfinder indexes. These indexes are stored in a file.

        The file format is the following:
         - one entry per line, with each line having this format:
         - sequence id;sequence length;sequence rank

        """
        try:
            with open(self._fasta_path, 'r') as fasta_file:
                with open(
                        os.path.join(os.path.dirname(self.cfg.sequence_db()),
                                     self.name + ".idx"), 'w') as my_base:
                    f_iter = fasta_iter(fasta_file)
                    seq_nb = 0
                    for seq_id, comment, length in f_iter:
                        seq_nb += 1
                        my_base.write(f"{seq_id};{length:d};{seq_nb:d}\n")
        except Exception as err:
            msg = f"unable to index the sequence dataset: {self.cfg.sequence_db()} : {err}"
            _log.critical(msg, exc_info=True)
            raise MacsypyError(msg)
Пример #14
0
def get_best_hit_4_func(function, hits, key='score'):
    """
    select the best Loner among several ones encoding for same function

        * score
        * i_evalue
        * profile_coverage

    :param str function: the name of the function fulfill by the hits (all hits must have same function)
    :param hits: the hits to filter.
    :type hits: sequence of :class:`macsypy.hit.ModelHit` object
    :param str key: The criterion used to select the best hit 'score', i_evalue', 'profile_coverage'
    :return: the best hit
    :rtype: :class:`macsypy.hit.ModelHit` object
    """
    originals = []
    exchangeables = []
    for hit in hits:
        if hit.gene_ref.name == function:
            originals.append(hit)
        else:
            exchangeables.append(hit)
    if originals:
        hits = originals
    else:
        hits = exchangeables
    if key == 'score':
        hits.sort(key=attrgetter(key), reverse=True)
    elif key == 'i_eval':
        hits.sort(key=attrgetter(key))
    elif key == 'profile_coverage':
        hits.sort(key=attrgetter(key), reverse=True)
    else:
        raise MacsypyError(
            f'The criterion for Loners comparison {key} does not exist or is not available.\n'
        )
    return hits[0]
Пример #15
0
def get_best_hits(hits, key='score'):
    """
    If several hits match the same protein, keep only the best match based either on

        * score
        * i_evalue
        * profile_coverage

    :param hits: the hits to filter, all hits must match the same protein.
    :type hits: [ :class:`macsypy.hit.CoreHit` object, ...]
    :param str key: The criterion used to select the best hit 'score', i_evalue', 'profile_coverage'
    :return: the list of the best hits
    :rtype: [ :class:`macsypy.hit.CoreHit` object, ...]
    """
    hits_register = {}
    for hit in hits:
        register_key = hit.replicon_name, hit.position
        if register_key in hits_register:
            hits_register[register_key].append(hit)
        else:
            hits_register[register_key] = [hit]

    best_hits = []
    for hits_on_same_prot in hits_register.values():
        if key == 'score':
            hits_on_same_prot.sort(key=attrgetter(key), reverse=True)
        elif key == 'i_eval':
            hits_on_same_prot.sort(key=attrgetter(key))
        elif key == 'profile_coverage':
            hits_on_same_prot.sort(key=attrgetter(key), reverse=True)
        else:
            raise MacsypyError(
                f'The criterion for Hits comparison {key} does not exist or is not available.\n'
                f'It must be either "score", "i_eval" or "profile_coverage".')
        best_hits.append(hits_on_same_prot[0])
    return best_hits