def __init__(self, hit, gene_ref, gene_status): """ :param hit: a match between a hmm profile and a replicon :type hit: :class:`macsypy.hit.CoreHit` object :param gene_ref: The ModelGene link to this hit The ModeleGene have the same name than the CoreGene But one hit can be link to several ModelGene (several Model) To know for what gene this hit play role use the :meth:`macsypy.gene.ModelGene.alternate_of` :: hit.gene_ref.alternate_of() :type gene_ref: :class:`macsypy.gene.ModelGene` object :param gene_status: :type gene_status: :class:`macsypy.gene.GeneStatus` object """ if not isinstance(hit, CoreHit): raise MacsypyError( f"The {self.__class__.__name__} 'hit' argument must be a CoreHit not {type(hit)}." ) self._hit = hit if not isinstance(gene_ref, ModelGene): raise MacsypyError( f"The {self.__class__.__name__} 'gene_ref' argument must be a ModelGene " f"not {type(gene_ref)}.") self.gene_ref = gene_ref self.status = gene_status
def __init__(self, path=None, profile_dir=None, def_dir=None, profile_suffix='.hmm', relative_path=False): """ :param str path: if it's an installed model, path is the absolute path to a model family. otherwise path is None, and profile_dir and def_dir must be specified. :param str profile_dir: the absolute path to the directory which contains the hmm profiles files. :param str def_dir: The absolute path to the directory which contains the models definitions (xml files) or submodels. :param str profile_suffix: the suffix of hmm files :param bool relative_path: True if you want to work with relative path, False to work with absolute path. :raise: MacsypyError if path is set and profile_dir or def_dir is set :raise: MacsypyError if profile_dir is set but not def_dir and vice versa """ if path and any((profile_dir, def_dir)): raise MacsypyError("'path' and '{}' are incompatible arguments".format( 'profile_dir' if profile_dir else 'def_dir')) elif not path and not all((profile_dir, def_dir)): raise MacsypyError("if 'profile_dir' is specified 'def_dir' must be specified_too and vice versa") self.path = path if path is not None: self.name = os.path.basename(path) else: self.name = os.path.basename(def_dir) if not profile_dir: profile_dir = os.path.join(path, 'profiles') self._profiles = self._scan_profiles(profile_dir, profile_suffix=profile_suffix, relative_path=relative_path) self._definitions = {} if not def_dir: def_dir = os.path.join(self.path, 'definitions') for definition in os.listdir(def_dir): definition_path = os.path.join(def_dir, definition) new_def = self._scan_definitions(def_path=definition_path) if new_def: # _scan_definitions can return None if a dir is empty new_def.fqn = f"{self.name}{_separator}{new_def.fqn}" if new_def.subdefinitions: for def_loc in new_def.subdefinitions.values(): def_loc.fqn = f"{self.name}{_separator}{def_loc.fqn}" self._definitions[new_def.name] = new_def else: import glob for model_path in glob.glob(os.path.join(def_dir, '*.xml')): model_fqn = os.path.basename(os.path.splitext(model_path)[0]) if not relative_path: model_path = os.path.abspath(model_path) new_def = DefinitionLocation(name=model_fqn, path=model_path) self._definitions[new_def.name] = new_def
def _build_my_indexes(self, index_dir): """ Build macsyfinder indexes. These indexes are stored in a file. The file format is the following: - the first line is the path of the sequence-db indexed - one entry per line, with each line having this format: - sequence id;sequence length;sequence rank """ index_file = os.path.join(index_dir, self.name + ".idx") try: with open(self._fasta_path, 'r') as fasta_file: with open(index_file, 'w') as my_base: my_base.write(self._fasta_path + '\n') f_iter = fasta_iter(fasta_file) seq_nb = 0 for seq_id, comment, length in f_iter: seq_nb += 1 my_base.write( f"{seq_id}{self._field_separator}{length:d}{self._field_separator}{seq_nb:d}\n" ) my_base.flush() except Exception as err: msg = f"unable to index the sequence dataset: {self.cfg.sequence_db()} : {err}" _log.critical(msg, exc_info=True) raise MacsypyError(msg) from err return index_file
def __init__(self, hit, gene_ref=None, gene_status=None, counterpart=None): """ hit that is outside a cluster, the gene_ref is a loner :param hit: a match between a hmm profile and a replicon :type hit: :class:`macsypy.hit.CoreHit` object :param gene_ref: The ModelGene link to this hit The ModeleGene have the same name than the CoreGene But one hit can be link to several ModelGene (several Model) To know for what gene this hit play role use the :meth:`macsypy.gene.ModelGene.alternate_of` :: hit.gene_ref.alternate_of() :type gene_ref: :class:`macsypy.gene.ModelGene` object :param gene_status: :type gene_status: :class:`macsypy.gene.GeneStatus` object :param counterpart: the other occurence of the gene or exchangeable in the replicon :type counterpart: list of :class:`macsypy.hit.CoreHit` """ super().__init__(hit, gene_ref=gene_ref, gene_status=gene_status, counterpart=counterpart) if not self.gene_ref.multi_system: msg = f"{hit.id} cannot be a multi systems, gene_ref '{gene_ref.name}' not tag as multi_system" _log.critical(msg) raise MacsypyError(msg)
def _parse_section(self, section_node, allowed_elements): """ Parse a node containing configurations options and value :param section_node: :param allowed_elements: The elements allowed in this section Only these elements are parsed and in the final dictionnary :type allowed_elements: a dict with options name as keys and function to parse the element :return: dict """ section = {} for child in section_node: element = child.tag if element in allowed_elements: value = child.text try: value = allowed_elements[element](value) except (TypeError, ValueError) as err: msg = f"The model configuration file '{self._path}' cannot be parsed: {err}" _log.critical(msg) raise MacsypyError(msg) from None else: _log.warning( f"unknown element '{element}' in '{self._path}' ignore it." ) continue section[element] = value return section
def _validator(cast_func, raw, default, sequence=False): if raw == '': if default is None: raise MacsypyError('Please enter some value') else: raw = default elif sequence: raw = [item.strip() for item in raw.split(',')] try: if isinstance(raw, type([])): value = [cast_func(item) for item in raw] else: value = cast_func(raw) except ValueError as err: raise MacsypyError(f'Invalid value: {err}') from err return value
def parse_cut_ga(value): if value.lower() in ('true', 1): return True elif value.lower() in ('false', 0): return False else: msg = f"cannot parse 'cut_ga' element in '{self._path}' expect True, 1, False, 0 got : '{value}'" _log.critical(msg) raise MacsypyError(msg)
def _fill_gembase_min_max(self, topology, default_topology): """ For each replicon_name of a gembase dataset, it fills the internal dictionary with a namedtuple RepliconInfo :param topology: the topologies for each replicon (parsed from the file specified with the option --topology-file) :type topology: dict :param default_topology: the topology provided by the config.replicon_topology :type default_topology: string """ def grp_replicon(entry): """ in gembase the identifier of fasta sequence follows the following schema: <replicon-name>_<seq-name> with eventually '_' inside the <replicon_name> but not in the <seq-name>. so grp_replicon allow to group sequences belonging to the same replicon. """ return "_".join(entry[0].split('_')[:-1]) def parse_seq_id(seq_id): """ parse a gemabse sequence id (.idx) seq_id has the following format <replicon-name>_<seq-name> with eventually '_' inside the <replicon_name> but not in the <seq-name>. """ *replicon_name, seq_name = seq_id.split('_') replicon_name = "_".join(replicon_name) return replicon_name, seq_name replicons = (x[1] for x in groupby(self._idx, grp_replicon)) for replicon in replicons: genes = [] seq_id, seq_length, _min = next(replicon) replicon_name, seq_name = parse_seq_id(seq_id) genes.append((seq_name, seq_length)) for seq_id, seq_length, rank in replicon: # pass all sequence of the replicon until the last one _, seq_name = parse_seq_id(seq_id) genes.append((seq_name, seq_length)) _, seq_name = parse_seq_id(seq_id) try: _max = rank except UnboundLocalError: msg = f"Error during sequence-db '{self.cfg.sequence_db()}' parsing. Are you sure db-type is 'gembase'?" _log.critical(msg) raise MacsypyError(msg) from None genes.append((seq_name, seq_length)) if replicon_name in topology: self._DB[replicon_name] = RepliconInfo(topology[replicon_name], _min, _max, genes) else: self._DB[replicon_name] = RepliconInfo(default_topology, _min, _max, genes)
def __init__(self, hit, gene_ref=None, gene_status=None, counterpart=None): if isinstance(hit, CoreHit) and not (gene_ref and gene_status): raise MacsypyError( f"Cannot Create a {self.__class__.__name__} hit from " f"CoreHit ({hit.gene.name}, {hit.position}) " "without specifying 'gene_ref' and 'gene_status'") elif isinstance(hit, CoreHit): super().__init__(hit, gene_ref, gene_status) elif isinstance(hit, ModelHit): super().__init__(hit.hit, gene_ref=hit.gene_ref, gene_status=hit.gene_ref.status) self.counterpart = counterpart
def _get_model_conf_node(self): """ Find the root of the document :return: the document root of model_conf """ try: tree = Et.parse(self._path) model_node = tree.getroot() except Exception as err: msg = f"unable to parse model configuration '{self._path}' : {err}" _log.critical(msg) raise MacsypyError(msg) from None return model_node
def __iter__(self): """ :raise MacsypyError: if the indexes are not buid :return: an iterator on the indexes To use it the index must be build. """ path = self.find_my_indexes() if path is None: raise MacsypyError("Build index before to use it.") with open(path) as idx_file: # The first line of index is the path to the data # It is not an index _ = next(idx_file) for line in idx_file: try: seq_id, length, _rank = line.split(self._field_separator) except Exception as err: raise MacsypyError( f"fail to parse database index {path} at line: {line}", err) from err length = int(length) _rank = int(_rank) yield seq_id, length, _rank
def counterpart(self, counterparts): """ :param counterpart: """ if not counterparts: self._counterpart = set() elif all([ hit.gene_ref.alternate_of().name is self.gene_ref.alternate_of().name for hit in counterparts ]): self._counterpart = set(counterparts) else: msg = f"Try to set counterpart for hit '{self.gene_ref.name}' with non compatible hits: " \ f"{[hit.gene_ref.name for hit in counterparts]}" _log.error(msg) raise MacsypyError(msg)
def _build_my_indexes(self): """ Build macsyfinder indexes. These indexes are stored in a file. The file format is the following: - one entry per line, with each line having this format: - sequence id;sequence length;sequence rank """ try: with open(self._fasta_path, 'r') as fasta_file: with open( os.path.join(os.path.dirname(self.cfg.sequence_db()), self.name + ".idx"), 'w') as my_base: f_iter = fasta_iter(fasta_file) seq_nb = 0 for seq_id, comment, length in f_iter: seq_nb += 1 my_base.write(f"{seq_id};{length:d};{seq_nb:d}\n") except Exception as err: msg = f"unable to index the sequence dataset: {self.cfg.sequence_db()} : {err}" _log.critical(msg, exc_info=True) raise MacsypyError(msg)
def get_best_hit_4_func(function, hits, key='score'): """ select the best Loner among several ones encoding for same function * score * i_evalue * profile_coverage :param str function: the name of the function fulfill by the hits (all hits must have same function) :param hits: the hits to filter. :type hits: sequence of :class:`macsypy.hit.ModelHit` object :param str key: The criterion used to select the best hit 'score', i_evalue', 'profile_coverage' :return: the best hit :rtype: :class:`macsypy.hit.ModelHit` object """ originals = [] exchangeables = [] for hit in hits: if hit.gene_ref.name == function: originals.append(hit) else: exchangeables.append(hit) if originals: hits = originals else: hits = exchangeables if key == 'score': hits.sort(key=attrgetter(key), reverse=True) elif key == 'i_eval': hits.sort(key=attrgetter(key)) elif key == 'profile_coverage': hits.sort(key=attrgetter(key), reverse=True) else: raise MacsypyError( f'The criterion for Loners comparison {key} does not exist or is not available.\n' ) return hits[0]
def get_best_hits(hits, key='score'): """ If several hits match the same protein, keep only the best match based either on * score * i_evalue * profile_coverage :param hits: the hits to filter, all hits must match the same protein. :type hits: [ :class:`macsypy.hit.CoreHit` object, ...] :param str key: The criterion used to select the best hit 'score', i_evalue', 'profile_coverage' :return: the list of the best hits :rtype: [ :class:`macsypy.hit.CoreHit` object, ...] """ hits_register = {} for hit in hits: register_key = hit.replicon_name, hit.position if register_key in hits_register: hits_register[register_key].append(hit) else: hits_register[register_key] = [hit] best_hits = [] for hits_on_same_prot in hits_register.values(): if key == 'score': hits_on_same_prot.sort(key=attrgetter(key), reverse=True) elif key == 'i_eval': hits_on_same_prot.sort(key=attrgetter(key)) elif key == 'profile_coverage': hits_on_same_prot.sort(key=attrgetter(key), reverse=True) else: raise MacsypyError( f'The criterion for Hits comparison {key} does not exist or is not available.\n' f'It must be either "score", "i_eval" or "profile_coverage".') best_hits.append(hits_on_same_prot[0]) return best_hits