def test_search_recover(self): # first job searching using hmmsearch gene_name = "abc" c_gene_abc = CoreGene(self.model_location, gene_name, self.profile_factory) report = search_genes([c_gene_abc], self.cfg) expected_hit = [ Hit(c_gene_abc, "ESCO030p01_000260", 706, "ESCO030p01", 26, float(1.000e-200), float(660.800), float(1.000), float(0.714), 160, 663) ] # second job using recover # disable hmmer to be sure that test use the recover inner function self.cfg.hmmer = lambda: "hmmer_disable" # and create a new dir for the second job previous_job_path = self.cfg.working_dir() self.cfg.previous_run = lambda: previous_job_path self.cfg.out_dir = lambda: os.path.join(self.tmp_dir, 'job_2') os.mkdir(self.cfg.out_dir()) # rerun with previous run # but we have to reset the profile attached to the gene gene._profile._report self.profile_factory = ProfileFactory(self.cfg) c_gene_abc = CoreGene(self.model_location, gene_name, self.profile_factory) report = search_genes([c_gene_abc], self.cfg) self.assertEqual(len(report), 1) self.assertEqual(expected_hit[0], report[0].hits[0])
def test_search(self): gene_name = "abc" c_gene_abc = CoreGene(self.model_location, gene_name, self.profile_factory) report = search_genes([c_gene_abc], self.cfg) expected_hit = [ Hit(c_gene_abc, "ESCO030p01_000260", 706, "ESCO030p01", 26, float(1.000e-200), float(660.800), float(1.000), float(0.714), 160, 663) ] self.assertEqual(len(report), 1) self.assertEqual(expected_hit[0], report[0].hits[0])
def search_systems(config, model_bank, gene_bank, profile_factory, logger): """ Do the job, this function is the orchestrator of all the macsyfinder mechanics at the end several files are produced containing the results - macsyfinder.conf: The set of variables used to runt this job - macsyfinder.systems: The list of the potential systems - macsyfinder.rejected_cluster: The list of all clusters and clustrs combination which has been rejected and the reason - macsyfinder.log: the copy of the standard output :param config: The MacSyFinder Configuration :type config: :class:`macsypy.config.Config` object :param model_bank: The bank populated with the available models :type model_bank: :class:`macsypy.model.ModelBank` object :param gene_bank: the bank containing all genes :type gene_bank: :class:`macsypy.gene.GeneBank` object :param profile_factory: The profile factory :type profile_factory: :class:`macsypy.gene.ProfileFactory` :param logger: The logger use to display information to the user. It must be initialized. see :func:`macsypy.init_logger` :type logger: :class:`colorlog.Logger` object :return: the systems and rejected clusters found :rtype: ([:class:`macsypy.system.System`, ...], [:class:`macsypy.cluster.RejectedCluster`, ...]) """ working_dir = config.working_dir() config.save(path_or_buf=os.path.join(working_dir, config.cfg_name)) registry = ModelRegistry() models_loc_available = scan_models_dir( config.models_dir(), profile_suffix=config.profile_suffix(), relative_path=config.relative_path()) for model_loc in models_loc_available: registry.add(model_loc) # build indexes idx = Indexes(config) idx.build(force=config.idx) # create models parser = DefinitionParser(config, model_bank, gene_bank, registry, profile_factory) try: models_def_to_detect = get_def_to_detect(config.models(), registry) except KeyError as err: sys.exit(f"macsyfinder: {err}") parser.parse(models_def_to_detect) logger.info( f"MacSyFinder's results will be stored in working_dir{working_dir}") logger.info(f"Analysis launched on {config.sequence_db()} for model(s):") for m in models_def_to_detect: logger.info(f"\t- {m.fqn}") models_to_detect = [ model_bank[model_loc.fqn] for model_loc in models_def_to_detect ] all_genes = [] for model in models_to_detect: genes = model.mandatory_genes + model.accessory_genes + model.neutral_genes + model.forbidden_genes # Exchangeable (formerly homologs/analogs) are also added because they can "replace" an important gene... ex_genes = [] for g in genes: ex_genes += g.exchangeables all_genes += (genes + ex_genes) ############################################# # this part of code is executed in parallel ############################################# try: all_reports = search_genes(all_genes, config) except Exception as err: sys.exit(str(err)) ############################################# # end of parallel code ############################################# all_hits = [ hit for subl in [report.hits for report in all_reports] for hit in subl ] if len(all_hits) > 0: # It's important to keep this sorting to have in last all_hits version # the hits with the same replicon_name and position sorted by score # the best score in first hits_by_replicon = {} for hit in all_hits: if hit.replicon_name in hits_by_replicon: hits_by_replicon[hit.replicon_name].append(hit) else: hits_by_replicon[hit.replicon_name] = [hit] for rep_name in hits_by_replicon: hits_by_replicon[rep_name] = get_best_hits( hits_by_replicon[rep_name], key='score') hits_by_replicon[rep_name].sort(key=attrgetter('position')) models_to_detect = sorted(models_to_detect, key=attrgetter('name')) db_type = config.db_type() if db_type in ('ordered_replicon', 'gembase'): systems, rejected_clusters = _search_in_ordered_replicon( hits_by_replicon, models_to_detect, config, logger) return systems, rejected_clusters elif db_type == "unordered": likely_systems, rejected_hits = _search_in_unordered_replicon( hits_by_replicon, models_to_detect, logger) return likely_systems, rejected_hits else: assert False, f"dbtype have an invalid value {db_type}" else: # No hits detected return [], []