class TestReport(MacsyTest): def setUp(self): args = argparse.Namespace() args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = tempfile.gettempdir() args.log_level = 30 args.out_dir = os.path.join(args.res_search_dir, 'test_macsyfinder_Report') if os.path.exists(args.out_dir): shutil.rmtree(args.out_dir) os.mkdir(args.out_dir) seq_db = self.find_data("base", "test_base.fa") shutil.copy(seq_db, args.out_dir) args.sequence_db = os.path.join(args.out_dir, os.path.basename(seq_db)) self.cfg = Config(MacsyDefaults(), args) os.mkdir(os.path.join(self.cfg.out_dir(), self.cfg.hmmer_dir())) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe self.profile_factory = ProfileFactory(self.cfg) idx = Indexes(self.cfg) idx.build() def tearDown(self): try: shutil.rmtree(self.cfg.working_dir()) except Exception: pass
def main(args=None, log_level=None) -> None: """ main entry point to macsyprofile :param args: the arguments passed on the command line without the program name :type args: List of string :param log_level: the output verbosity :type log_level: a positive int or a string among 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL' """ global _log args = sys.argv[1:] if args is None else args parsed_args = parse_args(args) if log_level is None: log_level = verbosity_to_log_level(parsed_args.verbosity) _log = init_logger(log_level, out=(not parsed_args.mute)) if not os.path.exists(parsed_args.previous_run): _log.critical(f"{parsed_args.previous_run}: No such directory.") sys.tracebacklimit = 0 raise FileNotFoundError() from None elif not os.path.isdir(parsed_args.previous_run): _log.critical(f"{parsed_args.previous_run} is not a directory.") sys.tracebacklimit = 0 raise ValueError() from None defaults = MacsyDefaults(i_evalue_sel=1.0e9, coverage_profile=-1.0) cfg = Config(defaults, parsed_args) msf_run_path = cfg.previous_run() hmmer_results = os.path.join(msf_run_path, cfg.hmmer_dir()) hmm_suffix = cfg.res_search_suffix() profile_suffix = cfg.profile_suffix() if parsed_args.out: profile_report_path = os.path.normpath(parsed_args.out) dirname = os.path.normpath(os.path.dirname(parsed_args.out)) if not os.path.exists(dirname): _log.critical(f"The {dirname} directory is not writable") sys.tracebacklimit = 0 raise ValueError() from None else: profile_report_path = os.path.join(cfg.previous_run(), 'hmm_coverage.tsv') if os.path.exists(profile_report_path) and not parsed_args.force: _log.critical( f"The file {profile_report_path} already exists. " f"Remove it or specify a new output name --out or use --force option" ) sys.tracebacklimit = 0 raise ValueError() from None hmmer_files = sorted( glob.glob( os.path.join(hmmer_results, f"{parsed_args.pattern}{hmm_suffix}"))) try: model_familly_name = cfg.models()[0] model_dir = [ p for p in [os.path.join(p, model_familly_name) for p in cfg.models_dir()] if os.path.exists(p) ][-1] profiles_dir = os.path.join(model_dir, 'profiles') except IndexError: _log.critical( f"Cannot find models in conf file {msf_run_path}. " f"May be these results have been generated with an old version of macsyfinder." ) sys.tracebacklimit = 0 raise ValueError() from None _log.debug(f"hmmer_files: {hmmer_files}") all_hits = [] with open(profile_report_path, 'w') as prof_out: print(header(args), file=prof_out) for hmmer_out_path in hmmer_files: _log.info(f"parsing {hmmer_out_path}") gene_name = get_gene_name(hmmer_out_path, hmm_suffix) profile_path = os.path.join(profiles_dir, f"{gene_name}{profile_suffix}") gene_profile_len = get_profile_len(profile_path) hmm = HmmProfile(gene_name, gene_profile_len, hmmer_out_path, cfg) hits = hmm.parse() all_hits += hits if len(all_hits) > 0: if parsed_args.best_hits: # It's important to keep this sorting to have in last all_hits version # the hits with the same replicon_name and position sorted by score # the best score in first hits_by_replicon = {} for hit in all_hits: if hit.replicon_name in hits_by_replicon: hits_by_replicon[hit.replicon_name].append(hit) else: hits_by_replicon[hit.replicon_name] = [hit] all_hits = [] for rep_name in hits_by_replicon: hits_by_replicon[rep_name] = get_best_hits( hits_by_replicon[rep_name], key=parsed_args.best_hits) all_hits += sorted(hits_by_replicon[rep_name], key=lambda h: h.position) all_hits = sorted( all_hits, key=lambda h: (h.gene_name, h.replicon_name, h.position, h.score)) _log.info(f"found {len(all_hits)} hits") for hit in all_hits: print(hit, file=prof_out) _log.info(f"result is in '{profile_report_path}'") else: _log.info("No hit found")