if (args.taxids is None or args.taxids == "") and (args.taxa is None or args.taxa == ""): print( colorify(f'Either --taxids or --taxa parameter is required', 'red')) sys.exit(1) if (args.taxids is not None and args.taxids != "") and (args.taxa is not None and args.taxa != ""): print(colorify(f'Use either --taxids or --taxa, not both', 'red')) sys.exit(1) ## if "EGGNOG_DATA_DIR" in os.environ: set_data_path(os.environ["EGGNOG_DATA_DIR"]) if args.data_dir: set_data_path(args.data_dir) data_path = get_data_path() # http://eggnog5.embl.de/download/eggnog_5.0/e5.proteomes.faa if not pexists(get_eggnog_proteins_file()): if args.allyes or ask( f"Download eggnog5 proteins to {data_path}? ~9GB (It is required to create new databases)" ) == 'y': print( colorify( f'Downloading eggnog5 proteins file to {data_path}...', 'green'))
def parse_args(parser): args = parser.parse_args() if "EGGNOG_DATA_DIR" in os.environ: set_data_path(os.environ["EGGNOG_DATA_DIR"]) if args.data_dir: set_data_path(args.data_dir) if args.version: version = "" try: version = get_full_version_info() except Exception: version = get_version() print(version) sys.exit(0) args.call_info = get_call_info() if args.list_taxa: from eggnogmapper.vars import LEVEL_DEPTH, LEVEL_DICT, LEVEL_NAMES, LEVEL_PARENTS print("tax_name\ttax_id\tdepth\tparents\tparents_names") for tax_name, tax_id in LEVEL_DICT.items(): depth = LEVEL_DEPTH.get(tax_id, "-") parents = LEVEL_PARENTS.get(tax_id, "-") parents_names = [LEVEL_NAMES.get(x, "-") for x in parents] print(f"{tax_name}\t{tax_id}\t{depth}\t{','.join(parents)}\t{','.join(parents_names)}") sys.exit(0) if args.cpu == 0: args.cpu = multiprocessing.cpu_count() # translate if args.itype in [ITYPE_GENOME, ITYPE_META, ITYPE_PROTS] and args.translate == True: parser.error('"--translate" only can be used with "--itype CDS"') # Gene prediction if args.training_genome is not None and args.training_file is None: parser.error('"--training_genome requires --training_file"') if args.training_genome is None and args.training_file is not None: if not os.path.isfile(args.training_file): parser.error('"--training_file must point to an existing file, if no --training_genome is provided."') # Search modes if args.mode == SEARCH_MODE_DIAMOND: dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db() if not pexists(dmnd_db): print(colorify('DIAMOND database %s not present. Use download_eggnog_database.py to fetch it' % dmnd_db, 'red')) raise EmapperException() if args.input is not None: if args.annotate_hits_table is not None: print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_DIAMOND}", 'blue')) args.annotate_hits_table = None else: # the default -m is diamond, but we will consider -m no_search as default when # --annotate_hits_table has been provided and -i has not been provided if args.annotate_hits_table is not None: print(colorify(f"Assuming -m {SEARCH_MODE_NO_SEARCH}", 'blue')) args.mode = SEARCH_MODE_NO_SEARCH else: parser.error('An input fasta file is required (-i)') # Output file required if not args.output: parser.error('An output project name is required (-o)') if args.resume == True: print(colorify("Diamond jobs cannot be resumed. --resume will be ignored.", 'blue')) args.resume = False elif args.mode == SEARCH_MODE_MMSEQS2: mmseqs_db = args.mmseqs_db if args.mmseqs_db else get_eggnog_mmseqs_db() if not pexists(mmseqs_db): print(colorify('MMseqs2 database %s not present. Use download_eggnog_database.py to fetch it' % mmseqs_db, 'red')) raise EmapperException() if not args.input: parser.error('An input fasta file is required (-i)') # Output file required if not args.output: parser.error('An output project name is required (-o)') if args.resume == True: print(colorify("MMseqs2 jobs cannot be resumed. --resume will be ignored.", 'blue')) args.resume = False if args.annotate_hits_table is not None: print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_MMSEQS2}", 'blue')) args.annotate_hits_table = None elif args.mode == SEARCH_MODE_HMMER: # if args.usemem == True: # total_workers = args.num_workers * args.num_servers # if args.cpu < total_workers: # parser.error(f"Less cpus ({args.cpu}) than total workers ({total_workers}) were specified.") # if args.cpu % total_workers != 0: # parser.error(f"Number of cpus ({args.cpu}) must be a multiple of total workers ({total_workers}).") # args.cpus_per_worker = int(args.cpu / total_workers) # sys.stderr.write(f"CPUs per worker: {args.cpus_per_worker}\n") # else: # args.cpus_per_worker = args.cpu if not args.input: parser.error('An input file is required (-i)') # Output file required if not args.output: parser.error('An output project name is required (-o)') # Hmmer database # NOTE: hmmer database format, name and checking if exists is done within hmmer module if not args.db: parser.error('HMMER mode requires a target database (-d, --database).') if args.itype == ITYPE_CDS: args.translate = True if (args.itype == ITYPE_GENOME or args.itype == ITYPE_META) and args.genepred == GENEPRED_MODE_SEARCH: parser.error('HMMER mode is not compatible with "--genepred search" option.') if args.annotate_hits_table is not None: print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_HMMER}", 'blue')) args.annotate_hits_table = None if args.clean_overlaps is not None: if args.clean_overlaps == "none": args.clean_overlaps = None elif args.mode == SEARCH_MODE_CACHE: if args.cache_file is None: parser.error('A file with annotations and md5 of queries is required (-c FILE)') if args.decorate_gff != DECORATE_GFF_NONE: print(colorify("WARNING: no GFF will be created for cache-based annotations. It is not implemented yet, sorry.", 'red')) if args.no_annot == True: parser.error(f'Cache mode (-m {SEARCH_MODE_CACHE}) should be used to annotate.') elif args.mode == SEARCH_MODE_NO_SEARCH: if args.no_annot == False and not args.annotate_hits_table: parser.error(f'No search mode (-m {SEARCH_MODE_NO_SEARCH}) requires a hits table to annotate (--annotate_hits_table FILE.seed_orthologs)') if args.md5 == True and args.input is None: parser.error(f'--md5 requires an input FASTA file (-i FASTA).') # if args.no_annot == True and args.report_orthologs == False: # parser.error(f'Nothing to do if running in no search mode (-m {SEARCH_MODE_NO_SEARCH}), with --no_annot and without --report_orthologs.') else: parser.error(f'unrecognized search mode (-m {args.mode})') # Search thresholds args.dmnd_evalue = args.mmseqs_evalue = args.hmm_evalue = args.evalue args.dmnd_score = args.mmseqs_score = args_hmm_score = args.score args.qcov = args.query_cover # Annotation options if args.no_annot == False or args.report_orthologs == True: if not pexists(get_eggnogdb_file()): print(colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red')) raise EmapperException() args.tax_scope_mode, args.tax_scope_id = __parse_tax_scope(args.tax_scope) if args.target_taxa is not None: args.target_taxa = args.target_taxa.split(",") if args.excluded_taxa is not None: args.excluded_taxa = args.excluded_taxa.split(",") # Sets GO evidence bases if args.go_evidence == 'experimental': args.go_evidence = set(["EXP","IDA","IPI","IMP","IGI","IEP"]) args.go_excluded = set(["ND", "IEA"]) elif args.go_evidence == 'non-electronic': args.go_evidence = None args.go_excluded = set(["ND", "IEA"]) elif args.go_evidence == 'all': args.go_evidence = None args.go_excluded = None else: raise ValueError('Invalid --go_evidence value') # PFAM annotation options if args.pfam_transfer in [PFAM_TRANSFER_BEST_OG, PFAM_TRANSFER_NARROWEST_OG, PFAM_TRANSFER_SEED_ORTHOLOG]: pass else: raise ValueError(f'Invalid --pfam_transfer option {args.pfam_transfer}') if args.pfam_realign == PFAM_REALIGN_NONE: pass elif args.pfam_realign == PFAM_REALIGN_REALIGN or args.pfam_realign == PFAM_REALIGN_DENOVO: if not args.input: parser.error(f'An input fasta file is required (-i) for --pfam_realign {args.pfam_realign}') else: raise ValueError(f'Invalid --pfam_realign option {args.pfam_realign}') total_workers = args.num_workers * args.num_servers if args.cpu < total_workers: parser.error(f"Less cpus ({args.cpu}) than total workers ({total_workers}) were specified.") if args.cpu % total_workers != 0: parser.error(f"Number of cpus ({args.cpu}) must be a multiple of total workers ({total_workers}).") args.cpus_per_worker = int(args.cpu / total_workers) return args
help='simulate and print commands. Nothing is downloaded') parser.add_argument('-q', action="store_true", dest='quiet', help='quiet_mode') parser.add_argument("--data_dir", metavar='', type=existing_dir, help='Directory to use for DATA_PATH.') args = parser.parse_args() if args.data_dir: set_data_path(args.data_dir) # if args.force or not pexists(pjoin(get_data_path(), 'og2level.tsv.gz')): # print colorify('Downloading "og2level.tsv.gz" at %s' %get_data_path(), 'green') # download_og2level() # if 'all' in args.dbs: # args.dbs = EGGNOG_DATABASES if args.force or not pexists(pjoin(get_data_path(), 'eggnog.db')): if args.allyes or ask("Download main annotation database?") == 'y': print colorify( 'Downloading "eggnog.db" at %s...' % get_data_path(), 'green') download_annotations() else: print 'Skipping'
def parse_args(parser): args = parser.parse_args() if "EGGNOG_DATA_DIR" in os.environ: set_data_path(os.environ["EGGNOG_DATA_DIR"]) if args.data_dir: set_data_path(args.data_dir) if args.version: version = "" try: version = get_full_version_info() except Exception: version = get_version() print(version) sys.exit(0) args.call_info = get_call_info() if args.list_taxa: print_taxa() sys.exit(0) if args.cpu == 0: args.cpu = multiprocessing.cpu_count() multiprocessing.set_start_method(args.mp_start_method) if args.resume == True and args.override == True: parser.error('Only one of --resume or --override is allowed.') # Gene prediction if args.training_genome is not None and args.training_file is None: parser.error('"--training_genome requires --training_file"') if args.training_genome is None and args.training_file is not None: if not os.path.isfile(args.training_file): parser.error('"--training_file must point to an existing file, if no --training_genome is provided."') # Search modes if args.mode == SEARCH_MODE_DIAMOND: dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db() if not pexists(dmnd_db): print(colorify('DIAMOND database %s not present. Use download_eggnog_database.py to fetch it' % dmnd_db, 'red')) raise EmapperException() if args.input is not None: if args.annotate_hits_table is not None: print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_DIAMOND}", 'blue')) args.annotate_hits_table = None else: # the default -m is diamond, but we will consider -m no_search as default when # --annotate_hits_table has been provided and -i has not been provided if args.annotate_hits_table is not None: print(colorify(f"Assuming -m {SEARCH_MODE_NO_SEARCH}", 'blue')) args.mode = SEARCH_MODE_NO_SEARCH else: parser.error('An input fasta file is required (-i)') # Output file required if not args.output: parser.error('An output project name is required (-o)') elif args.mode == SEARCH_MODE_MMSEQS2: mmseqs_db = args.mmseqs_db if args.mmseqs_db else get_eggnog_mmseqs_db() if not pexists(mmseqs_db): print(colorify('MMseqs2 database %s not present. Use download_eggnog_database.py to fetch it' % mmseqs_db, 'red')) raise EmapperException() if not args.input: parser.error('An input fasta file is required (-i)') # Output file required if not args.output: parser.error('An output project name is required (-o)') if args.annotate_hits_table is not None: print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_MMSEQS2}", 'blue')) args.annotate_hits_table = None elif args.mode == SEARCH_MODE_HMMER: if not args.input: parser.error('An input file is required (-i)') # Output file required if not args.output: parser.error('An output project name is required (-o)') # Hmmer database # NOTE: hmmer database format, name and checking if exists is done within hmmer module if not args.db: parser.error('HMMER mode requires a target database (-d, --database).') if args.itype == ITYPE_CDS: args.translate = True if (args.itype == ITYPE_GENOME or args.itype == ITYPE_META) and args.genepred == GENEPRED_MODE_SEARCH: parser.error('HMMER mode is not compatible with "--genepred search" option.') if args.annotate_hits_table is not None: print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_HMMER}", 'blue')) args.annotate_hits_table = None if args.clean_overlaps is not None: if args.clean_overlaps == "none": args.clean_overlaps = None elif args.mode == SEARCH_MODE_CACHE: if args.cache_file is None: parser.error('A file with annotations and md5 of queries is required (-c FILE)') if args.decorate_gff != DECORATE_GFF_NONE: print(colorify("WARNING: no GFF will be created for cache-based annotations. It is not implemented yet, sorry.", 'red')) if args.no_annot == True: parser.error(f'Cache mode (-m {SEARCH_MODE_CACHE}) should be used to annotate.') elif args.mode == SEARCH_MODE_NO_SEARCH: if args.no_annot == False and not args.annotate_hits_table: parser.error(f'No search mode (-m {SEARCH_MODE_NO_SEARCH}) requires a hits table to annotate (--annotate_hits_table FILE.seed_orthologs)') if args.md5 == True and args.input is None: parser.error(f'--md5 requires an input FASTA file (-i FASTA).') else: parser.error(f'unrecognized search mode (-m {args.mode})') # Search thresholds args.dmnd_evalue = args.mmseqs_evalue = args.hmm_evalue = args.evalue args.dmnd_score = args.mmseqs_score = args_hmm_score = args.score args.qcov = args.query_cover # Annotation options if args.no_annot == False or args.report_orthologs == True: if not pexists(get_eggnogdb_file()): print(colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red')) raise EmapperException() args.tax_scope_ids = parse_tax_scope(args.tax_scope) if args.target_taxa is not None: args.target_taxa = args.target_taxa.split(",") if args.excluded_taxa is not None: args.excluded_taxa = args.excluded_taxa.split(",") # Sets GO evidence bases if args.go_evidence == 'experimental': args.go_evidence = set(["EXP","IDA","IPI","IMP","IGI","IEP"]) args.go_excluded = set(["ND", "IEA"]) elif args.go_evidence == 'non-electronic': args.go_evidence = None args.go_excluded = set(["ND", "IEA"]) elif args.go_evidence == 'all': args.go_evidence = None args.go_excluded = None else: raise ValueError('Invalid --go_evidence value') # PFAM annotation options if args.pfam_realign == PFAM_REALIGN_NONE: pass elif args.pfam_realign == PFAM_REALIGN_REALIGN or args.pfam_realign == PFAM_REALIGN_DENOVO: if not args.input: parser.error(f'An input fasta file is required (-i) for --pfam_realign {args.pfam_realign}') else: raise ValueError(f'Invalid --pfam_realign option {args.pfam_realign}') total_workers = args.num_workers * args.num_servers if args.cpu < total_workers: parser.error(f"Less cpus ({args.cpu}) than total workers ({total_workers}) were specified.") if args.cpu % total_workers != 0: parser.error(f"Number of cpus ({args.cpu}) must be a multiple of total workers ({total_workers}).") args.cpus_per_worker = int(args.cpu / total_workers) return args