def main(args: List[str] = sys.argv[1:]) -> None: # TODO(#1690): Refactor admin command(s) to a separate CLI tool. if "DET_ADMIN" in os.environ: experiment_args_description.subs.append( Cmd( "delete", experiment.delete_experiment, "delete experiment", [ Arg("experiment_id", help="delete experiment"), Arg( "--yes", action="store_true", default=False, help="automatically answer yes to prompts", ), ], )) try: parser = make_parser() argcomplete.autocomplete(parser) parsed_args = parser.parse_args(args) def die(message: str, always_print_traceback: bool = False) -> None: if always_print_traceback or os.getenv( "DET_DEBUG", "").lower() in ("true", "1", "yes"): import traceback traceback.print_exc() parser.exit(1, colored(message + "\n", "red")) v = vars(parsed_args) if not v.get("func"): parser.print_usage() parser.exit(2, "{}: no subcommand specified\n".format(parser.prog)) cert_fn = str(auth.get_config_path().joinpath("master.crt")) if os.path.exists(cert_fn): os.environ["REQUESTS_CA_BUNDLE"] = cert_fn try: try: check_version(parsed_args) except requests.exceptions.SSLError: # An SSLError usually means that we queried a master over HTTPS and got an untrusted # cert, so allow the user to store and trust the current cert. (It could also mean # that we tried to talk HTTPS on the HTTP port, but distinguishing that based on the # exception is annoying, and we'll figure that out in the next step anyway.) addr = api.parse_master_address(parsed_args.master) check_not_none(addr.hostname) check_not_none(addr.port) try: cert_pem_data = ssl.get_server_certificate( (cast(str, addr.hostname), cast(int, addr.port))) except ssl.SSLError: die("Tried to connect over HTTPS but couldn't get a certificate from the " "master; consider using HTTP") cert_hash = hashlib.sha256( ssl.PEM_cert_to_DER_cert(cert_pem_data)).hexdigest() cert_fingerprint = ":".join(chunks(cert_hash, 2)) if not render.yes_or_no( "The master sent an untrusted certificate with this SHA256 fingerprint:\n" "{}\nDo you want to trust this certificate from now on?" .format(cert_fingerprint)): die("Unable to verify master certificate") with open(cert_fn, "w") as out: out.write(cert_pem_data) os.environ["REQUESTS_CA_BUNDLE"] = cert_fn check_version(parsed_args) parsed_args.func(parsed_args) except KeyboardInterrupt as e: raise e except (api.errors.BadRequestException, api.errors.BadResponseException) as e: die("Failed to {}: {}".format(parsed_args.func.__name__, e)) except api.errors.CorruptTokenCacheException: die("Failed to login: Attempted to read a corrupted token cache. " "The store has been deleted; please try again.") except Exception: die("Failed to {}".format(parsed_args.func.__name__), always_print_traceback=True) except KeyboardInterrupt: parser.exit(3, colored("Interrupting...\n", "red"))
args_description = Cmd( "e|xperiment", None, "manage experiments", [ # Inspection commands. Cmd( "list", list_experiments, "list experiments", [ Arg( "--all", "-a", action="store_true", help= "show all experiments (including archived and other users')", ), Arg("--csv", action="store_true", help="print as CSV"), ], is_default=True, ), Cmd("config", config, "display experiment config", [experiment_id_arg("experiment ID")]), Cmd( "describe", describe, "describe experiment", [ Arg("experiment_ids", help="comma-separated list of experiment IDs to describe"), Arg("--metrics", action="store_true", help="display full metrics"), Group( Arg("--csv", action="store_true", help="print as CSV"), Arg("--json", action="store_true", help="print as JSON"), Arg("--outdir", type=Path, help="directory to save output"), ), ], ), Cmd( "download-model-def", download_model_def, "download model definition", [ experiment_id_arg("experiment ID"), Arg("--output-dir", type=Path, help="output directory", default="."), ], ), Cmd( "list-trials lt", list_trials, "list trials of experiment", [ experiment_id_arg("experiment ID"), Arg("--csv", action="store_true", help="print as CSV"), ], ), Cmd( "list-checkpoints lc", checkpoint.list, "list checkpoints of experiment", [ experiment_id_arg("experiment ID"), Arg( "--best", type=int, help="Return the best N checkpoints for this experiment. " "If this flag is used, only checkpoints with an associated " "validation metric will be considered.", ), Arg("--csv", action="store_true", help="print as CSV"), ], ), # Create command. Cmd( "create", create, "create experiment", [ Arg("config_file", type=FileType("r"), help="experiment config file (.yaml)"), Arg("model_def", type=Path, help="file or directory containing model definition"), Arg( "-g", "--git", action="store_true", help="Associate git metadata with this experiment. This " "flag assumes that git is installed, a .git repository " "exists in the model definition directory, and that the " "git working tree of that repository is empty.", ), Arg( "--local", action="store_true", help= "Create the experiment in local mode instead of submitting it to the " "cluster. For more information, see documentation on det.experimental.create()", ), Arg( "--template", type=str, help= "name of template to apply to the experiment configuration", ), Group( Arg( "-f", "--follow-first-trial", action="store_true", help= "follow the logs of the first trial that is created", ), Arg("--paused", action="store_true", help="do not activate the experiment"), Arg( "-t", "--test-mode", action="store_true", help="Test the experiment configuration and model " "definition by creating and scheduling a very small " "experiment. This command will verify that a training " "step and validation step run successfully and that " "checkpoints can be saved. The test experiment will " "be archived on creation.", ), ), ], ), # Lifecycle management commands. Cmd( "activate", activate, "activate experiment", [experiment_id_arg("experiment ID to activate")], ), Cmd("cancel", cancel, "cancel experiment", [experiment_id_arg("experiment ID to cancel")]), Cmd("pause", pause, "pause experiment", [experiment_id_arg("experiment ID to pause")]), Cmd( "archive", archive, "archive experiment", [experiment_id_arg("experiment ID to archive")], ), Cmd( "unarchive", unarchive, "unarchive experiment", [experiment_id_arg("experiment ID to unarchive")], ), Cmd( "download", download, "download checkpoints for an experiment", [ experiment_id_arg("experiment ID to cancel"), Arg( "-o", "--output-dir", type=str, default="checkpoints", help="Desired top level directory for the checkpoints. " "Checkpoints will be downloaded to " "<output_dir>/<checkpoint_uuid>/<checkpoint_files>.", ), Arg( "--top-n", type=int, default=1, help="The number of checkpoints to download for the " "experiment. The checkpoints are sorted by validation " "metric as defined by --sort-by and --smaller-is-better." "This command will select the best N checkpoints from the " "top performing N trials of the experiment.", ), Arg( "--sort-by", type=str, default=None, help= "The name of the validation metric to sort on. Without --sort-by, the " "experiment's searcher metric is assumed. If this argument is specified, " "--smaller-is-better must also be specified.", ), Arg( "--smaller-is-better", type=lambda s: bool(distutils.util.strtobool(s)), default=None, help="The sort order for metrics when using --sort-by. For " "example, 'accuracy' would require passing '--smaller-is-better false'. If " "--sort-by is specified, this argument must be specified.", ), Arg( "-q", "--quiet", action="store_true", help="Only print the paths to the checkpoints.", ), ], ), Cmd("kill", kill_experiment, "kill experiment", [Arg("experiment_id", help="experiment ID")]), Cmd( "wait", wait, "wait for experiment to reach terminal state", [ experiment_id_arg("experiment ID"), Arg( "--polling-interval", type=int, default=5, help="the interval (in seconds) to poll for updated state", ), ], ), # Attribute setting commands. Cmd( "label", None, "manage experiment labels", [ Cmd( "add", add_label, "add label", [ experiment_id_arg("experiment ID"), Arg("label", help="label") ], ), Cmd( "remove", remove_label, "remove label", [ experiment_id_arg("experiment ID"), Arg("label", help="label") ], ), ], ), Cmd( "set", None, "set experiment attributes", [ Cmd( "description", set_description, "set experiment description", [ experiment_id_arg("experiment ID to modify"), Arg("description", help="experiment description"), ], ), Cmd( "gc-policy", set_gc_policy, "set experiment GC policy and run GC", [ experiment_id_arg("experiment ID to modify"), Arg( "--save-experiment-best", type=int, required=True, help="number of best checkpoints per experiment " "to save", ), Arg( "--save-trial-best", type=int, required=True, help="number of best checkpoints per trial to save", ), Arg( "--save-trial-latest", type=int, required=True, help= "number of latest checkpoints per trial to save", ), Arg( "--yes", action="store_true", default=False, help="automatically answer yes to prompts", ), ], ), Cmd( "max-slots", set_max_slots, "set `max_slots` of experiment", [ experiment_id_arg("experiment ID to modify"), Arg("max_slots", type=none_or_int, help="max slots"), ], ), Cmd( "weight", set_weight, "set `weight` of experiment", [ experiment_id_arg("experiment ID to modify"), Arg("weight", type=float, help="weight"), ], ), ], ), ], )
help="master address", metavar="address", default=get_default_master_address()), Arg("-v", "--version", action="version", help="print CLI version and exit", version="%(prog)s {}".format(determined_cli.__version__)), experiment.args_description, checkpoint.args_description, Cmd( "task", None, "manage tasks (commands, experiments, notebooks, shells, tensorboards)", [ Cmd("list", list_tasks, "list tasks in cluster", [ Arg("--csv", action="store_true", help="print as CSV"), ], is_default=True), ]), Cmd("preview-search", preview_search, "preview search", [ Arg("config_file", type=FileType("r"), help="experiment config file (.yaml)") ]), ] # type: List[object] # fmt: on all_args_description = (args_description + master_args_description +
args_description = Cmd( "e|xperiment", None, "manage experiments", [ # Inspection commands. Cmd( "list", list_experiments, "list experiments", [ Arg( "--all", "-a", action="store_true", help= "show all experiments (including archived and other users')", ), Arg("--csv", action="store_true", help="print as CSV"), ], is_default=True, ), Cmd("config", config, "display experiment config", [experiment_id_arg("experiment ID")]), Cmd( "describe", describe, "describe experiment", [ Arg("experiment_ids", help="comma-separated list of experiment IDs to describe"), Arg("--metrics", action="store_true", help="display full metrics"), Group( Arg("--csv", action="store_true", help="print as CSV"), Arg("--json", action="store_true", help="print as JSON"), Arg("--outdir", type=Path, help="directory to save output"), ), ], ), Cmd( "download-model-def", download_model_def, "download model definition", [ experiment_id_arg("experiment ID"), Arg("--output-dir", type=Path, help="output directory", default="."), ], ), Cmd( "list-trials lt", list_trials, "list trials of experiment", [ experiment_id_arg("experiment ID"), Arg("--csv", action="store_true", help="print as CSV"), ], ), Cmd( "list-checkpoints lc", checkpoint.list, "list checkpoints of experiment", [ experiment_id_arg("experiment ID"), Arg( "--best", type=int, help="Return the best N checkpoints for this experiment. " "If this flag is used, only checkpoints with an associated " "validation metric will be considered.", ), Arg( "-d", "--download-dir", type=Path, help="download the listed checkpoints to this directory. " "The resources of each checkpoint will be saved in a " "subdirectory labeled with the experiment ID, trial ID, " "and step ID. This flag is only supported for experiments " "configured to use S3 or GCS checkpoint storage.", ), Arg("--csv", action="store_true", help="print as CSV"), ], ), # Create command. Cmd( "create", create, "create experiment", [ Arg("config_file", type=FileType("r"), help="experiment config file (.yaml)"), Arg("model_def", type=Path, help="file or directory containing model definition"), Arg( "-g", "--git", action="store_true", help="Associate git metadata with this experiment. This " "flag assumes that git is installed, a .git repository " "exists in the model definition directory, and that the " "git working tree of that repository is empty.", ), Arg( "--local", action="store_true", help= "Create the experiment in local mode instead of submitting it to the " "cluster. For more information, see documentation on det.experimental.create() " "and det.experimental.Mode.LOCAL", ), Arg( "--template", type=str, help= "name of template to apply to the experiment configuration", ), Group( Arg( "-f", "--follow-first-trial", action="store_true", help= "follow the logs of the first trial that is created", ), Arg("--paused", action="store_true", help="do not activate the experiment"), Arg( "-t", "--test-mode", action="store_true", help="Test the experiment configuration and model " "definition by creating and scheduling a very small " "experiment. This command will verify that a training " "step and validation step run successfully and that " "checkpoints can be saved. The test experiment will " "be archived on creation.", ), ), ], ), # Lifecycle management commands. Cmd( "activate", activate, "activate experiment", [experiment_id_arg("experiment ID to activate")], ), Cmd("cancel", cancel, "cancel experiment", [experiment_id_arg("experiment ID to cancel")]), Cmd("pause", pause, "pause experiment", [experiment_id_arg("experiment ID to pause")]), Cmd( "archive", archive, "archive experiment", [experiment_id_arg("experiment ID to archive")], ), Cmd( "unarchive", unarchive, "unarchive experiment", [experiment_id_arg("experiment ID to unarchive")], ), Cmd("kill", kill_experiment, "kill experiment", [Arg("experiment_id", help="experiment ID")]), Cmd( "wait", wait, "wait for experiment to reach terminal state", [ experiment_id_arg("experiment ID"), Arg( "--polling-interval", type=int, default=5, help="the interval (in seconds) to poll for updated state", ), ], ), # Attribute setting commands. Cmd( "label", None, "manage experiment labels", [ Cmd( "add", add_label, "add label", [ experiment_id_arg("experiment ID"), Arg("label", help="label") ], ), Cmd( "remove", remove_label, "remove label", [ experiment_id_arg("experiment ID"), Arg("label", help="label") ], ), ], ), Cmd( "set", None, "set experiment attributes", [ Cmd( "description", set_description, "set experiment description", [ experiment_id_arg("experiment ID to modify"), Arg("description", help="experiment description"), ], ), Cmd( "gc-policy", set_gc_policy, "set experiment GC policy and run GC", [ experiment_id_arg("experiment ID to modify"), Arg( "--save-experiment-best", type=int, required=True, help="number of best checkpoints per experiment " "to save", ), Arg( "--save-trial-best", type=int, required=True, help="number of best checkpoints per trial to save", ), Arg( "--save-trial-latest", type=int, required=True, help= "number of latest checkpoints per trial to save", ), Arg( "--yes", action="store_true", default=False, help="automatically answer yes to prompts", ), ], ), Cmd( "max-slots", set_max_slots, "set `max_slots` of experiment", [ experiment_id_arg("experiment ID to modify"), Arg("max_slots", type=none_or_int, help="max slots"), ], ), Cmd( "weight", set_weight, "set `weight` of experiment", [ experiment_id_arg("experiment ID to modify"), Arg("weight", type=float, help="weight"), ], ), ], ), ], )