def die(message: str, always_print_traceback: bool = False) -> None: if always_print_traceback or debug_mode(): import traceback traceback.print_exc() parser.exit(1, colored(message + "\n", "red"))
def create_trial_instance( trial_def: Type[det.Trial], checkpoint_dir: str, config: Optional[Dict[str, Any]] = None, hparams: Optional[Dict[str, Any]] = None, ) -> det.Trial: """ Create a trial instance from a Trial class definition. This can be a useful utility for debugging your trial logic in any development environment. Arguments: trial_def: A class definition that inherits from the det.Trial interface. checkpoint_dir: The checkpoint directory that the trial will use for loading and saving checkpoints. config: An optional experiment configuration that is used to initialize the :class:`determined.TrialContext`. If not specified, a minimal default is used. """ determined.common.set_logger( util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled()) env, rendezvous_info, hvd_config = det._make_local_execution_env( managed_training=False, test_mode=False, config=config, hparams=hparams) trial_context = trial_def.trial_context_class(env, hvd_config) return trial_def(trial_context)
def init_native( trial_def: Optional[Type[det.Trial]] = None, controller_cls: Optional[Type[det.TrialController]] = None, native_context_cls: Optional[Type[det.NativeContext]] = None, config: Optional[Dict[str, Any]] = None, local: bool = False, test: bool = False, context_dir: str = "", command: Optional[List[str]] = None, master_url: Optional[str] = None, ) -> Any: determined.common.set_logger( util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled()) if local: if not test: logging.warning("local training is not supported, testing instead") with det._local_execution_manager(pathlib.Path(context_dir).resolve()): return test_one_batch( controller_cls=controller_cls, native_context_cls=native_context_cls, trial_class=trial_def, config=config, ) else: return _init_cluster_mode( trial_def=trial_def, controller_cls=controller_cls, native_context_cls=native_context_cls, config=config, test=test, context_dir=context_dir, command=command, master_url=master_url, )
def create( trial_def: Type[det.Trial], config: Optional[Dict[str, Any]] = None, local: bool = False, test: bool = False, context_dir: str = "", command: Optional[List[str]] = None, master_url: Optional[str] = None, ) -> Any: # TODO: Add a reference to the local development tutorial. """ Create an experiment. Arguments: trial_def: A class definition implementing the :class:`determined.Trial` interface. config: A dictionary representing the experiment configuration to be associated with the experiment. local: A boolean indicating if training should be done locally. When ``False``, the experiment will be submitted to the Determined cluster. Defaults to ``False``. test: A boolean indicating if the experiment should be shortened to a minimal loop of training on a small amount of data, performing validation, and checkpointing. ``test=True`` is useful for quick iteration during model porting or debugging because common errors will surface more quickly. Defaults to ``False``. context_dir: A string filepath that defines the context directory. All model code will be executed with this as the current working directory. When ``local=False``, this argument is required. All files in this directory will be uploaded to the Determined cluster. The total size of this directory must be under 96 MB. When ``local=True``, this argument is optional and defaults to the current working directory. command: A list of strings that is used as the entrypoint of the training script in the Determined task environment. When executing this function via a Python script, this argument is inferred to be ``sys.argv`` by default. When executing this function via IPython or Jupyter notebook, this argument is required. Example: When creating an experiment by running ``python train.py --flag value``, the default command is inferred as ``["train.py", "--flag", "value"]``. master_url: An optional string to use as the Determined master URL when ``local=False``. If not specified, will be inferred from the environment variable ``DET_MASTER``. """ if local and not test: raise NotImplementedError( "det.create(local=True, test=False) is not yet implemented. Please set local=False " "or test=True.") determined.common.set_logger( util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled()) if local: # Local test mode. with det._local_execution_manager(pathlib.Path(context_dir).resolve()): return test_one_batch( trial_class=trial_def, config=config, ) elif not load.RunpyGlobals.is_initialized(): # Cluster mode, but still running locally; submit the experiment. _submit_experiment( config=config, test=test, context_dir=context_dir, command=command, master_url=master_url, ) else: # Cluster mode, now on the cluster; actually train. load.RunpyGlobals.set_runpy_trial_result(trial_def) raise det.errors.StopLoadingImplementation()
def main(args: List[str] = sys.argv[1:], ) -> None: # TODO: we lazily import "det deploy" but in the future we'd want to lazily import everything. parser = make_parser() full_cmd, aliases = generate_aliases(deploy_cmd.name) is_deploy_cmd = len(args) > 0 and any(args[0] == alias for alias in [*aliases, full_cmd]) if is_deploy_cmd: from determined.deploy.cli import args_description as deploy_args_description add_args(parser, [deploy_args_description]) else: add_args(parser, all_args_description) try: argcomplete.autocomplete(parser) parsed_args = parser.parse_args(args) def die(message: str, always_print_traceback: bool = False) -> None: if always_print_traceback or debug_mode(): import traceback traceback.print_exc(file=sys.stderr) parser.exit(1, colored(message + "\n", "red")) v = vars(parsed_args) if not v.get("func"): parser.print_usage() parser.exit(2, "{}: no subcommand specified\n".format(parser.prog)) try: # For `det deploy`, skip interaction with master. if is_deploy_cmd: parsed_args.func(parsed_args) return # Configure the CLI's Cert singleton. certs.cli_cert = certs.default_load(parsed_args.master) try: check_version(parsed_args) except requests.exceptions.SSLError: # An SSLError usually means that we queried a master over HTTPS and got an untrusted # cert, so allow the user to store and trust the current cert. (It could also mean # that we tried to talk HTTPS on the HTTP port, but distinguishing that based on the # exception is annoying, and we'll figure that out in the next step anyway.) addr = api.parse_master_address(parsed_args.master) check_not_none(addr.hostname) check_not_none(addr.port) try: ctx = SSL.Context(SSL.TLSv1_2_METHOD) conn = SSL.Connection(ctx, socket.socket()) conn.set_tlsext_host_name( cast(str, addr.hostname).encode()) conn.connect( cast(Sequence[Union[str, int]], (addr.hostname, addr.port))) conn.do_handshake() cert_pem_data = "".join( crypto.dump_certificate(crypto.FILETYPE_PEM, cert).decode() for cert in conn.get_peer_cert_chain()) except crypto.Error: die("Tried to connect over HTTPS but couldn't get a certificate from the " "master; consider using HTTP") cert_hash = hashlib.sha256( ssl.PEM_cert_to_DER_cert(cert_pem_data)).hexdigest() cert_fingerprint = ":".join(chunks(cert_hash, 2)) if not render.yes_or_no( "The master sent an untrusted certificate chain with this SHA256 fingerprint:\n" "{}\nDo you want to trust this certificate from now on?" .format(cert_fingerprint)): die("Unable to verify master certificate") certs.CertStore(certs.default_store()).set_cert( parsed_args.master, cert_pem_data) # Reconfigure the CLI's Cert singleton, but preserve the certificate name. old_cert_name = certs.cli_cert.name certs.cli_cert = certs.Cert(cert_pem=cert_pem_data, name=old_cert_name) check_version(parsed_args) parsed_args.func(parsed_args) except KeyboardInterrupt as e: raise e except (api.errors.BadRequestException, api.errors.BadResponseException) as e: die("Failed to {}: {}".format(parsed_args.func.__name__, e)) except api.errors.CorruptTokenCacheException: die("Failed to login: Attempted to read a corrupted token cache. " "The store has been deleted; please try again.") except EnterpriseOnlyError as e: die(f"Determined Enterprise Edition is required for this functionality: {e}" ) except Exception: die("Failed to {}".format(parsed_args.func.__name__), always_print_traceback=True) except KeyboardInterrupt: # die() may not be defined yet. if debug_mode(): import traceback traceback.print_exc(file=sys.stderr) print(colored("Interrupting...\n", "red"), file=sys.stderr) exit(3)