def local_experiment(args: Namespace) -> None: if not args.test_mode: raise NotImplementedError( "Local training mode (--local mode without --test mode) is not yet supported. Please " "try local test mode by adding the --test flag or cluster training mode by removing " "the --local flag.") experiment_config = _parse_config_file_or_exit(args.config_file, args.config) entrypoint = experiment_config["entrypoint"] # --local --test mode only makes sense for the legacy trial entrypoints. Otherwise the user # would just run their training script directly. if not det.util.match_legacy_trial_class(entrypoint): raise NotImplementedError( "Local test mode (--local --test) is only supported for Trial-like entrypoints. " "Script-like entrypoints are not supported, but maybe you can just invoke your script " "directly?") set_logger(bool(experiment_config.get("debug", False))) with _local_execution_manager(args.model_def.resolve()): trial_class = determined.load.trial_class_from_entrypoint(entrypoint) determined.experimental.test_one_batch(trial_class=trial_class, config=experiment_config)
def _load_trial_on_local( context_dir: pathlib.Path, training: bool, config: Dict[str, Any], hparams: Dict[str, Any]) -> Tuple[Type[det.Trial], det.TrialContext]: with det._local_execution_manager(context_dir): trial_class = load.load_trial_implementation(config["entrypoint"]) env, rendezvous_info, hvd_config = det._make_local_execution_env( training, config, hparams) trial_context = trial_class.trial_context_class(env, hvd_config) return trial_class, trial_context
def _load_trial_on_local( context_dir: pathlib.Path, managed_training: bool, config: Dict[str, Any], hparams: Dict[str, Any], ) -> Tuple[Type[det.Trial], det.TrialContext]: with det._local_execution_manager(context_dir): trial_class = load.trial_class_from_entrypoint(config["entrypoint"]) env, rendezvous_info, hvd_config = det._make_local_execution_env( managed_training=managed_training, test_mode=False, config=config, hparams=hparams ) trial_context = trial_class.trial_context_class(env, hvd_config, rendezvous_info) return trial_class, trial_context
def test_test_one_batch() -> None: with det._local_execution_manager( pathlib.Path(pytorch_xor_model.__file__).parent): experimental.test_one_batch( trial_class=pytorch_xor_model.XORTrial, config={ "hyperparameters": { "hidden_size": 2, "learning_rate": 0.5, "global_batch_size": 4 } }, )
def local_experiment(args: Namespace) -> None: if not args.test_mode: raise NotImplementedError( "Local training mode (--local mode without --test mode) is not yet supported. Please " "try local test mode by adding the --test flag or cluster training mode by removing " "the --local flag.") experiment_config = _parse_config_file_or_exit(args.config_file, args.config) set_logger(bool(experiment_config.get("debug", False))) with _local_execution_manager(args.model_def.resolve()): trial_class = determined.load.trial_class_from_entrypoint( experiment_config["entrypoint"]) determined.experimental.test_one_batch(trial_class=trial_class, config=experiment_config)
def local_experiment(args: Namespace) -> None: try: import determined as det from determined import experimental, load except ImportError as e: print("--local requires that the `determined` package is installed.") raise e if not args.test_mode: raise NotImplementedError( "Local training mode (--local mode without --test mode) is not yet supported. Please " "try local test mode by adding the --test flag or cluster training mode by removing " "the --local flag." ) experiment_config = _parse_config_file_or_exit(args.config_file) determined.common.set_logger(bool(experiment_config.get("debug", False))) with det._local_execution_manager(args.model_def.resolve()): trial_class = load.trial_class_from_entrypoint(experiment_config["entrypoint"]) experimental.test_one_batch(trial_class=trial_class, config=experiment_config)
def init_native( trial_def: Optional[Type[det.Trial]] = None, controller_cls: Optional[Type[det.TrialController]] = None, native_context_cls: Optional[Type[det.NativeContext]] = None, config: Optional[Dict[str, Any]] = None, local: bool = False, test: bool = False, context_dir: str = "", command: Optional[List[str]] = None, master_url: Optional[str] = None, ) -> Any: determined.common.set_logger( util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled()) if local: if not test: logging.warning("local training is not supported, testing instead") with det._local_execution_manager(pathlib.Path(context_dir).resolve()): return test_one_batch( controller_cls=controller_cls, native_context_cls=native_context_cls, trial_class=trial_def, config=config, ) else: return _init_cluster_mode( trial_def=trial_def, controller_cls=controller_cls, native_context_cls=native_context_cls, config=config, test=test, context_dir=context_dir, command=command, master_url=master_url, )
def create( trial_def: Type[det.Trial], config: Optional[Dict[str, Any]] = None, local: bool = False, test: bool = False, context_dir: str = "", command: Optional[List[str]] = None, master_url: Optional[str] = None, ) -> Any: # TODO: Add a reference to the local development tutorial. """ Create an experiment. Arguments: trial_def: A class definition implementing the :class:`determined.Trial` interface. config: A dictionary representing the experiment configuration to be associated with the experiment. local: A boolean indicating if training should be done locally. When ``False``, the experiment will be submitted to the Determined cluster. Defaults to ``False``. test: A boolean indicating if the experiment should be shortened to a minimal loop of training on a small amount of data, performing validation, and checkpointing. ``test=True`` is useful for quick iteration during model porting or debugging because common errors will surface more quickly. Defaults to ``False``. context_dir: A string filepath that defines the context directory. All model code will be executed with this as the current working directory. When ``local=False``, this argument is required. All files in this directory will be uploaded to the Determined cluster. The total size of this directory must be under 96 MB. When ``local=True``, this argument is optional and defaults to the current working directory. command: A list of strings that is used as the entrypoint of the training script in the Determined task environment. When executing this function via a Python script, this argument is inferred to be ``sys.argv`` by default. When executing this function via IPython or Jupyter notebook, this argument is required. Example: When creating an experiment by running ``python train.py --flag value``, the default command is inferred as ``["train.py", "--flag", "value"]``. master_url: An optional string to use as the Determined master URL when ``local=False``. If not specified, will be inferred from the environment variable ``DET_MASTER``. """ if local and not test: raise NotImplementedError( "det.create(local=True, test=False) is not yet implemented. Please set local=False " "or test=True.") determined.common.set_logger( util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled()) if local: # Local test mode. with det._local_execution_manager(pathlib.Path(context_dir).resolve()): return test_one_batch( trial_class=trial_def, config=config, ) elif not load.RunpyGlobals.is_initialized(): # Cluster mode, but still running locally; submit the experiment. _submit_experiment( config=config, test=test, context_dir=context_dir, command=command, master_url=master_url, ) else: # Cluster mode, now on the cluster; actually train. load.RunpyGlobals.set_runpy_trial_result(trial_def) raise det.errors.StopLoadingImplementation()