def create_trial_instance( trial_def: Type[det.Trial], checkpoint_dir: str, config: Optional[Dict[str, Any]] = None, hparams: Optional[Dict[str, Any]] = None, ) -> det.Trial: """ Create a trial instance from a Trial class definition. This can be a useful utility for debugging your trial logic in any development environment. Arguments: trial_def: A class definition that inherits from the det.Trial interface. checkpoint_dir: The checkpoint directory that the trial will use for loading and saving checkpoints. config: An optional experiment configuration that is used to initialize the :class:`determined.TrialContext`. If not specified, a minimal default is used. """ determined_common.set_logger( util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled()) env, rendezvous_info, hvd_config = det._make_local_execution_env( False, config, hparams) trial_context = trial_def.trial_context_class(env, hvd_config) return trial_def(trial_context)
def test_one_batch( trial_class: Type[det.Trial], config: Optional[Dict[str, Any]] = None, ) -> Any: # Override the scheduling_unit value to 1. config = {**(config or {}), "scheduling_unit": 1} logging.info("Running a minimal test experiment locally") checkpoint_dir = tempfile.TemporaryDirectory() env, rendezvous_info, hvd_config = det._make_local_execution_env( managed_training=True, test_mode=True, config=config, limit_gpus=1) workloads = _make_test_workloads( pathlib.Path(checkpoint_dir.name).joinpath("checkpoint"), env.experiment_config) logging.info(f"Using hyperparameters: {env.hparams}.") logging.debug(f"Using a test experiment config: {env.experiment_config}.") # Case 2: test one batch for Trial implementation. controller = load.load_trial( trial_class=trial_class, env=env, workloads=workloads, load_path=None, rendezvous_info=rendezvous_info, hvd_config=hvd_config, ) controller.run() checkpoint_dir.cleanup() logging.info( "Note: to submit an experiment to the cluster, change local parameter to False" )
def _load_trial_on_local( context_dir: pathlib.Path, training: bool, config: Dict[str, Any], hparams: Dict[str, Any]) -> Tuple[Type[det.Trial], det.TrialContext]: with det._local_execution_manager(context_dir): trial_class = load.load_trial_implementation(config["entrypoint"]) env, rendezvous_info, hvd_config = det._make_local_execution_env( training, config, hparams) trial_context = trial_class.trial_context_class(env, hvd_config) return trial_class, trial_context
def _load_trial_on_local( context_dir: pathlib.Path, managed_training: bool, config: Dict[str, Any], hparams: Dict[str, Any], ) -> Tuple[Type[det.Trial], det.TrialContext]: with det._local_execution_manager(context_dir): trial_class = load.trial_class_from_entrypoint(config["entrypoint"]) env, rendezvous_info, hvd_config = det._make_local_execution_env( managed_training=managed_training, test_mode=False, config=config, hparams=hparams ) trial_context = trial_class.trial_context_class(env, hvd_config, rendezvous_info) return trial_class, trial_context
def test_one_batch( trial_class: Type[det.Trial], config: Optional[Dict[str, Any]] = None, ) -> Any: # Override the scheduling_unit value to 1. config = {**(config or {}), "scheduling_unit": 1} logging.info("Running a minimal test experiment locally") with tempfile.TemporaryDirectory() as checkpoint_dir: core_context, env = det._make_local_execution_env( managed_training=True, test_mode=True, config=config, checkpoint_dir=checkpoint_dir, limit_gpus=1, ) workloads = _make_test_workloads(env.experiment_config) logging.info(f"Using hyperparameters: {env.hparams}.") logging.debug( f"Using a test experiment config: {env.experiment_config}.") distributed_backend = det._DistributedBackend() controller_class = trial_class.trial_controller_class assert controller_class is not None controller_class.pre_execute_hook(env, distributed_backend) trial_context = trial_class.trial_context_class(core_context, env) logging.info(f"Creating {trial_class.__name__}.") trial_inst = trial_class(trial_context) controller = controller_class.from_trial( trial_inst=trial_inst, context=trial_context, env=env, workloads=workloads, ) controller.run() logging.info("The test experiment passed.") logging.info( "Note: to submit an experiment to the cluster, change local parameter to False" )
def from_config(cls, config: Dict[str, Any]) -> "TrialContext": """ Create a context object suitable for debugging outside of Determined. An example for a subclass of :class:`~determined.pytorch._pytorch_trial.PyTorchTrial`: .. code-block:: python config = { ... } context = det.pytorch.PyTorchTrialContext.from_config(config) my_trial = MyPyTorchTrial(context) train_ds = my_trial.build_training_data_loader() for epoch_idx in range(3): for batch_idx, batch in enumerate(train_ds): metrics = my_trial.train_batch(batch, epoch_idx, batch_idx) ... An example for a subclass of :class:`~determined.keras._tf_keras_trial.TFKerasTrial`: .. code-block:: python config = { ... } context = det.keras.TFKerasTrialContext.from_config(config) my_trial = tf_keras_one_var_model.OneVarTrial(context) model = my_trial.build_model() model.fit(my_trial.build_training_data_loader()) eval_metrics = model.evaluate(my_trial.build_validation_data_loader()) Arguments: config: An experiment config file, in dictionary form. """ core_context, env = det._make_local_execution_env( managed_training=False, test_mode=False, config=config, checkpoint_dir="/tmp", limit_gpus=1, ) return cls(core_context, env)
def create_trial_instance( trial_def: Type[det.Trial], checkpoint_dir: str, config: Optional[Dict[str, Any]] = None, hparams: Optional[Dict[str, Any]] = None, ) -> det.Trial: """ Deprecated: please use your TrialContext's .from_config() method instead. Create a trial instance from a Trial class definition. This can be a useful utility for debugging your trial logic in any development environment. Arguments: trial_def: A class definition that inherits from the det.Trial interface. checkpoint_dir: The checkpoint directory that the trial will use for loading and saving checkpoints. config: An optional experiment configuration that is used to initialize the :class:`determined.TrialContext`. If not specified, a minimal default is used. """ warnings.warn( "det.experimental.create_trial_instance() is now deprecated. Please use\n" "your TrialContext's .from_config() method instead. Example\n" "\n" " context = PyTorchTrialContext.from_config()\n" " my_trial = MyPyTorchTrial(context)\n", FutureWarning, ) determined.common.set_logger( util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled()) env, rendezvous_info, hvd_config = det._make_local_execution_env( managed_training=False, test_mode=False, config=config, hparams=hparams) trial_context = trial_def.trial_context_class( env, hvd_config, rendezvous_info=rendezvous_info) return trial_def(trial_context)
def test_one_batch( controller_cls: Optional[Type[det.TrialController]] = None, native_context_cls: Optional[Type[det.NativeContext]] = None, trial_class: Optional[Type[det.Trial]] = None, config: Optional[Dict[str, Any]] = None, ) -> Any: # Override the scheduling_unit value to 1. config = {**(config or {}), "scheduling_unit": 1} logging.info("Running a minimal test experiment locally") checkpoint_dir = tempfile.TemporaryDirectory() env, rendezvous_info, hvd_config = det._make_local_execution_env( managed_training=True, test_mode=True, config=config, limit_gpus=1) workloads = _make_test_workloads( pathlib.Path(checkpoint_dir.name).joinpath("checkpoint"), env.experiment_config) logging.info(f"Using hyperparameters: {env.hparams}.") logging.debug(f"Using a test experiment config: {env.experiment_config}.") if native_context_cls is not None and controller_cls is not None: # Case 1: test one batch for Native implementation. controller_cls.pre_execute_hook(env=env, hvd_config=hvd_config) context = native_context_cls( env=env, hvd_config=hvd_config, rendezvous_info=rendezvous_info, ) def train_fn() -> None: controller = cast(Type[det.TrialController], controller_cls).from_native( context=context, env=env, workloads=workloads, load_path=None, rendezvous_info=rendezvous_info, hvd_config=hvd_config, ) controller.run() checkpoint_dir.cleanup() context._set_train_fn(train_fn) logging.info( "Note: to submit an experiment to the cluster, change local parameter to False" ) return context elif trial_class is not None: # Case 2: test one batch for Trial implementation. controller = load.load_controller_from_trial( trial_class=trial_class, env=env, workloads=workloads, load_path=None, rendezvous_info=rendezvous_info, hvd_config=hvd_config, ) controller.run() checkpoint_dir.cleanup() logging.info( "Note: to submit an experiment to the cluster, change local parameter to False" ) else: raise errors.InternalException( "Must provide a trial_def if using Trial API or " "a controller_cls and a native_context_cls if using Native API.")