Exemplo n.º 1
0
    def __init__(self, trial_inst: det.Trial, *args: Any,
                 **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        assert isinstance(
            trial_inst,
            DeepSpeedTrial), "DeepSpeedTrialController needs a DeepSpeedTrial"
        self.trial = trial_inst
        self.context = cast(det_ds.DeepSpeedTrialContext, self.context)
        self.context._set_determined_profiler(self.prof)
        if torch.cuda.is_available():
            self.prof._set_sync_device(self._sync_device)
        self.callbacks = self.trial.build_callbacks()

        if len(self.context.models) == 0:
            raise det.errors.InvalidExperimentException(
                "Must have at least one model engine. "
                "This might be caused by not wrapping your model with wrap_model_engine()"
            )

        self.wlsq = None  # type: Optional[layers.WorkloadSequencer]
        if self.workloads is None:
            self.workloads, self.wlsq = layers.make_compatibility_workloads(
                self.context._core, self.env,
                self.context.models[0].train_batch_size())

        self.steps_completed = self.env.steps_completed
Exemplo n.º 2
0
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        check_startup_hook_ran = self.env.hparams.get("check_startup_hook_ran", False)
        if check_startup_hook_ran:
            check.true(os.path.isfile("startup-hook-ran"), "File should exists.")

        self.chaos = random.SystemRandom()
        self._batch_size = self.context.get_per_slot_batch_size()
        self.chaos_probability = self.env.hparams.get("chaos_probability", 0)
        self.chaos_probability_train = self.env.hparams.get("chaos_probability_train")
        self.chaos_probability_validate = self.env.hparams.get("chaos_probability_validate")
        self.chaos_probability_checkpoint = self.env.hparams.get("chaos_probability_checkpoint")
        self.nan_probability_validate = self.env.hparams.get("nan_probability_validate", 0)
        self.fail_on_first_validation = self.env.hparams.get("fail_on_first_validation", "")
        self.fail_on_chechpoint_save = self.env.hparams.get("fail_on_chechpoint_save", "")
        self.validation_set_size = self.env.hparams.get("validation_set_size", 32 * 32)
        self.train_batch_secs = self.env.hparams.get("training_batch_seconds", 0)
        self.validation_secs = self.env.hparams.get(
            "validation_seconds",
            self.validation_set_size * self.train_batch_secs / self._batch_size,
        )
        self.num_training_metrics = self.env.hparams.get("num_training_metrics", 1)
        assert self.num_training_metrics > 0
        self.num_validation_metrics = self.env.hparams.get("num_validation_metrics", 1)
        assert self.num_validation_metrics > 0
        self.save_secs = self.env.hparams.get("save_checkpoint_seconds", 0)
        self.load_secs = self.env.hparams.get("load_checkpoint_secs", 0)
        self.metrics_progression = self.env.hparams.get("metrics_progression", "decreasing")
        assert self.metrics_progression in ("increasing", "decreasing", "constant")
        self.metrics_base = self.env.hparams.get("metrics_base", 0.9)
        assert 0 < self.metrics_base < 1
        self.metrics_sigma = self.env.hparams.get("metrics_sigma", 0.0)
        assert 0 <= self.metrics_sigma
        self.write_null = self.env.hparams.get("write_null", False)

        self.request_stop = self.env.hparams.get("request_stop", False)

        self.non_chief_exit_immediately = self.env.hparams.get("non_chief_exit_immediately", False)

        self.wlsq = None
        if self.workloads is None:
            self.workloads, self.wlsq = layers.make_compatibility_workloads(
                self.context._core, self.env, self.context.get_global_batch_size()
            )

        self.steps_completed = self.env.steps_completed

        if self.env.latest_checkpoint is not None:
            with self.context._core.checkpoint.restore_path(
                self.env.latest_checkpoint
            ) as load_path:
                self.load(pathlib.Path(load_path))
        else:
            self.trained_steps = collections.Counter()
Exemplo n.º 3
0
    def __init__(
        self,
        estimator: tf.estimator.Estimator,
        user_train_spec: tf.estimator.TrainSpec,
        val_spec: tf.estimator.EvalSpec,
        serving_input_receiver_fns: Dict[str,
                                         estimator.ServingInputReceiverFn],
        context: estimator.EstimatorTrialContext,
        *args: Any,
        **kwargs: Any,
    ) -> None:
        super().__init__(context, *args, **kwargs)

        # Catch if the estimator has been configured to use a tf.distribute.Strategy
        # as this can conflict with Determined's distributed training and lead to
        # crashes/OOM. We cannot reliable tell the user that this was the cause of
        # their failure, because the code may crash before this point in user code
        # during build_estimator(). train_distribute is valid if it is None or if
        # it is an empty tf.contrib.distribute.DistributeConfig
        if estimator.config.train_distribute is not None:
            check.is_none(
                estimator.config.train_distribute.train_distribute,
                f"TensorFlow's approach to distributed training can conflict with "
                f"Determined's. Currently Determined requires that the train_distribute "
                f"field of the RunConfig not be set. Your estimator has "
                f"train_distribute={str(estimator.config.train_distribute.train_distribute)}",
            )
            check.is_none(
                estimator.config.train_distribute.eval_distribute,
                f"TensorFlow's approach to distributed training can conflict with "
                f"Determined's. Currently Determined requires that the eval_distribute "
                f"field of the RunConfig not be set. Your estimator has "
                f"eval_distribute={str(estimator.config.train_distribute.eval_distribute)}",
            )
        if self.context.distributed.size > 1:
            assert (
                self.use_horovod
            ), "Estimator trial must be run with a horovod backend if distributed training"

        self.estimator = estimator
        self.user_train_spec = user_train_spec
        self.val_spec = val_spec
        self.serving_input_receiver_fns = serving_input_receiver_fns

        self.wlsq = None  # type: Optional[layers.WorkloadSequencer]
        if self.workloads is None:
            self.workloads, self.wlsq = layers.make_compatibility_workloads(
                self.context._core,
                self.env,
                self.context.get_global_batch_size(),
            )

        self._init_model()
Exemplo n.º 4
0
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        self.value = self.env.hparams["starting_base_value"]
        self.training_structure = self.env.hparams["training_structure"]
        self.training_structure["inf"] = math.inf
        self.training_structure["nan"] = math.nan
        self.training_structure["nanarray"] = np.array([math.nan, math.nan])
        self.validation_structure = self.env.hparams["validation_structure"]
        self.validation_structure["neg_inf"] = -1 * math.inf
        self.gain_per_batch = 0

        self.wlsq = None
        if self.workloads is None:
            self.workloads, self.wlsq = layers.make_compatibility_workloads(
                self.context._core, self.env,
                self.context.get_global_batch_size())
Exemplo n.º 5
0
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        self.value = self.env.hparams["starting_base_value"]
        self.training_structure = self.env.hparams["training_structure"]
        self.validation_structure = self.env.hparams["validation_structure"]
        self.gain_per_batch = self.env.hparams["gain_per_batch"]

        self.wlsq = None
        if self.workloads is None:
            self.workloads, self.wlsq = layers.make_compatibility_workloads(
                self.context._core, self.env,
                self.context.get_global_batch_size())

        self.steps_completed = self.env.steps_completed

        if self.env.latest_checkpoint is not None:
            with self.context._core.checkpoint.restore_path(
                    self.env.latest_checkpoint) as load_path:
                self.load(pathlib.Path(load_path))
    def __init__(self, trial_inst: det.Trial, *args: Any,
                 **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        check.is_instance(trial_inst, PyTorchTrial,
                          "PyTorchTrialController needs an PyTorchTrial")
        self.trial = cast(PyTorchTrial, trial_inst)
        self.context = cast(pytorch.PyTorchTrialContext, self.context)
        self.context._set_determined_profiler(self.prof)
        if torch.cuda.is_available():
            self.prof._set_sync_device(self._sync_device)
        self.callbacks = self.trial.build_callbacks()

        check.gt_eq(
            len(self.context.models),
            1,
            "Must have at least one model. "
            "This might be caused by not wrapping your model with wrap_model()",
        )
        check.gt_eq(
            len(self.context.optimizers),
            1,
            "Must have at least one optimizer. "
            "This might be caused by not wrapping your optimizer with wrap_optimizer()",
        )
        self._check_evaluate_implementation()

        self.wlsq = None  # type: Optional[layers.WorkloadSequencer]
        if self.workloads is None:
            self.workloads, self.wlsq = layers.make_compatibility_workloads(
                self.context._core,
                self.env,
                self.context.get_global_batch_size(),
            )

        self.steps_completed = self.env.steps_completed

        # Currently only horovod and torch backends are supported for distributed training
        if self.context.distributed.size > 1:
            assert (self.use_horovod or self.use_torch
                    ), "Must use horovod or torch for distributed training"
Exemplo n.º 7
0
    def __init__(
        self,
        model: tf.keras.models.Model,
        session: tf.compat.v1.ConfigProto,
        train_config: keras.TFKerasTrainConfig,
        trial: "TFKerasTrial",
        *args: Any,
        **kwargs: Any,
    ) -> None:
        super().__init__(*args, **kwargs)

        self.model = model
        self.session = session
        self.trial = trial

        # Configure optimizers, done for backwards compatibility.
        self.context._select_optimizers()

        keras._check_if_aggregation_frequency_will_work(
            model=self.model,
            use_horovod=self.use_horovod,
            aggregation_frequency=self.context._aggregation_frequency,
        )

        self.training_data = train_config.training_data
        self.validation_data = train_config.validation_data

        # Support the deprecated SequenceAdapter API.
        if isinstance(self.training_data, keras.SequenceAdapter):
            self.context._configure_fit(
                workers=self.training_data.workers,
                use_multiprocessing=self.training_data.use_multiprocessing,
                max_queue_size=self.training_data.max_queue_size,
            )
            # Use the provided Sequence directly.
            self.training_data = self.training_data.sequence
        if isinstance(self.validation_data, keras.SequenceAdapter):
            # Ignore these settings and use the same settings as for the fit call.
            self.validation_data = self.validation_data.sequence

        if self.context.distributed.size > 1:
            assert self.use_horovod, (
                "TF Keras trial must be launched with a horovod backend if "
                "doing distributed training"
            )

        self._check_training_data()
        self._check_validation_data()

        self.enqueuers = []  # type: List[keras._Enqueuer]

        self.wlsq = None  # type: Optional[layers.WorkloadSequencer]
        if self.workloads is None:
            self.workloads, self.wlsq = layers.make_compatibility_workloads(
                self.context._core,
                self.env,
                self.context.get_global_batch_size(),
            )

        # If a load path is provided, load weights and restore the data location.
        self.multiplexer_load_state = None  # type: Optional[Dict]
        if self.env.latest_checkpoint is not None:
            logging.info(f"Restoring trial from checkpoint {self.env.latest_checkpoint}")
            with self.context._core.checkpoint.restore_path(
                self.env.latest_checkpoint
            ) as load_path:
                self._load(load_path)

        self._configure_callbacks(train_config.callbacks)

        self.train_response_func = None  # type: Optional[workload.ResponseFunc]
        self.train_workload_metrics = []  # type: List[Dict[str, Any]]
        self.train_workload_batches = 0
        self.train_workload_inputs = 0
        self.train_workload_len = 0
        self.test_inputs = 0

        self.steps_completed = self.env.steps_completed