예제 #1
0
def _init_native(
    controller_cls: Type[det.TrialController],
    native_context_cls: Type[det.NativeContext],
    config: Optional[Dict[str, Any]] = None,
    mode: Mode = Mode.CLUSTER,
    context_dir: str = "",
    command: Optional[List[str]] = None,
    master_url: Optional[str] = None,
) -> Any:
    if Mode(mode) == Mode.CLUSTER:
        if load.RunpyGlobals.is_initialized():
            controller_cls.pre_execute_hook(
                env=load.RunpyGlobals.get_instance().env,
                hvd_config=load.RunpyGlobals.get_instance().hvd_config,
            )
            context = native_context_cls(
                env=load.RunpyGlobals.get_instance().env,
                hvd_config=load.RunpyGlobals.get_instance().hvd_config,
            )
            load.RunpyGlobals.set_runpy_native_result(context, controller_cls)
            context._set_train_fn(_stop_loading_implementation)
            return context

        else:
            create_experiment(
                config=config, context_dir=context_dir, command=command, master_url=master_url
            )
            print("Exiting the program after submitting the experiment.")
            sys.exit(0)

    elif Mode(mode) == Mode.LOCAL:
        print("Starting a test experiment locally.")
        checkpoint_dir = tempfile.TemporaryDirectory()
        env, workloads, rendezvous_info, hvd_config = make_test_experiment_env(
            checkpoint_dir=pathlib.Path(checkpoint_dir.name), config=config
        )
        print(
            f"Using a modified test config: {env.experiment_config}.\n"
            f"Using a set of random hyperparameter values: {env.hparams}."
        )
        controller_cls.pre_execute_hook(env=env, hvd_config=hvd_config)
        context = native_context_cls(env=env, hvd_config=hvd_config)

        def train_fn() -> None:
            controller = controller_cls.from_native(
                context=context,
                env=env,
                workloads=workloads,
                load_path=None,
                rendezvous_info=rendezvous_info,
                hvd_config=hvd_config,
            )
            controller.run()
            checkpoint_dir.cleanup()

        context._set_train_fn(train_fn)
        return context

    else:
        raise errors.InvalidExperimentException("Must use either local mode or cluster mode.")
예제 #2
0
            def fit_generator(wrapper, *args: Any, **kwargs: Any) -> None:
                if not self.compile_args:
                    raise errors.InvalidExperimentException(
                        "Must call .compile before calling .fit_generator().")

                fit_generator_args = inspect.signature(
                    model.fit_generator).bind(*args, **kwargs)
                fit_generator_args.apply_defaults()

                training_data = keras.SequenceAdapter(
                    fit_generator_args.arguments["generator"],
                    use_multiprocessing=fit_generator_args.
                    arguments["use_multiprocessing"],
                    workers=fit_generator_args.arguments["workers"],
                )
                validation_data = keras.SequenceAdapter(
                    fit_generator_args.arguments["validation_data"],
                    use_multiprocessing=fit_generator_args.
                    arguments["use_multiprocessing"],
                    workers=fit_generator_args.arguments["workers"],
                )

                self.train_config = TFKerasTrainConfig(
                    training_data=training_data,
                    validation_data=validation_data,
                    callbacks=fit_generator_args.arguments["callbacks"],
                )

                if train_fn:
                    train_fn()
예제 #3
0
def _submit_experiment(
    config: Optional[Dict[str, Any]],
    context_dir: str,
    command: Optional[List[str]],
    test: bool = False,
    master_url: Optional[str] = None,
) -> int:
    if context_dir == "":
        raise errors.InvalidExperimentException("Cannot specify the context directory to be empty.")

    context_path = pathlib.Path(context_dir)
    config = {**constants.DEFAULT_EXP_CFG, **(config or {})}
    config.setdefault("internal", {})
    config["internal"]["native"] = {"command": _set_command_default(context_path, command)}
    logging.info(f"Creating an experiment with config: {config}")

    if master_url is None:
        master_url = util.get_default_master_address()

    exp_context = context.Context.from_local(context_path)

    # When a requested_user isn't specified to initialize_session(), the
    # authentication module will attempt to use the token store to grab the
    # current logged-in user. If there is no logged in user found, it will
    # default to constants.DEFAULT_DETERMINED_USER.
    auth.initialize_session(master_url, requested_user=None, try_reauth=True)

    if test:
        return api.create_test_experiment_and_follow_logs(master_url, config, exp_context)
    else:
        return api.create_experiment_and_follow_logs(master_url, config, exp_context)
예제 #4
0
def _set_command_default(context_dir: pathlib.Path,
                         command: Optional[List[str]] = None) -> List[str]:
    if not command or len(command) == 0:
        if _in_ipython():
            raise errors.InvalidExperimentException(
                "Must specify the location of the notebook file "
                "relative to the context directory when in notebook.")

        exp_path = pathlib.Path(sys.argv[0]).resolve()
        exp_rel_path = exp_path.relative_to(context_dir.resolve())
        if exp_rel_path.suffix in {"py", "ipynb"}:
            raise errors.InvalidExperimentException(
                "Command must begin with a file with the suffix .py or .ipynb. "
                "Found {}".format(command))

        command = [str(exp_rel_path), *_get_current_args()]

    return command
예제 #5
0
            def compile(wrapper, *args: Any, **kwargs: Any) -> None:
                bound_arguments = inspect.signature(model.compile).bind(
                    *args, **kwargs)
                bound_arguments.apply_defaults()

                if "optimizer" not in bound_arguments.arguments:
                    raise errors.InvalidExperimentException(
                        "Must have 'optimizer' in arguments of .compile().")

                self.compile_args = bound_arguments
예제 #6
0
            def fit_generator(wrapper, *args: Any, **kwargs: Any) -> None:
                if not self.compile_args:
                    raise errors.InvalidExperimentException(
                        "Must call .compile before calling .fit_generator().")

                fit_generator_args = inspect.signature(
                    model.fit_generator).bind(*args, **kwargs)
                fit_generator_args.apply_defaults()

                training_data = fit_generator_args.arguments["generator"]

                if fit_generator_args.arguments["validation_data"] is None:
                    raise errors.InvalidExperimentException(
                        "Determined requires validation_data in the call to fit_generator()."
                    )

                validation_data = keras._adapt_data_from_data_loader(
                    input_data=fit_generator_args.arguments["validation_data"],
                    batch_size=self.env.per_slot_batch_size,
                )

                self.train_config = TFKerasTrainConfig(
                    training_data=training_data,
                    validation_data=validation_data,
                    callbacks=fit_generator_args.arguments["callbacks"],
                )

                self.configure_fit(
                    verbose=fit_generator_args.arguments["verbose"],
                    class_weight=fit_generator_args.arguments["class_weight"],
                    shuffle=fit_generator_args.arguments["shuffle"],
                    workers=fit_generator_args.arguments["workers"],
                    use_multiprocessing=fit_generator_args.
                    arguments["use_multiprocessing"],
                    max_queue_size=fit_generator_args.
                    arguments["max_queue_size"],
                )

                if train_fn:
                    train_fn()
예제 #7
0
def create_experiment(
    config: Optional[Dict[str, Any]],
    context_dir: str,
    command: Optional[List[str]],
    test_mode: bool = False,
    master_url: Optional[str] = None,
) -> Optional[int]:
    """Submit an experiment to the Determined master.

    Alternatively, use det.create() with a mode argument of "submit".

    Args:
        name (Optional[str]): The URL of the Determined master node. If None
        (default), then the master address will be inferred from the
        environment.

    Returns:
        The ID of the created experiment.
    """
    if context_dir == "":
        raise errors.InvalidExperimentException(
            "Cannot specify the context directory to be empty.")

    context_path = pathlib.Path(context_dir)
    config = {**constants.DEFAULT_EXP_CFG, **(config or {})}
    config.setdefault("internal", {})
    config["internal"]["native"] = {
        "command": set_command_default(context_path, command)
    }
    print("Creating an experiment with config: {}".format(config))

    if master_url is None:
        master_url = util.get_default_master_address()

    exp_context = context.Context.from_local(context_path)

    # When a requested_user isn't specified to initialize_session(), the
    # authentication module will attempt to use the token store to grab the
    # current logged-in user. If there is no logged in user found, it will
    # default to constants.DEFAULT_DETERMINED_USER.
    auth.initialize_session(master_url, requested_user=None, try_reauth=True)

    if test_mode:
        exp_id = api.create_test_experiment(master_url, config, exp_context)
    else:
        exp_id = api.create_experiment(master_url, config, exp_context)
    print("Created experiment {}".format(exp_id))

    return exp_id
예제 #8
0
            def fit(wrapper, *args: Any, **kwargs: Any) -> None:
                """Communicate a model, data, and other training configuration with the harness.

                Parameters:
                    the same as tf.keras.Model.fit except for this function only handles the
                    following cases of data

                    x: Input data. It could be:
                        1) A Numpy array (or array-like), or a list of arrays (in case the model
                        has multiple inputs).
                        2) A dict mapping input names to the corresponding array, if the model
                        has named inputs.
                        3) A tf.data dataset. Should return a tuple of either (inputs, targets) or
                        (inputs, targets, sample_weights).
                        4) A keras.utils.Sequence returning (inputs, targets) or (inputs, targets,
                        sample weights).

                    y: Target data. Like the input data x, it could be either Numpy array(s).
                        If x is a dataset or keras.utils.Sequence instance, y should not be
                        specified(since targets will be obtained from x).

                    validation_data: Data on which to evaluate the loss and any model metrics
                        at the end of each epoch. The model will not be trained on this data.
                        validation_data will override validation_split. validation_data could be:
                        1) tuple (x_val, y_val) of Numpy arrays
                        2) tuple (x_val, y_val, val_sample_weights) of Numpy arrays
                        3) dataset For the first two cases, batch_size must be provided.
                        For the last case, validation_steps could be provided.
                """
                if not self.compile_args:
                    raise errors.InvalidExperimentException(
                        "Must call .compile before calling .fit()."
                    )

                fit_args = inspect.signature(model.fit).bind(*args, **kwargs)
                fit_args.apply_defaults()

                training_data = keras._adapt_data_from_fit_args(
                    x=fit_args.arguments["x"],
                    y=fit_args.arguments["y"],
                    sample_weight=fit_args.arguments["sample_weight"],
                    batch_size=self.env.per_slot_batch_size,
                )

                if fit_args.arguments["validation_data"] is None:
                    raise errors.InvalidExperimentException(
                        "Determined requires validation_data in the call to fit()."
                    )

                validation_data = keras._adapt_data_from_data_loader(
                    input_data=fit_args.arguments["validation_data"],
                    batch_size=self.env.per_slot_batch_size,
                )

                self.train_config = TFKerasTrainConfig(
                    training_data=training_data,
                    validation_data=validation_data,
                    callbacks=fit_args.arguments["callbacks"],
                )

                self.configure_fit(
                    verbose=fit_args.arguments["verbose"],
                    shuffle=fit_args.arguments["shuffle"],
                    class_weight=fit_args.arguments["class_weight"],
                    workers=fit_args.arguments["workers"],
                    use_multiprocessing=fit_args.arguments["use_multiprocessing"],
                    max_queue_size=fit_args.arguments["max_queue_size"],
                )

                if train_fn:
                    train_fn()
예제 #9
0
def create(
    trial_def: Type[det.Trial],
    config: Optional[Dict[str, Any]] = None,
    mode: Mode = Mode.CLUSTER,
    context_dir: str = "",
    command: Optional[List[str]] = None,
    master_url: Optional[str] = None,
) -> None:
    # TODO: Add a reference to the local development tutorial.
    """
    Create an experiment.

    Arguments:
        trial_def:
            A class definition implementing the ``det.Trial`` interface.
        config:
            A dictionary representing the experiment configuration to be
            associated with the experiment.
        mode:
            The :py:class:`determined.experimental.Mode` used when creating
            an experiment

            1. ``Mode.CLUSTER`` (default): Submit the experiment to a remote
            Determined cluster.

            2. ``Mode.LOCAL``: Test the experiment in the calling
            Python process for local development / debugging purposes.
            Run through a minimal loop of training, validation, and checkpointing steps.

        context_dir:
            A string filepath that defines the context directory. All model
            code will be executed with this as the current working directory.

            In CLUSTER mode, this argument is required. All files in this
            directory will be uploaded to the Determined cluster. The total
            size of this directory must be under 96 MB.

            In LOCAL mode, this argument is optional and assumed to be the
            current working directory by default.
        command:
            A list of strings that is used as the entrypoint of the training
            script in the Determined task environment. When executing this
            function via a python script, this argument is inferred to be
            ``sys.argv`` by default. When executing this function via IPython
            or Jupyter notebook, this argument is required.

            Example: When creating an experiment by running "python train.py
            --flag value", the default command is inferred as ["train.py",
            "--flag", "value"].

        master_url:
            An optional string to use as the Determined master URL in submit
            mode. If not specified, will be inferred from the environment
            variable ``DET_MASTER``.
    """

    if Mode(mode) == Mode.CLUSTER:
        if load.RunpyGlobals.is_initialized():
            load.RunpyGlobals.set_runpy_trial_result(
                trial_def, cast(Type[det.TrialController], trial_def.trial_controller_class)
            )
            _stop_loading_implementation()

        else:
            create_experiment(
                config=config, context_dir=context_dir, command=command, master_url=master_url
            )

    elif Mode(mode) == Mode.LOCAL:
        context_path = pathlib.Path(context_dir) if context_dir else pathlib.Path.cwd()
        test_one_batch(context_path, trial_class=trial_def, config=config)
    else:
        raise errors.InvalidExperimentException("Must use either local mode or cluster mode.")