Пример #1
0
def grid_search(train_fn,
                grid_dict,
                direction=Direction.MAX,
                name='no-name',
                local_logdir=False,
                description=None,
                optimization_key='metric'):
    """
    *Parallel Experiment*

    Run grid search evolution to explore a predefined set of hyperparameter combinations.
    The function is treated as a blackbox that returns a metric for some given hyperparameter combination.
    The returned metric is used to evaluate how 'good' the hyperparameter combination was.

    Example usage:

    >>> from hops import experiment
    >>> grid_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    return network.evaluate(learning_rate, layers, dropout)
    >>> experiment.grid_search(train_nn, grid_dict, direction=Direction.MAX)

    Returning multiple outputs, including images and logs:

    >>> from hops import experiment
    >>> grid_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    from PIL import Image
    >>>    f = open('logfile.txt', 'w')
    >>>    f.write('Starting training...')
    >>>    accuracy, loss = network.evaluate(learning_rate, layers, dropout)
    >>>    img = Image.new(.....)
    >>>    img.save('diagram.png')
    >>>    return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'}
    >>> # Important! Remember: optimization_key must be set when returning multiple outputs
    >>> experiment.grid_search(train_nn, grid_dict, direction=Direction.MAX, optimization_key='accuracy')

    Args:
        :train_fn: the function to run, must return a metric
        :grid_dict: a dict with a key for each argument with a corresponding value being a list containing the hyperparameters to test, internally all possible combinations will be generated and run as separate Experiments
        :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: a longer description for the experiment
        :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value

    Returns:
        HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        experiment_utils._create_experiment_dir(app_id, run_id)

        experiment_json = experiment_utils._populate_experiment(
            name, 'grid_search', 'PARALLEL_EXPERIMENTS', json.dumps(grid_dict),
            description, app_id, direction, optimization_key)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        grid_params = experiment_utils.grid_params(grid_dict)

        logdir, best_param, best_metric, return_dict = grid_search_impl._run(
            sc,
            train_fn,
            run_id,
            grid_params,
            direction=direction,
            local_logdir=local_logdir,
            name=name,
            optimization_key=optimization_key)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        experiment_utils._finalize_experiment(
            experiment_json, best_metric, app_id, run_id, 'FINISHED', duration,
            experiment_utils._get_logdir(app_id, run_id), logdir,
            optimization_key)

        return logdir, best_param, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
Пример #2
0
def mirrored(train_fn,
             name='no-name',
             local_logdir=False,
             description=None,
             evaluator=False,
             metric_key=None):
    """
    *Distributed Training*

    Example usage:

    >>> from hops import experiment
    >>> def mirrored_training():
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    from hops import tensorboard
    >>>    from hops import devices
    >>>    logdir = tensorboard.logdir()
    >>>    ...MirroredStrategy()...
    >>> experiment.mirrored(mirrored_training, local_logdir=True)

    Args:
        :train_fn: contains the code where you are using MirroredStrategy.
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: a longer description for the experiment
        :evaluator: whether to run one of the workers as an evaluator
        :metric_key: If returning a dict with multiple return values, this key should match the name of the key in the dict for the metric you want to associate with the experiment

    Returns:
        HDFS path in your project where the experiment is stored and return value from the process running as chief

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    num_workers = util.num_executors()
    if evaluator:
        assert num_workers > 2, "number of workers must be atleast 3 if evaluator is set to True"

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        experiment_utils._create_experiment_dir(app_id, run_id)

        experiment_json = experiment_utils._populate_experiment(
            name, 'mirrored', 'DISTRIBUTED_TRAINING', None, description,
            app_id, None, None)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, return_dict = mirrored_impl._run(sc,
                                                 train_fn,
                                                 run_id,
                                                 local_logdir=local_logdir,
                                                 name=name,
                                                 evaluator=evaluator)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        metric = experiment_utils._get_metric(return_dict, metric_key)

        experiment_utils._finalize_experiment(experiment_json, metric, app_id,
                                              run_id, 'FINISHED', duration,
                                              logdir, None, None)

        return logdir, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
Пример #3
0
def random_search(train_fn,
                  boundary_dict,
                  direction=Direction.MAX,
                  samples=10,
                  name='no-name',
                  local_logdir=False,
                  description=None,
                  optimization_key='metric'):
    """

    *Parallel Experiment*

    Run an Experiment contained in *train_fn* for configured number of random samples controlled by the *samples* parameter. Each hyperparameter is contained in *boundary_dict* with the key
    corresponding to the name of the hyperparameter and a list containing two elements defining the lower and upper bound.
    The experiment must return a metric corresponding to how 'good' the given hyperparameter combination is.

    Example usage:

    >>> from hops import experiment
    >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    return network.evaluate(learning_rate, layers, dropout)
    >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max')

    Returning multiple outputs, including images and logs:

    >>> from hops import experiment
    >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    from PIL import Image
    >>>    f = open('logfile.txt', 'w')
    >>>    f.write('Starting training...')
    >>>    accuracy, loss = network.evaluate(learning_rate, layers, dropout)
    >>>    img = Image.new(.....)
    >>>    img.save('diagram.png')
    >>>    return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'}
    >>> # Important! Remember: optimization_key must be set when returning multiple outputs
    >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max', optimization_key='accuracy')


    Args:
        :train_fn: The function to run
        :boundary_dict: dict containing hyperparameter name and corresponding boundaries, each experiment randomize a value in the boundary range.
        :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric
        :samples: the number of random samples to evaluate for each hyperparameter given the boundaries, for example samples=3 would result in 3 hyperparameter combinations in total to evaluate
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: A longer description for the experiment
        :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value

    Returns:
        HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        experiment_utils._create_experiment_dir(app_id, run_id)

        experiment_json = experiment_utils._populate_experiment(
            name, 'random_search', 'PARALLEL_EXPERIMENTS',
            json.dumps(boundary_dict), description, app_id, direction,
            optimization_key)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, best_param, best_metric, return_dict = r_search_impl._run(
            sc,
            train_fn,
            run_id,
            boundary_dict,
            samples,
            direction=direction,
            local_logdir=local_logdir,
            optimization_key=optimization_key)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        experiment_utils._finalize_experiment(
            experiment_json, best_metric, app_id, run_id, 'FINISHED', duration,
            experiment_utils._get_logdir(app_id, run_id), logdir,
            optimization_key)

        return logdir, best_param, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
Пример #4
0
def launch(train_fn,
           args_dict=None,
           name='no-name',
           local_logdir=False,
           description=None,
           metric_key=None):
    """

    *Experiment* or *Parallel Experiment*

    Run an Experiment contained in *train_fn* one time with no arguments or multiple times with different arguments if
    *args_dict* is specified.

    Example usage:

    >>> from hops import experiment
    >>> def train_nn():
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    accuracy, loss = network.evaluate(learning_rate, layers, dropout)
    >>> experiment.launch(train_nn)

    Returning multiple outputs, including images and logs:

    >>> from hops import experiment
    >>> def train_nn():
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    from PIL import Image
    >>>    f = open('logfile.txt', 'w')
    >>>    f.write('Starting training...')
    >>>    accuracy, loss = network.evaluate(learning_rate, layers, dropout)
    >>>    img = Image.new(.....)
    >>>    img.save('diagram.png')
    >>>    return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'}
    >>> experiment.launch(train_nn)

    Args:
        :train_fn: The function to run
        :args_dict: If specified will run the same function multiple times with different arguments, {'a':[1,2], 'b':[5,3]} would run the function two times with arguments (1,5) and (2,3) provided that the function signature contains two arguments like *def func(a,b):*
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: A longer description for the experiment
        :metric_key: If returning a dict with multiple return values, this key should match the name of the key in the dict for the metric you want to associate with the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError(
            "An experiment is currently running. Please call experiment.end() to stop it."
        )

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        experiment_utils._create_experiment_dir(app_id, run_id)

        experiment_json = None
        if args_dict:
            experiment_json = experiment_utils._populate_experiment(
                name, 'launch', 'EXPERIMENT', json.dumps(args_dict),
                description, app_id, None, None)
        else:
            experiment_json = experiment_utils._populate_experiment(
                name, 'launch', 'EXPERIMENT', None, description, app_id, None,
                None)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, return_dict = launcher._run(sc, train_fn, run_id, args_dict,
                                            local_logdir)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        metric = experiment_utils._get_metric(return_dict, metric_key)

        experiment_utils._finalize_experiment(experiment_json, metric, app_id,
                                              run_id, 'FINISHED', duration,
                                              logdir, None, None)
        return logdir, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
Пример #5
0
def parameter_server(map_fun,
                     name='no-name',
                     local_logdir=False,
                     description=None,
                     evaluator=False):
    """
    *Distributed Training*

    Sets up the cluster to run ParameterServerStrategy.

    TF_CONFIG is exported in the background and does not need to be set by the user themselves.

    Example usage:

    >>> from hops import experiment
    >>> def distributed_training():
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the wrapper function
    >>>    from hops import tensorboard
    >>>    from hops import devices
    >>>    logdir = tensorboard.logdir()
    >>>    ...ParameterServerStrategy(num_gpus_per_worker=devices.get_num_gpus())...
    >>> experiment.parameter_server(distributed_training, local_logdir=True)

    Args:f
        :map_fun: contains the code where you are using ParameterServerStrategy.
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: a longer description for the experiment
        :evaluator: whether to run one of the workers as an evaluator

    Returns:
        HDFS path in your project where the experiment is stored and return value from the process running as chief

    """
    num_ps = util.num_param_servers()
    num_executors = util.num_executors()

    assert num_ps > 0, "number of parameter servers should be greater than 0"
    assert num_ps < num_executors, "num_ps cannot be greater than num_executors (i.e. num_executors == num_ps + num_workers)"
    if evaluator:
        assert num_executors - num_ps > 2, "number of workers must be atleast 3 if evaluator is set to True"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id))

        experiment_json = experiment_utils._populate_experiment(
            name, 'parameter_server', 'DISTRIBUTED_TRAINING', None,
            description, app_id, None, None)

        experiment_json = experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, return_dict = ps_impl._run(sc,
                                           map_fun,
                                           run_id,
                                           local_logdir=local_logdir,
                                           name=name,
                                           evaluator=evaluator)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        experiment_utils._finalize_experiment(experiment_json, None, app_id,
                                              run_id, 'FINISHED', duration,
                                              logdir, None, None)

        return logdir, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
Пример #6
0
def differential_evolution(objective_function,
                           boundary_dict,
                           direction=Direction.MAX,
                           generations=4,
                           population=6,
                           mutation=0.5,
                           crossover=0.7,
                           name='no-name',
                           local_logdir=False,
                           description=None,
                           optimization_key='metric'):
    """
    *Parallel Experiment*

    Run differential evolution to explore a given search space for each hyperparameter and figure out the best hyperparameter combination.
    The function is treated as a blackbox that returns a metric for some given hyperparameter combination.
    The returned metric is used to evaluate how 'good' the hyperparameter combination was.

    Example usage:

    >>> from hops import experiment
    >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    import tensorflow
    >>>    return network.evaluate(learning_rate, layers, dropout)
    >>> experiment.differential_evolution(train_nn, boundary_dict, direction=Direction.MAX)

    Returning multiple outputs, including images and logs:

    >>> from hops import experiment
    >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the wrapper function
    >>>    from PIL import Image
    >>>    f = open('logfile.txt', 'w')
    >>>    f.write('Starting training...')
    >>>    accuracy, loss = network.evaluate(learning_rate, layers, dropout)
    >>>    img = Image.new(.....)
    >>>    img.save('diagram.png')
    >>>    return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'}
    >>> # Important! Remember: optimization_key must be set when returning multiple outputs
    >>> experiment.differential_evolution(train_nn, boundary_dict, direction=Direction.MAX, optimization_key='accuracy')

    Args:
        :objective_function: the function to run, must return a metric
        :boundary_dict: a dict where each key corresponds to an argument of *objective_function* and the correspond value should be a list of two elements. The first element being the lower bound for the parameter and the the second element the upper bound.
        :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric
        :generations: number of generations
        :population: size of population
        :mutation: mutation rate to explore more different hyperparameters
        :crossover: how fast to adapt the population to the best in each generation
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: a longer description for the experiment
        :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value

    Returns:
        HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        diff_evo_impl.run_id = run_id

        hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id))

        experiment_json = experiment_utils._populate_experiment(
            name, 'differential_evolution', 'PARALLEL_EXPERIMENTS',
            json.dumps(boundary_dict), description, app_id, direction,
            optimization_key)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, best_param, best_metric, return_dict = diff_evo_impl._run(
            objective_function,
            boundary_dict,
            direction=direction,
            generations=generations,
            population=population,
            mutation=mutation,
            crossover=crossover,
            cleanup_generations=False,
            local_logdir=local_logdir,
            name=name,
            optimization_key=optimization_key)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        experiment_utils._finalize_experiment(
            experiment_json, best_metric, app_id, run_id, 'FINISHED', duration,
            experiment_utils._get_logdir(app_id, run_id), logdir,
            optimization_key)

        return logdir, best_param, return_dict

    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
Пример #7
0
        def _target_function(self):

            try:
                time_earlystop_check = (
                    time.time()
                )  # only used by earlystop-supporting experiments

                while not self.worker_done:
                    trial = None
                    # get a message
                    try:
                        msg = self._message_q.get_nowait()
                    except queue.Empty:
                        msg = {"type": None}

                    if self.earlystop_check != NoStoppingRule.earlystop_check:
                        if (time.time() -
                                time_earlystop_check) >= self.es_interval:
                            time_earlystop_check = time.time()

                            # pass currently running trials to early stop component
                            if len(self._final_store) > self.es_min:
                                self._log("Check for early stopping.")
                                try:
                                    to_stop = self.earlystop_check(
                                        self._trial_store,
                                        self._final_store,
                                        self.direction,
                                    )
                                except Exception as e:
                                    self._log(e)
                                    to_stop = []
                                if len(to_stop) > 0:
                                    self._log(
                                        "Trials to stop: {}".format(to_stop))
                                for trial_id in to_stop:
                                    self.get_trial(trial_id).set_early_stop()

                    # depending on message do the work
                    # 1. METRIC
                    if msg["type"] == "METRIC":
                        # append executor logs if in the message
                        logs = msg.get("logs", None)
                        if logs is not None:
                            with self.log_lock:
                                self.executor_logs = self.executor_logs + logs

                        if msg["trial_id"] is not None and msg[
                                "data"] is not None:
                            self.get_trial(msg["trial_id"]).append_metric(
                                msg["data"])

                    # 2. BLACKLIST the trial
                    elif msg["type"] == "BLACK":
                        trial = self.get_trial(msg["trial_id"])
                        with trial.lock:
                            trial.status = Trial.SCHEDULED
                            self.server.reservations.assign_trial(
                                msg["partition_id"], msg["trial_id"])

                    # 3. FINAL
                    elif msg["type"] == "FINAL":
                        # set status
                        # get trial only once
                        trial = self.get_trial(msg["trial_id"])

                        logs = msg.get("logs", None)
                        if logs is not None:
                            with self.log_lock:
                                self.executor_logs = self.executor_logs + logs

                        # finalize the trial object
                        with trial.lock:
                            trial.status = Trial.FINALIZED
                            trial.final_metric = msg["data"]
                            trial.duration = experiment_utils._seconds_to_milliseconds(
                                time.time() - trial.start)

                        # move trial to the finalized ones
                        self._final_store.append(trial)
                        self._trial_store.pop(trial.trial_id)

                        # update result dictionary
                        self._update_result(trial)
                        # keep for later in case tqdm doesn't work
                        self.maggy_log = self._update_maggy_log()
                        self._log(self.maggy_log)

                        hopshdfs.dump(
                            trial.to_json(),
                            self.log_dir + "/" + trial.trial_id +
                            "/trial.json",
                        )

                        # assign new trial
                        if self.experiment_type == "optimization":
                            trial = self.optimizer.get_suggestion(trial)
                        elif self.experiment_type == "ablation":
                            trial = self.ablator.get_trial(trial)
                        if trial is None:
                            self.server.reservations.assign_trial(
                                msg["partition_id"], None)
                            self.experiment_done = True
                        elif trial == "IDLE":
                            self.add_message({
                                "type":
                                "IDLE",
                                "partition_id":
                                msg["partition_id"],
                                "idle_start":
                                time.time(),
                            })
                            self.server.reservations.assign_trial(
                                msg["partition_id"], None)
                        else:
                            with trial.lock:
                                trial.start = time.time()
                                trial.status = Trial.SCHEDULED
                                self.server.reservations.assign_trial(
                                    msg["partition_id"], trial.trial_id)
                                self.add_trial(trial)

                    # 4. Let executor be idle
                    elif msg["type"] == "IDLE":
                        # execute only every 0.1 seconds but do not block thread
                        if (self.experiment_type == "optimization"
                                and time.time() - msg["idle_start"] > 0.1):
                            trial = self.optimizer.get_suggestion()
                            if trial is None:
                                self.server.reservations.assign_trial(
                                    msg["partition_id"], None)
                                self.experiment_done = True
                            elif trial == "IDLE":
                                # reset timeout
                                msg["idle_start"] = time.time()
                                self.add_message(msg)
                            else:
                                with trial.lock:
                                    trial.start = time.time()
                                    trial.status = Trial.SCHEDULED
                                    self.server.reservations.assign_trial(
                                        msg["partition_id"], trial.trial_id)
                                    self.add_trial(trial)
                        elif self.experiment_type == "optimization":
                            self.add_message(msg)

                    # 4. REG
                    elif msg["type"] == "REG":
                        if self.experiment_type == "optimization":
                            trial = self.optimizer.get_suggestion()
                        elif self.experiment_type == "ablation":
                            trial = self.ablator.get_trial()
                        if trial is None:
                            self.server.reservations.assign_trial(
                                msg["partition_id"], None)
                            self.experiment_done = True
                        elif trial == "IDLE":
                            # reset timeout
                            msg["idle_start"] = time.time()
                            self.add_message(msg)
                        else:
                            with trial.lock:
                                trial.start = time.time()
                                trial.status = Trial.SCHEDULED
                                self.server.reservations.assign_trial(
                                    msg["partition_id"], trial.trial_id)
                                self.add_trial(trial)
            except Exception as exc:
                # Exception can't be propagated to parent thread
                # therefore log the exception and fail experiment
                self._log(exc)
                self.exception = exc
                self.server.stop()
Пример #8
0
    def finalize(self, job_end):

        results = ""

        if self.experiment_type == "optimization":

            _ = self.optimizer.finalize_experiment(self._final_store)

            self.job_end = job_end

            self.duration = experiment_utils._seconds_to_milliseconds(
                self.job_end - self.job_start)

            self.duration_str = experiment_utils._time_diff(
                self.job_start, self.job_end)

            results = ("\n------ " + self.optimizer.name() +
                       " Results ------ direction(" + self.direction + ") \n"
                       "BEST combination " +
                       json.dumps(self.result["best_hp"]) + " -- metric " +
                       str(self.result["best_val"]) + "\n"
                       "WORST combination " +
                       json.dumps(self.result["worst_hp"]) + " -- metric " +
                       str(self.result["worst_val"]) + "\n"
                       "AVERAGE metric -- " + str(self.result["avg"]) + "\n"
                       "EARLY STOPPED Trials -- " +
                       str(self.result["early_stopped"]) + "\n"
                       "Total job time " + self.duration_str + "\n")

        elif self.experiment_type == "ablation":

            _ = self.ablator.finalize_experiment(self._final_store)
            self.job_end = job_end

            self.duration = experiment_utils._seconds_to_milliseconds(
                self.job_end - self.job_start)

            self.duration_str = experiment_utils._time_diff(
                self.job_start, self.job_end)

            results = ("\n------ " + self.ablator.name() +
                       " Results ------ \n" + "BEST Config Excludes " +
                       json.dumps(self.result["best_config"]) + " -- metric " +
                       str(self.result["best_val"]) + "\n" +
                       "WORST Config Excludes " +
                       json.dumps(self.result["worst_config"]) +
                       " -- metric " + str(self.result["worst_val"]) + "\n" +
                       "AVERAGE metric -- " + str(self.result["avg"]) + "\n" +
                       "Total Job Time " + self.duration_str + "\n")

        print(results)

        self._log(results)

        hopshdfs.dump(
            json.dumps(self.result, default=util.json_default_numpy),
            self.log_dir + "/result.json",
        )
        sc = hopsutil._find_spark().sparkContext
        hopshdfs.dump(self.json(sc), self.log_dir + "/maggy.json")

        return self.result
Пример #9
0
def lagom(
    map_fun,
    name="no-name",
    experiment_type="optimization",
    searchspace=None,
    optimizer=None,
    direction="max",
    num_trials=1,
    ablation_study=None,
    ablator=None,
    optimization_key="metric",
    hb_interval=1,
    es_policy="median",
    es_interval=300,
    es_min=10,
    description="",
):
    """Launches a maggy experiment, which depending on `experiment_type` can
    either be a hyperparameter optimization or an ablation study experiment.
    Given a search space, objective and a model training procedure `map_fun`
    (black-box function), an experiment is the whole process of finding the
    best hyperparameter combination in the search space, optimizing the
    black-box function. Currently maggy supports random search and a median
    stopping rule.

    **lagom** is a Swedish word meaning "just the right amount".

    :param map_fun: User defined experiment containing the model training.
    :type map_fun: function
    :param name: A user defined experiment identifier.
    :type name: str
    :param experiment_type: Type of Maggy experiment, either 'optimization'
        (default) or 'ablation'.
    :type experiment_type: str
    :param searchspace: A maggy Searchspace object from which samples are
        drawn.
    :type searchspace: Searchspace
    :param optimizer: The optimizer is the part generating new trials.
    :type optimizer: str, AbstractOptimizer
    :param direction: If set to ‘max’ the highest value returned will
        correspond to the best solution, if set to ‘min’ the opposite is true.
    :type direction: str
    :param num_trials: the number of trials to evaluate given the search space,
        each containing a different hyperparameter combination
    :type num_trials: int
    :param ablation_study: Ablation study object. Can be None for optimization
        experiment type.
    :type ablation_study: AblationStudy
    :param ablator: Ablator to use for experiment type 'ablation'.
    :type ablator: str, AbstractAblator
    :param optimization_key: Name of the metric to be optimized
    :type optimization_key: str, optional
    :param hb_interval: The heartbeat interval in seconds from trial executor
        to experiment driver, defaults to 1
    :type hb_interval: int, optional
    :param es_policy: The earlystopping policy, defaults to 'median'
    :type es_policy: str, optional
    :param es_interval: Frequency interval in seconds to check currently
        running trials for early stopping, defaults to 300
    :type es_interval: int, optional
    :param es_min: Minimum number of trials finalized before checking for
        early stopping, defaults to 10
    :type es_min: int, optional
    :param description: A longer description of the experiment.
    :type description: str, optional
    :raises RuntimeError: An experiment is currently running.
    :return: A dictionary indicating the best trial and best hyperparameter
        combination with it's performance metric
    :rtype: dict
    """
    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    job_start = time.time()
    sc = hopsutil._find_spark().sparkContext
    exp_driver = None

    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        app_id, run_id = util._validate_ml_id(app_id, run_id)

        # start run
        running = True
        experiment_utils._set_ml_id(app_id, run_id)

        # create experiment dir
        experiment_utils._create_experiment_dir(app_id, run_id)

        tensorboard._register(experiment_utils._get_logdir(app_id, run_id))

        num_executors = util.num_executors(sc)

        # start experiment driver
        if experiment_type == "optimization":

            assert num_trials > 0, "number of trials should be greater " + "than zero"
            tensorboard._write_hparams_config(
                experiment_utils._get_logdir(app_id, run_id), searchspace
            )

            if num_executors > num_trials:
                num_executors = num_trials

            exp_driver = experimentdriver.ExperimentDriver(
                "optimization",
                searchspace=searchspace,
                optimizer=optimizer,
                direction=direction,
                num_trials=num_trials,
                name=name,
                num_executors=num_executors,
                hb_interval=hb_interval,
                es_policy=es_policy,
                es_interval=es_interval,
                es_min=es_min,
                description=description,
                log_dir=experiment_utils._get_logdir(app_id, run_id),
            )

            exp_function = exp_driver.optimizer.name()

        elif experiment_type == "ablation":
            exp_driver = experimentdriver.ExperimentDriver(
                "ablation",
                ablation_study=ablation_study,
                ablator=ablator,
                name=name,
                num_executors=num_executors,
                hb_interval=hb_interval,
                description=description,
                log_dir=experiment_utils._get_logdir(app_id, run_id),
            )
            # using exp_driver.num_executor since
            # it has been set using ablator.get_number_of_trials()
            # in experiment.py
            if num_executors > exp_driver.num_executors:
                num_executors = exp_driver.num_executors

            exp_function = exp_driver.ablator.name()
        else:
            running = False
            raise RuntimeError(
                "Unknown experiment_type:"
                "should be either 'optimization' or 'ablation', "
                "But it is '{0}'".format(str(experiment_type))
            )

        nodeRDD = sc.parallelize(range(num_executors), num_executors)

        # Do provenance after initializing exp_driver, because exp_driver does
        # the type checks for optimizer and searchspace
        sc.setJobGroup(os.environ["ML_ID"], "{0} | {1}".format(name, exp_function))

        experiment_json = experiment_utils._populate_experiment(
            name,
            exp_function,
            "MAGGY",
            exp_driver.searchspace.json(),
            description,
            app_id,
            direction,
            optimization_key,
        )

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, "CREATE"
        )

        util._log(
            "Started Maggy Experiment: {0}, {1}, run {2}".format(name, app_id, run_id)
        )

        exp_driver.init(job_start)

        server_addr = exp_driver.server_addr

        # Force execution on executor, since GPU is located on executor
        nodeRDD.foreachPartition(
            trialexecutor._prepare_func(
                app_id,
                run_id,
                experiment_type,
                map_fun,
                server_addr,
                hb_interval,
                exp_driver._secret,
                optimization_key,
                experiment_utils._get_logdir(app_id, run_id),
            )
        )
        job_end = time.time()

        result = exp_driver.finalize(job_end)
        best_logdir = (
            experiment_utils._get_logdir(app_id, run_id) + "/" + result["best_id"]
        )

        util._finalize_experiment(
            experiment_json,
            float(result["best_val"]),
            app_id,
            run_id,
            "FINISHED",
            exp_driver.duration,
            experiment_utils._get_logdir(app_id, run_id),
            best_logdir,
            optimization_key,
        )

        util._log("Finished Experiment")

        return result

    except:  # noqa: E722
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - job_start)
        )
        if exp_driver:
            if exp_driver.exception:
                raise exp_driver.exception
        raise
    finally:
        # grace period to send last logs to sparkmagic
        # sparkmagic hb poll intervall is 5 seconds, therefore wait 6 seconds
        time.sleep(6)
        # cleanup spark jobs
        if running and exp_driver is not None:
            exp_driver.stop()
        run_id += 1
        running = False
        sc.setJobGroup("", "")

    return result