def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        experiment_utils._set_ml_id(app_id, run_id)

        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        try:
            #Arguments
            if args_dict:
                param_string, params, args = experiment_utils.build_parameters(
                    map_fun, executor_num, args_dict)
                hdfs_exec_logdir, hdfs_appid_logdir = experiment_utils._create_experiment_subdirectories(
                    app_id,
                    run_id,
                    param_string,
                    'random_search',
                    params=params)
                logfile = experiment_utils._init_logger(hdfs_exec_logdir)
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir)
                print(devices._get_gpu_info())
                print(
                    '-------------------------------------------------------')
                print('Started running task ' + param_string)
                task_start = time.time()
                retval = map_fun(*args)
                task_end = time.time()
                experiment_utils._handle_return(retval, hdfs_exec_logdir,
                                                optimization_key, logfile)
                time_str = 'Finished task ' + param_string + ' - took ' + experiment_utils._time_diff(
                    task_start, task_end)
                print(time_str)
                print('Returning metric ' + str(retval))
                print(
                    '-------------------------------------------------------')
        except:
            raise
        finally:
            experiment_utils._cleanup(tensorboard, t)
示例#2
0
    def _wrapper_fun(iter):
        """

        Args:
            :iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        experiment_utils._set_ml_id(app_id, run_id)

        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        global local_logdir_bool

        try:
            #Arguments
            if args_dict:
                param_string, params, args = experiment_utils.build_parameters(map_fun, executor_num, args_dict)
                val = _get_return_file(param_string, app_id, generation_id, run_id)
                hdfs_exec_logdir, hdfs_appid_logdir = experiment_utils._create_experiment_subdirectories(app_id, run_id, param_string, 'differential_evolution', sub_type='generation.' + str(generation_id), params=params)
                logfile = experiment_utils._init_logger(hdfs_exec_logdir)
                tb_hdfs_path, tb_pid = tensorboard._register(hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir_bool)
                print(devices._get_gpu_info())
                print('-------------------------------------------------------')
                print('Started running task ' + param_string)
                if val is not None:
                    val = json.loads(val)
                task_start = time.time()
                if val is None:
                    val = map_fun(*args)
                task_end = time.time()
                time_str = 'Finished task ' + param_string + ' - took ' + experiment_utils._time_diff(task_start, task_end)
                print(time_str)
                experiment_utils._handle_return(val, hdfs_exec_logdir, opt_key, logfile)
                print('Returning metric ' + str(val))
                print('-------------------------------------------------------')
        except:
            raise
        finally:
            experiment_utils._cleanup(tensorboard, t)
示例#3
0
def _start_run():
    global running
    global app_id
    global run_id
    running = True
    experiment_utils._set_ml_id(app_id, run_id)
示例#4
0
    def _wrapper_fun(iter):
        """
        Wraps the user supplied training function in order to be passed to the
        Spark Executors.

        Args:
            iter:

        Returns:

        """
        experiment_utils._set_ml_id(app_id, run_id)

        # get task context information to determine executor identifier
        partition_id, task_attempt = util.get_partition_attempt_id()

        client = rpc.Client(server_addr, partition_id, task_attempt,
                            hb_interval, secret)
        log_file = (log_dir + "/executor_" + str(partition_id) + "_" +
                    str(task_attempt) + ".log")

        # save the builtin print
        original_print = __builtin__.print

        reporter = Reporter(log_file, partition_id, task_attempt,
                            original_print)

        def maggy_print(*args, **kwargs):
            """Maggy custom print() function."""
            original_print(*args, **kwargs)
            reporter.log(" ".join(str(x) for x in args), True)

        # override the builtin print
        __builtin__.print = maggy_print

        try:
            client_addr = client.client_addr

            host_port = client_addr[0] + ":" + str(client_addr[1])

            exec_spec = {}
            exec_spec["partition_id"] = partition_id
            exec_spec["task_attempt"] = task_attempt
            exec_spec["host_port"] = host_port
            exec_spec["trial_id"] = None

            reporter.log("Registering with experiment driver", False)
            client.register(exec_spec)

            client.start_heartbeat(reporter)

            # blocking
            trial_id, parameters = client.get_suggestion(reporter)

            while not client.done:
                if experiment_type == "ablation":
                    ablation_params = {
                        "ablated_feature":
                        parameters.get("ablated_feature", "None"),
                        "ablated_layer":
                        parameters.get("ablated_layer", "None"),
                    }
                    parameters.pop("ablated_feature")
                    parameters.pop("ablated_layer")

                tb_logdir = log_dir + "/" + trial_id
                trial_log_file = tb_logdir + "/output.log"
                reporter.set_trial_id(trial_id)

                # If trial is repeated, delete trial directory, except log file
                if hopshdfs.exists(tb_logdir):
                    util._clean_dir(tb_logdir, [trial_log_file])
                else:
                    hopshdfs.mkdir(tb_logdir)

                reporter.init_logger(trial_log_file)
                tensorboard._register(tb_logdir)
                if experiment_type == "ablation":
                    hopshdfs.dump(
                        json.dumps(ablation_params,
                                   default=util.json_default_numpy),
                        tb_logdir + "/.hparams.json",
                    )

                else:
                    hopshdfs.dump(
                        json.dumps(parameters,
                                   default=util.json_default_numpy),
                        tb_logdir + "/.hparams.json",
                    )

                try:
                    reporter.log("Starting Trial: {}".format(trial_id), False)
                    reporter.log("Trial Configuration: {}".format(parameters),
                                 False)

                    if experiment_type == "optimization":
                        tensorboard._write_hparams(parameters, trial_id)

                    sig = inspect.signature(map_fun)
                    if sig.parameters.get("reporter", None):
                        retval = map_fun(**parameters, reporter=reporter)
                    else:
                        retval = map_fun(**parameters)

                    if experiment_type == "optimization":
                        tensorboard._write_session_end()

                    retval = util._handle_return_val(retval, tb_logdir,
                                                     optimization_key,
                                                     trial_log_file)

                except exceptions.EarlyStopException as e:
                    retval = e.metric
                    reporter.log("Early Stopped Trial.", False)

                reporter.log("Finished Trial: {}".format(trial_id), False)
                reporter.log("Final Metric: {}".format(retval), False)
                client.finalize_metric(retval, reporter)

                # blocking
                trial_id, parameters = client.get_suggestion(reporter)

        except:  # noqa: E722
            reporter.log(traceback.format_exc(), False)
            raise
        finally:
            reporter.close_logger()
            client.stop()
            client.close()
示例#5
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        experiment_utils._set_ml_id(app_id, run_id)

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        is_chief = False
        logdir = None
        tb_hdfs_path = None
        try:
            host = experiment_utils._get_ip_address()

            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(('', 0))
            port = tmp_socket.getsockname()[1]

            client = allreduce_reservation.Client(server_addr)
            host_port = host + ":" + str(port)

            client.register({"worker": host_port, "index": executor_num})
            cluster = client.await_reservations()
            tmp_socket.close()
            client.close()

            task_index = experiment_utils._find_index(host_port, cluster)

            if task_index == -1:
                cluster["task"] = {"type": "chief", "index": 0}
            else:
                cluster["task"] = {"type": "worker", "index": task_index}

            evaluator_node = None
            if evaluator:
                last_worker_index = len(cluster["cluster"]["worker"]) - 1
                evaluator_node = cluster["cluster"]["worker"][
                    last_worker_index]
                cluster["cluster"]["evaluator"] = [evaluator_node]
                del cluster["cluster"]["worker"][last_worker_index]
                if evaluator_node == host_port:
                    cluster["task"] = {"type": "evaluator", "index": 0}

            print('TF_CONFIG: {} '.format(cluster))

            if num_executors > 1:
                os.environ["TF_CONFIG"] = json.dumps(cluster)

            is_chief = (cluster["task"]["type"] == "chief")

            is_evaluator = (cluster["task"]["type"] == "evaluator")

            if is_chief:
                logdir = experiment_utils._get_logdir(app_id, run_id)
                tb_hdfs_path, tb_pid = tensorboard._register(
                    logdir, logdir, executor_num, local_logdir=local_logdir)
            elif is_evaluator:
                logdir = experiment_utils._get_logdir(app_id, run_id)
                tensorboard.events_logdir = logdir

            logfile = experiment_utils._init_logger(
                experiment_utils._get_logdir(app_id, run_id),
                role=cluster["task"]["type"],
                index=cluster["task"]["index"])

            print(devices._get_gpu_info())
            print('-------------------------------------------------------')
            print('Started running task')
            task_start = time.time()
            retval = map_fun()

            if is_chief:
                experiment_utils._handle_return_simple(
                    retval, experiment_utils._get_logdir(app_id, run_id),
                    logfile)

            task_end = time.time()
            time_str = 'Finished task - took ' + experiment_utils._time_diff(
                task_start, task_end)
            print(time_str)
            print('-------------------------------------------------------')
        except:
            raise
        finally:
            experiment_utils._cleanup(tensorboard, t)
示例#6
0
 def set_ml_id(self, app_id, run_id):
     return experiment_utils._set_ml_id(app_id, run_id)
示例#7
0
def lagom(
    map_fun,
    name="no-name",
    experiment_type="optimization",
    searchspace=None,
    optimizer=None,
    direction="max",
    num_trials=1,
    ablation_study=None,
    ablator=None,
    optimization_key="metric",
    hb_interval=1,
    es_policy="median",
    es_interval=300,
    es_min=10,
    description="",
):
    """Launches a maggy experiment, which depending on `experiment_type` can
    either be a hyperparameter optimization or an ablation study experiment.
    Given a search space, objective and a model training procedure `map_fun`
    (black-box function), an experiment is the whole process of finding the
    best hyperparameter combination in the search space, optimizing the
    black-box function. Currently maggy supports random search and a median
    stopping rule.

    **lagom** is a Swedish word meaning "just the right amount".

    :param map_fun: User defined experiment containing the model training.
    :type map_fun: function
    :param name: A user defined experiment identifier.
    :type name: str
    :param experiment_type: Type of Maggy experiment, either 'optimization'
        (default) or 'ablation'.
    :type experiment_type: str
    :param searchspace: A maggy Searchspace object from which samples are
        drawn.
    :type searchspace: Searchspace
    :param optimizer: The optimizer is the part generating new trials.
    :type optimizer: str, AbstractOptimizer
    :param direction: If set to ‘max’ the highest value returned will
        correspond to the best solution, if set to ‘min’ the opposite is true.
    :type direction: str
    :param num_trials: the number of trials to evaluate given the search space,
        each containing a different hyperparameter combination
    :type num_trials: int
    :param ablation_study: Ablation study object. Can be None for optimization
        experiment type.
    :type ablation_study: AblationStudy
    :param ablator: Ablator to use for experiment type 'ablation'.
    :type ablator: str, AbstractAblator
    :param optimization_key: Name of the metric to be optimized
    :type optimization_key: str, optional
    :param hb_interval: The heartbeat interval in seconds from trial executor
        to experiment driver, defaults to 1
    :type hb_interval: int, optional
    :param es_policy: The earlystopping policy, defaults to 'median'
    :type es_policy: str, optional
    :param es_interval: Frequency interval in seconds to check currently
        running trials for early stopping, defaults to 300
    :type es_interval: int, optional
    :param es_min: Minimum number of trials finalized before checking for
        early stopping, defaults to 10
    :type es_min: int, optional
    :param description: A longer description of the experiment.
    :type description: str, optional
    :raises RuntimeError: An experiment is currently running.
    :return: A dictionary indicating the best trial and best hyperparameter
        combination with it's performance metric
    :rtype: dict
    """
    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    job_start = time.time()
    sc = hopsutil._find_spark().sparkContext
    exp_driver = None

    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        app_id, run_id = util._validate_ml_id(app_id, run_id)

        # start run
        running = True
        experiment_utils._set_ml_id(app_id, run_id)

        # create experiment dir
        experiment_utils._create_experiment_dir(app_id, run_id)

        tensorboard._register(experiment_utils._get_logdir(app_id, run_id))

        num_executors = util.num_executors(sc)

        # start experiment driver
        if experiment_type == "optimization":

            assert num_trials > 0, "number of trials should be greater " + "than zero"
            tensorboard._write_hparams_config(
                experiment_utils._get_logdir(app_id, run_id), searchspace
            )

            if num_executors > num_trials:
                num_executors = num_trials

            exp_driver = experimentdriver.ExperimentDriver(
                "optimization",
                searchspace=searchspace,
                optimizer=optimizer,
                direction=direction,
                num_trials=num_trials,
                name=name,
                num_executors=num_executors,
                hb_interval=hb_interval,
                es_policy=es_policy,
                es_interval=es_interval,
                es_min=es_min,
                description=description,
                log_dir=experiment_utils._get_logdir(app_id, run_id),
            )

            exp_function = exp_driver.optimizer.name()

        elif experiment_type == "ablation":
            exp_driver = experimentdriver.ExperimentDriver(
                "ablation",
                ablation_study=ablation_study,
                ablator=ablator,
                name=name,
                num_executors=num_executors,
                hb_interval=hb_interval,
                description=description,
                log_dir=experiment_utils._get_logdir(app_id, run_id),
            )
            # using exp_driver.num_executor since
            # it has been set using ablator.get_number_of_trials()
            # in experiment.py
            if num_executors > exp_driver.num_executors:
                num_executors = exp_driver.num_executors

            exp_function = exp_driver.ablator.name()
        else:
            running = False
            raise RuntimeError(
                "Unknown experiment_type:"
                "should be either 'optimization' or 'ablation', "
                "But it is '{0}'".format(str(experiment_type))
            )

        nodeRDD = sc.parallelize(range(num_executors), num_executors)

        # Do provenance after initializing exp_driver, because exp_driver does
        # the type checks for optimizer and searchspace
        sc.setJobGroup(os.environ["ML_ID"], "{0} | {1}".format(name, exp_function))

        experiment_json = experiment_utils._populate_experiment(
            name,
            exp_function,
            "MAGGY",
            exp_driver.searchspace.json(),
            description,
            app_id,
            direction,
            optimization_key,
        )

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, "CREATE"
        )

        util._log(
            "Started Maggy Experiment: {0}, {1}, run {2}".format(name, app_id, run_id)
        )

        exp_driver.init(job_start)

        server_addr = exp_driver.server_addr

        # Force execution on executor, since GPU is located on executor
        nodeRDD.foreachPartition(
            trialexecutor._prepare_func(
                app_id,
                run_id,
                experiment_type,
                map_fun,
                server_addr,
                hb_interval,
                exp_driver._secret,
                optimization_key,
                experiment_utils._get_logdir(app_id, run_id),
            )
        )
        job_end = time.time()

        result = exp_driver.finalize(job_end)
        best_logdir = (
            experiment_utils._get_logdir(app_id, run_id) + "/" + result["best_id"]
        )

        util._finalize_experiment(
            experiment_json,
            float(result["best_val"]),
            app_id,
            run_id,
            "FINISHED",
            exp_driver.duration,
            experiment_utils._get_logdir(app_id, run_id),
            best_logdir,
            optimization_key,
        )

        util._log("Finished Experiment")

        return result

    except:  # noqa: E722
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - job_start)
        )
        if exp_driver:
            if exp_driver.exception:
                raise exp_driver.exception
        raise
    finally:
        # grace period to send last logs to sparkmagic
        # sparkmagic hb poll intervall is 5 seconds, therefore wait 6 seconds
        time.sleep(6)
        # cleanup spark jobs
        if running and exp_driver is not None:
            exp_driver.stop()
        run_id += 1
        running = False
        sc.setJobGroup("", "")

    return result
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i


        experiment_utils._set_ml_id(app_id, run_id)

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        role = None
        logdir = None
        tb_hdfs_path = None

        client = parameter_server_reservation.Client(server_addr)

        try:
            host = experiment_utils._get_ip_address()

            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(('', 0))
            port = tmp_socket.getsockname()[1]
            host_port = host + ":" + str(port)

            exec_spec = {}
            if executor_num < num_ps:
                exec_spec["task_type"] = "ps"
            else:
                exec_spec["task_type"] = "worker"
            exec_spec["host_port"] = host_port
            exec_spec["gpus_present"] = devices.get_num_gpus() > 0

            client.register(exec_spec)

            cluster = client.await_reservations()

            tmp_socket.close()

            role, index = experiment_utils._find_task_and_index(host_port, cluster)

            cluster_spec = {}
            cluster_spec["cluster"] = cluster
            cluster_spec["task"] = {"type": role, "index": index}

            evaluator_node = None
            if evaluator:
                last_worker_index = len(cluster_spec["cluster"]["worker"])-1
                evaluator_node = cluster_spec["cluster"]["worker"][last_worker_index]
                cluster_spec["cluster"]["evaluator"] = [evaluator_node]
                del cluster_spec["cluster"]["worker"][last_worker_index]
                if evaluator_node == host_port:
                    role = "evaluator"
                    cluster_spec["task"] = {"type": "evaluator", "index": 0}

            print('TF_CONFIG: {} '.format(cluster_spec))
            os.environ["TF_CONFIG"] = json.dumps(cluster_spec)

            logfile = experiment_utils._init_logger(experiment_utils._get_logdir(app_id, run_id), role=role, index=cluster_spec["task"]["index"])

            dist_logdir = experiment_utils._get_logdir(app_id, run_id) + '/logdir'

            is_chief = (cluster["task"]["type"] == "chief")
            if is_chief:
                hdfs.mkdir(dist_logdir)
                tensorboard._register(dist_logdir, experiment_utils._get_logdir(app_id, run_id), executor_num, local_logdir=local_logdir)
            else:
                tensorboard.events_logdir = dist_logdir
                
            print(devices._get_gpu_info())
            print('-------------------------------------------------------')
            print('Started running task')
            task_start = time.time()

            retval=None
            if role == "ps":
                ps_thread = threading.Thread(target=lambda: map_fun())
                ps_thread.start()
                client.await_all_workers_finished()
            else:
                retval = map_fun()

            if role == "chief":
                experiment_utils._handle_return_simple(retval, experiment_utils._get_logdir(app_id, run_id), logfile)

            task_end = time.time()
            time_str = 'Finished task - took ' + experiment_utils._time_diff(task_start, task_end)
            print(time_str)
            print('-------------------------------------------------------')
        except:
            raise
        finally:
            if role != "ps":
                client.register_worker_finished()
            client.close()
            experiment_utils._cleanup(tensorboard, t)