예제 #1
0
def _run(sc,
         map_fun,
         run_id,
         local_logdir=False,
         name="no-name",
         evaluator=False):
    """

    Args:
        sc:
        map_fun:
        local_logdir:
        name:

    Returns:

    """
    app_id = str(sc.applicationId)

    num_executions = util.num_executors()

    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Make SparkUI intuitive by grouping jobs
    sc.setJobGroup(
        os.environ['ML_ID'],
        "{} | ParameterServerStrategy - Distributed Training".format(name))

    server = parameter_server_reservation.Server(num_executions)

    server_addr = server.start()

    num_ps = util.num_param_servers()

    #Force execution on executor, since GPU is located on executor
    nodeRDD.foreachPartition(
        _prepare_func(app_id, run_id, map_fun, local_logdir, server_addr,
                      num_ps, evaluator))

    logdir = experiment_utils._get_logdir(app_id, run_id)

    print('Finished Experiment \n')

    path_to_return = logdir + '/.outputs.json'
    if pydoop.hdfs.path.exists(path_to_return):
        with pydoop.hdfs.open(path_to_return, "r") as fi:
            contents = fi.read()
            fi.close()
            return logdir, json.loads(contents)

    return logdir, None
예제 #2
0
def _run(sc,
         map_fun,
         run_id,
         args_dict,
         samples,
         direction=Direction.MAX,
         local_logdir=False,
         name="no-name",
         optimization_key=None):
    """

    Args:
        sc:
        map_fun:
        args_dict:
        local_logdir:
        name:

    Returns:

    """

    app_id = str(sc.applicationId)

    arg_lists = list(args_dict.values())
    for i in range(len(arg_lists)):
        if len(arg_lists[i]) != 2:
            raise ValueError(
                'Boundary list must contain exactly two elements, [lower_bound, upper_bound] for each hyperparameter'
            )

    hp_names = args_dict.keys()

    random_dict = {}
    for hp in hp_names:
        lower_bound = args_dict[hp][0]
        upper_bound = args_dict[hp][1]

        assert lower_bound < upper_bound, "lower bound: " + str(
            lower_bound) + " must be less than upper bound: " + str(
                upper_bound)

        random_values = []

        if type(lower_bound) is int and type(upper_bound) is int:
            for i in range(samples):
                random_values.append(random.randint(lower_bound, upper_bound))
        elif (type(lower_bound) is float
              or type(lower_bound) is int) and (type(upper_bound) is float
                                                or type(upper_bound) is int):
            for i in range(samples):
                random_values.append(random.uniform(lower_bound, upper_bound))
        else:
            raise ValueError('Only float and int is currently supported')

        random_dict[hp] = random_values

    random_dict, new_samples = _remove_duplicates(random_dict, samples)

    sc.setJobGroup(os.environ['ML_ID'], "{} | Random Search".format(name))
    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(new_samples), new_samples)

    nodeRDD.foreachPartition(
        _prepare_func(app_id, run_id, map_fun, random_dict, local_logdir,
                      optimization_key))

    arg_count = six.get_function_code(map_fun).co_argcount
    arg_names = six.get_function_code(map_fun).co_varnames
    exp_dir = experiment_utils._get_logdir(app_id, run_id)

    max_val, max_hp, min_val, min_hp, avg, max_return_dict, min_return_dict = experiment_utils._get_best(
        random_dict, new_samples, arg_names, arg_count, exp_dir,
        optimization_key)

    param_combination = ""
    best_val = ""
    return_dict = {}

    if direction.upper() == Direction.MAX:
        param_combination = max_hp
        best_val = str(max_val)
        return_dict = max_return_dict
    elif direction.upper() == Direction.MIN:
        param_combination = min_hp
        best_val = str(min_val)
        return_dict = min_return_dict

    print('Finished Experiment \n')

    best_dir = exp_dir + '/' + param_combination

    return best_dir, experiment_utils._get_params_dict(
        best_dir), best_val, return_dict
예제 #3
0
def mirrored(map_fun,
             name='no-name',
             local_logdir=False,
             description=None,
             evaluator=False):
    """
    *Distributed Training*

    Example usage:

    >>> from hops import experiment
    >>> def mirrored_training():
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the wrapper function
    >>>    from hops import tensorboard
    >>>    from hops import devices
    >>>    logdir = tensorboard.logdir()
    >>>    ...MirroredStrategy()...
    >>> experiment.mirrored(mirrored_training, local_logdir=True)

    Args:
        :map_fun: contains the code where you are using MirroredStrategy.
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: a longer description for the experiment
        :evaluator: whether to run one of the workers as an evaluator

    Returns:
        HDFS path in your project where the experiment is stored and return value from the process running as chief

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    num_workers = util.num_executors()
    if evaluator:
        assert num_workers > 2, "number of workers must be atleast 3 if evaluator is set to True"

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id))

        experiment_json = experiment_utils._populate_experiment(
            name, 'mirrored', 'DISTRIBUTED_TRAINING', None, description,
            app_id, None, None)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, return_dict = mirrored_impl._run(sc,
                                                 map_fun,
                                                 run_id,
                                                 local_logdir=local_logdir,
                                                 name=name,
                                                 evaluator=evaluator)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        experiment_utils._finalize_experiment(experiment_json, None, app_id,
                                              run_id, 'FINISHED', duration,
                                              logdir, None, None)

        return logdir, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
예제 #4
0
def collective_all_reduce(map_fun,
                          name='no-name',
                          local_logdir=False,
                          description=None,
                          evaluator=False):
    """
    *Distributed Training*

    Sets up the cluster to run CollectiveAllReduceStrategy.

    TF_CONFIG is exported in the background and does not need to be set by the user themselves.

    Example usage:

    >>> from hops import experiment
    >>> def distributed_training():
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the wrapper function
    >>>    from hops import tensorboard
    >>>    from hops import devices
    >>>    logdir = tensorboard.logdir()
    >>>    ...CollectiveAllReduceStrategy(num_gpus_per_worker=devices.get_num_gpus())...
    >>> experiment.collective_all_reduce(distributed_training, local_logdir=True)

    Args:
        :map_fun: the function containing code to run CollectiveAllReduceStrategy
        :name: the name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: a longer description for the experiment
        :evaluator: whether to run one of the workers as an evaluator

    Returns:
        HDFS path in your project where the experiment is stored and return value from the process running as chief

    """

    num_ps = util.num_param_servers()
    num_executors = util.num_executors()

    assert num_ps == 0, "number of parameter servers should be 0"
    assert num_executors > 1, "number of workers (executors) should be greater than 1"
    if evaluator:
        assert num_executors > 2, "number of workers must be atleast 3 if evaluator is set to True"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id))

        experiment_json = experiment_utils._populate_experiment(
            name, 'collective_all_reduce', 'DISTRIBUTED_TRAINING', None,
            description, app_id, None, None)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, return_dict = allreduce_impl._run(sc,
                                                  map_fun,
                                                  run_id,
                                                  local_logdir=local_logdir,
                                                  name=name,
                                                  evaluator=evaluator)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        experiment_utils._finalize_experiment(experiment_json, None, app_id,
                                              run_id, 'FINISHED', duration,
                                              logdir, None, None)

        return logdir, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
예제 #5
0
def launch(map_fun,
           args_dict=None,
           name='no-name',
           local_logdir=False,
           description=None,
           metric_key=None):
    """

    *Experiment* or *Parallel Experiment*

    Run an Experiment contained in *map_fun* one time with no arguments or multiple times with different arguments if
    *args_dict* is specified.

    Example usage:

    >>> from hops import experiment
    >>> def train_nn():
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the wrapper function
    >>>    accuracy, loss = network.evaluate(learning_rate, layers, dropout)
    >>> experiment.launch(train_nn)

    Returning multiple outputs, including images and logs:

    >>> from hops import experiment
    >>> def train_nn():
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the wrapper function
    >>>    from PIL import Image
    >>>    f = open('logfile.txt', 'w')
    >>>    f.write('Starting training...')
    >>>    accuracy, loss = network.evaluate(learning_rate, layers, dropout)
    >>>    img = Image.new(.....)
    >>>    img.save('diagram.png')
    >>>    return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'}
    >>> experiment.launch(train_nn)

    Args:
        :map_fun: The function to run
        :args_dict: If specified will run the same function multiple times with different arguments, {'a':[1,2], 'b':[5,3]} would run the function two times with arguments (1,5) and (2,3) provided that the function signature contains two arguments like *def func(a,b):*
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: A longer description for the experiment
        :metric_key: If returning a dict with multiple return values, this key should match the name of the key in the dict for the metric you want to associate with the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError(
            "An experiment is currently running. Please call experiment.end() to stop it."
        )

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id))

        experiment_json = None
        if args_dict:
            experiment_json = experiment_utils._populate_experiment(
                name, 'launch', 'EXPERIMENT', json.dumps(args_dict),
                description, app_id, None, None)
        else:
            experiment_json = experiment_utils._populate_experiment(
                name, 'launch', 'EXPERIMENT', None, description, app_id, None,
                None)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, return_dict = launcher._run(sc, map_fun, run_id, args_dict,
                                            local_logdir)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        metric = experiment_utils._get_metric(return_dict, metric_key)

        experiment_utils._finalize_experiment(experiment_json, metric, app_id,
                                              run_id, 'FINISHED', duration,
                                              logdir, None, None)
        return logdir, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
예제 #6
0
def differential_evolution(objective_function,
                           boundary_dict,
                           direction=Direction.MAX,
                           generations=4,
                           population=6,
                           mutation=0.5,
                           crossover=0.7,
                           name='no-name',
                           local_logdir=False,
                           description=None,
                           optimization_key='metric'):
    """
    *Parallel Experiment*

    Run differential evolution to explore a given search space for each hyperparameter and figure out the best hyperparameter combination.
    The function is treated as a blackbox that returns a metric for some given hyperparameter combination.
    The returned metric is used to evaluate how 'good' the hyperparameter combination was.

    Example usage:

    >>> from hops import experiment
    >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    import tensorflow
    >>>    return network.evaluate(learning_rate, layers, dropout)
    >>> experiment.differential_evolution(train_nn, boundary_dict, direction=Direction.MAX)

    Returning multiple outputs, including images and logs:

    >>> from hops import experiment
    >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the wrapper function
    >>>    from PIL import Image
    >>>    f = open('logfile.txt', 'w')
    >>>    f.write('Starting training...')
    >>>    accuracy, loss = network.evaluate(learning_rate, layers, dropout)
    >>>    img = Image.new(.....)
    >>>    img.save('diagram.png')
    >>>    return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'}
    >>> # Important! Remember: optimization_key must be set when returning multiple outputs
    >>> experiment.differential_evolution(train_nn, boundary_dict, direction=Direction.MAX, optimization_key='accuracy')

    Args:
        :objective_function: the function to run, must return a metric
        :boundary_dict: a dict where each key corresponds to an argument of *objective_function* and the correspond value should be a list of two elements. The first element being the lower bound for the parameter and the the second element the upper bound.
        :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric
        :generations: number of generations
        :population: size of population
        :mutation: mutation rate to explore more different hyperparameters
        :crossover: how fast to adapt the population to the best in each generation
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: a longer description for the experiment
        :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value

    Returns:
        HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        diff_evo_impl.run_id = run_id

        hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id))

        experiment_json = experiment_utils._populate_experiment(
            name, 'differential_evolution', 'PARALLEL_EXPERIMENTS',
            json.dumps(boundary_dict), description, app_id, direction,
            optimization_key)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, best_param, best_metric, return_dict = diff_evo_impl._run(
            objective_function,
            boundary_dict,
            direction=direction,
            generations=generations,
            population=population,
            mutation=mutation,
            crossover=crossover,
            cleanup_generations=False,
            local_logdir=local_logdir,
            name=name,
            optimization_key=optimization_key)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        experiment_utils._finalize_experiment(
            experiment_json, best_metric, app_id, run_id, 'FINISHED', duration,
            experiment_utils._get_logdir(app_id, run_id), logdir,
            optimization_key)

        return logdir, best_param, return_dict

    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
예제 #7
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        experiment_utils._set_ml_id(app_id, run_id)

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        is_chief = False
        logdir = None
        tb_hdfs_path = None
        try:
            host = experiment_utils._get_ip_address()

            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(('', 0))
            port = tmp_socket.getsockname()[1]

            client = allreduce_reservation.Client(server_addr)
            host_port = host + ":" + str(port)

            client.register({"worker": host_port, "index": executor_num})
            cluster = client.await_reservations()
            tmp_socket.close()
            client.close()

            task_index = experiment_utils._find_index(host_port, cluster)

            if task_index == -1:
                cluster["task"] = {"type": "chief", "index": 0}
            else:
                cluster["task"] = {"type": "worker", "index": task_index}

            evaluator_node = None
            if evaluator:
                last_worker_index = len(cluster["cluster"]["worker"]) - 1
                evaluator_node = cluster["cluster"]["worker"][
                    last_worker_index]
                cluster["cluster"]["evaluator"] = [evaluator_node]
                del cluster["cluster"]["worker"][last_worker_index]
                if evaluator_node == host_port:
                    cluster["task"] = {"type": "evaluator", "index": 0}

            print('TF_CONFIG: {} '.format(cluster))

            if num_executors > 1:
                os.environ["TF_CONFIG"] = json.dumps(cluster)

            is_chief = (cluster["task"]["type"] == "chief")

            is_evaluator = (cluster["task"]["type"] == "evaluator")

            if is_chief:
                logdir = experiment_utils._get_logdir(app_id, run_id)
                tb_hdfs_path, tb_pid = tensorboard._register(
                    logdir, logdir, executor_num, local_logdir=local_logdir)
            elif is_evaluator:
                logdir = experiment_utils._get_logdir(app_id, run_id)
                tensorboard.events_logdir = logdir

            logfile = experiment_utils._init_logger(
                experiment_utils._get_logdir(app_id, run_id),
                role=cluster["task"]["type"],
                index=cluster["task"]["index"])

            print(devices._get_gpu_info())
            print('-------------------------------------------------------')
            print('Started running task')
            task_start = time.time()
            retval = map_fun()

            if is_chief:
                experiment_utils._handle_return_simple(
                    retval, experiment_utils._get_logdir(app_id, run_id),
                    logfile)

            task_end = time.time()
            time_str = 'Finished task - took ' + experiment_utils._time_diff(
                task_start, task_end)
            print(time_str)
            print('-------------------------------------------------------')
        except:
            raise
        finally:
            experiment_utils._cleanup(tensorboard, t)
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i


        experiment_utils._set_ml_id(app_id, run_id)

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        role = None
        logdir = None
        tb_hdfs_path = None

        client = parameter_server_reservation.Client(server_addr)

        try:
            host = experiment_utils._get_ip_address()

            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(('', 0))
            port = tmp_socket.getsockname()[1]
            host_port = host + ":" + str(port)

            exec_spec = {}
            if executor_num < num_ps:
                exec_spec["task_type"] = "ps"
            else:
                exec_spec["task_type"] = "worker"
            exec_spec["host_port"] = host_port
            exec_spec["gpus_present"] = devices.get_num_gpus() > 0

            client.register(exec_spec)

            cluster = client.await_reservations()

            tmp_socket.close()

            role, index = experiment_utils._find_task_and_index(host_port, cluster)

            cluster_spec = {}
            cluster_spec["cluster"] = cluster
            cluster_spec["task"] = {"type": role, "index": index}

            evaluator_node = None
            if evaluator:
                last_worker_index = len(cluster_spec["cluster"]["worker"])-1
                evaluator_node = cluster_spec["cluster"]["worker"][last_worker_index]
                cluster_spec["cluster"]["evaluator"] = [evaluator_node]
                del cluster_spec["cluster"]["worker"][last_worker_index]
                if evaluator_node == host_port:
                    role = "evaluator"
                    cluster_spec["task"] = {"type": "evaluator", "index": 0}

            print('TF_CONFIG: {} '.format(cluster_spec))
            os.environ["TF_CONFIG"] = json.dumps(cluster_spec)

            logfile = experiment_utils._init_logger(experiment_utils._get_logdir(app_id, run_id), role=role, index=cluster_spec["task"]["index"])

            dist_logdir = experiment_utils._get_logdir(app_id, run_id) + '/logdir'

            is_chief = (cluster["task"]["type"] == "chief")
            if is_chief:
                hdfs.mkdir(dist_logdir)
                tensorboard._register(dist_logdir, experiment_utils._get_logdir(app_id, run_id), executor_num, local_logdir=local_logdir)
            else:
                tensorboard.events_logdir = dist_logdir
                
            print(devices._get_gpu_info())
            print('-------------------------------------------------------')
            print('Started running task')
            task_start = time.time()

            retval=None
            if role == "ps":
                ps_thread = threading.Thread(target=lambda: map_fun())
                ps_thread.start()
                client.await_all_workers_finished()
            else:
                retval = map_fun()

            if role == "chief":
                experiment_utils._handle_return_simple(retval, experiment_utils._get_logdir(app_id, run_id), logfile)

            task_end = time.time()
            time_str = 'Finished task - took ' + experiment_utils._time_diff(task_start, task_end)
            print(time_str)
            print('-------------------------------------------------------')
        except:
            raise
        finally:
            if role != "ps":
                client.register_worker_finished()
            client.close()
            experiment_utils._cleanup(tensorboard, t)
예제 #9
0
 def log_searchspace(self, app_id, run_id, searchspace):
     tensorboard._write_hparams_config(
         experiment_utils._get_logdir(app_id, run_id), searchspace
     )
예제 #10
0
 def init_ml_tracking(self, app_id, run_id):
     tensorboard._register(experiment_utils._get_logdir(app_id, run_id))
예제 #11
0
def _run(sc,
         map_fun,
         run_id,
         args_dict,
         direction=Direction.MAX,
         local_logdir=False,
         name="no-name",
         optimization_key=None):
    """
    Run the wrapper function with each hyperparameter combination as specified by the dictionary

    Args:
        sc:
        map_fun:
        args_dict:
        direction:
        local_logdir:
        name:

    Returns:

    """
    app_id = str(sc.applicationId)
    num_executions = 1

    if direction.upper() != Direction.MAX and direction.upper(
    ) != Direction.MIN:
        raise ValueError('Invalid direction ' + direction +
                         ', must be Direction.MAX or Direction.MIN')

    arg_lists = list(args_dict.values())
    currentLen = len(arg_lists[0])
    for i in range(len(arg_lists)):
        if currentLen != len(arg_lists[i]):
            raise ValueError(
                'Length of each function argument list must be equal')
        num_executions = len(arg_lists[i])

    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Make SparkUI intuitive by grouping jobs
    sc.setJobGroup(os.environ['ML_ID'], "{} | Grid Search".format(name))

    #Force execution on executor, since GPU is located on executor
    nodeRDD.foreachPartition(
        _prepare_func(app_id, run_id, map_fun, args_dict, local_logdir,
                      optimization_key))

    arg_count = six.get_function_code(map_fun).co_argcount
    arg_names = six.get_function_code(map_fun).co_varnames
    exp_dir = experiment_utils._get_logdir(app_id, run_id)

    max_val, max_hp, min_val, min_hp, avg, max_return_dict, min_return_dict = experiment_utils._get_best(
        args_dict, num_executions, arg_names, arg_count, exp_dir,
        optimization_key)

    param_combination = ""
    best_val = ""
    return_dict = {}

    if direction.upper() == Direction.MAX:
        param_combination = max_hp
        best_val = str(max_val)
        return_dict = max_return_dict
    elif direction.upper() == Direction.MIN:
        param_combination = min_hp
        best_val = str(min_val)
        return_dict
        return_dict = min_return_dict

    print('Finished Experiment \n')

    best_dir = exp_dir + '/' + param_combination

    return best_dir, experiment_utils._get_params_dict(
        best_dir), best_val, return_dict
예제 #12
0
def lagom(
    map_fun,
    name="no-name",
    experiment_type="optimization",
    searchspace=None,
    optimizer=None,
    direction="max",
    num_trials=1,
    ablation_study=None,
    ablator=None,
    optimization_key="metric",
    hb_interval=1,
    es_policy="median",
    es_interval=300,
    es_min=10,
    description="",
):
    """Launches a maggy experiment, which depending on `experiment_type` can
    either be a hyperparameter optimization or an ablation study experiment.
    Given a search space, objective and a model training procedure `map_fun`
    (black-box function), an experiment is the whole process of finding the
    best hyperparameter combination in the search space, optimizing the
    black-box function. Currently maggy supports random search and a median
    stopping rule.

    **lagom** is a Swedish word meaning "just the right amount".

    :param map_fun: User defined experiment containing the model training.
    :type map_fun: function
    :param name: A user defined experiment identifier.
    :type name: str
    :param experiment_type: Type of Maggy experiment, either 'optimization'
        (default) or 'ablation'.
    :type experiment_type: str
    :param searchspace: A maggy Searchspace object from which samples are
        drawn.
    :type searchspace: Searchspace
    :param optimizer: The optimizer is the part generating new trials.
    :type optimizer: str, AbstractOptimizer
    :param direction: If set to ‘max’ the highest value returned will
        correspond to the best solution, if set to ‘min’ the opposite is true.
    :type direction: str
    :param num_trials: the number of trials to evaluate given the search space,
        each containing a different hyperparameter combination
    :type num_trials: int
    :param ablation_study: Ablation study object. Can be None for optimization
        experiment type.
    :type ablation_study: AblationStudy
    :param ablator: Ablator to use for experiment type 'ablation'.
    :type ablator: str, AbstractAblator
    :param optimization_key: Name of the metric to be optimized
    :type optimization_key: str, optional
    :param hb_interval: The heartbeat interval in seconds from trial executor
        to experiment driver, defaults to 1
    :type hb_interval: int, optional
    :param es_policy: The earlystopping policy, defaults to 'median'
    :type es_policy: str, optional
    :param es_interval: Frequency interval in seconds to check currently
        running trials for early stopping, defaults to 300
    :type es_interval: int, optional
    :param es_min: Minimum number of trials finalized before checking for
        early stopping, defaults to 10
    :type es_min: int, optional
    :param description: A longer description of the experiment.
    :type description: str, optional
    :raises RuntimeError: An experiment is currently running.
    :return: A dictionary indicating the best trial and best hyperparameter
        combination with it's performance metric
    :rtype: dict
    """
    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    job_start = time.time()
    sc = hopsutil._find_spark().sparkContext
    exp_driver = None

    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        app_id, run_id = util._validate_ml_id(app_id, run_id)

        # start run
        running = True
        experiment_utils._set_ml_id(app_id, run_id)

        # create experiment dir
        experiment_utils._create_experiment_dir(app_id, run_id)

        tensorboard._register(experiment_utils._get_logdir(app_id, run_id))

        num_executors = util.num_executors(sc)

        # start experiment driver
        if experiment_type == "optimization":

            assert num_trials > 0, "number of trials should be greater " + "than zero"
            tensorboard._write_hparams_config(
                experiment_utils._get_logdir(app_id, run_id), searchspace
            )

            if num_executors > num_trials:
                num_executors = num_trials

            exp_driver = experimentdriver.ExperimentDriver(
                "optimization",
                searchspace=searchspace,
                optimizer=optimizer,
                direction=direction,
                num_trials=num_trials,
                name=name,
                num_executors=num_executors,
                hb_interval=hb_interval,
                es_policy=es_policy,
                es_interval=es_interval,
                es_min=es_min,
                description=description,
                log_dir=experiment_utils._get_logdir(app_id, run_id),
            )

            exp_function = exp_driver.optimizer.name()

        elif experiment_type == "ablation":
            exp_driver = experimentdriver.ExperimentDriver(
                "ablation",
                ablation_study=ablation_study,
                ablator=ablator,
                name=name,
                num_executors=num_executors,
                hb_interval=hb_interval,
                description=description,
                log_dir=experiment_utils._get_logdir(app_id, run_id),
            )
            # using exp_driver.num_executor since
            # it has been set using ablator.get_number_of_trials()
            # in experiment.py
            if num_executors > exp_driver.num_executors:
                num_executors = exp_driver.num_executors

            exp_function = exp_driver.ablator.name()
        else:
            running = False
            raise RuntimeError(
                "Unknown experiment_type:"
                "should be either 'optimization' or 'ablation', "
                "But it is '{0}'".format(str(experiment_type))
            )

        nodeRDD = sc.parallelize(range(num_executors), num_executors)

        # Do provenance after initializing exp_driver, because exp_driver does
        # the type checks for optimizer and searchspace
        sc.setJobGroup(os.environ["ML_ID"], "{0} | {1}".format(name, exp_function))

        experiment_json = experiment_utils._populate_experiment(
            name,
            exp_function,
            "MAGGY",
            exp_driver.searchspace.json(),
            description,
            app_id,
            direction,
            optimization_key,
        )

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, "CREATE"
        )

        util._log(
            "Started Maggy Experiment: {0}, {1}, run {2}".format(name, app_id, run_id)
        )

        exp_driver.init(job_start)

        server_addr = exp_driver.server_addr

        # Force execution on executor, since GPU is located on executor
        nodeRDD.foreachPartition(
            trialexecutor._prepare_func(
                app_id,
                run_id,
                experiment_type,
                map_fun,
                server_addr,
                hb_interval,
                exp_driver._secret,
                optimization_key,
                experiment_utils._get_logdir(app_id, run_id),
            )
        )
        job_end = time.time()

        result = exp_driver.finalize(job_end)
        best_logdir = (
            experiment_utils._get_logdir(app_id, run_id) + "/" + result["best_id"]
        )

        util._finalize_experiment(
            experiment_json,
            float(result["best_val"]),
            app_id,
            run_id,
            "FINISHED",
            exp_driver.duration,
            experiment_utils._get_logdir(app_id, run_id),
            best_logdir,
            optimization_key,
        )

        util._log("Finished Experiment")

        return result

    except:  # noqa: E722
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - job_start)
        )
        if exp_driver:
            if exp_driver.exception:
                raise exp_driver.exception
        raise
    finally:
        # grace period to send last logs to sparkmagic
        # sparkmagic hb poll intervall is 5 seconds, therefore wait 6 seconds
        time.sleep(6)
        # cleanup spark jobs
        if running and exp_driver is not None:
            exp_driver.stop()
        run_id += 1
        running = False
        sc.setJobGroup("", "")

    return result
예제 #13
0
def _run(sc,
         map_fun,
         run_id,
         args_dict=None,
         local_logdir=False,
         name="no-name"):
    """

    Args:
        sc:
        map_fun:
        args_dict:
        local_logdir:
        name:

    Returns:

    """

    app_id = str(sc.applicationId)

    if args_dict == None:
        num_executions = 1
    else:
        arg_lists = list(args_dict.values())
        currentLen = len(arg_lists[0])
        for i in range(len(arg_lists)):
            if currentLen != len(arg_lists[i]):
                raise ValueError(
                    'Length of each function argument list must be equal')
            num_executions = len(arg_lists[i])

    sc.setJobGroup(os.environ['ML_ID'],
                   "{} | Launcher running experiment".format(name))
    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Force execution on executor, since GPU is located on executor
    nodeRDD.foreachPartition(
        _prepare_func(app_id, run_id, map_fun, args_dict, local_logdir))

    print('Finished Experiment \n')

    # For single run return .return if exists
    if args_dict == None:
        path_to_return = experiment_utils._get_logdir(
            app_id, run_id) + '/.outputs.json'
        if hdfs.exists(path_to_return):
            return_json = hdfs.load(path_to_return)
            return_dict = json.loads(return_json)
            return experiment_utils._get_logdir(app_id, run_id), return_dict
        else:
            return experiment_utils._get_logdir(app_id, run_id), None
    elif num_executions == 1:
        arg_count = six.get_function_code(map_fun).co_argcount
        arg_names = six.get_function_code(map_fun).co_varnames
        argIndex = 0
        param_string = ''
        while arg_count > 0:
            param_name = arg_names[argIndex]
            param_val = args_dict[param_name][0]
            param_string += str(param_name) + '=' + str(param_val) + '&'
            arg_count -= 1
            argIndex += 1
        param_string = param_string[:-1]
        path_to_return = experiment_utils._get_logdir(
            app_id, run_id) + '/' + param_string + '/.outputs.json'
        if hdfs.exists(path_to_return):
            return_json = hdfs.load(path_to_return)
            return_dict = json.loads(return_json)
            return experiment_utils._get_logdir(app_id, run_id), return_dict
        else:
            return experiment_utils._get_logdir(app_id, run_id), None
    else:
        return experiment_utils._get_logdir(app_id, run_id), None
예제 #14
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        experiment_utils._set_ml_id(app_id, run_id)

        tb_hdfs_path = ''

        hdfs_exec_logdir = experiment_utils._get_logdir(app_id, run_id)

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        try:
            #Arguments
            if args_dict:
                param_string, params, args = experiment_utils.build_parameters(
                    map_fun, executor_num, args_dict)
                hdfs_exec_logdir, hdfs_appid_logdir = experiment_utils._create_experiment_subdirectories(
                    app_id, run_id, param_string, 'grid_search', params=params)
                logfile = experiment_utils._init_logger(hdfs_exec_logdir)
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir)
                print(devices._get_gpu_info())
                print(
                    '-------------------------------------------------------')
                print('Started running task ' + param_string)
                task_start = time.time()
                retval = map_fun(*args)
                task_end = time.time()
                experiment_utils._handle_return_simple(retval,
                                                       hdfs_exec_logdir,
                                                       logfile)
                time_str = 'Finished task ' + param_string + ' - took ' + experiment_utils._time_diff(
                    task_start, task_end)
                print(time_str)
                print(
                    '-------------------------------------------------------')
            else:
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_exec_logdir,
                    executor_num,
                    local_logdir=local_logdir)
                logfile = experiment_utils._init_logger(hdfs_exec_logdir)
                print(devices._get_gpu_info())
                print(
                    '-------------------------------------------------------')
                print('Started running task')
                task_start = time.time()
                retval = map_fun()
                task_end = time.time()
                experiment_utils._handle_return_simple(retval,
                                                       hdfs_exec_logdir,
                                                       logfile)
                time_str = 'Finished task - took ' + experiment_utils._time_diff(
                    task_start, task_end)
                print(time_str)
                print(
                    '-------------------------------------------------------')
        except:
            raise
        finally:
            experiment_utils._cleanup(tensorboard, t)
예제 #15
0
def random_search(train_fn,
                  boundary_dict,
                  direction=Direction.MAX,
                  samples=10,
                  name='no-name',
                  local_logdir=False,
                  description=None,
                  optimization_key='metric'):
    """

    *Parallel Experiment*

    Run an Experiment contained in *train_fn* for configured number of random samples controlled by the *samples* parameter. Each hyperparameter is contained in *boundary_dict* with the key
    corresponding to the name of the hyperparameter and a list containing two elements defining the lower and upper bound.
    The experiment must return a metric corresponding to how 'good' the given hyperparameter combination is.

    Example usage:

    >>> from hops import experiment
    >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    return network.evaluate(learning_rate, layers, dropout)
    >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max')

    Returning multiple outputs, including images and logs:

    >>> from hops import experiment
    >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    from PIL import Image
    >>>    f = open('logfile.txt', 'w')
    >>>    f.write('Starting training...')
    >>>    accuracy, loss = network.evaluate(learning_rate, layers, dropout)
    >>>    img = Image.new(.....)
    >>>    img.save('diagram.png')
    >>>    return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'}
    >>> # Important! Remember: optimization_key must be set when returning multiple outputs
    >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max', optimization_key='accuracy')


    Args:
        :train_fn: The function to run
        :boundary_dict: dict containing hyperparameter name and corresponding boundaries, each experiment randomize a value in the boundary range.
        :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric
        :samples: the number of random samples to evaluate for each hyperparameter given the boundaries, for example samples=3 would result in 3 hyperparameter combinations in total to evaluate
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: A longer description for the experiment
        :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value

    Returns:
        HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        experiment_utils._create_experiment_dir(app_id, run_id)

        experiment_json = experiment_utils._populate_experiment(
            name, 'random_search', 'PARALLEL_EXPERIMENTS',
            json.dumps(boundary_dict), description, app_id, direction,
            optimization_key)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, best_param, best_metric, return_dict = r_search_impl._run(
            sc,
            train_fn,
            run_id,
            boundary_dict,
            samples,
            direction=direction,
            local_logdir=local_logdir,
            optimization_key=optimization_key)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        experiment_utils._finalize_experiment(
            experiment_json, best_metric, app_id, run_id, 'FINISHED', duration,
            experiment_utils._get_logdir(app_id, run_id), logdir,
            optimization_key)

        return logdir, best_param, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
예제 #16
0
def grid_search(train_fn,
                grid_dict,
                direction=Direction.MAX,
                name='no-name',
                local_logdir=False,
                description=None,
                optimization_key='metric'):
    """
    *Parallel Experiment*

    Run grid search evolution to explore a predefined set of hyperparameter combinations.
    The function is treated as a blackbox that returns a metric for some given hyperparameter combination.
    The returned metric is used to evaluate how 'good' the hyperparameter combination was.

    Example usage:

    >>> from hops import experiment
    >>> grid_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    return network.evaluate(learning_rate, layers, dropout)
    >>> experiment.grid_search(train_nn, grid_dict, direction=Direction.MAX)

    Returning multiple outputs, including images and logs:

    >>> from hops import experiment
    >>> grid_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    from PIL import Image
    >>>    f = open('logfile.txt', 'w')
    >>>    f.write('Starting training...')
    >>>    accuracy, loss = network.evaluate(learning_rate, layers, dropout)
    >>>    img = Image.new(.....)
    >>>    img.save('diagram.png')
    >>>    return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'}
    >>> # Important! Remember: optimization_key must be set when returning multiple outputs
    >>> experiment.grid_search(train_nn, grid_dict, direction=Direction.MAX, optimization_key='accuracy')

    Args:
        :train_fn: the function to run, must return a metric
        :grid_dict: a dict with a key for each argument with a corresponding value being a list containing the hyperparameters to test, internally all possible combinations will be generated and run as separate Experiments
        :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: a longer description for the experiment
        :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value

    Returns:
        HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        experiment_utils._create_experiment_dir(app_id, run_id)

        experiment_json = experiment_utils._populate_experiment(
            name, 'grid_search', 'PARALLEL_EXPERIMENTS', json.dumps(grid_dict),
            description, app_id, direction, optimization_key)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        grid_params = experiment_utils.grid_params(grid_dict)

        logdir, best_param, best_metric, return_dict = grid_search_impl._run(
            sc,
            train_fn,
            run_id,
            grid_params,
            direction=direction,
            local_logdir=local_logdir,
            name=name,
            optimization_key=optimization_key)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        experiment_utils._finalize_experiment(
            experiment_json, best_metric, app_id, run_id, 'FINISHED', duration,
            experiment_utils._get_logdir(app_id, run_id), logdir,
            optimization_key)

        return logdir, best_param, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
예제 #17
0
 def get_logdir(self, app_id, run_id):
     return experiment_utils._get_logdir(app_id, run_id)