Exemplo n.º 1
0
def visualize(hdfs_root_logdir):
    """ Visualize all TensorBoard events for a given path in HopsFS. This is intended for use after running TensorFlow jobs to visualize
    them all in the same TensorBoard. tflauncher.launch returns the path in HopsFS which should be handed as argument for this method to visualize all runs.

    Args:
      :hdfs_root_logdir: the path in HopsFS to enter as the logdir for TensorBoard
    """

    sc = util._find_spark().sparkContext
    app_id = str(sc.applicationId)

    pypath = os.getenv("PYSPARK_PYTHON")

    logdir = os.getcwd() + '/tensorboard_events/'
    if os.path.exists(logdir):
        shutil.rmtree(logdir)
        os.makedirs(logdir)
    else:
        os.makedirs(logdir)

        #find free port
    tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    tb_socket.bind(('', 0))
    tb_addr, tb_port = tb_socket.getsockname()

    tb_path = util._find_tensorboard()

    tb_socket.close()

    tb_env = os.environ.copy()
    tb_env['CUDA_VISIBLE_DEVICES'] = ''
    tb_env['LC_ALL'] = 'C'

    tb_proc = subprocess.Popen([
        pypath, tb_path,
        "--logdir=%s" % logdir,
        "--port=%d" % tb_port,
        "--host=%s" % "0.0.0.0"
    ],
                               env=tb_env,
                               preexec_fn=util._on_executor_exit('SIGTERM'))

    host = socket.gethostname()
    tb_url = "http://{0}:{1}".format(host, tb_port)
    tb_endpoint = hopshdfs._get_experiments_dir(
    ) + "/" + app_id + "/TensorBoard.visualize"
    #dump tb host:port to hdfs
    pydoop.hdfs.dump(tb_url, tb_endpoint, user=hopshdfs.project_user())

    handle = hopshdfs.get()
    hdfs_logdir_entries = handle.list_directory(hdfs_root_logdir)
    for entry in hdfs_logdir_entries:
        file_name, extension = splitext(entry['name'])
        if not extension == '.log':
            pydoop.hdfs.get(entry['name'], logdir)

    tb_proc.wait()
    stdout, stderr = tb_proc.communicate()
    print(stdout)
    print(stderr)
Exemplo n.º 2
0
    def write_featureframe(self):
        """
        Writes a dataframe of data as a training dataset on HDFS in the petastorm format

        Returns:
            None

        Raises:
              :ValueError: if not petastorm schema was provided
        """
        spark = util._find_spark()
        if constants.PETASTORM_CONFIG.SCHEMA in self.petastorm_args:
            schema = self.petastorm_args[constants.PETASTORM_CONFIG.SCHEMA]
            del self.petastorm_args[constants.PETASTORM_CONFIG.SCHEMA]
        else:
            raise ValueError(
                "Required petastorm argument 'schema' is not defined in petastorm_args dict"
            )
        if constants.PETASTORM_CONFIG.FILESYSTEM_FACTORY in self.petastorm_args:
            filesystem_factory = self.petastorm_args[
                constants.PETASTORM_CONFIG.FILESYSTEM_FACTORY]
            del self.petastorm_args[
                constants.PETASTORM_CONFIG.FILESYSTEM_FACTORY]
        else:
            filesystem_factory = lambda: pa.hdfs.connect(
                driver=constants.PETASTORM_CONFIG.LIBHDFS)
        with materialize_dataset(spark,
                                 self.path,
                                 schema,
                                 filesystem_factory=filesystem_factory,
                                 **self.petastorm_args):
            self.df.write.mode(self.write_mode).parquet(self.path)
Exemplo n.º 3
0
def num_executors(sc):
    """
    Get the number of executors configured for Jupyter

    :param sc: The SparkContext to take the executors from.
    :type sc: [SparkContext
    :return: Number of configured executors for Jupyter
    :rtype: int
    """
    sc = hopsutil._find_spark().sparkContext
    try:
        return int(sc._conf.get("spark.dynamicAllocation.maxExecutors"))
    except:  # noqa: E722
        raise RuntimeError(
            "Failed to find spark.dynamicAllocation.maxExecutors property, \
            please select your mode as either Experiment, Parallel \
            Experiments or Distributed Training.")
Exemplo n.º 4
0
def _set_spark_hadoop_conf(json_content):
    spark = None
    if constants.ENV_VARIABLES.SPARK_IS_DRIVER in os.environ:
        spark = util._find_spark()
    if spark is not None:
        sc = spark.sparkContext
        sc._jsc.hadoopConfiguration().set(
            constants.S3_CONFIG.S3_CREDENTIAL_PROVIDER_ENV,
            constants.S3_CONFIG.S3_TEMPORARY_CREDENTIAL_PROVIDER)
        sc._jsc.hadoopConfiguration().set(
            constants.S3_CONFIG.S3_ACCESS_KEY_ENV,
            json_content[constants.REST_CONFIG.JSON_ACCESS_KEY_ID])
        sc._jsc.hadoopConfiguration().set(
            constants.S3_CONFIG.S3_SECRET_KEY_ENV,
            json_content[constants.REST_CONFIG.JSON_SECRET_KEY_ID])
        sc._jsc.hadoopConfiguration().set(
            constants.S3_CONFIG.S3_SESSION_KEY_ENV,
            json_content[constants.REST_CONFIG.JSON_SESSION_TOKEN_ID])
Exemplo n.º 5
0
def _convert_dataframe_to_spark(dataframe):
    """
    Helper method for converting a user-provided dataframe into a spark dataframe

    Args:
        :dataframe: the input dataframe (supported types are spark rdds, spark dataframes, pandas dataframes,
                    python 2D lists, and numpy 2D arrays)

    Returns:
        the dataframe convertd to a spark dataframe

    Raises:
        :CouldNotConvertDataframe: in case the provided dataframe could not be converted to a spark dataframe
    """
    spark = util._find_spark()
    if isinstance(dataframe, pd.DataFrame):
        sc = spark.sparkContext
        sql_context = SQLContext(sc)
        return sql_context.createDataFrame(dataframe)
    if isinstance(dataframe, list):
        dataframe = np.array(dataframe)
    if isinstance(dataframe, np.ndarray):
        if dataframe.ndim != 2:
            raise CouldNotConvertDataframe(
                "Cannot convert numpy array that do not have two dimensions to a dataframe. "
                "The number of dimensions are: {}".format(dataframe.ndim))
        num_cols = dataframe.shape[1]
        dataframe_dict = {}
        for n_col in list(range(num_cols)):
            col_name = "col_" + str(n_col)
            dataframe_dict[col_name] = dataframe[:, n_col]
        pandas_df = pd.DataFrame(dataframe_dict)
        sc = spark.sparkContext
        sql_context = SQLContext(sc)
        return sql_context.createDataFrame(pandas_df)
    if isinstance(dataframe, RDD):
        return dataframe.toDF()
    if isinstance(dataframe, DataFrame):
        return dataframe
    raise CouldNotConvertDataframe(
        "The provided dataframe type is not recognized. Supported types are: spark rdds, spark dataframes, "
        "pandas dataframes, python 2D lists, and numpy 2D arrays. The provided dataframe has type: {}"
        .format(type(dataframe)))
Exemplo n.º 6
0
def launch(map_fun, args_dict=None, name='no-name', local_logdir=False, versioned_resources=None, description=None):
    """

    *Experiment* or *Parallel Experiment*

    Run an Experiment contained in *map_fun* one time with no arguments or multiple times with different arguments if
    *args_dict* is specified.

    Example usage:

    >>> from hops import experiment
    >>> def train_nn():
    >>>    import tensorflow
    >>>    from hops import tensorboard
    >>>    logdir = tensorboard.logdir()
    >>>    # code for preprocessing, training and exporting model
    >>>    # optionally return a value for the experiment which is registered in Experiments service
    >>> experiment.launch(train_nn)

    Args:
        :map_fun: The function to run
        :args_dict: If specified will run the same function multiple times with different arguments, {'a':[1,2], 'b':[5,3]}
         would run the function two times with arguments (1,5) and (2,3) provided that the function signature contains two arguments like *def func(a,b):*
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :versioned_resources: A list of HDFS paths of resources to version with this experiment
        :description: A longer description for the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.")

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True

        sc = util._find_spark().sparkContext
        app_id = str(sc.applicationId)

        launcher.run_id = launcher.run_id + 1

        versioned_path = util._version_resources(versioned_resources, launcher._get_logdir(app_id))

        experiment_json = None
        if args_dict:
            experiment_json = util._populate_experiment(sc, name, 'experiment', 'launcher', launcher._get_logdir(app_id), json.dumps(args_dict), versioned_path, description)
        else:
            experiment_json = util._populate_experiment(sc, name, 'experiment', 'launcher', launcher._get_logdir(app_id), None, versioned_path, description)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        retval, tensorboard_logdir = launcher._launch(sc, map_fun, args_dict, local_logdir)

        util._version_resources(versioned_resources, launcher._get_logdir(app_id))

        if retval:
            experiment_json = util._finalize_experiment(experiment_json, None, retval)
            util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)
            return tensorboard_logdir

        experiment_json = util._finalize_experiment(experiment_json, None, None)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

    except:
        _exception_handler()
        raise
    finally:
        #cleanup spark jobs
        elastic_id +=1
        running = False
        sc.setJobGroup("", "")
    return tensorboard_logdir
Exemplo n.º 7
0
def parameter_server(map_fun,
                     name='no-name',
                     local_logdir=False,
                     description=None,
                     evaluator=False):
    """
    *Distributed Training*

    Sets up the cluster to run ParameterServerStrategy.

    TF_CONFIG is exported in the background and does not need to be set by the user themselves.

    Example usage:

    >>> from hops import experiment
    >>> def distributed_training():
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the wrapper function
    >>>    from hops import tensorboard
    >>>    from hops import devices
    >>>    logdir = tensorboard.logdir()
    >>>    ...ParameterServerStrategy(num_gpus_per_worker=devices.get_num_gpus())...
    >>> experiment.parameter_server(distributed_training, local_logdir=True)

    Args:f
        :map_fun: contains the code where you are using ParameterServerStrategy.
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: a longer description for the experiment
        :evaluator: whether to run one of the workers as an evaluator

    Returns:
        HDFS path in your project where the experiment is stored and return value from the process running as chief

    """
    num_ps = util.num_param_servers()
    num_executors = util.num_executors()

    assert num_ps > 0, "number of parameter servers should be greater than 0"
    assert num_ps < num_executors, "num_ps cannot be greater than num_executors (i.e. num_executors == num_ps + num_workers)"
    if evaluator:
        assert num_executors - num_ps > 2, "number of workers must be atleast 3 if evaluator is set to True"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id))

        experiment_json = experiment_utils._populate_experiment(
            name, 'parameter_server', 'DISTRIBUTED_TRAINING', None,
            description, app_id, None, None)

        experiment_json = experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, return_dict = ps_impl._run(sc,
                                           map_fun,
                                           run_id,
                                           local_logdir=local_logdir,
                                           name=name,
                                           evaluator=evaluator)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        experiment_utils._finalize_experiment(experiment_json, None, app_id,
                                              run_id, 'FINISHED', duration,
                                              logdir, None, None)

        return logdir, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
Exemplo n.º 8
0
def differential_evolution(objective_function,
                           boundary_dict,
                           direction=Direction.MAX,
                           generations=4,
                           population=6,
                           mutation=0.5,
                           crossover=0.7,
                           name='no-name',
                           local_logdir=False,
                           description=None,
                           optimization_key='metric'):
    """
    *Parallel Experiment*

    Run differential evolution to explore a given search space for each hyperparameter and figure out the best hyperparameter combination.
    The function is treated as a blackbox that returns a metric for some given hyperparameter combination.
    The returned metric is used to evaluate how 'good' the hyperparameter combination was.

    Example usage:

    >>> from hops import experiment
    >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    import tensorflow
    >>>    return network.evaluate(learning_rate, layers, dropout)
    >>> experiment.differential_evolution(train_nn, boundary_dict, direction=Direction.MAX)

    Returning multiple outputs, including images and logs:

    >>> from hops import experiment
    >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the wrapper function
    >>>    from PIL import Image
    >>>    f = open('logfile.txt', 'w')
    >>>    f.write('Starting training...')
    >>>    accuracy, loss = network.evaluate(learning_rate, layers, dropout)
    >>>    img = Image.new(.....)
    >>>    img.save('diagram.png')
    >>>    return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'}
    >>> # Important! Remember: optimization_key must be set when returning multiple outputs
    >>> experiment.differential_evolution(train_nn, boundary_dict, direction=Direction.MAX, optimization_key='accuracy')

    Args:
        :objective_function: the function to run, must return a metric
        :boundary_dict: a dict where each key corresponds to an argument of *objective_function* and the correspond value should be a list of two elements. The first element being the lower bound for the parameter and the the second element the upper bound.
        :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric
        :generations: number of generations
        :population: size of population
        :mutation: mutation rate to explore more different hyperparameters
        :crossover: how fast to adapt the population to the best in each generation
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: a longer description for the experiment
        :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value

    Returns:
        HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        diff_evo_impl.run_id = run_id

        hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id))

        experiment_json = experiment_utils._populate_experiment(
            name, 'differential_evolution', 'PARALLEL_EXPERIMENTS',
            json.dumps(boundary_dict), description, app_id, direction,
            optimization_key)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, best_param, best_metric, return_dict = diff_evo_impl._run(
            objective_function,
            boundary_dict,
            direction=direction,
            generations=generations,
            population=population,
            mutation=mutation,
            crossover=crossover,
            cleanup_generations=False,
            local_logdir=local_logdir,
            name=name,
            optimization_key=optimization_key)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        experiment_utils._finalize_experiment(
            experiment_json, best_metric, app_id, run_id, 'FINISHED', duration,
            experiment_utils._get_logdir(app_id, run_id), logdir,
            optimization_key)

        return logdir, best_param, return_dict

    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
Exemplo n.º 9
0
def mirrored(map_fun, name='no-name', local_logdir=False, versioned_resources=None, description=None):
    """
    *Distributed Training* single machine - multiple GPUs

    Example usage:

    >>> from hops import experiment
    >>> def mirrored_training():
    >>>    import tensorflow
    >>>    from hops import tensorboard
    >>>    from hops import devices
    >>>    logdir = tensorboard.logdir()
    >>>    ...MirroredStrategy()...
    >>> experiment.mirrored(mirrored_training)

    Args:
        :map_fun: contains the code where you are using MirroredStrategy.
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :versioned_resources: A list of HDFS paths of resources to version with this experiment
        :description: a longer description for the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.")

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True

        sc = util._find_spark().sparkContext
        app_id = str(sc.applicationId)

        mirrored_impl.run_id = mirrored_impl.run_id + 1

        versioned_path = util._version_resources(versioned_resources, mirrored_impl._get_logdir(app_id))

        experiment_json = util._populate_experiment(sc, name, 'experiment', 'mirrored', mirrored_impl._get_logdir(app_id), None, versioned_path, description)

        util._version_resources(versioned_resources, mirrored_impl._get_logdir(app_id))

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        retval, logdir = mirrored_impl._launch(sc, map_fun, local_logdir=local_logdir, name=name)

        experiment_json = util._finalize_experiment(experiment_json, None, retval)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)
    except:
        _exception_handler()
        raise
    finally:
        #cleanup spark jobs
        elastic_id +=1
        running = False
        sc.setJobGroup("", "")

    return logdir
Exemplo n.º 10
0
def begin(name='no-name', local_logdir=False, versioned_resources=None, description=None):
    """
    Start a custom Experiment, at the end of the experiment call *end(metric)*.

    *IMPORTANT* - This call should not be combined with other functions in the experiment module, other than *end*.
    Other experiment functions such as *grid_search* manages the *begin* and *end* functions internally

    Example usage:

    >>> from hops import experiment
    >>> experiment.begin(name='calculate pi')
    >>> # Code to calculate pi
    >>> pi = calc_pi()
    >>> experiment.end(pi)

    Args:
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :versioned_resources: A list of HDFS paths of resources to version with this experiment
        :description: A longer description for the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """
    global running
    if running:
        raise RuntimeError("An experiment is currently running. Please call experiment.stop() to stop it.")

    try:
        global app_id
        global experiment_json
        global elastic_id
        global run_id
        global driver_tensorboard_hdfs_path

        running = True

        sc = util._find_spark().sparkContext
        app_id = str(sc.applicationId)

        run_id = run_id + 1

        versioned_path = util._version_resources(versioned_resources, _get_logdir(app_id))

        experiment_json = None

        experiment_json = util._populate_experiment(sc, name, 'experiment', 'begin', _get_logdir(app_id), None, versioned_path, description)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(app_id, run_id, None, 'begin')

        pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user())

        hopshdfs._init_logger()

        driver_tensorboard_hdfs_path,_ = tensorboard._register(hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir)
    except:
        _exception_handler()
        raise

    return driver_tensorboard_hdfs_path
Exemplo n.º 11
0
def parameter_server(map_fun, name='no-name', local_logdir=False, versioned_resources=None, description=None):
    """
    *Distributed Training*

    Sets up the cluster to run ParameterServerStrategy.

    TF_CONFIG is exported in the background and does not need to be set by the user themselves.

    Example usage:

    >>> from hops import experiment
    >>> def distributed_training():
    >>>    import tensorflow
    >>>    from hops import tensorboard
    >>>    from hops import devices
    >>>    logdir = tensorboard.logdir()
    >>>    ...ParameterServerStrategy(num_gpus_per_worker=devices.get_num_gpus())...
    >>> experiment.parameter_server(distributed_training, local_logdir=True)

    Args:
        :map_fun: contains the code where you are using ParameterServerStrategy.
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :versioned_resources: A list of HDFS paths of resources to version with this experiment
        :description: a longer description for the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """
    num_ps = util.num_param_servers()
    num_executors = util.num_executors()

    assert num_ps > 0, "number of parameter servers should be greater than 0"
    assert num_ps < num_executors, "num_ps cannot be greater than num_executors (i.e. num_executors == num_ps + num_workers)"

    global running
    if running:
        raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.")

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True

        sc = util._find_spark().sparkContext
        app_id = str(sc.applicationId)

        ps.run_id = ps.run_id + 1

        versioned_path = util._version_resources(versioned_resources, ps._get_logdir(app_id))

        experiment_json = util._populate_experiment(sc, name, 'experiment', 'parameter_server', ps._get_logdir(app_id), None, versioned_path, description)

        util._version_resources(versioned_resources, ps._get_logdir(app_id))

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        retval, logdir = ps._launch(sc, map_fun, local_logdir=local_logdir, name=name)

        experiment_json = util._finalize_experiment(experiment_json, None, retval)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)
    except:
        _exception_handler()
        raise
    finally:
        #cleanup spark jobs
        elastic_id +=1
        running = False
        sc.setJobGroup("", "")

    return logdir
Exemplo n.º 12
0
def random_search(map_fun,
                  boundary_dict,
                  direction='max',
                  samples=10,
                  name='no-name',
                  local_logdir=False,
                  versioned_resources=None,
                  description=None):
    """

    *Parallel Experiment*

    Run an Experiment contained in *map_fun* for configured number of random samples controlled by the *samples* parameter. Each hyperparameter is contained in *boundary_dict* with the key
    corresponding to the name of the hyperparameter and a list containing two elements defining the lower and upper bound.
    The experiment must return a metric corresponding to how 'good' the given hyperparameter combination is.

    Example usage:

    >>> from hops import experiment
    >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    import tensorflow
    >>>    # code for preprocessing, training and exporting model
    >>>    # mandatory return a value for the experiment which is registered in Experiments service
    >>>    return network.evaluate(learning_rate, layers, dropout)
    >>> experiment.random_search(train_nn, boundary_dict, samples=14, direction='max')

    Args:
        :map_fun: The function to run
        :boundary_dict: dict containing hyperparameter name and corresponding boundaries, each experiment randomize a value in the boundary range.
        :direction: If set to 'max' the highest value returned will correspond to the best solution, if set to 'min' the opposite is true
        :samples: the number of random samples to evaluate for each hyperparameter given the boundaries
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :versioned_resources: A list of HDFS paths of resources to version with this experiment
        :description: A longer description for the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError(
            "An experiment is currently running. Please call experiment.end() to stop it."
        )

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True

        sc = util._find_spark().sparkContext
        app_id = str(sc.applicationId)

        r_search.run_id = r_search.run_id + 1

        versioned_path = util._version_resources(versioned_resources,
                                                 r_search._get_logdir(app_id))

        experiment_json = None

        experiment_json = util._populate_experiment(
            sc, name, 'experiment', 'random_search',
            r_search._get_logdir(app_id), json.dumps(boundary_dict),
            versioned_path, description)

        util._version_resources(versioned_resources,
                                r_search._get_logdir(app_id))

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                          experiment_json)

        tensorboard_logdir, param, metric = r_search._launch(
            sc,
            map_fun,
            boundary_dict,
            samples,
            direction=direction,
            local_logdir=local_logdir)

        experiment_json = util._finalize_experiment(experiment_json, param,
                                                    metric)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                          experiment_json)

        return tensorboard_logdir

    except:
        _exception_handler()
        raise
    finally:
        #cleanup spark jobs
        elastic_id += 1
        running = False
        sc.setJobGroup("", "")
    return tensorboard_logdir
Exemplo n.º 13
0
def grid_search(train_fn,
                grid_dict,
                direction=Direction.MAX,
                name='no-name',
                local_logdir=False,
                description=None,
                optimization_key='metric'):
    """
    *Parallel Experiment*

    Run grid search evolution to explore a predefined set of hyperparameter combinations.
    The function is treated as a blackbox that returns a metric for some given hyperparameter combination.
    The returned metric is used to evaluate how 'good' the hyperparameter combination was.

    Example usage:

    >>> from hops import experiment
    >>> grid_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    return network.evaluate(learning_rate, layers, dropout)
    >>> experiment.grid_search(train_nn, grid_dict, direction=Direction.MAX)

    Returning multiple outputs, including images and logs:

    >>> from hops import experiment
    >>> grid_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    from PIL import Image
    >>>    f = open('logfile.txt', 'w')
    >>>    f.write('Starting training...')
    >>>    accuracy, loss = network.evaluate(learning_rate, layers, dropout)
    >>>    img = Image.new(.....)
    >>>    img.save('diagram.png')
    >>>    return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'}
    >>> # Important! Remember: optimization_key must be set when returning multiple outputs
    >>> experiment.grid_search(train_nn, grid_dict, direction=Direction.MAX, optimization_key='accuracy')

    Args:
        :train_fn: the function to run, must return a metric
        :grid_dict: a dict with a key for each argument with a corresponding value being a list containing the hyperparameters to test, internally all possible combinations will be generated and run as separate Experiments
        :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: a longer description for the experiment
        :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value

    Returns:
        HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        experiment_utils._create_experiment_dir(app_id, run_id)

        experiment_json = experiment_utils._populate_experiment(
            name, 'grid_search', 'PARALLEL_EXPERIMENTS', json.dumps(grid_dict),
            description, app_id, direction, optimization_key)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        grid_params = experiment_utils.grid_params(grid_dict)

        logdir, best_param, best_metric, return_dict = grid_search_impl._run(
            sc,
            train_fn,
            run_id,
            grid_params,
            direction=direction,
            local_logdir=local_logdir,
            name=name,
            optimization_key=optimization_key)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        experiment_utils._finalize_experiment(
            experiment_json, best_metric, app_id, run_id, 'FINISHED', duration,
            experiment_utils._get_logdir(app_id, run_id), logdir,
            optimization_key)

        return logdir, best_param, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
Exemplo n.º 14
0
def random_search(train_fn,
                  boundary_dict,
                  direction=Direction.MAX,
                  samples=10,
                  name='no-name',
                  local_logdir=False,
                  description=None,
                  optimization_key='metric'):
    """

    *Parallel Experiment*

    Run an Experiment contained in *train_fn* for configured number of random samples controlled by the *samples* parameter. Each hyperparameter is contained in *boundary_dict* with the key
    corresponding to the name of the hyperparameter and a list containing two elements defining the lower and upper bound.
    The experiment must return a metric corresponding to how 'good' the given hyperparameter combination is.

    Example usage:

    >>> from hops import experiment
    >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    return network.evaluate(learning_rate, layers, dropout)
    >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max')

    Returning multiple outputs, including images and logs:

    >>> from hops import experiment
    >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]}
    >>> def train_nn(learning_rate, layers, dropout):
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    from PIL import Image
    >>>    f = open('logfile.txt', 'w')
    >>>    f.write('Starting training...')
    >>>    accuracy, loss = network.evaluate(learning_rate, layers, dropout)
    >>>    img = Image.new(.....)
    >>>    img.save('diagram.png')
    >>>    return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'}
    >>> # Important! Remember: optimization_key must be set when returning multiple outputs
    >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max', optimization_key='accuracy')


    Args:
        :train_fn: The function to run
        :boundary_dict: dict containing hyperparameter name and corresponding boundaries, each experiment randomize a value in the boundary range.
        :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric
        :samples: the number of random samples to evaluate for each hyperparameter given the boundaries, for example samples=3 would result in 3 hyperparameter combinations in total to evaluate
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: A longer description for the experiment
        :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value

    Returns:
        HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        experiment_utils._create_experiment_dir(app_id, run_id)

        experiment_json = experiment_utils._populate_experiment(
            name, 'random_search', 'PARALLEL_EXPERIMENTS',
            json.dumps(boundary_dict), description, app_id, direction,
            optimization_key)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, best_param, best_metric, return_dict = r_search_impl._run(
            sc,
            train_fn,
            run_id,
            boundary_dict,
            samples,
            direction=direction,
            local_logdir=local_logdir,
            optimization_key=optimization_key)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        experiment_utils._finalize_experiment(
            experiment_json, best_metric, app_id, run_id, 'FINISHED', duration,
            experiment_utils._get_logdir(app_id, run_id), logdir,
            optimization_key)

        return logdir, best_param, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
Exemplo n.º 15
0
    def finalize(self, job_end):

        results = ""

        if self.experiment_type == "optimization":

            _ = self.optimizer.finalize_experiment(self._final_store)

            self.job_end = job_end

            self.duration = experiment_utils._seconds_to_milliseconds(
                self.job_end - self.job_start)

            self.duration_str = experiment_utils._time_diff(
                self.job_start, self.job_end)

            results = ("\n------ " + self.optimizer.name() +
                       " Results ------ direction(" + self.direction + ") \n"
                       "BEST combination " +
                       json.dumps(self.result["best_hp"]) + " -- metric " +
                       str(self.result["best_val"]) + "\n"
                       "WORST combination " +
                       json.dumps(self.result["worst_hp"]) + " -- metric " +
                       str(self.result["worst_val"]) + "\n"
                       "AVERAGE metric -- " + str(self.result["avg"]) + "\n"
                       "EARLY STOPPED Trials -- " +
                       str(self.result["early_stopped"]) + "\n"
                       "Total job time " + self.duration_str + "\n")

        elif self.experiment_type == "ablation":

            _ = self.ablator.finalize_experiment(self._final_store)
            self.job_end = job_end

            self.duration = experiment_utils._seconds_to_milliseconds(
                self.job_end - self.job_start)

            self.duration_str = experiment_utils._time_diff(
                self.job_start, self.job_end)

            results = ("\n------ " + self.ablator.name() +
                       " Results ------ \n" + "BEST Config Excludes " +
                       json.dumps(self.result["best_config"]) + " -- metric " +
                       str(self.result["best_val"]) + "\n" +
                       "WORST Config Excludes " +
                       json.dumps(self.result["worst_config"]) +
                       " -- metric " + str(self.result["worst_val"]) + "\n" +
                       "AVERAGE metric -- " + str(self.result["avg"]) + "\n" +
                       "Total Job Time " + self.duration_str + "\n")

        print(results)

        self._log(results)

        hopshdfs.dump(
            json.dumps(self.result, default=util.json_default_numpy),
            self.log_dir + "/result.json",
        )
        sc = hopsutil._find_spark().sparkContext
        hopshdfs.dump(self.json(sc), self.log_dir + "/maggy.json")

        return self.result
Exemplo n.º 16
0
Arquivo: rpc.py Projeto: carlee0/maggy
    def start(self, exp_driver):
        """
        Start listener in a background thread.

        Returns:
            address of the Server as a tuple of (host, port)
        """
        global server_host_port

        server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        server_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        if not server_host_port:
            server_sock.bind(("", 0))
            # hostname may not be resolvable but IP address probably will be
            host = experiment_utils._get_ip_address()
            port = server_sock.getsockname()[1]
            server_host_port = (host, port)

            # register this driver with Hopsworks
            sc = hopsutil._find_spark().sparkContext
            app_id = str(sc.applicationId)

            method = hopsconstants.HTTP_CONFIG.HTTP_POST
            resource_url = (hopsconstants.DELIMITERS.SLASH_DELIMITER +
                            hopsconstants.REST_CONFIG.HOPSWORKS_REST_RESOURCE +
                            hopsconstants.DELIMITERS.SLASH_DELIMITER +
                            "maggy" +
                            hopsconstants.DELIMITERS.SLASH_DELIMITER +
                            "drivers")
            json_contents = {
                "hostIp": host,
                "port": port,
                "appId": app_id,
                "secret": exp_driver._secret,
            }
            json_embeddable = json.dumps(json_contents)
            headers = {
                hopsconstants.HTTP_CONFIG.HTTP_CONTENT_TYPE:
                hopsconstants.HTTP_CONFIG.HTTP_APPLICATION_JSON
            }

            try:
                response = hopsutil.send_request(method,
                                                 resource_url,
                                                 data=json_embeddable,
                                                 headers=headers)

                if (response.status_code // 100) != 2:
                    print("No connection to Hopsworks for logging.")
                    exp_driver._log("No connection to Hopsworks for logging.")
            except Exception as e:
                print("Connection failed to Hopsworks. No logging.")
                exp_driver._log(e)
                exp_driver._log("Connection failed to Hopsworks. No logging.")
        else:
            server_sock.bind(server_host_port)
        server_sock.listen(10)

        def _listen(self, sock, driver):
            CONNECTIONS = []
            CONNECTIONS.append(sock)

            while not self.done:
                read_socks, _, _ = select.select(CONNECTIONS, [], [], 60)
                for sock in read_socks:
                    if sock == server_sock:
                        client_sock, client_addr = sock.accept()
                        CONNECTIONS.append(client_sock)
                        _ = client_addr
                    else:
                        try:
                            msg = self.receive(sock)

                            # raise exception if secret does not match
                            # so client socket gets closed
                            if not secrets.compare_digest(
                                    msg["secret"], exp_driver._secret):
                                exp_driver._log("SERVER secret: {}".format(
                                    exp_driver._secret))
                                exp_driver._log(
                                    "ERROR: wrong secret {}".format(
                                        msg["secret"]))
                                raise Exception

                            self._handle_message(sock, msg, driver)
                        except Exception as e:
                            _ = e
                            sock.close()
                            CONNECTIONS.remove(sock)

            server_sock.close()

        t = threading.Thread(target=_listen,
                             args=(self, server_sock, exp_driver))
        t.daemon = True
        t.start()

        return server_host_port
Exemplo n.º 17
0
def _run(train_fn,
         search_dict,
         direction=Direction.MAX,
         generations=4,
         population=6,
         mutation=0.5,
         crossover=0.7,
         cleanup_generations=False,
         local_logdir=False,
         name="no-name",
         optimization_key=None):
    """

    Args:
        :train_fn:
        :search_dict:
        :direction:
        :generations:
        :population:
        :mutation:
        :crossover:
        :cleanup_generations:
        :local_logdir:
        :name:
        :optimization_key:

    Returns:

    """

    global run_id
    global local_logdir_bool
    local_logdir_bool = local_logdir

    global spark
    spark = util._find_spark()

    global objective_function
    objective_function = train_fn

    global cleanup
    cleanup = cleanup_generations

    global opt_key
    opt_key = optimization_key

    argcount = six.get_function_code(train_fn).co_argcount
    arg_names = six.get_function_code(train_fn).co_varnames

    ordered_arr = []

    app_id = spark.sparkContext.applicationId

    arg_lists = list(search_dict.values())
    for i in range(len(arg_lists)):
        if len(arg_lists[i]) != 2:
            raise ValueError(
                'Boundary list must contain exactly two elements, [lower_bound, upper_bound] for float/int '
                'or [category1, category2] in the case of strings')

    assert population > 3, 'population should be greater than 3'
    assert generations > 1, 'generations should be greater than 1'

    argIndex = 0
    while argcount != 0:
        ordered_arr.append(
            (arg_names[argIndex], search_dict[arg_names[argIndex]]))
        argcount = argcount - 1
        argIndex = argIndex + 1

    ordered_dict = OrderedDict(ordered_arr)

    bounds_list = []
    types_list = []

    for entry in ordered_dict:
        bounds_list.append((ordered_dict[entry][0], ordered_dict[entry][1]))

        if isinstance(ordered_dict[entry][0], int):
            types_list.append('int')
        elif isinstance(ordered_dict[entry][0], float):
            types_list.append('float')
        else:
            types_list.append('cat')

    global diff_evo
    diff_evo = DifferentialEvolution(_execute_all,
                                     bounds_list,
                                     types_list,
                                     ordered_dict,
                                     direction=direction,
                                     generations=generations,
                                     population=population,
                                     crossover=crossover,
                                     mutation=mutation,
                                     name=name)

    root_dir = experiment_utils._get_experiments_dir() + "/" + str(
        app_id) + "_" + str(run_id)

    best_param, best_metric = diff_evo._solve(root_dir)

    param_string = ''
    for hp in best_param:
        param_string = param_string + hp + '&'
    param_string = param_string[:-1]

    best_exp_logdir, return_dict = _get_best(str(root_dir), direction)

    print('Finished Experiment \n')

    return best_exp_logdir, experiment_utils._get_params_dict(
        best_exp_logdir), best_metric, return_dict
Exemplo n.º 18
0
def lagom(
    map_fun,
    name="no-name",
    experiment_type="optimization",
    searchspace=None,
    optimizer=None,
    direction="max",
    num_trials=1,
    ablation_study=None,
    ablator=None,
    optimization_key="metric",
    hb_interval=1,
    es_policy="median",
    es_interval=300,
    es_min=10,
    description="",
):
    """Launches a maggy experiment, which depending on `experiment_type` can
    either be a hyperparameter optimization or an ablation study experiment.
    Given a search space, objective and a model training procedure `map_fun`
    (black-box function), an experiment is the whole process of finding the
    best hyperparameter combination in the search space, optimizing the
    black-box function. Currently maggy supports random search and a median
    stopping rule.

    **lagom** is a Swedish word meaning "just the right amount".

    :param map_fun: User defined experiment containing the model training.
    :type map_fun: function
    :param name: A user defined experiment identifier.
    :type name: str
    :param experiment_type: Type of Maggy experiment, either 'optimization'
        (default) or 'ablation'.
    :type experiment_type: str
    :param searchspace: A maggy Searchspace object from which samples are
        drawn.
    :type searchspace: Searchspace
    :param optimizer: The optimizer is the part generating new trials.
    :type optimizer: str, AbstractOptimizer
    :param direction: If set to ‘max’ the highest value returned will
        correspond to the best solution, if set to ‘min’ the opposite is true.
    :type direction: str
    :param num_trials: the number of trials to evaluate given the search space,
        each containing a different hyperparameter combination
    :type num_trials: int
    :param ablation_study: Ablation study object. Can be None for optimization
        experiment type.
    :type ablation_study: AblationStudy
    :param ablator: Ablator to use for experiment type 'ablation'.
    :type ablator: str, AbstractAblator
    :param optimization_key: Name of the metric to be optimized
    :type optimization_key: str, optional
    :param hb_interval: The heartbeat interval in seconds from trial executor
        to experiment driver, defaults to 1
    :type hb_interval: int, optional
    :param es_policy: The earlystopping policy, defaults to 'median'
    :type es_policy: str, optional
    :param es_interval: Frequency interval in seconds to check currently
        running trials for early stopping, defaults to 300
    :type es_interval: int, optional
    :param es_min: Minimum number of trials finalized before checking for
        early stopping, defaults to 10
    :type es_min: int, optional
    :param description: A longer description of the experiment.
    :type description: str, optional
    :raises RuntimeError: An experiment is currently running.
    :return: A dictionary indicating the best trial and best hyperparameter
        combination with it's performance metric
    :rtype: dict
    """
    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    job_start = time.time()
    sc = hopsutil._find_spark().sparkContext
    exp_driver = None

    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        app_id, run_id = util._validate_ml_id(app_id, run_id)

        # start run
        running = True
        experiment_utils._set_ml_id(app_id, run_id)

        # create experiment dir
        experiment_utils._create_experiment_dir(app_id, run_id)

        tensorboard._register(experiment_utils._get_logdir(app_id, run_id))

        num_executors = util.num_executors(sc)

        # start experiment driver
        if experiment_type == "optimization":

            assert num_trials > 0, "number of trials should be greater " + "than zero"
            tensorboard._write_hparams_config(
                experiment_utils._get_logdir(app_id, run_id), searchspace
            )

            if num_executors > num_trials:
                num_executors = num_trials

            exp_driver = experimentdriver.ExperimentDriver(
                "optimization",
                searchspace=searchspace,
                optimizer=optimizer,
                direction=direction,
                num_trials=num_trials,
                name=name,
                num_executors=num_executors,
                hb_interval=hb_interval,
                es_policy=es_policy,
                es_interval=es_interval,
                es_min=es_min,
                description=description,
                log_dir=experiment_utils._get_logdir(app_id, run_id),
            )

            exp_function = exp_driver.optimizer.name()

        elif experiment_type == "ablation":
            exp_driver = experimentdriver.ExperimentDriver(
                "ablation",
                ablation_study=ablation_study,
                ablator=ablator,
                name=name,
                num_executors=num_executors,
                hb_interval=hb_interval,
                description=description,
                log_dir=experiment_utils._get_logdir(app_id, run_id),
            )
            # using exp_driver.num_executor since
            # it has been set using ablator.get_number_of_trials()
            # in experiment.py
            if num_executors > exp_driver.num_executors:
                num_executors = exp_driver.num_executors

            exp_function = exp_driver.ablator.name()
        else:
            running = False
            raise RuntimeError(
                "Unknown experiment_type:"
                "should be either 'optimization' or 'ablation', "
                "But it is '{0}'".format(str(experiment_type))
            )

        nodeRDD = sc.parallelize(range(num_executors), num_executors)

        # Do provenance after initializing exp_driver, because exp_driver does
        # the type checks for optimizer and searchspace
        sc.setJobGroup(os.environ["ML_ID"], "{0} | {1}".format(name, exp_function))

        experiment_json = experiment_utils._populate_experiment(
            name,
            exp_function,
            "MAGGY",
            exp_driver.searchspace.json(),
            description,
            app_id,
            direction,
            optimization_key,
        )

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, "CREATE"
        )

        util._log(
            "Started Maggy Experiment: {0}, {1}, run {2}".format(name, app_id, run_id)
        )

        exp_driver.init(job_start)

        server_addr = exp_driver.server_addr

        # Force execution on executor, since GPU is located on executor
        nodeRDD.foreachPartition(
            trialexecutor._prepare_func(
                app_id,
                run_id,
                experiment_type,
                map_fun,
                server_addr,
                hb_interval,
                exp_driver._secret,
                optimization_key,
                experiment_utils._get_logdir(app_id, run_id),
            )
        )
        job_end = time.time()

        result = exp_driver.finalize(job_end)
        best_logdir = (
            experiment_utils._get_logdir(app_id, run_id) + "/" + result["best_id"]
        )

        util._finalize_experiment(
            experiment_json,
            float(result["best_val"]),
            app_id,
            run_id,
            "FINISHED",
            exp_driver.duration,
            experiment_utils._get_logdir(app_id, run_id),
            best_logdir,
            optimization_key,
        )

        util._log("Finished Experiment")

        return result

    except:  # noqa: E722
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - job_start)
        )
        if exp_driver:
            if exp_driver.exception:
                raise exp_driver.exception
        raise
    finally:
        # grace period to send last logs to sparkmagic
        # sparkmagic hb poll intervall is 5 seconds, therefore wait 6 seconds
        time.sleep(6)
        # cleanup spark jobs
        if running and exp_driver is not None:
            exp_driver.stop()
        run_id += 1
        running = False
        sc.setJobGroup("", "")

    return result
Exemplo n.º 19
0
def launch(train_fn,
           args_dict=None,
           name='no-name',
           local_logdir=False,
           description=None,
           metric_key=None):
    """

    *Experiment* or *Parallel Experiment*

    Run an Experiment contained in *train_fn* one time with no arguments or multiple times with different arguments if
    *args_dict* is specified.

    Example usage:

    >>> from hops import experiment
    >>> def train_nn():
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    accuracy, loss = network.evaluate(learning_rate, layers, dropout)
    >>> experiment.launch(train_nn)

    Returning multiple outputs, including images and logs:

    >>> from hops import experiment
    >>> def train_nn():
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    from PIL import Image
    >>>    f = open('logfile.txt', 'w')
    >>>    f.write('Starting training...')
    >>>    accuracy, loss = network.evaluate(learning_rate, layers, dropout)
    >>>    img = Image.new(.....)
    >>>    img.save('diagram.png')
    >>>    return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'}
    >>> experiment.launch(train_nn)

    Args:
        :train_fn: The function to run
        :args_dict: If specified will run the same function multiple times with different arguments, {'a':[1,2], 'b':[5,3]} would run the function two times with arguments (1,5) and (2,3) provided that the function signature contains two arguments like *def func(a,b):*
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: A longer description for the experiment
        :metric_key: If returning a dict with multiple return values, this key should match the name of the key in the dict for the metric you want to associate with the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError(
            "An experiment is currently running. Please call experiment.end() to stop it."
        )

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        experiment_utils._create_experiment_dir(app_id, run_id)

        experiment_json = None
        if args_dict:
            experiment_json = experiment_utils._populate_experiment(
                name, 'launch', 'EXPERIMENT', json.dumps(args_dict),
                description, app_id, None, None)
        else:
            experiment_json = experiment_utils._populate_experiment(
                name, 'launch', 'EXPERIMENT', None, description, app_id, None,
                None)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, return_dict = launcher._run(sc, train_fn, run_id, args_dict,
                                            local_logdir)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        metric = experiment_utils._get_metric(return_dict, metric_key)

        experiment_utils._finalize_experiment(experiment_json, metric, app_id,
                                              run_id, 'FINISHED', duration,
                                              logdir, None, None)
        return logdir, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
Exemplo n.º 20
0
def differential_evolution(objective_function, boundary_dict, direction = 'max', generations=10, population=10, mutation=0.5, crossover=0.7, cleanup_generations=False, name='no-name', local_logdir=False, versioned_resources=None, description=None):
    """
    *Parallel Experiment*

    Run differential evolution to explore a given search space for each hyperparameter and figure out the best hyperparameter combination.
    The function is treated as a blackbox that returns a metric for some given hyperparameter combination.
    The returned metric is used to evaluate how 'good' the hyperparameter combination was.

    Example usage:

    >>> from hops import experiment
    >>> boundary_dict = {'learning_rate':[0.01, 0.2], 'dropout': [0.1, 0.9]}
    >>> def train_nn(learning_rate, dropout):
    >>>    import tensorflow
    >>>    # code for preprocessing, training and exporting model
    >>>    # mandatory return a value for the experiment which is registered in Experiments service
    >>>    return network.evaluate(learning_rate, dropout)
    >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max')

    Args:
        :objective_function: the function to run, must return a metric
        :boundary_dict: a dict where each key corresponds to an argument of *objective_function* and the correspond value should be a list of two elements. The first element being the lower bound for the parameter and the the second element the upper bound.
        :direction: 'max' to maximize the returned metric, 'min' to minize the returned metric
        :generations: number of generations
        :population: size of population
        :mutation: mutation rate to explore more different hyperparameters
        :crossover: how fast to adapt the population to the best in each generation
        :cleanup_generations: remove previous generations from HDFS, only keep the last 2
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :versioned_resources: A list of HDFS paths of resources to version with this experiment
        :description: a longer description for the experiment

    Returns:
        HDFS path in your project where the experiment is stored, dict with best hyperparameters

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.")

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True
        spark = util._find_spark()
        sc = spark.sparkContext
        app_id = str(sc.applicationId)

        diff_evo.run_id = diff_evo.run_id + 1

        versioned_path = util._version_resources(versioned_resources, diff_evo._get_logdir(app_id))

        experiment_json = None
        experiment_json = util._populate_experiment(sc, name, 'experiment', 'differential_evolution', diff_evo._get_logdir(app_id), json.dumps(boundary_dict), versioned_path, description)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        tensorboard_logdir, best_param, best_metric = diff_evo._search(spark, objective_function, boundary_dict, direction=direction, generations=generations, popsize=population, mutation=mutation, crossover=crossover, cleanup_generations=cleanup_generations, local_logdir=local_logdir, name=name)

        experiment_json = util._finalize_experiment(experiment_json, best_param, best_metric)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        best_param_dict = util._convert_to_dict(best_param)

    except:
        _exception_handler()
        raise
    finally:
        #cleanup spark jobs
        elastic_id +=1
        running = False
        sc.setJobGroup("", "")

    return tensorboard_logdir, best_param_dict
Exemplo n.º 21
0
def mirrored(train_fn,
             name='no-name',
             local_logdir=False,
             description=None,
             evaluator=False,
             metric_key=None):
    """
    *Distributed Training*

    Example usage:

    >>> from hops import experiment
    >>> def mirrored_training():
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    from hops import tensorboard
    >>>    from hops import devices
    >>>    logdir = tensorboard.logdir()
    >>>    ...MirroredStrategy()...
    >>> experiment.mirrored(mirrored_training, local_logdir=True)

    Args:
        :train_fn: contains the code where you are using MirroredStrategy.
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: a longer description for the experiment
        :evaluator: whether to run one of the workers as an evaluator
        :metric_key: If returning a dict with multiple return values, this key should match the name of the key in the dict for the metric you want to associate with the experiment

    Returns:
        HDFS path in your project where the experiment is stored and return value from the process running as chief

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    num_workers = util.num_executors()
    if evaluator:
        assert num_workers > 2, "number of workers must be atleast 3 if evaluator is set to True"

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        experiment_utils._create_experiment_dir(app_id, run_id)

        experiment_json = experiment_utils._populate_experiment(
            name, 'mirrored', 'DISTRIBUTED_TRAINING', None, description,
            app_id, None, None)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, return_dict = mirrored_impl._run(sc,
                                                 train_fn,
                                                 run_id,
                                                 local_logdir=local_logdir,
                                                 name=name,
                                                 evaluator=evaluator)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        metric = experiment_utils._get_metric(return_dict, metric_key)

        experiment_utils._finalize_experiment(experiment_json, metric, app_id,
                                              run_id, 'FINISHED', duration,
                                              logdir, None, None)

        return logdir, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
Exemplo n.º 22
0
def grid_search(map_fun, args_dict, direction='max', name='no-name', local_logdir=False, versioned_resources=None, description=None):
    """
    *Parallel Experiment*

    Run multiple experiments and test a grid of hyperparameters for a neural network to maximize e.g. a Neural Network's accuracy.

    The following example will run *train_nn* with 6 different hyperparameter combinations

    >>> from hops import experiment
    >>> grid_dict = {'learning_rate':[0.1, 0.3], 'dropout': [0.4, 0.6, 0.1]}
    >>> def train_nn(learning_rate, dropout):
    >>>    import tensorflow
    >>>    # code for preprocessing, training and exporting model
    >>>    # mandatory return a value for the experiment which is registered in Experiments service
    >>>    return network.evaluate(learning_rate, dropout)
    >>> experiment.grid_search(train_nn, grid_dict, direction='max')

    The following values will be injected in the function and run and evaluated.

        - (learning_rate=0.1, dropout=0.4)
        - (learning_rate=0.1, dropout=0.6)
        - (learning_rate=0.1, dropout=0.1)
        - (learning_rate=0.3, dropout=0.4)
        - (learning_rate=0.3, dropout=0.6)
        - (learning_rate=0.3, dropout=0.1)

    Args:
        :map_fun: the function to run, must return a metric
        :args_dict: a dict with a key for each argument with a corresponding value being a list containing the hyperparameters to test, internally all possible combinations will be generated and run as separate Experiments
        :direction: 'max' to maximize the returned metric, 'min' to minize the returned metric
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :versioned_resources: A list of HDFS paths of resources to version with this experiment
        :description: a longer description for the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.")

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True

        sc = util._find_spark().sparkContext
        app_id = str(sc.applicationId)

        gs.run_id = gs.run_id + 1

        versioned_path = util._version_resources(versioned_resources, gs._get_logdir(app_id))

        experiment_json = util._populate_experiment(sc, name, 'experiment', 'grid_search', gs._get_logdir(app_id), json.dumps(args_dict), versioned_path, description)

        util._version_resources(versioned_resources, gs._get_logdir(app_id))

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        grid_params = util.grid_params(args_dict)

        tensorboard_logdir, param, metric = gs._grid_launch(sc, map_fun, grid_params, direction=direction, local_logdir=local_logdir, name=name)

        experiment_json = util._finalize_experiment(experiment_json, param, metric)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)
    except:
        _exception_handler()
        raise
    finally:
        #cleanup spark jobs
        elastic_id +=1
        running = False
        sc.setJobGroup("", "")

    return tensorboard_logdir