Exemplo n.º 1
0
def _populate_experiment(sc, model_name, module, function, logdir, hyperparameter_space, versioned_resources, description):
    """
    Args:
         :sc:
         :model_name:
         :module:
         :function:
         :logdir:
         :hyperparameter_space:
         :versioned_resources:
         :description:

    Returns:

    """
    user = None
    if constants.ENV_VARIABLES.HOPSWORKS_USER_ENV_VAR in os.environ:
        user = os.environ[constants.ENV_VARIABLES.HOPSWORKS_USER_ENV_VAR]
    return json.dumps({'project': hdfs.project_name(),
                       'user': user,
                       'name': model_name,
                       'module': module,
                       'function': function,
                       'status':'RUNNING',
                       'app_id': sc.applicationId,
                       'start': datetime.now().isoformat(),
                       'memory_per_executor': str(sc._conf.get("spark.executor.memory")),
                       'gpus_per_executor': str(sc._conf.get("spark.executor.gpus")),
                       'executors': str(num_executors()),
                       'logdir': logdir,
                       'hyperparameter_space': hyperparameter_space,
                       'versioned_resources': versioned_resources,
                       'description': description})
Exemplo n.º 2
0
def end(metric=None):
    """
    End a custom Experiment previously registered with *begin* and register a metric to associate with it.

    Args:
        :metric: The metric to associate with the Experiment

    """
    global running
    global experiment_json
    global elastic_id
    global driver_tensorboard_hdfs_path
    global app_id
    if not running:
        raise RuntimeError(
            "An experiment is not running. Did you forget to call experiment.end()?"
        )
    try:
        if metric:
            experiment_json = util._finalize_experiment(
                experiment_json, None, str(metric))
            util._put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                              experiment_json)
        else:
            experiment_json = util._finalize_experiment(
                experiment_json, None, None)
            util._put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                              experiment_json)
    except:
        _exception_handler()
        raise
    finally:
        elastic_id += 1
        running = False
        handle = hopshdfs.get()

        if tensorboard.tb_pid != 0:
            subprocess.Popen(["kill", str(tensorboard.tb_pid)])

        if tensorboard.local_logdir_bool:
            local_tb = tensorboard.local_logdir_path
            util._store_local_tensorboard(local_tb, tensorboard.events_logdir)

        if not tensorboard.endpoint == None and not tensorboard.endpoint == '' \
                and handle.exists(tensorboard.endpoint):
            handle.delete(tensorboard.endpoint)
        hopshdfs._kill_logger()
Exemplo n.º 3
0
    def json(self, sc):
        """Get all relevant experiment information in JSON format.
        """
        user = None
        if hopsconstants.ENV_VARIABLES.HOPSWORKS_USER_ENV_VAR in os.environ:
            user = os.environ[
                hopsconstants.ENV_VARIABLES.HOPSWORKS_USER_ENV_VAR]

        experiment_json = {
            "project":
            hopshdfs.project_name(),
            "user":
            user,
            "name":
            self.name,
            "module":
            "maggy",
            "app_id":
            str(sc.applicationId),
            "start":
            time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime(self.job_start)),
            "memory_per_executor":
            str(sc._conf.get("spark.executor.memory")),
            "gpus_per_executor":
            str(sc._conf.get("spark.executor.gpus")),
            "executors":
            self.num_executors,
            "logdir":
            self.log_dir,
            # 'versioned_resources': versioned_resources,
            "description":
            self.description,
            "experiment_type":
            self.experiment_type,
        }

        if self.experiment_type == "optimization":
            experiment_json["hyperparameter_space"] = json.dumps(
                self.searchspace.to_dict())
            experiment_json["function"] = self.optimizer.name()
        elif self.experiment_type == "ablation":
            experiment_json["ablation_study"] = json.dumps(
                self.ablation_study.to_dict())
            experiment_json["ablator"] = self.ablator.name()

        if self.experiment_done:
            experiment_json["status"] = "FINISHED"
            experiment_json["finished"] = time.strftime(
                "%Y-%m-%dT%H:%M:%S", time.localtime(self.job_end))
            experiment_json["duration"] = self.duration
            if self.experiment_type == "optimization":
                experiment_json["hyperparameter"] = json.dumps(
                    self.result["best_hp"])
            experiment_json["metric"] = self.result["best_val"]

        else:
            experiment_json["status"] = "RUNNING"

        return json.dumps(experiment_json, default=util.json_default_numpy)
Exemplo n.º 4
0
def set_auth_header(headers):
    if constants.ENV_VARIABLES.REMOTE_ENV_VAR in os.environ:
        headers[constants.HTTP_CONFIG.
                HTTP_AUTHORIZATION] = "ApiKey " + get_api_key_aws(
                    hdfs.project_name())
    else:
        headers[
            constants.HTTP_CONFIG.HTTP_AUTHORIZATION] = "Bearer " + get_jwt()
Exemplo n.º 5
0
def get_serving_endpoint(model, project=None):

    endpoint = os.environ['REST_ENDPOINT']

    if 'http' in endpoint:
        last_index = endpoint.rfind('/')
        endpoint = endpoint[last_index+1:]

    host_port_pair = endpoint.split(':')

    #hardcode disabled for now
    os.environ['SSL_ENABLED'] = 'false'

    if os.environ['SSL_ENABLED'] == 'true':
        connection = http.HTTPSConnection(str(host_port_pair[0]), int(host_port_pair[1]))
    else:
        connection = http.HTTPConnection(str(host_port_pair[0]), int(host_port_pair[1]))

    headers = {'Content-type': 'application/json'}

    material_passwd = os.getcwd() + '/material_passwd'

    if not os.path.exists(material_passwd):
        raise AssertionError('material_passwd is not present in current working directory')

    with open(material_passwd) as f:
        keyStorePwd = f.read()

    k_certificate = os.getcwd() + '/k_certificate'

    if not os.path.exists(material_passwd):
        raise AssertionError('k_certificate is not present in current working directory')

    with open(k_certificate) as f:
        keyStore = f.read()
        keyStore = base64.b64encode(keyStore)

    if not project:
        project = hdfs.project_name()

    json_contents = {'project': project,
                     'model': model,
                     'keyStorePwd': keyStorePwd,
                     'keyStore': keyStore
                     }

    json_embeddable = json.dumps(json_contents)

    connection.request('POST', '/hopsworks-api/api/appservice/tfserving', json_embeddable, headers)

    response = connection.getresponse()
    respBody = response.read()
    responseObject = json.loads(respBody)

    host = responseObject['host']
    port = responseObject['port']

    return str(host) + ':' + str(port)
Exemplo n.º 6
0
def exception_handler():
    global running
    global experiment_json
    if running and experiment_json != None:
        experiment_json = json.loads(experiment_json)
        experiment_json['status'] = "FAILED"
        experiment_json['finished'] = datetime.now().isoformat()
        experiment_json = json.dumps(experiment_json)
        util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                         experiment_json)
Exemplo n.º 7
0
def exit_handler():
    global experiment_json
    global elastic_id
    if running and experiment_json != None:
        experiment_json = json.loads(experiment_json)
        experiment_json['status'] = "KILLED"
        experiment_json['finished'] = datetime.now().isoformat()
        experiment_json = json.dumps(experiment_json)
        util.put_elastic(hopshdfs.project_name(), app_id,
                         str('dist' + str(elastic_id)), experiment_json)
Exemplo n.º 8
0
def _do_get_project_training_datasets_sink():
    """
    Gets the project's default location for storing training datasets in HopsFS

    Returns:
        the project's default hopsfs location for storing training datasets

    """
    project_name = hdfs.project_name()
    training_datasets_sink = project_name + constants.FEATURE_STORE.TRAINING_DATASETS_SUFFIX
    return training_datasets_sink
Exemplo n.º 9
0
def get_elasticsearch_index(index):
    """
    Get the valid elasticsearch index for later use. This helper method prefix the index name with the project name.

    Args:
        :index: the elasticsearch index to interact with.

    Returns:
        A valid elasticsearch index name.
    """
    return hdfs.project_name() + "_" + index
Exemplo n.º 10
0
def _do_get_project_featurestore():
    """
    Gets the project's featurestore name (project_featurestore)

    Returns:
        the project's featurestore name

    """
    project_name = hdfs.project_name()
    featurestore_name = project_name.lower() + constants.FEATURE_STORE.FEATURESTORE_SUFFIX
    return featurestore_name
Exemplo n.º 11
0
def end(metric=None):
    global running
    global experiment_json
    global elastic_id
    global driver_tensorboard_hdfs_path
    global app_id
    if not running:
        raise RuntimeError(
            "An experiment is not running. Did you forget to call experiment.end()?"
        )
    try:
        if metric:
            experiment_json = util.finalize_experiment(experiment_json, None,
                                                       str(metric))
            util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                             experiment_json)
        else:
            experiment_json = util.finalize_experiment(experiment_json, None,
                                                       None)
            util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                             experiment_json)
    except:
        exception_handler()
        raise
    finally:
        elastic_id += 1
        running = False
        handle = hopshdfs.get()

        if tensorboard.tb_pid != 0:
            subprocess.Popen(["kill", str(tensorboard.tb_pid)])

        if tensorboard.local_logdir_bool:
            local_tb = tensorboard.local_logdir_path
            util.store_local_tensorboard(local_tb, tensorboard.events_logdir)

        if not tensorboard.endpoint == None and not tensorboard.endpoint == '' \
                and handle.exists(tensorboard.endpoint):
            handle.delete(tensorboard.endpoint)
        hopshdfs.kill_logger()
Exemplo n.º 12
0
def set_auth_header(headers):
    """
    Set authorization header for HTTP requests to Hopsworks, depending if setup is remote or not.

    Args:
        http headers
    """
    if constants.ENV_VARIABLES.REMOTE_ENV_VAR in os.environ:
        headers[constants.HTTP_CONFIG.
                HTTP_AUTHORIZATION] = "ApiKey " + get_api_key_aws(
                    hdfs.project_name())
    else:
        headers[
            constants.HTTP_CONFIG.HTTP_AUTHORIZATION] = "Bearer " + get_jwt()
Exemplo n.º 13
0
def _exit_handler():
    """

    Returns:

    """
    global running
    global experiment_json
    if running and experiment_json != None:
        experiment_json = json.loads(experiment_json)
        experiment_json['status'] = "KILLED"
        experiment_json['finished'] = datetime.now().isoformat()
        experiment_json = json.dumps(experiment_json)
        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)
Exemplo n.º 14
0
def download_model(name, version=None, project_name=None, overwrite=False):
    """
    Download from the Hopsworks Models dataset an archive (zip file) containing the model artifacts.
    You first need to use the project.connect function to connect to Hopsworks.
    If the Models dataset where the model resides is a shared dataset from another project,
    then you need to specify the name of the project that owns the Models dataset was shared from.

    For example if you run this:

    >>> from hops import model
    >>> # If connecting from an external client, you need to connect to Hopsworks
    >>> project.connect(...) # see project module for documentation
    >>> model.download_model('mnist')

    Args:
        :name: name of the model
        :version: version of the model. If omitted, all versions of the model will be included in the archive.
        :project_name name of the project parent of the model. By default, this project is the current project running
        the experiment
        :overwrite: Whether to overwrite the model archive  file if it already exists

    Returns:
        A zip file containing the model artifacts

    Raises:
        :ModelArchiveExists: if the model archive that contains the model artifacts already exists
    """
    if project_name is None:
        project_name = hdfs.project_name()

    # Check if model archive already exists and if it should be deleted, otherwise return an error
    model_dir = '/Projects/' + project_name + "/Models/" + name
    if version is not None:
        model_dir += "/" + str(version)
        name += str(version)
    archive_path = model_dir + ".zip"
    name += ".zip"
    if dataset.path_exists(archive_path):
        if overwrite:
            dataset.delete(archive_path, block=True)
        else:
            raise ModelArchiveExists(
                "Model archive file already exists at {}. Either set overwrite=True or remove the file manually."
                .format(archive_path))

    print("Preparing the model archive...")
    dataset.compress(model_dir, block=True, project_name=project_name)
    print("Downloading the model archive...")
    dataset.download(archive_path, file=name)
Exemplo n.º 15
0
def _populate_experiment_model(model, project=None):
    """
    Args:
         :model:
         :project_name:

    Returns:

    """

    if project is None:
        project = hdfs.project_name()
    return {
        'id': os.environ['ML_ID'],
        'model': model,
        'modelProjectName': project
    }
Exemplo n.º 16
0
def log(line, level='info', logger=None, thread="default"):
    # For logging to work you need to add this to logstash and restart the service
    # input {
    #   tcp {
    #     port => 5000
    #     codec => json
    #   }
    # }
    #
    # Will do normal printing if all fails

    # add extra field to logstash message
    if logger is not None:
        mlogger = logger
    else:
        # Maybe needs to add executor here also if we have multiple
        mlogger = _get_logger("executor-logger-%s" %
                              os.environ['CONTAINER_ID'])
        if not mlogger:
            print("Logger error returned None")
            return
        if hasattr(mlogger, "_no_logging"):
            print(line)
            return True

    if hasattr(mlogger, "_no_logging"):
        print(line)
        return True

    import time
    import datetime

    extra = {
        'application':
        [hdfs.project_name().lower(), "jupyter", "notebook", "executor128"],
        'timestamp':
        datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
        'priority':
        level
        #'thread'     : thread
    }
    getattr(mlogger, level)('%s', line, extra=extra)
    return True
Exemplo n.º 17
0
def exists(serving_name):
    """
    Checks if there exists a serving with the given name

    Example use-case:

    >>> from hops import serving
    >>> serving.exist(serving_name)

    Args:
        :serving_name: the name of the serving

    Returns:
           True if the serving exists, otherwise false
    """
    try:
        return get_id(serving_name) is not None
    except ServingNotFound:
        print("No serving with name {} was found in the project {}".format(
            serving_name, hdfs.project_name()))
        return False
Exemplo n.º 18
0
def _populate_experiment(model_name, function, type, hp, description, app_id,
                         direction, optimization_key):
    """
    Args:
         :sc:
         :model_name:
         :module:
         :function:
         :logdir:
         :hyperparameter_space:
         :description:

    Returns:

    """
    jobName = None
    if constants.ENV_VARIABLES.JOB_NAME_ENV_VAR in os.environ:
        jobName = os.environ[constants.ENV_VARIABLES.JOB_NAME_ENV_VAR]

    kernelId = None
    if constants.ENV_VARIABLES.KERNEL_ID_ENV_VAR in os.environ:
        kernelId = os.environ[constants.ENV_VARIABLES.KERNEL_ID_ENV_VAR]

    if model_name == 'no-name' and jobName:
        model_name = jobName

    return {
        'id': os.environ['ML_ID'],
        'name': model_name,
        'projectName': hdfs.project_name(),
        'description': description,
        'state': 'RUNNING',
        'function': function,
        'experimentType': type,
        'appId': app_id,
        'direction': direction,
        'optimizationKey': optimization_key,
        'jobName': jobName,
        'kernelId': kernelId
    }
Exemplo n.º 19
0
def populate_experiment(sc, model_name, module, function, logdir,
                        hyperparameter_space, versioned_resources,
                        description):
    user = None
    if 'HOPSWORKS_USER' in os.environ:
        user = os.environ['HOPSWORKS_USER']
    return json.dumps({
        'project':
        hdfs.project_name(),
        'user':
        user,
        'name':
        model_name,
        'module':
        module,
        'function':
        function,
        'status':
        'RUNNING',
        'app_id':
        sc.applicationId,
        'start':
        datetime.now().isoformat(),
        'memory_per_executor':
        str(sc._conf.get("spark.executor.memory")),
        'gpus_per_executor':
        str(sc._conf.get("spark.executor.gpus")),
        'executors':
        str(sc._conf.get("spark.executor.instances")),
        'logdir':
        logdir,
        'hyperparameter_space':
        hyperparameter_space,
        'versioned_resources':
        versioned_resources,
        'description':
        description
    })
Exemplo n.º 20
0
def mirrored(map_fun, name='no-name', local_logdir=False, versioned_resources=None, description=None):
    """
    *Distributed Training* single machine - multiple GPUs

    Example usage:

    >>> from hops import experiment
    >>> def mirrored_training():
    >>>    import tensorflow
    >>>    from hops import tensorboard
    >>>    from hops import devices
    >>>    logdir = tensorboard.logdir()
    >>>    ...MirroredStrategy()...
    >>> experiment.mirrored(mirrored_training)

    Args:
        :map_fun: contains the code where you are using MirroredStrategy.
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :versioned_resources: A list of HDFS paths of resources to version with this experiment
        :description: a longer description for the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.")

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True

        sc = util._find_spark().sparkContext
        app_id = str(sc.applicationId)

        mirrored_impl.run_id = mirrored_impl.run_id + 1

        versioned_path = util._version_resources(versioned_resources, mirrored_impl._get_logdir(app_id))

        experiment_json = util._populate_experiment(sc, name, 'experiment', 'mirrored', mirrored_impl._get_logdir(app_id), None, versioned_path, description)

        util._version_resources(versioned_resources, mirrored_impl._get_logdir(app_id))

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        retval, logdir = mirrored_impl._launch(sc, map_fun, local_logdir=local_logdir, name=name)

        experiment_json = util._finalize_experiment(experiment_json, None, retval)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)
    except:
        _exception_handler()
        raise
    finally:
        #cleanup spark jobs
        elastic_id +=1
        running = False
        sc.setJobGroup("", "")

    return logdir
Exemplo n.º 21
0
def begin(name='no-name', local_logdir=False, versioned_resources=None, description=None):
    """
    Start a custom Experiment, at the end of the experiment call *end(metric)*.

    *IMPORTANT* - This call should not be combined with other functions in the experiment module, other than *end*.
    Other experiment functions such as *grid_search* manages the *begin* and *end* functions internally

    Example usage:

    >>> from hops import experiment
    >>> experiment.begin(name='calculate pi')
    >>> # Code to calculate pi
    >>> pi = calc_pi()
    >>> experiment.end(pi)

    Args:
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :versioned_resources: A list of HDFS paths of resources to version with this experiment
        :description: A longer description for the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """
    global running
    if running:
        raise RuntimeError("An experiment is currently running. Please call experiment.stop() to stop it.")

    try:
        global app_id
        global experiment_json
        global elastic_id
        global run_id
        global driver_tensorboard_hdfs_path

        running = True

        sc = util._find_spark().sparkContext
        app_id = str(sc.applicationId)

        run_id = run_id + 1

        versioned_path = util._version_resources(versioned_resources, _get_logdir(app_id))

        experiment_json = None

        experiment_json = util._populate_experiment(sc, name, 'experiment', 'begin', _get_logdir(app_id), None, versioned_path, description)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(app_id, run_id, None, 'begin')

        pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user())

        hopshdfs._init_logger()

        driver_tensorboard_hdfs_path,_ = tensorboard._register(hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir)
    except:
        _exception_handler()
        raise

    return driver_tensorboard_hdfs_path
Exemplo n.º 22
0
def parameter_server(map_fun, name='no-name', local_logdir=False, versioned_resources=None, description=None):
    """
    *Distributed Training*

    Sets up the cluster to run ParameterServerStrategy.

    TF_CONFIG is exported in the background and does not need to be set by the user themselves.

    Example usage:

    >>> from hops import experiment
    >>> def distributed_training():
    >>>    import tensorflow
    >>>    from hops import tensorboard
    >>>    from hops import devices
    >>>    logdir = tensorboard.logdir()
    >>>    ...ParameterServerStrategy(num_gpus_per_worker=devices.get_num_gpus())...
    >>> experiment.parameter_server(distributed_training, local_logdir=True)

    Args:
        :map_fun: contains the code where you are using ParameterServerStrategy.
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :versioned_resources: A list of HDFS paths of resources to version with this experiment
        :description: a longer description for the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """
    num_ps = util.num_param_servers()
    num_executors = util.num_executors()

    assert num_ps > 0, "number of parameter servers should be greater than 0"
    assert num_ps < num_executors, "num_ps cannot be greater than num_executors (i.e. num_executors == num_ps + num_workers)"

    global running
    if running:
        raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.")

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True

        sc = util._find_spark().sparkContext
        app_id = str(sc.applicationId)

        ps.run_id = ps.run_id + 1

        versioned_path = util._version_resources(versioned_resources, ps._get_logdir(app_id))

        experiment_json = util._populate_experiment(sc, name, 'experiment', 'parameter_server', ps._get_logdir(app_id), None, versioned_path, description)

        util._version_resources(versioned_resources, ps._get_logdir(app_id))

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        retval, logdir = ps._launch(sc, map_fun, local_logdir=local_logdir, name=name)

        experiment_json = util._finalize_experiment(experiment_json, None, retval)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)
    except:
        _exception_handler()
        raise
    finally:
        #cleanup spark jobs
        elastic_id +=1
        running = False
        sc.setJobGroup("", "")

    return logdir
Exemplo n.º 23
0
def launch(map_fun, args_dict=None, name='no-name', local_logdir=False, versioned_resources=None, description=None):
    """

    *Experiment* or *Parallel Experiment*

    Run an Experiment contained in *map_fun* one time with no arguments or multiple times with different arguments if
    *args_dict* is specified.

    Example usage:

    >>> from hops import experiment
    >>> def train_nn():
    >>>    import tensorflow
    >>>    from hops import tensorboard
    >>>    logdir = tensorboard.logdir()
    >>>    # code for preprocessing, training and exporting model
    >>>    # optionally return a value for the experiment which is registered in Experiments service
    >>> experiment.launch(train_nn)

    Args:
        :map_fun: The function to run
        :args_dict: If specified will run the same function multiple times with different arguments, {'a':[1,2], 'b':[5,3]}
         would run the function two times with arguments (1,5) and (2,3) provided that the function signature contains two arguments like *def func(a,b):*
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :versioned_resources: A list of HDFS paths of resources to version with this experiment
        :description: A longer description for the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.")

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True

        sc = util._find_spark().sparkContext
        app_id = str(sc.applicationId)

        launcher.run_id = launcher.run_id + 1

        versioned_path = util._version_resources(versioned_resources, launcher._get_logdir(app_id))

        experiment_json = None
        if args_dict:
            experiment_json = util._populate_experiment(sc, name, 'experiment', 'launcher', launcher._get_logdir(app_id), json.dumps(args_dict), versioned_path, description)
        else:
            experiment_json = util._populate_experiment(sc, name, 'experiment', 'launcher', launcher._get_logdir(app_id), None, versioned_path, description)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        retval, tensorboard_logdir = launcher._launch(sc, map_fun, args_dict, local_logdir)

        util._version_resources(versioned_resources, launcher._get_logdir(app_id))

        if retval:
            experiment_json = util._finalize_experiment(experiment_json, None, retval)
            util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)
            return tensorboard_logdir

        experiment_json = util._finalize_experiment(experiment_json, None, None)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

    except:
        _exception_handler()
        raise
    finally:
        #cleanup spark jobs
        elastic_id +=1
        running = False
        sc.setJobGroup("", "")
    return tensorboard_logdir
Exemplo n.º 24
0
 def project_name(self):
     return hopshdfs.project_name()
Exemplo n.º 25
0
def run(sc,
        map_fun,
        tf_args,
        num_executors,
        num_ps,
        tensorboard=False,
        input_mode=InputMode.TENSORFLOW,
        log_dir=None,
        driver_ps_nodes=False,
        master_node=None,
        reservation_timeout=600,
        name='no-name',
        local_logdir=False,
        versioned_resources=None,
        description=None,
        queues=['input', 'output', 'error']):
    """Starts the TensorFlowOnSpark cluster and Runs the TensorFlow "main" function on the Spark executors

  Args:
    :sc: SparkContext
    :map_fun: user-supplied TensorFlow "main" function
    :tf_args: ``argparse`` args, or command-line ``ARGV``.  These will be passed to the ``map_fun``.
    :num_executors: number of Spark executors.  This should match your Spark job's ``--num_executors``.
    :num_ps: number of Spark executors which are reserved for TensorFlow PS nodes.  All other executors will be used as TensorFlow worker nodes.
    :tensorboard: boolean indicating if the chief worker should spawn a Tensorboard server.
    :input_mode: TFCluster.InputMode
    :log_dir: directory to save tensorboard event logs.  If None, defaults to a fixed path on local filesystem.
    :driver_ps_nodes: run the PS nodes on the driver locally instead of on the spark executors; this help maximizing computing resources (esp. GPU). You will need to set cluster_size = num_executors + num_ps
    :master_node: name of the "master" or "chief" node in the cluster_template, used for `tf.estimator` applications.
    :reservation_timeout: number of seconds after which cluster reservation times out (600 sec default)
    :queues: *INTERNAL_USE*

  Returns:
    A TFCluster object representing the started cluster.
  """

    #in hopsworks we want the tensorboard to always be true:
    global elastic_id
    global running
    global run_id
    tb = True
    elastic_id = elastic_id + 1
    run_id = run_id + 1
    running = True

    logging.info(
        "Reserving TFSparkNodes {0}".format("w/ TensorBoard" if tb else ""))
    assert num_ps < num_executors

    if driver_ps_nodes:
        raise Exception(
            'running PS nodes on driver is not supported and not needed on Hops Hadoop, since we have GPU scheduling.'
        )

    if log_dir:
        raise Exception(
            'No need to specify log_dir directory, we save TensorBoard events in the directory returned by tensorboard.logdir for you'
        )

    # build a cluster_spec template using worker_nums
    cluster_template = {}
    cluster_template['ps'] = range(num_ps)
    if master_node is None:
        cluster_template['worker'] = range(num_ps, num_executors)
    else:
        cluster_template[master_node] = range(num_ps, num_ps + 1)
        if num_executors > num_ps + 1:
            cluster_template['worker'] = range(num_ps + 1, num_executors)
    logging.info("cluster_template: {}".format(cluster_template))

    # get default filesystem from spark
    defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS")
    # strip trailing "root" slash from "file:///" to be consistent w/ "hdfs://..."
    if defaultFS.startswith(
            "file://") and len(defaultFS) > 7 and defaultFS.endswith("/"):
        defaultFS = defaultFS[:-1]

    # get current working dir of spark launch
    working_dir = os.getcwd()

    # start a server to listen for reservations and broadcast cluster_spec
    server = reservation.Server(num_executors)
    server_addr = server.start()

    # start TF nodes on all executors
    logging.info("Starting TensorFlow on executors")
    cluster_meta = {
        'id': random.getrandbits(64),
        'cluster_template': cluster_template,
        'num_executors': num_executors,
        'default_fs': defaultFS,
        'working_dir': working_dir,
        'server_addr': server_addr
    }

    nodeRDD = sc.parallelize(range(num_executors), num_executors)
    global app_id
    app_id = sc.applicationId
    global experiment_json

    versioned_path = util.version_resources(versioned_resources,
                                            get_logdir(app_id))

    experiment_json = None
    experiment_json = util.populate_experiment(sc, name, 'TFCluster', 'run',
                                               get_logdir(app_id), None,
                                               versioned_path, description)

    util.put_elastic(hopshdfs.project_name(), app_id,
                     str('dist' + str(elastic_id)), experiment_json)

    # start TF on a background thread (on Spark driver) to allow for feeding job

    def _start(status):
        try:
            nodeRDD.foreachPartition(
                TFSparkNode.run(map_fun,
                                tf_args,
                                cluster_meta,
                                tb,
                                None,
                                app_id,
                                run_id,
                                queues,
                                local_logdir=local_logdir,
                                background=(input_mode == InputMode.SPARK)))
        except Exception as e:
            logging.error("Exception in TF background thread")
            status['error'] = str(e)
            exception_handler()

    t = threading.Thread(target=_start, args=(tf_status, ))
    # run as daemon thread so that in spark mode main thread can exit
    # if feeder spark stage fails and main thread can't do explicit shutdown
    t.daemon = True

    t.start()

    # wait for executors to check GPU presence
    logging.info("Waiting for GPU presence check to start")
    gpus_present = server.await_gpu_check()
    logging.info("All GPU checks completed")

    # wait for executors to register and start TFNodes before continuing
    logging.info("Waiting for TFSparkNodes to start")
    cluster_info = server.await_reservations(sc, tf_status,
                                             reservation_timeout)
    logging.info("All TFSparkNodes started")

    # print cluster_info and extract TensorBoard URL
    tb_url = None
    for node in cluster_info:
        logging.info(node)
        if node['tb_port'] != 0:
            tb_url = "http://{0}:{1}".format(node['host'], node['tb_port'])

    if tb_url is not None:
        logging.info(
            "========================================================================================"
        )
        logging.info("")
        logging.info("TensorBoard running at:       {0}".format(tb_url))
        logging.info("")
        logging.info(
            "========================================================================================"
        )

    # since our "primary key" for each executor's TFManager is (host, executor_id), sanity check for duplicates

    # Note: this may occur if Spark retries failed Python tasks on the same executor.
    tb_nodes = set()
    for node in cluster_info:
        node_id = (node['host'], node['executor_id'])
        if node_id in tb_nodes:
            raise Exception(
                "Duplicate cluster node id detected (host={0}, executor_id={1})"
                .format(node_id[0], node_id[1]) + "Please ensure that:\n" +
                "1. Number of executors >= number of TensorFlow nodes\n" +
                "2. Number of tasks per executors is 1\n" +
                "3, TFCluster.shutdown() is successfully invoked when done.")
        else:
            tb_nodes.add(node_id)

    # create TFCluster object
    cluster = TFCluster()
    cluster.sc = sc
    cluster.meta = cluster_meta
    cluster.nodeRDD = nodeRDD
    cluster.cluster_info = cluster_info
    cluster.cluster_meta = cluster_meta
    cluster.input_mode = input_mode
    cluster.queues = queues
    cluster.server = server

    return cluster
Exemplo n.º 26
0
    def shutdown(self, ssc=None):
        """Stops the distributed TensorFlow cluster.

    Args:
      :ssc: *For Streaming applications only*. Spark StreamingContext
    """
        logging.info("Stopping TensorFlow nodes")

        # identify ps/workers
        ps_list, worker_list = [], []
        for node in self.cluster_info:
            if node['job_name'] == 'ps':
                ps_list.append(node)
            else:
                worker_list.append(node)

        if ssc is not None:
            # Spark Streaming
            done = False
            while not done:
                done = ssc.awaitTerminationOrTimeout(1)
                if not done and self.server.done:
                    logging.info("Server done, stopping StreamingContext")
                    ssc.stop(stopSparkContext=False, stopGraceFully=True)
                done = done or self.server.done
        else:
            # in TENSORFLOW mode, there is no "data feeding" job, only a "start" job, so we must wait for the TensorFlow workers
            # to complete all tasks, while accounting for any PS tasks which run indefinitely.
            if self.input_mode == InputMode.TENSORFLOW:
                count = 0
                done = False
                while not done:
                    st = self.sc.statusTracker()
                    jobs = st.getActiveJobsIds()
                    if len(jobs) > 0:
                        stages = st.getActiveStageIds()
                        for i in stages:
                            si = st.getStageInfo(i)
                            if si.numActiveTasks == len(ps_list):
                                # if we only have PS tasks left, check that we see this condition a couple times
                                count += 1
                                done = (count >= 3)
                                time.sleep(5)
                    else:
                        done = True
                        global running
                        running = False

            # shutdown queues and managers for "worker" executors.
            # note: in SPARK mode, this job will immediately queue up behind the "data feeding" job.
            # in TENSORFLOW mode, this will only run after all workers have finished.
            workers = len(worker_list)
            workerRDD = self.sc.parallelize(range(workers), workers)
            workerRDD.foreachPartition(
                TFSparkNode.shutdown(self.cluster_info, self.queues))

        # exit Spark application w/ err status if TF job had any errors
        if 'error' in tf_status:
            logging.error("Exiting Spark application with error status.")
            exception_handler()
            self.sc.cancelAllJobs()
            #self.sc.stop()
            #sys.exit(1)
        global experiment_json
        global app_id
        experiment_json = util.finalize_experiment(experiment_json, None, None)

        util.put_elastic(hopshdfs.project_name(), app_id,
                         str('dist' + str(elastic_id)), experiment_json)

        logging.info("Shutting down cluster")
        # shutdown queues and managers for "PS" executors.
        # note: we have to connect/shutdown from the spark driver, because these executors are "busy" and won't accept any other tasks.
        for node in ps_list:
            addr = node['addr']
            authkey = node['authkey']
            m = TFManager.connect(addr, authkey)
            q = m.get_queue('control')
            q.put(None)
            q.join()

        # wait for all jobs to finish
        done = False
        while not done:
            time.sleep(5)
            st = self.sc.statusTracker()
            jobs = st.getActiveJobsIds()
            if len(jobs) == 0:
                break

        def tensorboard_url(self):
            """
      Utility function to get Tensorboard URL
      """
            tb_url = None
            for node in self.cluster_info:
                if node['tb_port'] != 0 and node[
                        'job_name'] == 'worker' and node['task_index'] == 0:
                    tb_url = "http://{0}:{1}".format(node['host'],
                                                     node['tb_port'])
            return tb_url
Exemplo n.º 27
0
def start_beam_jobserver(flink_session_name,
                         artifacts_dir="Resources",
                         jobserver_jar=None,
                         sdk_worker_parallelism=1):
    """
    Start the Java Beam job server that connects to the flink session cluster. User needs to provide the
    job name that started the Flink session and optionally the worker parallelism.

    Args:
      :flink_session_name: Job name that runs the Flink session.
      :sdk_worker_parallelism: Default parallelism for SDK worker processes. This option is only applied when the
      pipeline option sdkWorkerParallelism is set to 0.Default is 1, If 0, worker parallelism will be dynamically
      decided by runner.See also: sdkWorkerParallelism Pipeline Option (default: 1). For further documentation,
      please refer to Apache Beam docs.
    Returns:
        artifact_port, expansion_port, job_host, job_port, jobserver.pid
    """
    if jobserver_jar is None:
        jobserver_jar = os.path.join(
            util.get_flink_conf_dir(),
            "beam-runners-flink-1.8-job-server-2.15.0.jar")
    # Get Flink master URL (flink session cluster) from an ExecutionDTO
    method = constants.HTTP_CONFIG.HTTP_GET
    resource_url = constants.DELIMITERS.SLASH_DELIMITER + \
                   constants.REST_CONFIG.HOPSWORKS_REST_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \
                   constants.REST_CONFIG.HOPSWORKS_PROJECT_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \
                   hopsfs.project_id() + constants.DELIMITERS.SLASH_DELIMITER + \
                   "jobs" + constants.DELIMITERS.SLASH_DELIMITER + \
                   flink_session_name + constants.DELIMITERS.SLASH_DELIMITER + \
                   "executions" + \
                   "?limit=1&offset=0&sort_by=submissionTime:desc"
    response = util.send_request(method, resource_url)
    response_object = response.json()
    flink_master_url = response_object['items'][0]['flinkMasterURL']
    artifact_port = randint(10000, 65000)
    expansion_port = randint(10000, 65000)
    job_port = randint(10000, 65000)
    job_host = socket.getfqdn()
    log_base_path = ""
    if 'LOG_DIRS' in os.environ:
        log_base_path += os.environ['LOG_DIRS'] + "/"

    beam_jobserver_log = log_base_path + "beamjobserver-" + hopsfs.project_name().lower() + "-" + flink_session_name + \
                          "-" + str(job_port) + ".log"
    # copy jar to local
    with open(beam_jobserver_log, "wb") as out, open(beam_jobserver_log,
                                                     "wb") as err:
        jobserver = subprocess.Popen(
            [
                "java", "-jar", jobserver_jar,
                "--artifacts-dir=%s" % hopsfs.project_path() + artifacts_dir,
                "--flink-master-url=%s" % flink_master_url,
                "--artifact-port=%d" % artifact_port,
                "--expansion-port=%d" % expansion_port,
                "--job-host=%s" % job_host,
                "--job-port=%d" % job_port,
                "--sdk-worker-parallelism=%d" % sdk_worker_parallelism
            ],
            stdout=out,
            stderr=err,
            preexec_fn=util._on_executor_exit('SIGTERM'))
    global clusters
    clusters.append(flink_session_name)
    global jobserver_host
    jobserver_host = job_host
    global jobserver_port
    jobserver_port = job_port
    return {
        "jobserver_log": beam_jobserver_log,
        "artifact_port": artifact_port,
        "expansion_port": expansion_port,
        "job_host": job_host,
        "job_port": job_port,
        "jobserver.pid": jobserver.pid
    }
Exemplo n.º 28
0
def differential_evolution(objective_function, boundary_dict, direction = 'max', generations=10, population=10, mutation=0.5, crossover=0.7, cleanup_generations=False, name='no-name', local_logdir=False, versioned_resources=None, description=None):
    """
    *Parallel Experiment*

    Run differential evolution to explore a given search space for each hyperparameter and figure out the best hyperparameter combination.
    The function is treated as a blackbox that returns a metric for some given hyperparameter combination.
    The returned metric is used to evaluate how 'good' the hyperparameter combination was.

    Example usage:

    >>> from hops import experiment
    >>> boundary_dict = {'learning_rate':[0.01, 0.2], 'dropout': [0.1, 0.9]}
    >>> def train_nn(learning_rate, dropout):
    >>>    import tensorflow
    >>>    # code for preprocessing, training and exporting model
    >>>    # mandatory return a value for the experiment which is registered in Experiments service
    >>>    return network.evaluate(learning_rate, dropout)
    >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max')

    Args:
        :objective_function: the function to run, must return a metric
        :boundary_dict: a dict where each key corresponds to an argument of *objective_function* and the correspond value should be a list of two elements. The first element being the lower bound for the parameter and the the second element the upper bound.
        :direction: 'max' to maximize the returned metric, 'min' to minize the returned metric
        :generations: number of generations
        :population: size of population
        :mutation: mutation rate to explore more different hyperparameters
        :crossover: how fast to adapt the population to the best in each generation
        :cleanup_generations: remove previous generations from HDFS, only keep the last 2
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :versioned_resources: A list of HDFS paths of resources to version with this experiment
        :description: a longer description for the experiment

    Returns:
        HDFS path in your project where the experiment is stored, dict with best hyperparameters

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.")

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True
        spark = util._find_spark()
        sc = spark.sparkContext
        app_id = str(sc.applicationId)

        diff_evo.run_id = diff_evo.run_id + 1

        versioned_path = util._version_resources(versioned_resources, diff_evo._get_logdir(app_id))

        experiment_json = None
        experiment_json = util._populate_experiment(sc, name, 'experiment', 'differential_evolution', diff_evo._get_logdir(app_id), json.dumps(boundary_dict), versioned_path, description)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        tensorboard_logdir, best_param, best_metric = diff_evo._search(spark, objective_function, boundary_dict, direction=direction, generations=generations, popsize=population, mutation=mutation, crossover=crossover, cleanup_generations=cleanup_generations, local_logdir=local_logdir, name=name)

        experiment_json = util._finalize_experiment(experiment_json, best_param, best_metric)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        best_param_dict = util._convert_to_dict(best_param)

    except:
        _exception_handler()
        raise
    finally:
        #cleanup spark jobs
        elastic_id +=1
        running = False
        sc.setJobGroup("", "")

    return tensorboard_logdir, best_param_dict
Exemplo n.º 29
0
# Copyright (C) 2020, Logical Clocks AB. All rights reserved
# !/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
from hops import hdfs

arguments = len(sys.argv) - 1
position = 1

print("Parameters%s" % (len(sys.argv) - 1))
while (arguments >= position):
    print("Parameter %i: %s" % (position, sys.argv[position]))
    position = position + 1

td_proj_name = hdfs.project_name()
if arguments >= 1:
    td_proj_name = sys.argv[1]
td_ds = td_proj_name + '_Training_Datasets'
td = 'mnist_td_1'
if arguments >= 2:
    td = sys.argv[2]

model_proj_name = None
if arguments >= 3:
    model_proj_name = sys.argv[3]

model_name = 'mnist_model'
if arguments >= 4:
    model_name = sys.argv[4]
Exemplo n.º 30
0
def grid_search(map_fun, args_dict, direction='max', name='no-name', local_logdir=False, versioned_resources=None, description=None):
    """
    *Parallel Experiment*

    Run multiple experiments and test a grid of hyperparameters for a neural network to maximize e.g. a Neural Network's accuracy.

    The following example will run *train_nn* with 6 different hyperparameter combinations

    >>> from hops import experiment
    >>> grid_dict = {'learning_rate':[0.1, 0.3], 'dropout': [0.4, 0.6, 0.1]}
    >>> def train_nn(learning_rate, dropout):
    >>>    import tensorflow
    >>>    # code for preprocessing, training and exporting model
    >>>    # mandatory return a value for the experiment which is registered in Experiments service
    >>>    return network.evaluate(learning_rate, dropout)
    >>> experiment.grid_search(train_nn, grid_dict, direction='max')

    The following values will be injected in the function and run and evaluated.

        - (learning_rate=0.1, dropout=0.4)
        - (learning_rate=0.1, dropout=0.6)
        - (learning_rate=0.1, dropout=0.1)
        - (learning_rate=0.3, dropout=0.4)
        - (learning_rate=0.3, dropout=0.6)
        - (learning_rate=0.3, dropout=0.1)

    Args:
        :map_fun: the function to run, must return a metric
        :args_dict: a dict with a key for each argument with a corresponding value being a list containing the hyperparameters to test, internally all possible combinations will be generated and run as separate Experiments
        :direction: 'max' to maximize the returned metric, 'min' to minize the returned metric
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :versioned_resources: A list of HDFS paths of resources to version with this experiment
        :description: a longer description for the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.")

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True

        sc = util._find_spark().sparkContext
        app_id = str(sc.applicationId)

        gs.run_id = gs.run_id + 1

        versioned_path = util._version_resources(versioned_resources, gs._get_logdir(app_id))

        experiment_json = util._populate_experiment(sc, name, 'experiment', 'grid_search', gs._get_logdir(app_id), json.dumps(args_dict), versioned_path, description)

        util._version_resources(versioned_resources, gs._get_logdir(app_id))

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        grid_params = util.grid_params(args_dict)

        tensorboard_logdir, param, metric = gs._grid_launch(sc, map_fun, grid_params, direction=direction, local_logdir=local_logdir, name=name)

        experiment_json = util._finalize_experiment(experiment_json, param, metric)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)
    except:
        _exception_handler()
        raise
    finally:
        #cleanup spark jobs
        elastic_id +=1
        running = False
        sc.setJobGroup("", "")

    return tensorboard_logdir