def _populate_experiment(sc, model_name, module, function, logdir, hyperparameter_space, versioned_resources, description): """ Args: :sc: :model_name: :module: :function: :logdir: :hyperparameter_space: :versioned_resources: :description: Returns: """ user = None if constants.ENV_VARIABLES.HOPSWORKS_USER_ENV_VAR in os.environ: user = os.environ[constants.ENV_VARIABLES.HOPSWORKS_USER_ENV_VAR] return json.dumps({'project': hdfs.project_name(), 'user': user, 'name': model_name, 'module': module, 'function': function, 'status':'RUNNING', 'app_id': sc.applicationId, 'start': datetime.now().isoformat(), 'memory_per_executor': str(sc._conf.get("spark.executor.memory")), 'gpus_per_executor': str(sc._conf.get("spark.executor.gpus")), 'executors': str(num_executors()), 'logdir': logdir, 'hyperparameter_space': hyperparameter_space, 'versioned_resources': versioned_resources, 'description': description})
def end(metric=None): """ End a custom Experiment previously registered with *begin* and register a metric to associate with it. Args: :metric: The metric to associate with the Experiment """ global running global experiment_json global elastic_id global driver_tensorboard_hdfs_path global app_id if not running: raise RuntimeError( "An experiment is not running. Did you forget to call experiment.end()?" ) try: if metric: experiment_json = util._finalize_experiment( experiment_json, None, str(metric)) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) else: experiment_json = util._finalize_experiment( experiment_json, None, None) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) except: _exception_handler() raise finally: elastic_id += 1 running = False handle = hopshdfs.get() if tensorboard.tb_pid != 0: subprocess.Popen(["kill", str(tensorboard.tb_pid)]) if tensorboard.local_logdir_bool: local_tb = tensorboard.local_logdir_path util._store_local_tensorboard(local_tb, tensorboard.events_logdir) if not tensorboard.endpoint == None and not tensorboard.endpoint == '' \ and handle.exists(tensorboard.endpoint): handle.delete(tensorboard.endpoint) hopshdfs._kill_logger()
def json(self, sc): """Get all relevant experiment information in JSON format. """ user = None if hopsconstants.ENV_VARIABLES.HOPSWORKS_USER_ENV_VAR in os.environ: user = os.environ[ hopsconstants.ENV_VARIABLES.HOPSWORKS_USER_ENV_VAR] experiment_json = { "project": hopshdfs.project_name(), "user": user, "name": self.name, "module": "maggy", "app_id": str(sc.applicationId), "start": time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime(self.job_start)), "memory_per_executor": str(sc._conf.get("spark.executor.memory")), "gpus_per_executor": str(sc._conf.get("spark.executor.gpus")), "executors": self.num_executors, "logdir": self.log_dir, # 'versioned_resources': versioned_resources, "description": self.description, "experiment_type": self.experiment_type, } if self.experiment_type == "optimization": experiment_json["hyperparameter_space"] = json.dumps( self.searchspace.to_dict()) experiment_json["function"] = self.optimizer.name() elif self.experiment_type == "ablation": experiment_json["ablation_study"] = json.dumps( self.ablation_study.to_dict()) experiment_json["ablator"] = self.ablator.name() if self.experiment_done: experiment_json["status"] = "FINISHED" experiment_json["finished"] = time.strftime( "%Y-%m-%dT%H:%M:%S", time.localtime(self.job_end)) experiment_json["duration"] = self.duration if self.experiment_type == "optimization": experiment_json["hyperparameter"] = json.dumps( self.result["best_hp"]) experiment_json["metric"] = self.result["best_val"] else: experiment_json["status"] = "RUNNING" return json.dumps(experiment_json, default=util.json_default_numpy)
def set_auth_header(headers): if constants.ENV_VARIABLES.REMOTE_ENV_VAR in os.environ: headers[constants.HTTP_CONFIG. HTTP_AUTHORIZATION] = "ApiKey " + get_api_key_aws( hdfs.project_name()) else: headers[ constants.HTTP_CONFIG.HTTP_AUTHORIZATION] = "Bearer " + get_jwt()
def get_serving_endpoint(model, project=None): endpoint = os.environ['REST_ENDPOINT'] if 'http' in endpoint: last_index = endpoint.rfind('/') endpoint = endpoint[last_index+1:] host_port_pair = endpoint.split(':') #hardcode disabled for now os.environ['SSL_ENABLED'] = 'false' if os.environ['SSL_ENABLED'] == 'true': connection = http.HTTPSConnection(str(host_port_pair[0]), int(host_port_pair[1])) else: connection = http.HTTPConnection(str(host_port_pair[0]), int(host_port_pair[1])) headers = {'Content-type': 'application/json'} material_passwd = os.getcwd() + '/material_passwd' if not os.path.exists(material_passwd): raise AssertionError('material_passwd is not present in current working directory') with open(material_passwd) as f: keyStorePwd = f.read() k_certificate = os.getcwd() + '/k_certificate' if not os.path.exists(material_passwd): raise AssertionError('k_certificate is not present in current working directory') with open(k_certificate) as f: keyStore = f.read() keyStore = base64.b64encode(keyStore) if not project: project = hdfs.project_name() json_contents = {'project': project, 'model': model, 'keyStorePwd': keyStorePwd, 'keyStore': keyStore } json_embeddable = json.dumps(json_contents) connection.request('POST', '/hopsworks-api/api/appservice/tfserving', json_embeddable, headers) response = connection.getresponse() respBody = response.read() responseObject = json.loads(respBody) host = responseObject['host'] port = responseObject['port'] return str(host) + ':' + str(port)
def exception_handler(): global running global experiment_json if running and experiment_json != None: experiment_json = json.loads(experiment_json) experiment_json['status'] = "FAILED" experiment_json['finished'] = datetime.now().isoformat() experiment_json = json.dumps(experiment_json) util.put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)
def exit_handler(): global experiment_json global elastic_id if running and experiment_json != None: experiment_json = json.loads(experiment_json) experiment_json['status'] = "KILLED" experiment_json['finished'] = datetime.now().isoformat() experiment_json = json.dumps(experiment_json) util.put_elastic(hopshdfs.project_name(), app_id, str('dist' + str(elastic_id)), experiment_json)
def _do_get_project_training_datasets_sink(): """ Gets the project's default location for storing training datasets in HopsFS Returns: the project's default hopsfs location for storing training datasets """ project_name = hdfs.project_name() training_datasets_sink = project_name + constants.FEATURE_STORE.TRAINING_DATASETS_SUFFIX return training_datasets_sink
def get_elasticsearch_index(index): """ Get the valid elasticsearch index for later use. This helper method prefix the index name with the project name. Args: :index: the elasticsearch index to interact with. Returns: A valid elasticsearch index name. """ return hdfs.project_name() + "_" + index
def _do_get_project_featurestore(): """ Gets the project's featurestore name (project_featurestore) Returns: the project's featurestore name """ project_name = hdfs.project_name() featurestore_name = project_name.lower() + constants.FEATURE_STORE.FEATURESTORE_SUFFIX return featurestore_name
def end(metric=None): global running global experiment_json global elastic_id global driver_tensorboard_hdfs_path global app_id if not running: raise RuntimeError( "An experiment is not running. Did you forget to call experiment.end()?" ) try: if metric: experiment_json = util.finalize_experiment(experiment_json, None, str(metric)) util.put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) else: experiment_json = util.finalize_experiment(experiment_json, None, None) util.put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) except: exception_handler() raise finally: elastic_id += 1 running = False handle = hopshdfs.get() if tensorboard.tb_pid != 0: subprocess.Popen(["kill", str(tensorboard.tb_pid)]) if tensorboard.local_logdir_bool: local_tb = tensorboard.local_logdir_path util.store_local_tensorboard(local_tb, tensorboard.events_logdir) if not tensorboard.endpoint == None and not tensorboard.endpoint == '' \ and handle.exists(tensorboard.endpoint): handle.delete(tensorboard.endpoint) hopshdfs.kill_logger()
def set_auth_header(headers): """ Set authorization header for HTTP requests to Hopsworks, depending if setup is remote or not. Args: http headers """ if constants.ENV_VARIABLES.REMOTE_ENV_VAR in os.environ: headers[constants.HTTP_CONFIG. HTTP_AUTHORIZATION] = "ApiKey " + get_api_key_aws( hdfs.project_name()) else: headers[ constants.HTTP_CONFIG.HTTP_AUTHORIZATION] = "Bearer " + get_jwt()
def _exit_handler(): """ Returns: """ global running global experiment_json if running and experiment_json != None: experiment_json = json.loads(experiment_json) experiment_json['status'] = "KILLED" experiment_json['finished'] = datetime.now().isoformat() experiment_json = json.dumps(experiment_json) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)
def download_model(name, version=None, project_name=None, overwrite=False): """ Download from the Hopsworks Models dataset an archive (zip file) containing the model artifacts. You first need to use the project.connect function to connect to Hopsworks. If the Models dataset where the model resides is a shared dataset from another project, then you need to specify the name of the project that owns the Models dataset was shared from. For example if you run this: >>> from hops import model >>> # If connecting from an external client, you need to connect to Hopsworks >>> project.connect(...) # see project module for documentation >>> model.download_model('mnist') Args: :name: name of the model :version: version of the model. If omitted, all versions of the model will be included in the archive. :project_name name of the project parent of the model. By default, this project is the current project running the experiment :overwrite: Whether to overwrite the model archive file if it already exists Returns: A zip file containing the model artifacts Raises: :ModelArchiveExists: if the model archive that contains the model artifacts already exists """ if project_name is None: project_name = hdfs.project_name() # Check if model archive already exists and if it should be deleted, otherwise return an error model_dir = '/Projects/' + project_name + "/Models/" + name if version is not None: model_dir += "/" + str(version) name += str(version) archive_path = model_dir + ".zip" name += ".zip" if dataset.path_exists(archive_path): if overwrite: dataset.delete(archive_path, block=True) else: raise ModelArchiveExists( "Model archive file already exists at {}. Either set overwrite=True or remove the file manually." .format(archive_path)) print("Preparing the model archive...") dataset.compress(model_dir, block=True, project_name=project_name) print("Downloading the model archive...") dataset.download(archive_path, file=name)
def _populate_experiment_model(model, project=None): """ Args: :model: :project_name: Returns: """ if project is None: project = hdfs.project_name() return { 'id': os.environ['ML_ID'], 'model': model, 'modelProjectName': project }
def log(line, level='info', logger=None, thread="default"): # For logging to work you need to add this to logstash and restart the service # input { # tcp { # port => 5000 # codec => json # } # } # # Will do normal printing if all fails # add extra field to logstash message if logger is not None: mlogger = logger else: # Maybe needs to add executor here also if we have multiple mlogger = _get_logger("executor-logger-%s" % os.environ['CONTAINER_ID']) if not mlogger: print("Logger error returned None") return if hasattr(mlogger, "_no_logging"): print(line) return True if hasattr(mlogger, "_no_logging"): print(line) return True import time import datetime extra = { 'application': [hdfs.project_name().lower(), "jupyter", "notebook", "executor128"], 'timestamp': datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'priority': level #'thread' : thread } getattr(mlogger, level)('%s', line, extra=extra) return True
def exists(serving_name): """ Checks if there exists a serving with the given name Example use-case: >>> from hops import serving >>> serving.exist(serving_name) Args: :serving_name: the name of the serving Returns: True if the serving exists, otherwise false """ try: return get_id(serving_name) is not None except ServingNotFound: print("No serving with name {} was found in the project {}".format( serving_name, hdfs.project_name())) return False
def _populate_experiment(model_name, function, type, hp, description, app_id, direction, optimization_key): """ Args: :sc: :model_name: :module: :function: :logdir: :hyperparameter_space: :description: Returns: """ jobName = None if constants.ENV_VARIABLES.JOB_NAME_ENV_VAR in os.environ: jobName = os.environ[constants.ENV_VARIABLES.JOB_NAME_ENV_VAR] kernelId = None if constants.ENV_VARIABLES.KERNEL_ID_ENV_VAR in os.environ: kernelId = os.environ[constants.ENV_VARIABLES.KERNEL_ID_ENV_VAR] if model_name == 'no-name' and jobName: model_name = jobName return { 'id': os.environ['ML_ID'], 'name': model_name, 'projectName': hdfs.project_name(), 'description': description, 'state': 'RUNNING', 'function': function, 'experimentType': type, 'appId': app_id, 'direction': direction, 'optimizationKey': optimization_key, 'jobName': jobName, 'kernelId': kernelId }
def populate_experiment(sc, model_name, module, function, logdir, hyperparameter_space, versioned_resources, description): user = None if 'HOPSWORKS_USER' in os.environ: user = os.environ['HOPSWORKS_USER'] return json.dumps({ 'project': hdfs.project_name(), 'user': user, 'name': model_name, 'module': module, 'function': function, 'status': 'RUNNING', 'app_id': sc.applicationId, 'start': datetime.now().isoformat(), 'memory_per_executor': str(sc._conf.get("spark.executor.memory")), 'gpus_per_executor': str(sc._conf.get("spark.executor.gpus")), 'executors': str(sc._conf.get("spark.executor.instances")), 'logdir': logdir, 'hyperparameter_space': hyperparameter_space, 'versioned_resources': versioned_resources, 'description': description })
def mirrored(map_fun, name='no-name', local_logdir=False, versioned_resources=None, description=None): """ *Distributed Training* single machine - multiple GPUs Example usage: >>> from hops import experiment >>> def mirrored_training(): >>> import tensorflow >>> from hops import tensorboard >>> from hops import devices >>> logdir = tensorboard.logdir() >>> ...MirroredStrategy()... >>> experiment.mirrored(mirrored_training) Args: :map_fun: contains the code where you are using MirroredStrategy. :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :versioned_resources: A list of HDFS paths of resources to version with this experiment :description: a longer description for the experiment Returns: HDFS path in your project where the experiment is stored """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.") try: global app_id global experiment_json global elastic_id running = True sc = util._find_spark().sparkContext app_id = str(sc.applicationId) mirrored_impl.run_id = mirrored_impl.run_id + 1 versioned_path = util._version_resources(versioned_resources, mirrored_impl._get_logdir(app_id)) experiment_json = util._populate_experiment(sc, name, 'experiment', 'mirrored', mirrored_impl._get_logdir(app_id), None, versioned_path, description) util._version_resources(versioned_resources, mirrored_impl._get_logdir(app_id)) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) retval, logdir = mirrored_impl._launch(sc, map_fun, local_logdir=local_logdir, name=name) experiment_json = util._finalize_experiment(experiment_json, None, retval) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) except: _exception_handler() raise finally: #cleanup spark jobs elastic_id +=1 running = False sc.setJobGroup("", "") return logdir
def begin(name='no-name', local_logdir=False, versioned_resources=None, description=None): """ Start a custom Experiment, at the end of the experiment call *end(metric)*. *IMPORTANT* - This call should not be combined with other functions in the experiment module, other than *end*. Other experiment functions such as *grid_search* manages the *begin* and *end* functions internally Example usage: >>> from hops import experiment >>> experiment.begin(name='calculate pi') >>> # Code to calculate pi >>> pi = calc_pi() >>> experiment.end(pi) Args: :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :versioned_resources: A list of HDFS paths of resources to version with this experiment :description: A longer description for the experiment Returns: HDFS path in your project where the experiment is stored """ global running if running: raise RuntimeError("An experiment is currently running. Please call experiment.stop() to stop it.") try: global app_id global experiment_json global elastic_id global run_id global driver_tensorboard_hdfs_path running = True sc = util._find_spark().sparkContext app_id = str(sc.applicationId) run_id = run_id + 1 versioned_path = util._version_resources(versioned_resources, _get_logdir(app_id)) experiment_json = None experiment_json = util._populate_experiment(sc, name, 'experiment', 'begin', _get_logdir(app_id), None, versioned_path, description) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(app_id, run_id, None, 'begin') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() driver_tensorboard_hdfs_path,_ = tensorboard._register(hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir) except: _exception_handler() raise return driver_tensorboard_hdfs_path
def parameter_server(map_fun, name='no-name', local_logdir=False, versioned_resources=None, description=None): """ *Distributed Training* Sets up the cluster to run ParameterServerStrategy. TF_CONFIG is exported in the background and does not need to be set by the user themselves. Example usage: >>> from hops import experiment >>> def distributed_training(): >>> import tensorflow >>> from hops import tensorboard >>> from hops import devices >>> logdir = tensorboard.logdir() >>> ...ParameterServerStrategy(num_gpus_per_worker=devices.get_num_gpus())... >>> experiment.parameter_server(distributed_training, local_logdir=True) Args: :map_fun: contains the code where you are using ParameterServerStrategy. :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :versioned_resources: A list of HDFS paths of resources to version with this experiment :description: a longer description for the experiment Returns: HDFS path in your project where the experiment is stored """ num_ps = util.num_param_servers() num_executors = util.num_executors() assert num_ps > 0, "number of parameter servers should be greater than 0" assert num_ps < num_executors, "num_ps cannot be greater than num_executors (i.e. num_executors == num_ps + num_workers)" global running if running: raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.") try: global app_id global experiment_json global elastic_id running = True sc = util._find_spark().sparkContext app_id = str(sc.applicationId) ps.run_id = ps.run_id + 1 versioned_path = util._version_resources(versioned_resources, ps._get_logdir(app_id)) experiment_json = util._populate_experiment(sc, name, 'experiment', 'parameter_server', ps._get_logdir(app_id), None, versioned_path, description) util._version_resources(versioned_resources, ps._get_logdir(app_id)) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) retval, logdir = ps._launch(sc, map_fun, local_logdir=local_logdir, name=name) experiment_json = util._finalize_experiment(experiment_json, None, retval) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) except: _exception_handler() raise finally: #cleanup spark jobs elastic_id +=1 running = False sc.setJobGroup("", "") return logdir
def launch(map_fun, args_dict=None, name='no-name', local_logdir=False, versioned_resources=None, description=None): """ *Experiment* or *Parallel Experiment* Run an Experiment contained in *map_fun* one time with no arguments or multiple times with different arguments if *args_dict* is specified. Example usage: >>> from hops import experiment >>> def train_nn(): >>> import tensorflow >>> from hops import tensorboard >>> logdir = tensorboard.logdir() >>> # code for preprocessing, training and exporting model >>> # optionally return a value for the experiment which is registered in Experiments service >>> experiment.launch(train_nn) Args: :map_fun: The function to run :args_dict: If specified will run the same function multiple times with different arguments, {'a':[1,2], 'b':[5,3]} would run the function two times with arguments (1,5) and (2,3) provided that the function signature contains two arguments like *def func(a,b):* :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :versioned_resources: A list of HDFS paths of resources to version with this experiment :description: A longer description for the experiment Returns: HDFS path in your project where the experiment is stored """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.") try: global app_id global experiment_json global elastic_id running = True sc = util._find_spark().sparkContext app_id = str(sc.applicationId) launcher.run_id = launcher.run_id + 1 versioned_path = util._version_resources(versioned_resources, launcher._get_logdir(app_id)) experiment_json = None if args_dict: experiment_json = util._populate_experiment(sc, name, 'experiment', 'launcher', launcher._get_logdir(app_id), json.dumps(args_dict), versioned_path, description) else: experiment_json = util._populate_experiment(sc, name, 'experiment', 'launcher', launcher._get_logdir(app_id), None, versioned_path, description) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) retval, tensorboard_logdir = launcher._launch(sc, map_fun, args_dict, local_logdir) util._version_resources(versioned_resources, launcher._get_logdir(app_id)) if retval: experiment_json = util._finalize_experiment(experiment_json, None, retval) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) return tensorboard_logdir experiment_json = util._finalize_experiment(experiment_json, None, None) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) except: _exception_handler() raise finally: #cleanup spark jobs elastic_id +=1 running = False sc.setJobGroup("", "") return tensorboard_logdir
def project_name(self): return hopshdfs.project_name()
def run(sc, map_fun, tf_args, num_executors, num_ps, tensorboard=False, input_mode=InputMode.TENSORFLOW, log_dir=None, driver_ps_nodes=False, master_node=None, reservation_timeout=600, name='no-name', local_logdir=False, versioned_resources=None, description=None, queues=['input', 'output', 'error']): """Starts the TensorFlowOnSpark cluster and Runs the TensorFlow "main" function on the Spark executors Args: :sc: SparkContext :map_fun: user-supplied TensorFlow "main" function :tf_args: ``argparse`` args, or command-line ``ARGV``. These will be passed to the ``map_fun``. :num_executors: number of Spark executors. This should match your Spark job's ``--num_executors``. :num_ps: number of Spark executors which are reserved for TensorFlow PS nodes. All other executors will be used as TensorFlow worker nodes. :tensorboard: boolean indicating if the chief worker should spawn a Tensorboard server. :input_mode: TFCluster.InputMode :log_dir: directory to save tensorboard event logs. If None, defaults to a fixed path on local filesystem. :driver_ps_nodes: run the PS nodes on the driver locally instead of on the spark executors; this help maximizing computing resources (esp. GPU). You will need to set cluster_size = num_executors + num_ps :master_node: name of the "master" or "chief" node in the cluster_template, used for `tf.estimator` applications. :reservation_timeout: number of seconds after which cluster reservation times out (600 sec default) :queues: *INTERNAL_USE* Returns: A TFCluster object representing the started cluster. """ #in hopsworks we want the tensorboard to always be true: global elastic_id global running global run_id tb = True elastic_id = elastic_id + 1 run_id = run_id + 1 running = True logging.info( "Reserving TFSparkNodes {0}".format("w/ TensorBoard" if tb else "")) assert num_ps < num_executors if driver_ps_nodes: raise Exception( 'running PS nodes on driver is not supported and not needed on Hops Hadoop, since we have GPU scheduling.' ) if log_dir: raise Exception( 'No need to specify log_dir directory, we save TensorBoard events in the directory returned by tensorboard.logdir for you' ) # build a cluster_spec template using worker_nums cluster_template = {} cluster_template['ps'] = range(num_ps) if master_node is None: cluster_template['worker'] = range(num_ps, num_executors) else: cluster_template[master_node] = range(num_ps, num_ps + 1) if num_executors > num_ps + 1: cluster_template['worker'] = range(num_ps + 1, num_executors) logging.info("cluster_template: {}".format(cluster_template)) # get default filesystem from spark defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS") # strip trailing "root" slash from "file:///" to be consistent w/ "hdfs://..." if defaultFS.startswith( "file://") and len(defaultFS) > 7 and defaultFS.endswith("/"): defaultFS = defaultFS[:-1] # get current working dir of spark launch working_dir = os.getcwd() # start a server to listen for reservations and broadcast cluster_spec server = reservation.Server(num_executors) server_addr = server.start() # start TF nodes on all executors logging.info("Starting TensorFlow on executors") cluster_meta = { 'id': random.getrandbits(64), 'cluster_template': cluster_template, 'num_executors': num_executors, 'default_fs': defaultFS, 'working_dir': working_dir, 'server_addr': server_addr } nodeRDD = sc.parallelize(range(num_executors), num_executors) global app_id app_id = sc.applicationId global experiment_json versioned_path = util.version_resources(versioned_resources, get_logdir(app_id)) experiment_json = None experiment_json = util.populate_experiment(sc, name, 'TFCluster', 'run', get_logdir(app_id), None, versioned_path, description) util.put_elastic(hopshdfs.project_name(), app_id, str('dist' + str(elastic_id)), experiment_json) # start TF on a background thread (on Spark driver) to allow for feeding job def _start(status): try: nodeRDD.foreachPartition( TFSparkNode.run(map_fun, tf_args, cluster_meta, tb, None, app_id, run_id, queues, local_logdir=local_logdir, background=(input_mode == InputMode.SPARK))) except Exception as e: logging.error("Exception in TF background thread") status['error'] = str(e) exception_handler() t = threading.Thread(target=_start, args=(tf_status, )) # run as daemon thread so that in spark mode main thread can exit # if feeder spark stage fails and main thread can't do explicit shutdown t.daemon = True t.start() # wait for executors to check GPU presence logging.info("Waiting for GPU presence check to start") gpus_present = server.await_gpu_check() logging.info("All GPU checks completed") # wait for executors to register and start TFNodes before continuing logging.info("Waiting for TFSparkNodes to start") cluster_info = server.await_reservations(sc, tf_status, reservation_timeout) logging.info("All TFSparkNodes started") # print cluster_info and extract TensorBoard URL tb_url = None for node in cluster_info: logging.info(node) if node['tb_port'] != 0: tb_url = "http://{0}:{1}".format(node['host'], node['tb_port']) if tb_url is not None: logging.info( "========================================================================================" ) logging.info("") logging.info("TensorBoard running at: {0}".format(tb_url)) logging.info("") logging.info( "========================================================================================" ) # since our "primary key" for each executor's TFManager is (host, executor_id), sanity check for duplicates # Note: this may occur if Spark retries failed Python tasks on the same executor. tb_nodes = set() for node in cluster_info: node_id = (node['host'], node['executor_id']) if node_id in tb_nodes: raise Exception( "Duplicate cluster node id detected (host={0}, executor_id={1})" .format(node_id[0], node_id[1]) + "Please ensure that:\n" + "1. Number of executors >= number of TensorFlow nodes\n" + "2. Number of tasks per executors is 1\n" + "3, TFCluster.shutdown() is successfully invoked when done.") else: tb_nodes.add(node_id) # create TFCluster object cluster = TFCluster() cluster.sc = sc cluster.meta = cluster_meta cluster.nodeRDD = nodeRDD cluster.cluster_info = cluster_info cluster.cluster_meta = cluster_meta cluster.input_mode = input_mode cluster.queues = queues cluster.server = server return cluster
def shutdown(self, ssc=None): """Stops the distributed TensorFlow cluster. Args: :ssc: *For Streaming applications only*. Spark StreamingContext """ logging.info("Stopping TensorFlow nodes") # identify ps/workers ps_list, worker_list = [], [] for node in self.cluster_info: if node['job_name'] == 'ps': ps_list.append(node) else: worker_list.append(node) if ssc is not None: # Spark Streaming done = False while not done: done = ssc.awaitTerminationOrTimeout(1) if not done and self.server.done: logging.info("Server done, stopping StreamingContext") ssc.stop(stopSparkContext=False, stopGraceFully=True) done = done or self.server.done else: # in TENSORFLOW mode, there is no "data feeding" job, only a "start" job, so we must wait for the TensorFlow workers # to complete all tasks, while accounting for any PS tasks which run indefinitely. if self.input_mode == InputMode.TENSORFLOW: count = 0 done = False while not done: st = self.sc.statusTracker() jobs = st.getActiveJobsIds() if len(jobs) > 0: stages = st.getActiveStageIds() for i in stages: si = st.getStageInfo(i) if si.numActiveTasks == len(ps_list): # if we only have PS tasks left, check that we see this condition a couple times count += 1 done = (count >= 3) time.sleep(5) else: done = True global running running = False # shutdown queues and managers for "worker" executors. # note: in SPARK mode, this job will immediately queue up behind the "data feeding" job. # in TENSORFLOW mode, this will only run after all workers have finished. workers = len(worker_list) workerRDD = self.sc.parallelize(range(workers), workers) workerRDD.foreachPartition( TFSparkNode.shutdown(self.cluster_info, self.queues)) # exit Spark application w/ err status if TF job had any errors if 'error' in tf_status: logging.error("Exiting Spark application with error status.") exception_handler() self.sc.cancelAllJobs() #self.sc.stop() #sys.exit(1) global experiment_json global app_id experiment_json = util.finalize_experiment(experiment_json, None, None) util.put_elastic(hopshdfs.project_name(), app_id, str('dist' + str(elastic_id)), experiment_json) logging.info("Shutting down cluster") # shutdown queues and managers for "PS" executors. # note: we have to connect/shutdown from the spark driver, because these executors are "busy" and won't accept any other tasks. for node in ps_list: addr = node['addr'] authkey = node['authkey'] m = TFManager.connect(addr, authkey) q = m.get_queue('control') q.put(None) q.join() # wait for all jobs to finish done = False while not done: time.sleep(5) st = self.sc.statusTracker() jobs = st.getActiveJobsIds() if len(jobs) == 0: break def tensorboard_url(self): """ Utility function to get Tensorboard URL """ tb_url = None for node in self.cluster_info: if node['tb_port'] != 0 and node[ 'job_name'] == 'worker' and node['task_index'] == 0: tb_url = "http://{0}:{1}".format(node['host'], node['tb_port']) return tb_url
def start_beam_jobserver(flink_session_name, artifacts_dir="Resources", jobserver_jar=None, sdk_worker_parallelism=1): """ Start the Java Beam job server that connects to the flink session cluster. User needs to provide the job name that started the Flink session and optionally the worker parallelism. Args: :flink_session_name: Job name that runs the Flink session. :sdk_worker_parallelism: Default parallelism for SDK worker processes. This option is only applied when the pipeline option sdkWorkerParallelism is set to 0.Default is 1, If 0, worker parallelism will be dynamically decided by runner.See also: sdkWorkerParallelism Pipeline Option (default: 1). For further documentation, please refer to Apache Beam docs. Returns: artifact_port, expansion_port, job_host, job_port, jobserver.pid """ if jobserver_jar is None: jobserver_jar = os.path.join( util.get_flink_conf_dir(), "beam-runners-flink-1.8-job-server-2.15.0.jar") # Get Flink master URL (flink session cluster) from an ExecutionDTO method = constants.HTTP_CONFIG.HTTP_GET resource_url = constants.DELIMITERS.SLASH_DELIMITER + \ constants.REST_CONFIG.HOPSWORKS_REST_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \ constants.REST_CONFIG.HOPSWORKS_PROJECT_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \ hopsfs.project_id() + constants.DELIMITERS.SLASH_DELIMITER + \ "jobs" + constants.DELIMITERS.SLASH_DELIMITER + \ flink_session_name + constants.DELIMITERS.SLASH_DELIMITER + \ "executions" + \ "?limit=1&offset=0&sort_by=submissionTime:desc" response = util.send_request(method, resource_url) response_object = response.json() flink_master_url = response_object['items'][0]['flinkMasterURL'] artifact_port = randint(10000, 65000) expansion_port = randint(10000, 65000) job_port = randint(10000, 65000) job_host = socket.getfqdn() log_base_path = "" if 'LOG_DIRS' in os.environ: log_base_path += os.environ['LOG_DIRS'] + "/" beam_jobserver_log = log_base_path + "beamjobserver-" + hopsfs.project_name().lower() + "-" + flink_session_name + \ "-" + str(job_port) + ".log" # copy jar to local with open(beam_jobserver_log, "wb") as out, open(beam_jobserver_log, "wb") as err: jobserver = subprocess.Popen( [ "java", "-jar", jobserver_jar, "--artifacts-dir=%s" % hopsfs.project_path() + artifacts_dir, "--flink-master-url=%s" % flink_master_url, "--artifact-port=%d" % artifact_port, "--expansion-port=%d" % expansion_port, "--job-host=%s" % job_host, "--job-port=%d" % job_port, "--sdk-worker-parallelism=%d" % sdk_worker_parallelism ], stdout=out, stderr=err, preexec_fn=util._on_executor_exit('SIGTERM')) global clusters clusters.append(flink_session_name) global jobserver_host jobserver_host = job_host global jobserver_port jobserver_port = job_port return { "jobserver_log": beam_jobserver_log, "artifact_port": artifact_port, "expansion_port": expansion_port, "job_host": job_host, "job_port": job_port, "jobserver.pid": jobserver.pid }
def differential_evolution(objective_function, boundary_dict, direction = 'max', generations=10, population=10, mutation=0.5, crossover=0.7, cleanup_generations=False, name='no-name', local_logdir=False, versioned_resources=None, description=None): """ *Parallel Experiment* Run differential evolution to explore a given search space for each hyperparameter and figure out the best hyperparameter combination. The function is treated as a blackbox that returns a metric for some given hyperparameter combination. The returned metric is used to evaluate how 'good' the hyperparameter combination was. Example usage: >>> from hops import experiment >>> boundary_dict = {'learning_rate':[0.01, 0.2], 'dropout': [0.1, 0.9]} >>> def train_nn(learning_rate, dropout): >>> import tensorflow >>> # code for preprocessing, training and exporting model >>> # mandatory return a value for the experiment which is registered in Experiments service >>> return network.evaluate(learning_rate, dropout) >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max') Args: :objective_function: the function to run, must return a metric :boundary_dict: a dict where each key corresponds to an argument of *objective_function* and the correspond value should be a list of two elements. The first element being the lower bound for the parameter and the the second element the upper bound. :direction: 'max' to maximize the returned metric, 'min' to minize the returned metric :generations: number of generations :population: size of population :mutation: mutation rate to explore more different hyperparameters :crossover: how fast to adapt the population to the best in each generation :cleanup_generations: remove previous generations from HDFS, only keep the last 2 :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :versioned_resources: A list of HDFS paths of resources to version with this experiment :description: a longer description for the experiment Returns: HDFS path in your project where the experiment is stored, dict with best hyperparameters """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.") try: global app_id global experiment_json global elastic_id running = True spark = util._find_spark() sc = spark.sparkContext app_id = str(sc.applicationId) diff_evo.run_id = diff_evo.run_id + 1 versioned_path = util._version_resources(versioned_resources, diff_evo._get_logdir(app_id)) experiment_json = None experiment_json = util._populate_experiment(sc, name, 'experiment', 'differential_evolution', diff_evo._get_logdir(app_id), json.dumps(boundary_dict), versioned_path, description) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) tensorboard_logdir, best_param, best_metric = diff_evo._search(spark, objective_function, boundary_dict, direction=direction, generations=generations, popsize=population, mutation=mutation, crossover=crossover, cleanup_generations=cleanup_generations, local_logdir=local_logdir, name=name) experiment_json = util._finalize_experiment(experiment_json, best_param, best_metric) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) best_param_dict = util._convert_to_dict(best_param) except: _exception_handler() raise finally: #cleanup spark jobs elastic_id +=1 running = False sc.setJobGroup("", "") return tensorboard_logdir, best_param_dict
# Copyright (C) 2020, Logical Clocks AB. All rights reserved # !/usr/bin/env python # -*- coding: utf-8 -*- import sys from hops import hdfs arguments = len(sys.argv) - 1 position = 1 print("Parameters%s" % (len(sys.argv) - 1)) while (arguments >= position): print("Parameter %i: %s" % (position, sys.argv[position])) position = position + 1 td_proj_name = hdfs.project_name() if arguments >= 1: td_proj_name = sys.argv[1] td_ds = td_proj_name + '_Training_Datasets' td = 'mnist_td_1' if arguments >= 2: td = sys.argv[2] model_proj_name = None if arguments >= 3: model_proj_name = sys.argv[3] model_name = 'mnist_model' if arguments >= 4: model_name = sys.argv[4]
def grid_search(map_fun, args_dict, direction='max', name='no-name', local_logdir=False, versioned_resources=None, description=None): """ *Parallel Experiment* Run multiple experiments and test a grid of hyperparameters for a neural network to maximize e.g. a Neural Network's accuracy. The following example will run *train_nn* with 6 different hyperparameter combinations >>> from hops import experiment >>> grid_dict = {'learning_rate':[0.1, 0.3], 'dropout': [0.4, 0.6, 0.1]} >>> def train_nn(learning_rate, dropout): >>> import tensorflow >>> # code for preprocessing, training and exporting model >>> # mandatory return a value for the experiment which is registered in Experiments service >>> return network.evaluate(learning_rate, dropout) >>> experiment.grid_search(train_nn, grid_dict, direction='max') The following values will be injected in the function and run and evaluated. - (learning_rate=0.1, dropout=0.4) - (learning_rate=0.1, dropout=0.6) - (learning_rate=0.1, dropout=0.1) - (learning_rate=0.3, dropout=0.4) - (learning_rate=0.3, dropout=0.6) - (learning_rate=0.3, dropout=0.1) Args: :map_fun: the function to run, must return a metric :args_dict: a dict with a key for each argument with a corresponding value being a list containing the hyperparameters to test, internally all possible combinations will be generated and run as separate Experiments :direction: 'max' to maximize the returned metric, 'min' to minize the returned metric :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :versioned_resources: A list of HDFS paths of resources to version with this experiment :description: a longer description for the experiment Returns: HDFS path in your project where the experiment is stored """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.") try: global app_id global experiment_json global elastic_id running = True sc = util._find_spark().sparkContext app_id = str(sc.applicationId) gs.run_id = gs.run_id + 1 versioned_path = util._version_resources(versioned_resources, gs._get_logdir(app_id)) experiment_json = util._populate_experiment(sc, name, 'experiment', 'grid_search', gs._get_logdir(app_id), json.dumps(args_dict), versioned_path, description) util._version_resources(versioned_resources, gs._get_logdir(app_id)) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) grid_params = util.grid_params(args_dict) tensorboard_logdir, param, metric = gs._grid_launch(sc, map_fun, grid_params, direction=direction, local_logdir=local_logdir, name=name) experiment_json = util._finalize_experiment(experiment_json, param, metric) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) except: _exception_handler() raise finally: #cleanup spark jobs elastic_id +=1 running = False sc.setJobGroup("", "") return tensorboard_logdir