def __init__(self, count): """ Args: count: """ assert count > 0 self.reservations = Reservations(count) self.worker_finished = WorkerFinished(util.num_executors() - util.num_param_servers())
def _run(sc, map_fun, run_id, local_logdir=False, name="no-name", evaluator=False): """ Args: sc: map_fun: local_logdir: name: Returns: """ app_id = str(sc.applicationId) num_executions = util.num_executors() #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Make SparkUI intuitive by grouping jobs sc.setJobGroup( os.environ['ML_ID'], "{} | ParameterServerStrategy - Distributed Training".format(name)) server = parameter_server_reservation.Server(num_executions) server_addr = server.start() num_ps = util.num_param_servers() #Force execution on executor, since GPU is located on executor nodeRDD.foreachPartition( _prepare_func(app_id, run_id, map_fun, local_logdir, server_addr, num_ps, evaluator)) logdir = experiment_utils._get_logdir(app_id, run_id) print('Finished Experiment \n') path_to_return = logdir + '/.outputs.json' if pydoop.hdfs.path.exists(path_to_return): with pydoop.hdfs.open(path_to_return, "r") as fi: contents = fi.read() fi.close() return logdir, json.loads(contents) return logdir, None
def _launch(sc, map_fun, local_logdir=False, name="no-name"): """ Args: sc: map_fun: local_logdir: name: Returns: """ global run_id app_id = str(sc.applicationId) num_executions = int(sc._conf.get("spark.executor.instances")) #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Make SparkUI intuitive by grouping jobs sc.setJobGroup("ParameterServerStrategy", "{} | Distributed Training".format(name)) server = parameter_server_reservation.Server(num_executions) server_addr = server.start() num_ps = util.num_param_servers() #Force execution on executor, since GPU is located on executor nodeRDD.foreachPartition( _prepare_func(app_id, run_id, map_fun, local_logdir, server_addr, num_ps)) logdir = _get_logdir(app_id) path_to_metric = logdir + '/metric' if pydoop.hdfs.path.exists(path_to_metric): with pydoop.hdfs.open(path_to_metric, "r") as fi: metric = float(fi.read()) fi.close() return metric, logdir print('Finished Experiment \n') return None, logdir
def mirrored(train_fn, name='no-name', local_logdir=False, description=None, evaluator=False, metric_key=None): """ *Distributed Training* Example usage: >>> from hops import experiment >>> def mirrored_training(): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the train_fn function >>> from hops import tensorboard >>> from hops import devices >>> logdir = tensorboard.logdir() >>> ...MirroredStrategy()... >>> experiment.mirrored(mirrored_training, local_logdir=True) Args: :train_fn: contains the code where you are using MirroredStrategy. :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: a longer description for the experiment :evaluator: whether to run one of the workers as an evaluator :metric_key: If returning a dict with multiple return values, this key should match the name of the key in the dict for the metric you want to associate with the experiment Returns: HDFS path in your project where the experiment is stored and return value from the process running as chief """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running.") num_workers = util.num_executors() if evaluator: assert num_workers > 2, "number of workers must be atleast 3 if evaluator is set to True" start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() experiment_utils._create_experiment_dir(app_id, run_id) experiment_json = experiment_utils._populate_experiment( name, 'mirrored', 'DISTRIBUTED_TRAINING', None, description, app_id, None, None) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') logdir, return_dict = mirrored_impl._run(sc, train_fn, run_id, local_logdir=local_logdir, name=name, evaluator=evaluator) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) metric = experiment_utils._get_metric(return_dict, metric_key) experiment_utils._finalize_experiment(experiment_json, metric, app_id, run_id, 'FINISHED', duration, logdir, None, None) return logdir, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def grid_search(train_fn, grid_dict, direction=Direction.MAX, name='no-name', local_logdir=False, description=None, optimization_key='metric'): """ *Parallel Experiment* Run grid search evolution to explore a predefined set of hyperparameter combinations. The function is treated as a blackbox that returns a metric for some given hyperparameter combination. The returned metric is used to evaluate how 'good' the hyperparameter combination was. Example usage: >>> from hops import experiment >>> grid_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the train_fn function >>> return network.evaluate(learning_rate, layers, dropout) >>> experiment.grid_search(train_nn, grid_dict, direction=Direction.MAX) Returning multiple outputs, including images and logs: >>> from hops import experiment >>> grid_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the train_fn function >>> from PIL import Image >>> f = open('logfile.txt', 'w') >>> f.write('Starting training...') >>> accuracy, loss = network.evaluate(learning_rate, layers, dropout) >>> img = Image.new(.....) >>> img.save('diagram.png') >>> return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'} >>> # Important! Remember: optimization_key must be set when returning multiple outputs >>> experiment.grid_search(train_nn, grid_dict, direction=Direction.MAX, optimization_key='accuracy') Args: :train_fn: the function to run, must return a metric :grid_dict: a dict with a key for each argument with a corresponding value being a list containing the hyperparameters to test, internally all possible combinations will be generated and run as separate Experiments :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: a longer description for the experiment :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value Returns: HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running.") start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() experiment_utils._create_experiment_dir(app_id, run_id) experiment_json = experiment_utils._populate_experiment( name, 'grid_search', 'PARALLEL_EXPERIMENTS', json.dumps(grid_dict), description, app_id, direction, optimization_key) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') grid_params = experiment_utils.grid_params(grid_dict) logdir, best_param, best_metric, return_dict = grid_search_impl._run( sc, train_fn, run_id, grid_params, direction=direction, local_logdir=local_logdir, name=name, optimization_key=optimization_key) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) experiment_utils._finalize_experiment( experiment_json, best_metric, app_id, run_id, 'FINISHED', duration, experiment_utils._get_logdir(app_id, run_id), logdir, optimization_key) return logdir, best_param, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def launch(train_fn, args_dict=None, name='no-name', local_logdir=False, description=None, metric_key=None): """ *Experiment* or *Parallel Experiment* Run an Experiment contained in *train_fn* one time with no arguments or multiple times with different arguments if *args_dict* is specified. Example usage: >>> from hops import experiment >>> def train_nn(): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the train_fn function >>> accuracy, loss = network.evaluate(learning_rate, layers, dropout) >>> experiment.launch(train_nn) Returning multiple outputs, including images and logs: >>> from hops import experiment >>> def train_nn(): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the train_fn function >>> from PIL import Image >>> f = open('logfile.txt', 'w') >>> f.write('Starting training...') >>> accuracy, loss = network.evaluate(learning_rate, layers, dropout) >>> img = Image.new(.....) >>> img.save('diagram.png') >>> return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'} >>> experiment.launch(train_nn) Args: :train_fn: The function to run :args_dict: If specified will run the same function multiple times with different arguments, {'a':[1,2], 'b':[5,3]} would run the function two times with arguments (1,5) and (2,3) provided that the function signature contains two arguments like *def func(a,b):* :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: A longer description for the experiment :metric_key: If returning a dict with multiple return values, this key should match the name of the key in the dict for the metric you want to associate with the experiment Returns: HDFS path in your project where the experiment is stored """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError( "An experiment is currently running. Please call experiment.end() to stop it." ) start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() experiment_utils._create_experiment_dir(app_id, run_id) experiment_json = None if args_dict: experiment_json = experiment_utils._populate_experiment( name, 'launch', 'EXPERIMENT', json.dumps(args_dict), description, app_id, None, None) else: experiment_json = experiment_utils._populate_experiment( name, 'launch', 'EXPERIMENT', None, description, app_id, None, None) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') logdir, return_dict = launcher._run(sc, train_fn, run_id, args_dict, local_logdir) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) metric = experiment_utils._get_metric(return_dict, metric_key) experiment_utils._finalize_experiment(experiment_json, metric, app_id, run_id, 'FINISHED', duration, logdir, None, None) return logdir, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def random_search(train_fn, boundary_dict, direction=Direction.MAX, samples=10, name='no-name', local_logdir=False, description=None, optimization_key='metric'): """ *Parallel Experiment* Run an Experiment contained in *train_fn* for configured number of random samples controlled by the *samples* parameter. Each hyperparameter is contained in *boundary_dict* with the key corresponding to the name of the hyperparameter and a list containing two elements defining the lower and upper bound. The experiment must return a metric corresponding to how 'good' the given hyperparameter combination is. Example usage: >>> from hops import experiment >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the train_fn function >>> return network.evaluate(learning_rate, layers, dropout) >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max') Returning multiple outputs, including images and logs: >>> from hops import experiment >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the train_fn function >>> from PIL import Image >>> f = open('logfile.txt', 'w') >>> f.write('Starting training...') >>> accuracy, loss = network.evaluate(learning_rate, layers, dropout) >>> img = Image.new(.....) >>> img.save('diagram.png') >>> return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'} >>> # Important! Remember: optimization_key must be set when returning multiple outputs >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max', optimization_key='accuracy') Args: :train_fn: The function to run :boundary_dict: dict containing hyperparameter name and corresponding boundaries, each experiment randomize a value in the boundary range. :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric :samples: the number of random samples to evaluate for each hyperparameter given the boundaries, for example samples=3 would result in 3 hyperparameter combinations in total to evaluate :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: A longer description for the experiment :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value Returns: HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running.") start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() experiment_utils._create_experiment_dir(app_id, run_id) experiment_json = experiment_utils._populate_experiment( name, 'random_search', 'PARALLEL_EXPERIMENTS', json.dumps(boundary_dict), description, app_id, direction, optimization_key) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') logdir, best_param, best_metric, return_dict = r_search_impl._run( sc, train_fn, run_id, boundary_dict, samples, direction=direction, local_logdir=local_logdir, optimization_key=optimization_key) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) experiment_utils._finalize_experiment( experiment_json, best_metric, app_id, run_id, 'FINISHED', duration, experiment_utils._get_logdir(app_id, run_id), logdir, optimization_key) return logdir, best_param, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def parameter_server(map_fun, name='no-name', local_logdir=False, description=None, evaluator=False): """ *Distributed Training* Sets up the cluster to run ParameterServerStrategy. TF_CONFIG is exported in the background and does not need to be set by the user themselves. Example usage: >>> from hops import experiment >>> def distributed_training(): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> from hops import tensorboard >>> from hops import devices >>> logdir = tensorboard.logdir() >>> ...ParameterServerStrategy(num_gpus_per_worker=devices.get_num_gpus())... >>> experiment.parameter_server(distributed_training, local_logdir=True) Args:f :map_fun: contains the code where you are using ParameterServerStrategy. :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: a longer description for the experiment :evaluator: whether to run one of the workers as an evaluator Returns: HDFS path in your project where the experiment is stored and return value from the process running as chief """ num_ps = util.num_param_servers() num_executors = util.num_executors() assert num_ps > 0, "number of parameter servers should be greater than 0" assert num_ps < num_executors, "num_ps cannot be greater than num_executors (i.e. num_executors == num_ps + num_workers)" if evaluator: assert num_executors - num_ps > 2, "number of workers must be atleast 3 if evaluator is set to True" global running if running: raise RuntimeError("An experiment is currently running.") start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id)) experiment_json = experiment_utils._populate_experiment( name, 'parameter_server', 'DISTRIBUTED_TRAINING', None, description, app_id, None, None) experiment_json = experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') logdir, return_dict = ps_impl._run(sc, map_fun, run_id, local_logdir=local_logdir, name=name, evaluator=evaluator) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) experiment_utils._finalize_experiment(experiment_json, None, app_id, run_id, 'FINISHED', duration, logdir, None, None) return logdir, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def differential_evolution(objective_function, boundary_dict, direction=Direction.MAX, generations=4, population=6, mutation=0.5, crossover=0.7, name='no-name', local_logdir=False, description=None, optimization_key='metric'): """ *Parallel Experiment* Run differential evolution to explore a given search space for each hyperparameter and figure out the best hyperparameter combination. The function is treated as a blackbox that returns a metric for some given hyperparameter combination. The returned metric is used to evaluate how 'good' the hyperparameter combination was. Example usage: >>> from hops import experiment >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> import tensorflow >>> return network.evaluate(learning_rate, layers, dropout) >>> experiment.differential_evolution(train_nn, boundary_dict, direction=Direction.MAX) Returning multiple outputs, including images and logs: >>> from hops import experiment >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> from PIL import Image >>> f = open('logfile.txt', 'w') >>> f.write('Starting training...') >>> accuracy, loss = network.evaluate(learning_rate, layers, dropout) >>> img = Image.new(.....) >>> img.save('diagram.png') >>> return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'} >>> # Important! Remember: optimization_key must be set when returning multiple outputs >>> experiment.differential_evolution(train_nn, boundary_dict, direction=Direction.MAX, optimization_key='accuracy') Args: :objective_function: the function to run, must return a metric :boundary_dict: a dict where each key corresponds to an argument of *objective_function* and the correspond value should be a list of two elements. The first element being the lower bound for the parameter and the the second element the upper bound. :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric :generations: number of generations :population: size of population :mutation: mutation rate to explore more different hyperparameters :crossover: how fast to adapt the population to the best in each generation :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: a longer description for the experiment :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value Returns: HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running.") start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() diff_evo_impl.run_id = run_id hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id)) experiment_json = experiment_utils._populate_experiment( name, 'differential_evolution', 'PARALLEL_EXPERIMENTS', json.dumps(boundary_dict), description, app_id, direction, optimization_key) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') logdir, best_param, best_metric, return_dict = diff_evo_impl._run( objective_function, boundary_dict, direction=direction, generations=generations, population=population, mutation=mutation, crossover=crossover, cleanup_generations=False, local_logdir=local_logdir, name=name, optimization_key=optimization_key) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) experiment_utils._finalize_experiment( experiment_json, best_metric, app_id, run_id, 'FINISHED', duration, experiment_utils._get_logdir(app_id, run_id), logdir, optimization_key) return logdir, best_param, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def mirrored(map_fun, name='no-name', local_logdir=False, versioned_resources=None, description=None): """ *Distributed Training* single machine - multiple GPUs Example usage: >>> from hops import experiment >>> def mirrored_training(): >>> import tensorflow >>> from hops import tensorboard >>> from hops import devices >>> logdir = tensorboard.logdir() >>> ...MirroredStrategy()... >>> experiment.mirrored(mirrored_training) Args: :map_fun: contains the code where you are using MirroredStrategy. :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :versioned_resources: A list of HDFS paths of resources to version with this experiment :description: a longer description for the experiment Returns: HDFS path in your project where the experiment is stored """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.") try: global app_id global experiment_json global elastic_id running = True sc = util._find_spark().sparkContext app_id = str(sc.applicationId) mirrored_impl.run_id = mirrored_impl.run_id + 1 versioned_path = util._version_resources(versioned_resources, mirrored_impl._get_logdir(app_id)) experiment_json = util._populate_experiment(sc, name, 'experiment', 'mirrored', mirrored_impl._get_logdir(app_id), None, versioned_path, description) util._version_resources(versioned_resources, mirrored_impl._get_logdir(app_id)) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) retval, logdir = mirrored_impl._launch(sc, map_fun, local_logdir=local_logdir, name=name) experiment_json = util._finalize_experiment(experiment_json, None, retval) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) except: _exception_handler() raise finally: #cleanup spark jobs elastic_id +=1 running = False sc.setJobGroup("", "") return logdir
def parameter_server(map_fun, name='no-name', local_logdir=False, versioned_resources=None, description=None): """ *Distributed Training* Sets up the cluster to run ParameterServerStrategy. TF_CONFIG is exported in the background and does not need to be set by the user themselves. Example usage: >>> from hops import experiment >>> def distributed_training(): >>> import tensorflow >>> from hops import tensorboard >>> from hops import devices >>> logdir = tensorboard.logdir() >>> ...ParameterServerStrategy(num_gpus_per_worker=devices.get_num_gpus())... >>> experiment.parameter_server(distributed_training, local_logdir=True) Args: :map_fun: contains the code where you are using ParameterServerStrategy. :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :versioned_resources: A list of HDFS paths of resources to version with this experiment :description: a longer description for the experiment Returns: HDFS path in your project where the experiment is stored """ num_ps = util.num_param_servers() num_executors = util.num_executors() assert num_ps > 0, "number of parameter servers should be greater than 0" assert num_ps < num_executors, "num_ps cannot be greater than num_executors (i.e. num_executors == num_ps + num_workers)" global running if running: raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.") try: global app_id global experiment_json global elastic_id running = True sc = util._find_spark().sparkContext app_id = str(sc.applicationId) ps.run_id = ps.run_id + 1 versioned_path = util._version_resources(versioned_resources, ps._get_logdir(app_id)) experiment_json = util._populate_experiment(sc, name, 'experiment', 'parameter_server', ps._get_logdir(app_id), None, versioned_path, description) util._version_resources(versioned_resources, ps._get_logdir(app_id)) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) retval, logdir = ps._launch(sc, map_fun, local_logdir=local_logdir, name=name) experiment_json = util._finalize_experiment(experiment_json, None, retval) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) except: _exception_handler() raise finally: #cleanup spark jobs elastic_id +=1 running = False sc.setJobGroup("", "") return logdir
def grid_search(map_fun, args_dict, direction='max', name='no-name', local_logdir=False, versioned_resources=None, description=None): """ *Parallel Experiment* Run multiple experiments and test a grid of hyperparameters for a neural network to maximize e.g. a Neural Network's accuracy. The following example will run *train_nn* with 6 different hyperparameter combinations >>> from hops import experiment >>> grid_dict = {'learning_rate':[0.1, 0.3], 'dropout': [0.4, 0.6, 0.1]} >>> def train_nn(learning_rate, dropout): >>> import tensorflow >>> # code for preprocessing, training and exporting model >>> # mandatory return a value for the experiment which is registered in Experiments service >>> return network.evaluate(learning_rate, dropout) >>> experiment.grid_search(train_nn, grid_dict, direction='max') The following values will be injected in the function and run and evaluated. - (learning_rate=0.1, dropout=0.4) - (learning_rate=0.1, dropout=0.6) - (learning_rate=0.1, dropout=0.1) - (learning_rate=0.3, dropout=0.4) - (learning_rate=0.3, dropout=0.6) - (learning_rate=0.3, dropout=0.1) Args: :map_fun: the function to run, must return a metric :args_dict: a dict with a key for each argument with a corresponding value being a list containing the hyperparameters to test, internally all possible combinations will be generated and run as separate Experiments :direction: 'max' to maximize the returned metric, 'min' to minize the returned metric :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :versioned_resources: A list of HDFS paths of resources to version with this experiment :description: a longer description for the experiment Returns: HDFS path in your project where the experiment is stored """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.") try: global app_id global experiment_json global elastic_id running = True sc = util._find_spark().sparkContext app_id = str(sc.applicationId) gs.run_id = gs.run_id + 1 versioned_path = util._version_resources(versioned_resources, gs._get_logdir(app_id)) experiment_json = util._populate_experiment(sc, name, 'experiment', 'grid_search', gs._get_logdir(app_id), json.dumps(args_dict), versioned_path, description) util._version_resources(versioned_resources, gs._get_logdir(app_id)) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) grid_params = util.grid_params(args_dict) tensorboard_logdir, param, metric = gs._grid_launch(sc, map_fun, grid_params, direction=direction, local_logdir=local_logdir, name=name) experiment_json = util._finalize_experiment(experiment_json, param, metric) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) except: _exception_handler() raise finally: #cleanup spark jobs elastic_id +=1 running = False sc.setJobGroup("", "") return tensorboard_logdir
def differential_evolution(objective_function, boundary_dict, direction = 'max', generations=10, population=10, mutation=0.5, crossover=0.7, cleanup_generations=False, name='no-name', local_logdir=False, versioned_resources=None, description=None): """ *Parallel Experiment* Run differential evolution to explore a given search space for each hyperparameter and figure out the best hyperparameter combination. The function is treated as a blackbox that returns a metric for some given hyperparameter combination. The returned metric is used to evaluate how 'good' the hyperparameter combination was. Example usage: >>> from hops import experiment >>> boundary_dict = {'learning_rate':[0.01, 0.2], 'dropout': [0.1, 0.9]} >>> def train_nn(learning_rate, dropout): >>> import tensorflow >>> # code for preprocessing, training and exporting model >>> # mandatory return a value for the experiment which is registered in Experiments service >>> return network.evaluate(learning_rate, dropout) >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max') Args: :objective_function: the function to run, must return a metric :boundary_dict: a dict where each key corresponds to an argument of *objective_function* and the correspond value should be a list of two elements. The first element being the lower bound for the parameter and the the second element the upper bound. :direction: 'max' to maximize the returned metric, 'min' to minize the returned metric :generations: number of generations :population: size of population :mutation: mutation rate to explore more different hyperparameters :crossover: how fast to adapt the population to the best in each generation :cleanup_generations: remove previous generations from HDFS, only keep the last 2 :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :versioned_resources: A list of HDFS paths of resources to version with this experiment :description: a longer description for the experiment Returns: HDFS path in your project where the experiment is stored, dict with best hyperparameters """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.") try: global app_id global experiment_json global elastic_id running = True spark = util._find_spark() sc = spark.sparkContext app_id = str(sc.applicationId) diff_evo.run_id = diff_evo.run_id + 1 versioned_path = util._version_resources(versioned_resources, diff_evo._get_logdir(app_id)) experiment_json = None experiment_json = util._populate_experiment(sc, name, 'experiment', 'differential_evolution', diff_evo._get_logdir(app_id), json.dumps(boundary_dict), versioned_path, description) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) tensorboard_logdir, best_param, best_metric = diff_evo._search(spark, objective_function, boundary_dict, direction=direction, generations=generations, popsize=population, mutation=mutation, crossover=crossover, cleanup_generations=cleanup_generations, local_logdir=local_logdir, name=name) experiment_json = util._finalize_experiment(experiment_json, best_param, best_metric) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) best_param_dict = util._convert_to_dict(best_param) except: _exception_handler() raise finally: #cleanup spark jobs elastic_id +=1 running = False sc.setJobGroup("", "") return tensorboard_logdir, best_param_dict
def launch(map_fun, args_dict=None, name='no-name', local_logdir=False, versioned_resources=None, description=None): """ *Experiment* or *Parallel Experiment* Run an Experiment contained in *map_fun* one time with no arguments or multiple times with different arguments if *args_dict* is specified. Example usage: >>> from hops import experiment >>> def train_nn(): >>> import tensorflow >>> from hops import tensorboard >>> logdir = tensorboard.logdir() >>> # code for preprocessing, training and exporting model >>> # optionally return a value for the experiment which is registered in Experiments service >>> experiment.launch(train_nn) Args: :map_fun: The function to run :args_dict: If specified will run the same function multiple times with different arguments, {'a':[1,2], 'b':[5,3]} would run the function two times with arguments (1,5) and (2,3) provided that the function signature contains two arguments like *def func(a,b):* :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :versioned_resources: A list of HDFS paths of resources to version with this experiment :description: A longer description for the experiment Returns: HDFS path in your project where the experiment is stored """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.") try: global app_id global experiment_json global elastic_id running = True sc = util._find_spark().sparkContext app_id = str(sc.applicationId) launcher.run_id = launcher.run_id + 1 versioned_path = util._version_resources(versioned_resources, launcher._get_logdir(app_id)) experiment_json = None if args_dict: experiment_json = util._populate_experiment(sc, name, 'experiment', 'launcher', launcher._get_logdir(app_id), json.dumps(args_dict), versioned_path, description) else: experiment_json = util._populate_experiment(sc, name, 'experiment', 'launcher', launcher._get_logdir(app_id), None, versioned_path, description) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) retval, tensorboard_logdir = launcher._launch(sc, map_fun, args_dict, local_logdir) util._version_resources(versioned_resources, launcher._get_logdir(app_id)) if retval: experiment_json = util._finalize_experiment(experiment_json, None, retval) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) return tensorboard_logdir experiment_json = util._finalize_experiment(experiment_json, None, None) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) except: _exception_handler() raise finally: #cleanup spark jobs elastic_id +=1 running = False sc.setJobGroup("", "") return tensorboard_logdir
def unit2(): from pyspark.context import SparkContext from pyspark.conf import SparkConf import argparse import os import numpy import sys import tensorflow as tf import threading from datetime import datetime from hops import util from hops import hdfs from tensorflowonspark import TFCluster sc = spark.sparkContext num_executors = util.num_executors(spark) num_ps = util.num_param_servers(spark) parser = argparse.ArgumentParser() parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=0) parser.add_argument("-f", "--format", help="example format: (csv|pickle|tfr)", choices=["csv", "pickle", "tfr"], default="csv") parser.add_argument( "-i", "--images", help="HDFS path to MNIST images in parallelized format", default='/Projects/' + hdfs.project_name() + '/mnist/train/images') parser.add_argument( "-l", "--labels", help="HDFS path to MNIST labels in parallelized format", default='/Projects/' + hdfs.project_name() + '/mnist/train/labels') parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/test", default="mnist_model") parser.add_argument( "-n", "--cluster_size", help="number of nodes in the cluster (for Spark Standalone)", type=int, default=num_executors) parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions") parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1) parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000) parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true") parser.add_argument("-X", "--mode", help="train|inference", default="train") parser.add_argument("-c", "--rdma", help="use rdma connection", default=False) args = parser.parse_args() print("args:", args) print("{0} ===== Start".format(datetime.now().isoformat())) cluster = TFCluster.run(sc, mnist_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
def random_search(map_fun, boundary_dict, direction='max', samples=10, name='no-name', local_logdir=False, versioned_resources=None, description=None): """ *Parallel Experiment* Run an Experiment contained in *map_fun* for configured number of random samples controlled by the *samples* parameter. Each hyperparameter is contained in *boundary_dict* with the key corresponding to the name of the hyperparameter and a list containing two elements defining the lower and upper bound. The experiment must return a metric corresponding to how 'good' the given hyperparameter combination is. Example usage: >>> from hops import experiment >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> import tensorflow >>> # code for preprocessing, training and exporting model >>> # mandatory return a value for the experiment which is registered in Experiments service >>> return network.evaluate(learning_rate, layers, dropout) >>> experiment.random_search(train_nn, boundary_dict, samples=14, direction='max') Args: :map_fun: The function to run :boundary_dict: dict containing hyperparameter name and corresponding boundaries, each experiment randomize a value in the boundary range. :direction: If set to 'max' the highest value returned will correspond to the best solution, if set to 'min' the opposite is true :samples: the number of random samples to evaluate for each hyperparameter given the boundaries :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :versioned_resources: A list of HDFS paths of resources to version with this experiment :description: A longer description for the experiment Returns: HDFS path in your project where the experiment is stored """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError( "An experiment is currently running. Please call experiment.end() to stop it." ) try: global app_id global experiment_json global elastic_id running = True sc = util._find_spark().sparkContext app_id = str(sc.applicationId) r_search.run_id = r_search.run_id + 1 versioned_path = util._version_resources(versioned_resources, r_search._get_logdir(app_id)) experiment_json = None experiment_json = util._populate_experiment( sc, name, 'experiment', 'random_search', r_search._get_logdir(app_id), json.dumps(boundary_dict), versioned_path, description) util._version_resources(versioned_resources, r_search._get_logdir(app_id)) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) tensorboard_logdir, param, metric = r_search._launch( sc, map_fun, boundary_dict, samples, direction=direction, local_logdir=local_logdir) experiment_json = util._finalize_experiment(experiment_json, param, metric) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) return tensorboard_logdir except: _exception_handler() raise finally: #cleanup spark jobs elastic_id += 1 running = False sc.setJobGroup("", "") return tensorboard_logdir