def create_experiments(hyperparam_tuples): experiments = [] # experiment_names = {} for hyperparam_tuple in hyperparam_tuples: experiment_name = experiment_name_base experiment_name += "__opt__%s__%s" % (rand_string(32), int(time.time())) experiment_name = experiment_name.replace('.', '_') workspace_new = fs_tracker.get_artifact_cache( 'workspace', experiment_name) current_artifacts = artifacts.copy() current_artifacts.update({ 'workspace': { 'local': workspace_new, 'mutable': True } }) rsync_cp(workspace_orig, workspace_new, ignore_arg, logger) # shutil.copytree(workspace_orig, workspace_new) for param_name, param_value in hyperparam_tuple.iteritems(): if isinstance(param_value, np.ndarray): array_filepath = '/tmp/%s.npy' % rand_string(32) np.save(array_filepath, param_value) assert param_name not in current_artifacts current_artifacts[param_name] = {'local': array_filepath, 'mutable': False} else: with open(os.path.join(workspace_new, exec_filename), 'rb') as f: script_text = f.read() script_text = re.sub( '\\b' + param_name + '\\b(?=[^=]*\\n)', str(param_value), script_text) with open(os.path.join(workspace_new, exec_filename), 'wb') as f: f.write(script_text) experiments.append(model.create_experiment( filename=exec_filename, args=other_args, experiment_name=experiment_name, project=project, artifacts=current_artifacts, resources_needed=resources_needed, metric=runner_args.metric)) return experiments
def create_experiments(hyperparam_tuples): experiments = [] experiment_names = {} for hyperparam_tuple in hyperparam_tuples: experiment_name = experiment_name_base for param_name, param_value in hyperparam_tuple.iteritems(): experiment_name = experiment_name + '__' + \ param_name + '__' + str(param_value) experiment_name = experiment_name.replace('.', '_') # if experiments uses a previously used name, change it if experiment_name in experiment_names: new_experiment_name = experiment_name counter = 1 while new_experiment_name in experiment_names: counter += 1 new_experiment_name = "%s_v%s" % (experiment_name, counter) experiment_name = new_experiment_name experiment_names[experiment_name] = True workspace_orig = artifacts['workspace']['local'] \ if 'workspace' in artifacts.keys() else '.' workspace_new = fs_tracker.get_artifact_cache( 'workspace', experiment_name) current_artifacts = artifacts.copy() current_artifacts.update({ 'workspace': { 'local': workspace_new, 'mutable': True } }) shutil.copytree(workspace_orig, workspace_new) with open(os.path.join(workspace_new, exec_filename), 'r') as f: script_text = f.read() for param_name, param_value in hyperparam_tuple.iteritems(): script_text = re.sub('\\b' + param_name + '\\b(?=[^=]*\\n)', str(param_value), script_text) with open(os.path.join(workspace_new, exec_filename), 'w') as f: f.write(script_text) experiments.append(model.create_experiment( filename=exec_filename, args=other_args, experiment_name=experiment_name, project=project, artifacts=current_artifacts, resources_needed=resources_needed, metric=runner_args.metric)) return experiments
def main(args=sys.argv): logger = logging.getLogger('studio-runner') parser = argparse.ArgumentParser( description='Studio runner. \ Usage: studio run <runner_arguments> \ script <script_arguments>') parser.add_argument('--config', help='configuration file', default=None) parser.add_argument('--project', help='name of the project', default=None) parser.add_argument( '--experiment', '-e', help='Name of the experiment. If none provided, ' + 'random uuid will be generated', default=None) parser.add_argument( '--guest', help='Guest mode (does not require db credentials)', action='store_true') parser.add_argument( '--force-git', help='If run in a git directory, force running the experiment ' + 'even if changes are not commited', action='store_true') parser.add_argument( '--gpus', help='Number of gpus needed to run the experiment', type=int, default=None) parser.add_argument( '--cpus', help='Number of cpus needed to run the experiment' + ' (used to configure cloud instance)', type=int, default=None) parser.add_argument( '--ram', help='Amount of RAM needed to run the experiment' + ' (used to configure cloud instance), ex: 10G, 10GB', default=None) parser.add_argument( '--hdd', help='Amount of hard drive space needed to run the experiment' + ' (used to configure cloud instance), ex: 10G, 10GB', default=None) parser.add_argument( '--queue', '-q', help='Name of the remote execution queue', default=None) parser.add_argument( '--cloud', help='Cloud execution mode. Could be gcloud, gcspot, ec2 or ec2spot', default=None) parser.add_argument( '--bid', help='Spot instance price bid, specified in USD or in percentage ' + 'of on-demand instance price. Default is %(default)s', default='100%') parser.add_argument( '--capture-once', '-co', help='Name of the immutable artifact to be captured. ' + 'It will be captured once before the experiment is run', default=[], action='append') parser.add_argument( '--capture', '-c', help='Name of the mutable artifact to be captured continuously', default=[], action='append') parser.add_argument( '--reuse', '-r', help='Name of the artifact from another experiment to use', default=[], action='append') parser.add_argument( '--verbose', '-v', help='Verbosity level. Allowed values: ' + 'debug, info, warn, error, crit ' + 'or numerical value of logger levels.', default=None) parser.add_argument( '--metric', '-m', help='Metric to show in the summary of the experiment, ' + 'and to base hyperparameter search on. ' + 'Refers a scalar value in tensorboard log ' + 'example: --metric=val_loss[:final | :min | :max] to report ' + 'validation loss in the end of the keras experiment ' + '(or smallest or largest throughout the experiment for :min ' + 'and :max respectively)', default=None) parser.add_argument( '--hyperparam', '-hp', help='Try out multiple values of a certain parameter. ' + 'For example, --hyperparam=learning_rate:0.01:0.1:l10 ' + 'will instantiate 10 versions of the script, replace ' + 'learning_rate with a one of the 10 values for learning ' + 'rate that lies on a log grid from 0.01 to 0.1, create ' 'experiments and place them in the queue.', default=[], action='append') parser.add_argument( '--num-workers', help='Number of local or cloud workers to spin up', type=int, default=None) parser.add_argument( '--python-pkg', help='Python package not present in the current environment ' + 'that is needed for experiment. Only compatible with ' + 'remote and cloud workers for now', default=[], action='append') parser.add_argument( '--ssh-keypair', help='Name of the SSH keypair used to access the EC2 ' + 'instances directly', default=None) parser.add_argument( '--optimizer', '-opt', help='Name of optimizer to use, by default is grid search. ' + 'The name of the optimizer must either be in ' + 'studio/optimizer_plugins ' + 'directory or the path to the optimizer source file ' + 'must be supplied. ', default='grid') parser.add_argument( '--cloud-timeout', help="Time (in seconds) that cloud workers wait for messages. " + "If negative, " + "wait for the first message in the queue indefinitely " + "and shut down " + "as soon as no new messages are available. " + "If zero, don't wait at all." + "Default value is %(default)d", type=int, default=300) parser.add_argument( '--user-startup-script', help='Path of script to run before running the remote worker', default=None) parser.add_argument( '--branch', help='Branch of studioml to use when running remote worker, useful ' + 'for debugging pull requests. Default is current', default=None) # detect which argument is the script filename # and attribute all arguments past that index as related to the script py_suffix_args = [i for i, arg in enumerate(args) if arg.endswith('.py')] if len(py_suffix_args) < 1: print('At least one argument should be a python script ' + '(end with *.py)') parser.print_help() exit() script_index = py_suffix_args[0] runner_args = parser.parse_args(args[1:script_index]) exec_filename, other_args = args[script_index], args[script_index + 1:] # TODO: Queue the job based on arguments and only then execute. config = model.get_config(runner_args.config) if runner_args.verbose: config['verbose'] = runner_args.verbose if runner_args.guest: config['database']['guest'] = True verbose = model.parse_verbosity(config['verbose']) logger.setLevel(verbose) if git_util.is_git() and not git_util.is_clean(): logger.warn('Running from dirty git repo') if not runner_args.force_git: logger.error( 'Specify --force-git to run experiment from dirty git repo') sys.exit(1) resources_needed = parse_hardware(runner_args, config['resources_needed']) logger.debug('resources requested: ') logger.debug(str(resources_needed)) artifacts = {} artifacts.update(parse_artifacts(runner_args.capture, mutable=True)) artifacts.update(parse_artifacts(runner_args.capture_once, mutable=False)) with model.get_db_provider(config) as db: artifacts.update(parse_external_artifacts(runner_args.reuse, db)) if runner_args.branch: config['cloud']['branch'] = runner_args.branch if runner_args.user_startup_script: config['cloud']['user_startup_script'] = \ runner_args.user_startup_script if any(runner_args.hyperparam): if runner_args.optimizer is "grid": experiments = add_hyperparam_experiments( exec_filename, other_args, runner_args, artifacts, resources_needed, logger) queue_name = submit_experiments( experiments, config=config, logger=logger, cloud=runner_args.cloud) spin_up_workers( runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) else: opt_modulepath = os.path.join( os.path.dirname(os.path.abspath(__file__)), "optimizer_plugins", runner_args.optimizer + ".py") if not os.path.exists(opt_modulepath): opt_modulepath = os.path.abspath( os.path.expanduser(runner_args.optimizer)) logger.info('optimizer path: %s' % opt_modulepath) assert os.path.exists(opt_modulepath) sys.path.append(os.path.dirname(opt_modulepath)) opt_module = importlib.import_module( os.path.basename(opt_modulepath.replace(".py", ''))) h = HyperparameterParser(runner_args, logger) hyperparams = h.parse() optimizer = getattr( opt_module, "Optimizer")( hyperparams, config['optimizer'], logger) workers_started = False queue_name = None while not optimizer.stop(): hyperparam_pop = optimizer.ask() hyperparam_tuples = h.convert_to_tuples(hyperparam_pop) experiments = add_hyperparam_experiments( exec_filename, other_args, runner_args, artifacts, resources_needed, logger, optimizer=optimizer, hyperparam_tuples=hyperparam_tuples) queue_name = submit_experiments( experiments, config=config, logger=logger, cloud=runner_args.cloud, queue_name=queue_name) if not workers_started: spin_up_workers( runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) workers_started = True fitnesses, behaviors = get_experiment_fitnesses( experiments, optimizer, config, logger) # for i, hh in enumerate(hyperparam_pop): # print fitnesses[i] # for hhh in hh: # print hhh try: optimizer.tell(hyperparam_pop, fitnesses, behaviors) except BaseException: optimizer.tell(hyperparam_pop, fitnesses) try: optimizer.disp() except BaseException: logger.warn('Optimizer has no disp() method') else: experiments = [model.create_experiment( filename=exec_filename, args=other_args, experiment_name=runner_args.experiment, project=runner_args.project, artifacts=artifacts, resources_needed=resources_needed, metric=runner_args.metric)] queue_name = submit_experiments( experiments, config=config, logger=logger, cloud=runner_args.cloud) spin_up_workers( runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) return