def __init__(self, queue_name, sub_name=None, verbose=10): assert 'GOOGLE_APPLICATION_CREDENTIALS' in os.environ.keys() with open(os.environ['GOOGLE_APPLICATION_CREDENTIALS']) as f: credentials = json.loads(f.read()) project_name = credentials['project_id'] self.logger = logging.getLogger(self.__class__.__name__) if verbose is not None: self.logger.setLevel(parse_verbosity(verbose)) self.pubclient = pubsub.PublisherClient() self.subclient = pubsub.SubscriberClient() self.project = project_name self.topic_name = self.pubclient.topic_path(project_name, queue_name) self.logger.info("Topic name = {}".format(self.topic_name)) try: self.pubtopic = self.pubclient.get_topic(self.topic_name) except BaseException as e: self.pubtopic = self.pubclient.create_topic(self.topic_name) self.logger.info('topic {} created'.format(self.topic_name)) sub_name = sub_name if sub_name else queue_name + "_sub" self.logger.info("Topic name = {}".format(queue_name)) self.logger.info("Subscription name = {}".format(sub_name)) self.sub_name = self.subclient.subscription_path( project_name, sub_name) try: self.subclient.get_subscription(self.sub_name) except BaseException as e: self.logger.warn(e) self.subclient.create_subscription(self.sub_name, self.topic_name) self.logger.info('subscription {} created'.format(sub_name))
def submit_experiments( experiments, config, logger, cloud=None, queue_name=None, python_pkg=[]): num_experiments = len(experiments) verbose = model.parse_verbosity(config['verbose']) start_time = time.time() n_workers = min(multiprocessing.cpu_count() * 2, num_experiments) with closing(multiprocessing.Pool(n_workers, maxtasksperchild=20)) as p: experiments = p.imap_unordered(add_experiment, zip([config] * num_experiments, [python_pkg] * num_experiments, experiments), chunksize=1) p.close() p.join() # for e in experiments: # logger.info("Added experiment " + e.key) logger.info("Added %s experiments in %s seconds" % (num_experiments, int(time.time() - start_time))) queue = get_queue(queue_name, cloud, verbose) for e in experiments: queue.enqueue(json.dumps({ 'experiment': e.__dict__, 'config': config})) return queue.get_name()
def __init__(self, args): self.config = args.config if args.guest: self.config['database']['guest'] = True self.logger = logs.getLogger('LocalExecutor') self.logger.setLevel(model.parse_verbosity(self.config.get('verbose'))) self.logger.debug("Config: ") self.logger.debug(self.config)
def main(args=sys.argv): logger = logging.getLogger('studio-remote-worker') parser = argparse.ArgumentParser(description='Studio remote worker. \ Usage: studio-remote-worker \ ') parser.add_argument('--config', help='configuration file', default=None) parser.add_argument('--guest', help='Guest mode (does not require db credentials)', action='store_true') parser.add_argument( '--single-run', help='quit after a single run (regardless of the state of the queue)', action='store_true') parser.add_argument('--queue', help='queue name', required=True) parser.add_argument('--verbose', '-v', help='Verbosity level. Allowed vaules: ' + 'debug, info, warn, error, crit ' + 'or numerical value of logger levels.', default=None) parser.add_argument( '--timeout', '-t', help='Timeout after which remote worker stops listening (in seconds)', type=int, default=-1) parsed_args, script_args = parser.parse_known_args(args) verbose = model.parse_verbosity(parsed_args.verbose) logger.setLevel(verbose) if parsed_args.queue.startswith('ec2_') or \ parsed_args.queue.startswith('sqs_'): queue = SQSQueue(parsed_args.queue, verbose=verbose) else: queue = PubsubQueue(parsed_args.queue, verbose=verbose) logger.info('Waiting for the work in the queue...') timeout_before = parsed_args.timeout timeout_after = timeout_before if timeout_before > 0 else 0 wait_for_messages(queue, timeout_before, logger) logger.info('Starting working') worker_loop(queue, parsed_args, setup_pyenv=True, single_experiment=parsed_args.single_run, fetch_artifacts=True, timeout=timeout_after)
def main(args=sys.argv[1:]): parser = argparse.ArgumentParser(description='Studio WebUI server. \ Usage: studio \ <arguments>') parser.add_argument('--config', help='configuration file', default=None) # parser.add_argument('--guest', # help='Guest mode (does not require db credentials)', # action='store_true') parser.add_argument('--port', help='port to run Flask server on', type=int, default=5000) parser.add_argument('--host', help='host name.', default='0.0.0.0') parser.add_argument('--verbose', '-v', help='Verbosity level. Allowed vaules: ' + 'debug, info, warn, error, crit ' + 'or numerical value of logger levels.', default=None) args = parser.parse_args(args) config = model.get_config() if args.config: with open(args.config) as f: config = yaml.load(f) if args.verbose: config['verbose'] = args.verbose # if args.guest: # config['database']['guest'] = True global _config global _db_provider _config = config _db_provider = model.get_db_provider(_config, blocking_auth=False) getlogger().setLevel(model.parse_verbosity(config.get('verbose'))) global _save_auth_cookie _save_auth_cookie = True print('Starting Studio UI on port {0}'.format(args.port)) app.run(host=args.host, port=args.port)
def __init__(self, name, verbose=10, receive_timeout=300, retry_time=10): assert boto3 is not None self._client = boto3.client('sqs') create_q_response = self._client.create_queue(QueueName=name) self._queue_url = create_q_response['QueueUrl'] self.logger = logs.getLogger('SQSQueue') if verbose is not None: self.logger.setLevel(parse_verbosity(verbose)) self._name = name self.logger.info('Creating SQS queue with name ' + name) self.logger.info('Queue url = ' + self._queue_url) self._receive_timeout = receive_timeout self._retry_time = retry_time
def __init__(self, args): self.config = model.get_config() if args.config: if isinstance(args.config, basestring): with open(args.config) as f: self.config.update(yaml.load(f)) else: self.config.update(args.config) if args.guest: self.config['database']['guest'] = True self.db = model.get_db_provider(self.config) self.logger = logging.getLogger('LocalExecutor') self.logger.setLevel(model.parse_verbosity(self.config.get('verbose'))) self.logger.debug("Config: ") self.logger.debug(self.config)
def main(args=sys.argv): parser = argparse.ArgumentParser(description='Studio worker. \ Usage: studio-local-worker \ ') parser.add_argument('--config', help='configuration file', default=None) parser.add_argument('--guest', help='Guest mode (does not require db credentials)', action='store_true') parser.add_argument('--timeout', default=0, type=int) parser.add_argument('--verbose', default='error') parsed_args, script_args = parser.parse_known_args(args) verbose = parse_verbosity(parsed_args.verbose) queue = LocalQueue(verbose=verbose) # queue = glob.glob(fs_tracker.get_queue_directory() + "/*") # wait_for_messages(queue, parsed_args.timeout) returncode = worker_loop(queue, parsed_args, timeout=parsed_args.timeout) sys.exit(returncode)
def main(args=sys.argv[1:]): logger = logs.getLogger('studio-runner') parser = argparse.ArgumentParser( description='Studio runner. \ Usage: studio run <runner_arguments> \ script <script_arguments>') parser.add_argument('--config', help='configuration file', default=None) parser.add_argument('--project', help='name of the project', default=None) parser.add_argument( '--experiment', '-e', help='Name of the experiment. If none provided, ' + 'random uuid will be generated', default=None) parser.add_argument( '--guest', help='Guest mode (does not require db credentials)', action='store_true') parser.add_argument( '--force-git', help='If run in a git directory, force running the experiment ' + 'even if changes are not commited', action='store_true') parser.add_argument( '--gpus', help='Number of gpus needed to run the experiment', type=int, default=None) parser.add_argument( '--cpus', help='Number of cpus needed to run the experiment' + ' (used to configure cloud instance)', type=int, default=None) parser.add_argument( '--ram', help='Amount of RAM needed to run the experiment' + ' (used to configure cloud instance), ex: 10G, 10GB', default=None) parser.add_argument( '--gpuMem', help='Amount of GPU RAM needed to run the experiment', default=None) parser.add_argument( '--hdd', help='Amount of hard drive space needed to run the experiment' + ' (used to configure cloud instance), ex: 10G, 10GB', default=None) parser.add_argument( '--queue', '-q', help='Name of the remote execution queue', default=None) parser.add_argument( '--cloud', help='Cloud execution mode. Could be gcloud, gcspot, ec2 or ec2spot', default=None) parser.add_argument( '--bid', help='Spot instance price bid, specified in USD or in percentage ' + 'of on-demand instance price. Default is %(default)s', default='100%') parser.add_argument( '--capture-once', '-co', help='Name of the immutable artifact to be captured. ' + 'It will be captured once before the experiment is run', default=[], action='append') parser.add_argument( '--capture', '-c', help='Name of the mutable artifact to be captured continuously', default=[], action='append') parser.add_argument( '--reuse', '-r', help='Name of the artifact from another experiment to use', default=[], action='append') parser.add_argument( '--verbose', '-v', help='Verbosity level. Allowed values: ' + 'debug, info, warn, error, crit ' + 'or numerical value of logger levels.', default=None) parser.add_argument( '--metric', help='Metric to show in the summary of the experiment, ' + 'and to base hyperparameter search on. ' + 'Refers a scalar value in tensorboard log ' + 'example: --metric=val_loss[:final | :min | :max] to report ' + 'validation loss in the end of the keras experiment ' + '(or smallest or largest throughout the experiment for :min ' + 'and :max respectively)', default=None) parser.add_argument( '--hyperparam', '-hp', help='Try out multiple values of a certain parameter. ' + 'For example, --hyperparam=learning_rate:0.01:0.1:l10 ' + 'will instantiate 10 versions of the script, replace ' + 'learning_rate with a one of the 10 values for learning ' + 'rate that lies on a log grid from 0.01 to 0.1, create ' 'experiments and place them in the queue.', default=[], action='append') parser.add_argument( '--num-workers', help='Number of local or cloud workers to spin up', type=int, default=None) parser.add_argument( '--python-pkg', help='Python package not present in the current environment ' + 'that is needed for experiment. Only compatible with ' + 'remote and cloud workers for now', default=[], action='append') parser.add_argument( '--ssh-keypair', help='Name of the SSH keypair used to access the EC2 ' + 'instances directly', default=None) parser.add_argument( '--optimizer', '-opt', help='Name of optimizer to use, by default is grid search. ' + 'The name of the optimizer must either be in ' + 'studio/optimizer_plugins ' + 'directory or the path to the optimizer source file ' + 'must be supplied. ', default='grid') parser.add_argument( '--cloud-timeout', help="Time (in seconds) that cloud workers wait for messages. " + "If negative, " + "wait for the first message in the queue indefinitely " + "and shut down " + "as soon as no new messages are available. " + "If zero, don't wait at all." + "Default value is %(default)d", type=int, default=300) parser.add_argument( '--user-startup-script', help='Path of script to run immediately ' + 'before running the remote worker', default=None) parser.add_argument( '--branch', help='Branch of studioml to use when running remote worker, useful ' + 'for debugging pull requests. Default is current', default=None) parser.add_argument( '--max-duration', help='Max experiment runtime (i.e. time after which experiment ' + 'should be killed no matter what.). Examples of values ' + 'might include 5h, 48h2m10s', default=None) parser.add_argument( '--lifetime', help='Max experiment lifetime (i.e. wait time after which ' + 'experiment loses relevance and should not be started)' + ' Examples include 240h30m10s', default=None) parser.add_argument( '--container', help='Singularity container in which experiment should be run. ' + 'Assumes that container has all dependencies installed', default=None ) parser.add_argument( '--port', help='Ports to open on a cloud instance', default=[], action='append' ) # detect which argument is the script filename # and attribute all arguments past that index as related to the script (runner_args, other_args) = parser.parse_known_args(args) py_suffix_args = [i for i, arg in enumerate(args) if arg.endswith('.py') or '::' in arg] rerun = False if len(py_suffix_args) < 1: print('None of the arugments end with .py') if len(other_args) == 0: print("Trying to run a container job") assert runner_args.container is not None exec_filename = None elif len(other_args) == 1: print("Treating last argument as experiment key to rerun") rerun = True experiment_key = args[-1] else: print("Too many extra arguments - should be either none " + "for container job or one for experiment re-run") sys.exit(1) else: script_index = py_suffix_args[0] exec_filename, other_args = args[script_index], args[script_index + 1:] runner_args = parser.parse_args(args[:script_index]) # TODO: Queue the job based on arguments and only then execute. config = model.get_config(runner_args.config) if runner_args.verbose: config['verbose'] = runner_args.verbose if runner_args.guest: config['database']['guest'] = True if runner_args.container: runner_args.capture_once.append( runner_args.container + ':_singularity') verbose = model.parse_verbosity(config['verbose']) logger.setLevel(verbose) if git_util.is_git() and not git_util.is_clean() and not rerun: logger.warn('Running from dirty git repo') if not runner_args.force_git: logger.error( 'Specify --force-git to run experiment from dirty git repo') sys.exit(1) resources_needed = parse_hardware(runner_args, config['resources_needed']) logger.debug('resources requested: ') logger.debug(str(resources_needed)) artifacts = {} artifacts.update(parse_artifacts(runner_args.capture, mutable=True)) artifacts.update(parse_artifacts(runner_args.capture_once, mutable=False)) with model.get_db_provider(config) as db: artifacts.update(parse_external_artifacts(runner_args.reuse, db)) if runner_args.branch: config['cloud']['branch'] = runner_args.branch if runner_args.user_startup_script: config['cloud']['user_startup_script'] = \ runner_args.user_startup_script if runner_args.lifetime: config['experimentLifetime'] = runner_args.lifetime if any(runner_args.hyperparam): if runner_args.optimizer is "grid": experiments = add_hyperparam_experiments( exec_filename, other_args, runner_args, artifacts, resources_needed, logger) queue_name = submit_experiments( experiments, config=config, logger=logger, queue_name=runner_args.queue, cloud=runner_args.cloud) spin_up_workers( runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) else: opt_modulepath = os.path.join( os.path.dirname(os.path.abspath(__file__)), "optimizer_plugins", runner_args.optimizer + ".py") if not os.path.exists(opt_modulepath): opt_modulepath = os.path.abspath( os.path.expanduser(runner_args.optimizer)) logger.info('optimizer path: %s' % opt_modulepath) assert os.path.exists(opt_modulepath) sys.path.append(os.path.dirname(opt_modulepath)) opt_module = importlib.import_module( os.path.basename(opt_modulepath.replace(".py", ''))) h = HyperparameterParser(runner_args, logger) hyperparams = h.parse() optimizer = getattr( opt_module, "Optimizer")( hyperparams, config['optimizer'], logger) workers_started = False queue_name = runner_args.queue while not optimizer.stop(): hyperparam_pop = optimizer.ask() hyperparam_tuples = h.convert_to_tuples(hyperparam_pop) experiments = add_hyperparam_experiments( exec_filename, other_args, runner_args, artifacts, resources_needed, logger, optimizer=optimizer, hyperparam_tuples=hyperparam_tuples) queue_name = submit_experiments( experiments, config=config, logger=logger, cloud=runner_args.cloud, queue_name=queue_name) if not workers_started: spin_up_workers( runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) workers_started = True fitnesses, behaviors = get_experiment_fitnesses( experiments, optimizer, config, logger) # for i, hh in enumerate(hyperparam_pop): # print fitnesses[i] # for hhh in hh: # print hhh try: optimizer.tell(hyperparam_pop, fitnesses, behaviors) except BaseException: optimizer.tell(hyperparam_pop, fitnesses) try: optimizer.disp() except BaseException: logger.warn('Optimizer has no disp() method') else: if rerun: with model.get_db_provider(config) as db: experiment = db.get_experiment(experiment_key) new_key = runner_args.experiment if runner_args.experiment \ else experiment_key + '_rerun' + str(uuid.uuid4()) experiment.key = new_key for _, art in six.iteritems(experiment.artifacts): art['mutable'] = False experiments = [experiment] else: experiments = [create_experiment( filename=exec_filename, args=other_args, experiment_name=runner_args.experiment, project=runner_args.project, artifacts=artifacts, resources_needed=resources_needed, metric=runner_args.metric, max_duration=runner_args.max_duration, )] queue_name = submit_experiments( experiments, config=config, logger=logger, cloud=runner_args.cloud, queue_name=runner_args.queue) spin_up_workers( runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) return
def submit_experiments( experiments, resources_needed, config, runner_args, logger, queue_name=None, launch_workers=True): num_experiments = len(experiments) verbose = model.parse_verbosity(config['verbose']) if runner_args.cloud is None: queue_name = 'local' if 'queue' in config.keys(): queue_name = config['queue'] if runner_args.queue: queue_name = runner_args.queue start_time = time.time() n_workers = min(multiprocessing.cpu_count() * 2, num_experiments) with closing(multiprocessing.Pool(n_workers, maxtasksperchild=20)) as p: experiments = p.imap_unordered(add_experiment, zip([config] * num_experiments, [runner_args.python_pkg] * num_experiments, experiments), chunksize=1) p.close() p.join() # for e in experiments: # logger.info("Added experiment " + e.key) logger.info("Added %s experiments in %s seconds" % (num_experiments, int(time.time() - start_time))) if runner_args.cloud is not None: assert runner_args.cloud in ['gcloud', 'gcspot', 'ec2', 'ec2spot'] assert runner_args.queue is None, \ '--queue argument cannot be provided with --cloud argument' auth_cookie = None if config['database'].get('guest') \ else os.path.join( auth.TOKEN_DIR, config['database']['apiKey'] ) if runner_args.cloud in ['gcloud', 'gcspot']: if queue_name is None: queue_name = 'pubsub_' + str(uuid.uuid4()) worker_manager = GCloudWorkerManager( runner_args=runner_args, auth_cookie=auth_cookie, zone=config['cloud']['zone'] ) queue = PubsubQueue(queue_name, verbose=verbose) if runner_args.cloud in ['ec2', 'ec2spot']: if queue_name is None: queue_name = 'sqs_' + str(uuid.uuid4()) worker_manager = EC2WorkerManager( runner_args=runner_args, auth_cookie=auth_cookie ) queue = SQSQueue(queue_name, verbose=verbose) if launch_workers: if runner_args.cloud == 'gcloud' or \ runner_args.cloud == 'ec2': num_workers = int( runner_args.num_workers) if runner_args.num_workers else 1 for i in range(num_workers): worker_manager.start_worker( queue_name, resources_needed, ssh_keypair=runner_args.ssh_keypair, timeout=runner_args.cloud_timeout) else: assert runner_args.bid is not None if runner_args.num_workers: start_workers = runner_args.num_workers queue_upscaling = False else: start_workers = 1 queue_upscaling = True worker_manager.start_spot_workers( queue_name, runner_args.bid, resources_needed, start_workers=start_workers, queue_upscaling=queue_upscaling, ssh_keypair=runner_args.ssh_keypair, timeout=runner_args.cloud_timeout) else: if queue_name == 'local': queue = LocalQueue() queue.clean() elif queue_name.startswith('sqs_'): queue = SQSQueue(queue_name, verbose=verbose) else: queue = PubsubQueue( queue_name, config['database']['projectId'], verbose=verbose) for e in experiments: queue.enqueue(json.dumps({ 'experiment': e.__dict__, 'config': config})) if queue_name == 'local': worker_args = ['studio-local-worker'] if runner_args.config: worker_args += ['--config=' + runner_args.config] if runner_args.guest: worker_args += ['--guest'] logger.info('worker args: {}'.format(worker_args)) if not runner_args.num_workers or int(runner_args.num_workers) == 1: if 'STUDIOML_DUMMY_MODE' not in os.environ: local_worker.main(worker_args) else: raise NotImplementedError("Multiple local workers are not " + "implemented yet") return queue_name
def submit_experiments( experiments, config, runner_args, logger, resources_needed): db = model.get_db_provider(config) verbose = model.parse_verbosity(config['verbose']) queue_name = 'local' if 'queue' in config.keys(): queue_name = config['queue'] if runner_args.queue: queue_name = runner_args.queue for e in experiments: e.pythonenv = add_packages(e.pythonenv, runner_args.python_pkg) db.add_experiment(e) logger.info("Added experiment " + e.key) if runner_args.cloud is not None: assert runner_args.cloud in ['gcloud', 'gcspot', 'ec2', 'ec2spot'] assert runner_args.queue is None, \ '--queue argument cannot be provided with --cloud argument' auth_cookie = None if config['database'].get('guest') \ else os.path.join( auth.token_dir, config['database']['apiKey'] ) if runner_args.cloud in ['gcloud', 'gcspot']: queue_name = 'pubsub_' + str(uuid.uuid4()) queue = PubsubQueue(queue_name, verbose=verbose) worker_manager = GCloudWorkerManager( auth_cookie=auth_cookie, zone=config['cloud']['zone'] ) if runner_args.cloud in ['ec2', 'ec2spot']: queue_name = 'sqs_' + str(uuid.uuid4()) queue = SQSQueue(queue_name, verbose=verbose) worker_manager = EC2WorkerManager( auth_cookie=auth_cookie ) if runner_args.cloud == 'gcloud' or \ runner_args.cloud == 'ec2': num_workers = int( runner_args.num_workers) if runner_args.num_workers else 1 for i in range(num_workers): worker_manager.start_worker( queue_name, resources_needed, ssh_keypair=runner_args.ssh_keypair, timeout=runner_args.cloud_timeout) else: assert runner_args.bid is not None if runner_args.num_workers: start_workers = runner_args.num_workers queue_upscaling = False else: start_workers = 1 queue_upscaling = True worker_manager.start_spot_workers( queue_name, runner_args.bid, resources_needed, start_workers=start_workers, queue_upscaling=queue_upscaling, ssh_keypair=runner_args.ssh_keypair, timeout=runner_args.cloud_timeout) else: if queue_name == 'local': queue = LocalQueue() queue.clean() elif queue_name.startswith('sqs_'): queue = SQSQueue(queue_name, verbose=verbose) else: queue = PubsubQueue( queue_name, config['database']['projectId'], verbose=verbose) for e in experiments: queue.enqueue(json.dumps({ 'experiment': e.__dict__, 'config': config})) if queue_name == 'local': worker_args = ['studio-local-worker'] if runner_args.config: worker_args += ['--config=' + runner_args.config] if runner_args.guest: worker_args += ['--guest'] logger.info('worker args: {}'.format(worker_args)) if not runner_args.num_workers or int(runner_args.num_workers) == 1: local_worker.main(worker_args) else: raise NotImplementedError("Multiple local workers are not " + "implemented yet") return
def main(args=sys.argv): logger = logging.getLogger('studio-runner') parser = argparse.ArgumentParser( description='Studio runner. \ Usage: studio run <runner_arguments> \ script <script_arguments>') parser.add_argument('--config', help='configuration file', default=None) parser.add_argument('--project', help='name of the project', default=None) parser.add_argument( '--experiment', '-e', help='Name of the experiment. If none provided, ' + 'random uuid will be generated', default=None) parser.add_argument( '--guest', help='Guest mode (does not require db credentials)', action='store_true') parser.add_argument( '--force-git', help='If run in a git directory, force running the experiment ' + 'even if changes are not commited', action='store_true') parser.add_argument( '--gpus', help='Number of gpus needed to run the experiment', default=None) parser.add_argument( '--cpus', help='Number of cpus needed to run the experiment' + ' (used to configure cloud instance)', default=None) parser.add_argument( '--ram', help='Amount of RAM needed to run the experiment' + ' (used to configure cloud instance)', default=None) parser.add_argument( '--hdd', help='Amount of hard drive space needed to run the experiment' + ' (used to configure cloud instance)', default=None) parser.add_argument( '--queue', '-q', help='Name of the remote execution queue', default=None) parser.add_argument( '--cloud', help='Cloud execution mode. Could be gcloud, ec2 or ec2spot', default=None) parser.add_argument( '--bid', help='Spot instance price bid, specified in USD or in percentage ' + 'of on-demand instance price. Default is %(default)s', default='100%') parser.add_argument( '--capture-once', '-co', help='Name of the immutable artifact to be captured. ' + 'It will be captured once before the experiment is run', default=[], action='append') parser.add_argument( '--capture', '-c', help='Name of the mutable artifact to be captured continuously', default=[], action='append') parser.add_argument( '--reuse', '-r', help='Name of the artifact from another experiment to use', default=[], action='append') parser.add_argument( '--verbose', '-v', help='Verbosity level. Allowed values: ' + 'debug, info, warn, error, crit ' + 'or numerical value of logger levels.', default=None) parser.add_argument( '--metric', '-m', help='Metric to show in the summary of the experiment, ' + 'and to base hyperparameter search on. ' + 'Refers a scalar value in tensorboard log ' + 'example: --metric=val_loss[:final | :min | :max] to report ' + 'validation loss in the end of the keras experiment ' + '(or smallest or largest throughout the experiment for :min ' + 'and :max respectively)', default=None) parser.add_argument( '--hyperparam', '-hp', help='Try out multiple values of a certain parameter. ' + 'For example, --hyperparam=learning_rate:0.01:0.1:l10 ' + 'will instantiate 10 versions of the script, replace ' + 'learning_rate with a one of the 10 values for learning ' + 'rate that lies on a log grid from 0.01 to 0.1, create ' 'experiments and place them in the queue.', default=[], action='append') parser.add_argument( '--num-workers', help='Number of local or cloud workers to spin up', default=None) parser.add_argument( '--python-pkg', help='Python package not present in the current environment ' + 'that is needed for experiment. Only compatible with ' + 'remote and cloud workers for now', default=[], action='append') parser.add_argument( '--ssh-keypair', help='Name of the SSH keypair used to access the EC2 ' + 'instances directly', default=None) parser.add_argument( '--optimizer', '-opt', help='Name of optimizer to use, by default is grid search. ' + 'The name of the optimizer must either be in ' + 'studio/optimizer_plugins ' + 'directory or the path to the optimizer source file ' + 'must be supplied. ', default='grid') parser.add_argument( '--cloud-timeout', help="Time (in seconds) that cloud workers wait for messages. " + "If negative, " + "wait for the first message in the queue indefinitely " + "and shut down " + "as soon as no new messages are available. " + "If zero, don't wait at all." + "Default value is %(default)", type=int, default=300) # detect which argument is the script filename # and attribute all arguments past that index as related to the script py_suffix_args = [i for i, arg in enumerate(args) if arg.endswith('.py')] if len(py_suffix_args) < 1: print('At least one argument should be a python script ' + '(end with *.py)') parser.print_help() exit() script_index = py_suffix_args[0] runner_args = parser.parse_args(args[1:script_index]) exec_filename, other_args = args[script_index], args[script_index + 1:] # TODO: Queue the job based on arguments and only then execute. config = model.get_config(runner_args.config) if runner_args.verbose: config['verbose'] = runner_args.verbose verbose = model.parse_verbosity(config['verbose']) logger.setLevel(verbose) db = model.get_db_provider(config) if git_util.is_git() and not git_util.is_clean(): logger.warn('Running from dirty git repo') if not runner_args.force_git: logger.error( 'Specify --force-git to run experiment from dirty git repo') sys.exit(1) resources_needed = parse_hardware(runner_args, config['cloud']) logger.debug('resources requested: ') logger.debug(str(resources_needed)) artifacts = {} artifacts.update(parse_artifacts(runner_args.capture, mutable=True)) artifacts.update(parse_artifacts(runner_args.capture_once, mutable=False)) artifacts.update(parse_external_artifacts(runner_args.reuse, db)) if any(runner_args.hyperparam): if runner_args.optimizer is "grid": experiments = add_hyperparam_experiments( exec_filename, other_args, runner_args, artifacts, resources_needed) submit_experiments( experiments, config, runner_args, logger, resources_needed) else: opt_modulepath = os.path.join( os.path.dirname(os.path.abspath(__file__)), "optimizer_plugins", runner_args.optimizer + ".py") # logger.info('optimizer path: %s' % opt_modulepath) if not os.path.exists(opt_modulepath): opt_modulepath = os.path.abspath( os.path.expanduser(runner_args.optimizer)) logger.info('optimizer path: %s' % opt_modulepath) assert os.path.exists(opt_modulepath) sys.path.append(os.path.dirname(opt_modulepath)) opt_module = importlib.import_module( os.path.basename(opt_modulepath.replace(".py", ''))) hyperparam_values, log_scale_dict = get_hyperparam_values( runner_args) optimizer = getattr(opt_module, "Optimizer")(hyperparam_values, log_scale_dict) while not optimizer.stop(): hyperparam_tuples = optimizer.ask() experiments = add_hyperparam_experiments( exec_filename, other_args, runner_args, artifacts, resources_needed, optimizer=optimizer, hyperparam_tuples=hyperparam_tuples) submit_experiments( experiments, config, runner_args, logger, resources_needed) fitnesses = get_experiment_fitnesses(experiments, optimizer, config, logger) optimizer.tell(hyperparam_tuples, fitnesses) # if config['verbose'] == "info" or config['verbose'] == # "debug": try: optimizer.disp() except BaseException: logger.warn('Optimizer has no disp() method') else: experiments = [model.create_experiment( filename=exec_filename, args=other_args, experiment_name=runner_args.experiment, project=runner_args.project, artifacts=artifacts, resources_needed=resources_needed, metric=runner_args.metric)] submit_experiments( experiments, config, runner_args, logger, resources_needed) db = None return
def worker_loop(queue, parsed_args, single_experiment=False, timeout=0, verbose=None): fetch_artifacts = True logger = logs.getLogger('worker_loop') hold_period = 4 retval = 0 while True: msg = queue.dequeue(acknowledge=False, timeout=timeout) if not msg: break # first_exp, ack_key = queue.dequeue(acknowledge=False) first_exp, ack_key = msg data_dict = json.loads(sixdecode(first_exp)) experiment_key = data_dict['experiment']['key'] config = data_dict['config'] parsed_args.config = config if verbose: config['verbose'] = verbose else: verbose = model.parse_verbosity(config.get('verbose')) logger.setLevel(verbose) logger.debug('Received message: \n{}'.format(data_dict)) executor = LocalExecutor(parsed_args) with model.get_db_provider(config) as db: # experiment = experiment_from_dict(data_dict['experiment']) def try_get_experiment(): experiment = db.get_experiment(experiment_key) if experiment is None: raise ValueError( 'experiment is not found - indicates storage failure') return experiment experiment = retry(try_get_experiment, sleep_time=10, logger=logger) if config.get('experimentLifetime') and \ int(str2duration(config['experimentLifetime']) .total_seconds()) + experiment.time_added < time.time(): logger.info( 'Experiment expired (max lifetime of {} was exceeded)'. format(config.get('experimentLifetime'))) queue.acknowledge(ack_key) continue if allocate_resources(experiment, config, verbose=verbose): def hold_job(): queue.hold(ack_key, hold_period) hold_job() sched = BackgroundScheduler() sched.add_job(hold_job, 'interval', minutes=hold_period / 2) sched.start() try: python = 'python' if experiment.pythonver == 3: python = 'python3' if '_singularity' not in experiment.artifacts.keys(): pip_diff = pip_needed_packages(experiment.pythonenv, python) if any(pip_diff): logger.info( 'Setting up python packages for experiment') if pip_install_packages(pip_diff, python, logger) != 0: logger.info( "Installation of all packages together " + " failed, " "trying one package at a time") for pkg in pip_diff: pip_install_packages([pkg], python, logger) for tag, art in six.iteritems(experiment.artifacts): if fetch_artifacts or 'local' not in art.keys(): logger.info('Fetching artifact ' + tag) if tag == 'workspace': art['local'] = retry(lambda: db.get_artifact( art, only_newer=False), sleep_time=10, logger=logger) else: art['local'] = retry( lambda: db.get_artifact(art), sleep_time=10, logger=logger) returncode = executor.run(experiment) if returncode != 0: retval = returncode finally: sched.shutdown() queue.acknowledge(ack_key) if single_experiment: logger.info('single_experiment is True, quitting') return retval else: logger.info('Cannot run experiment ' + experiment.key + ' due lack of resources. Will retry') time.sleep(config['sleep_time']) # wait_for_messages(queue, timeout, logger) # queue = glob.glob(fs_tracker.get_queue_directory() + "/*") logger.info("Queue in {} is empty, quitting".format( fs_tracker.get_queue_directory())) return retval
def worker_loop(queue, parsed_args, setup_pyenv=False, single_experiment=False, fetch_artifacts=False, timeout=0): logger = logging.getLogger('worker_loop') hold_period = 4 while queue.has_next(): first_exp, ack_key = queue.dequeue(acknowledge=False) experiment_key = json.loads(first_exp)['experiment']['key'] config = json.loads(first_exp)['config'] parsed_args.config = config verbose = model.parse_verbosity(config.get('verbose')) logger.setLevel(verbose) logger.debug( 'Received experiment {} with config {} from the queue'.format( experiment_key, config)) executor = LocalExecutor(parsed_args) experiment = executor.db.get_experiment(experiment_key) if allocate_resources(experiment, config, verbose=verbose): def hold_job(): queue.hold(ack_key, hold_period) hold_job() sched = BackgroundScheduler() sched.add_job(hold_job, 'interval', minutes=hold_period / 2) sched.start() try: if setup_pyenv: logger.info('Setting up python packages for experiment') pipp = subprocess.Popen(['pip', 'install'] + experiment.pythonenv, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) pipout, _ = pipp.communicate() logger.info("pip output: \n" + pipout) # pip.main(['install'] + experiment.pythonenv) for tag, art in experiment.artifacts.iteritems(): if fetch_artifacts or 'local' not in art.keys(): logger.info('Fetching artifact ' + tag) if tag == 'workspace': # art['local'] = executor.db.store.get_artifact( # art, '.', only_newer=False) art['local'] = executor.db.store.get_artifact( art, only_newer=False) else: art['local'] = executor.db.store.get_artifact(art) executor.run(experiment) finally: sched.shutdown() queue.acknowledge(ack_key) if single_experiment: logger.info('single_experiment is True, quitting') return else: logger.info('Cannot run experiment ' + experiment.key + ' due lack of resources. Will retry') time.sleep(config['sleep_time']) wait_for_messages(queue, timeout, logger) # queue = glob.glob(fs_tracker.get_queue_directory() + "/*") logger.info("Queue in {} is empty, quitting".format( fs_tracker.get_queue_directory()))