def _test_serving(self, data_in, expected_data_out, wrapper=None): self.port = randint(5000, 9000) server_experimentid = 'test_serving_' + str(uuid.uuid4()) with get_local_queue_lock(): args = [ 'studio', 'run', '--force-git', '--verbose=debug', '--experiment=' + server_experimentid, '--config=' + self.get_config_path(), 'studio::serve_main', '--port=' + str(self.port), '--host=localhost' ] if wrapper: args.append('--wrapper=' + wrapper) subprocess.Popen(args, cwd=os.path.dirname(__file__)) time.sleep(60) try: retval = requests.post(url='http://localhost:' + str(self.port), json=data_in) data_out = retval.json() assert data_out == expected_data_out finally: with model.get_db_provider(model.get_config( self.get_config_path())) as db: db.stop_experiment(server_experimentid) time.sleep(20) db.delete_experiment(server_experimentid)
def test_experiment_lifetime(self): my_path = os.path.dirname(os.path.realpath(__file__)) logger = logs.getLogger('test_experiment_lifetime') logger.setLevel(10) config_name = os.path.join(my_path, 'test_config_http_client.yaml') key = 'test_experiment_lifetime' + str(uuid.uuid4()) with model.get_db_provider(model.get_config(config_name)) as db: try: db.delete_experiment(key) except Exception: pass p = subprocess.Popen(['studio', 'run', '--config=' + config_name, '--experiment=' + key, '--force-git', '--verbose=debug', '--lifetime=-10m', 'stop_experiment.py'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=my_path) pout, _ = p.communicate() if pout: logger.debug("studio run output: \n" + pout.decode()) db.delete_experiment(key)
def getResultsWithTimeout(self, timeout=0): total_sleep_time = 0 sleep_time = 1 while True: with model.get_db_provider(self.config) as db: if self.resumable: experiment_keys = db.get_project_experiments( self.project_name).keys() else: experiment_keys = self.submitted for key in experiment_keys: e = db.get_experiment(key) if e is not None and e.status == 'finished': self.logger.debug( 'Experiment {} finished, getting results' .format( e.key)) with open(db.get_artifact(e.artifacts['retval']), 'rb') as f: data = pickle.load(f) if not self.resumable: self.submitted.remove(e.key) else: db.delete_experiment(e.key) return (e.key, data) if timeout == 0 or \ (timeout > 0 and total_sleep_time > timeout): return None time.sleep(sleep_time) total_sleep_time += sleep_time
def get_store(self, config_name='test_config.yaml'): config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_name) with open(config_file) as f: config = yaml.load(f, Loader=yaml.SafeLoader) return model.get_db_provider(config).store
def __enter__(self): with model.get_db_provider(self.config): pass if self.wm: self.logger.debug('Spinning up cloud workers') if self.use_spot: self.wm.start_spot_workers( self.queue_name, self.bid, self.resources_needed, start_workers=self.num_workers, queue_upscaling=self.queue_upscaling, ssh_keypair=self.ssh_keypair, timeout=self.cloud_timeout) else: for i in range(self.num_workers): self.wm.start_worker( self.queue_name, self.resources_needed, ssh_keypair=self.ssh_keypair, timeout=self.cloud_timeout) self.p = None else: self.logger.debug('Starting local worker') self.p = subprocess.Popen([ 'studio-local-worker', '--verbose=%s' % self.config['verbose'], '--timeout=' + str(self.cloud_timeout)], close_fds=True) return self
def get_firebase_provider(self, config_name='test_config.yaml'): config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_name) with open(config_file) as f: config = yaml.load(f) return model.get_db_provider(config)
def _create_artifacts( self, client_code_file, args_file, workspace_new, files): artifacts = { 'retval': { 'mutable': True, 'unpack': True }, 'clientscript': { 'mutable': False, 'local': client_code_file, 'unpack': True }, 'args': { 'mutable': False, 'local': args_file, 'unpack': True }, 'workspace': { 'mutable': False, 'local': workspace_new, 'unpack': True } } for tag, name in six.iteritems(files): artifacts[tag] = {} url_schema = re.compile('^https{0,1}://') s3_schema = re.compile('^s3://') gcs_schema = re.compile('^gs://') studio_schema = re.compile( 'studio://(?P<experiment>.+)/(?P<artifact>.+)') if url_schema.match(name): artifacts[tag]['url'] = name artifacts[tag]['unpack'] = False elif s3_schema.match(name) or gcs_schema.match(name): artifacts[tag]['qualified'] = name artifacts[tag]['unpack'] = False elif studio_schema.match(name): ext_experiment_key = studio_schema.match( name).group('experiment') ext_tag = studio_schema.match(name).group('artifact') with model.get_db_provider(self.config) as db: ext_experiment = db.get_experiment(ext_experiment_key) artifacts[tag]['key'] = \ ext_experiment.artifacts[ext_tag]['key'] artifacts[tag]['unpack'] = True else: artifacts[tag]['local'] = os.path.abspath( os.path.expanduser(name)) artifacts[tag]['unpack'] = True artifacts[tag]['mutable'] = False return artifacts
def get_provider(self, config_name=None): config_name = config_name if config_name else \ self.get_default_config_name() config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_name) with open(config_file) as f: config = yaml.load(f) return model.get_db_provider(config)
def getResultsWithTimeout(self, timeout=0): total_sleep_time = 0 sleep_time = self.sleep_time assert self.resumable is False while True: with model.get_db_provider(self.config) as db: for key, submitted_time in six.iteritems(self.submitted): try: e = db.get_experiment(key) if e is not None: retval_path = db.get_artifact( e.artifacts['retval']) if os.path.exists(retval_path) and \ os.path.getmtime(retval_path) > submitted_time: with open(retval_path, 'rb') as f: data = pickle.load(f) del self.submitted[e.key] return (e.key, data) except BaseException as e: self.logger.debug( "Getting result failed due to exception:") self.logger.debug(e) ''' if e is not None and e.status == 'finished': self.logger.debug( 'Experiment {} finished, getting results' .format( e.key)) with open(db.get_artifact(e.artifacts['retval']), 'rb') as f: data = pickle.load(f) if not self.resumable: self.submitted.remove(e.key) else: db.delete_experiment(e.key) return (e.key, data) ''' if timeout == 0 or \ (timeout > 0 and total_sleep_time > timeout): return None if self.p is not None: assert self.p.poll() is None, \ "Executor process died, no point in waiting for results" time.sleep(sleep_time) total_sleep_time += sleep_time
def get_db(): global _config global _db_provider global _db_provider_timestamp if not _db_provider or \ not _db_provider_timestamp or \ time.time() - _db_provider_timestamp > DB_PROVIDER_EXPIRATION: _db_provider = model.get_db_provider(_config, blocking_auth=False) _db_provider_timestamp = time.time() return _db_provider
def main(args=sys.argv[1:]): parser = argparse.ArgumentParser( description='Studio WebUI server. \ Usage: studio \ <arguments>') parser.add_argument('--config', help='configuration file', default=None) # parser.add_argument('--guest', # help='Guest mode (does not require db credentials)', # action='store_true') parser.add_argument('--port', help='port to run Flask server on', type=int, default=5000) parser.add_argument('--host', help='host name.', default='localhost') parser.add_argument( '--verbose', '-v', help='Verbosity level. Allowed vaules: ' + 'debug, info, warn, error, crit ' + 'or numerical value of logger levels.', default=None) args = parser.parse_args(args) config = model.get_config() if args.config: with open(args.config) as f: config = yaml.load(f, Loader=yaml.FullLoader) if args.verbose: config['verbose'] = args.verbose # if args.guest: # config['database']['guest'] = True global _config global _db_provider _config = config _db_provider = model.get_db_provider(_config) getlogger().setLevel(parse_verbosity(config.get('verbose'))) global _save_auth_cookie _save_auth_cookie = True print('Starting Studio UI on port {0}'.format(args.port)) app.run(host=args.host, port=args.port)
def _list(args, cli_args): with model.get_db_provider(cli_args.config) as db: if len(args) == 0: experiments = db.get_user_experiments() elif args[0] == 'project': assert len(args) == 2 experiments = db.get_project_experiments(args[1]) elif args[0] == 'users': assert len(args) == 1 users = db.get_users() for u in users.keys(): print users[u].get('email') return elif args[0] == 'user': assert len(args) == 2 users = db.get_users() user_ids = [u for u in users if users[u].get('email') == args[1]] assert len(user_ids) == 1, \ 'The user with email ' + args[1] + \ 'not found!' experiments = db.get_user_experiments(user_ids[0]) elif args[0] == 'all': assert len(args) == 1 users = db.get_users() experiments = [] for u in users: experiments += db.get_user_experiments(u) else: get_logger().critical('Unknown command ' + args[0]) return if cli_args.short: for e in experiments: print e return experiments = [db.get_experiment(e) for e in experiments] experiments.sort(key=lambda e: -e.time_added) table = [['Time added', 'Key', 'Project', 'Status']] for e in experiments: table.append([ time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(e.time_added)), e.key, e.project, e.status]) print AsciiTable(table).table
def test_stop_experiment(self): my_path = os.path.dirname(os.path.realpath(__file__)) logger = logs.getLogger('test_stop_experiment') logger.setLevel(10) config_name = os.path.join(my_path, 'test_config_http_client.yaml') key = 'test_stop_experiment' + str(uuid.uuid4()) with model.get_db_provider(model.get_config(config_name)) as db: try: db.delete_experiment(key) except Exception: pass p = subprocess.Popen(['studio', 'run', '--config=' + config_name, '--experiment=' + key, '--force-git', '--verbose=debug', 'stop_experiment.py'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=my_path) # wait till experiment spins up experiment = None while experiment is None or experiment.status == 'waiting': time.sleep(1) try: experiment = db.get_experiment(key) except BaseException: pass logger.info('Stopping experiment') db.stop_experiment(key) pout, _ = p.communicate() if pout: logger.debug("studio run output: \n" + pout.decode()) db.delete_experiment(key)
def __enter__(self): with model.get_db_provider(self.config): pass self.p = None if self.wm: self.logger.debug('Spinning up cloud workers') if self.use_spot: self.wm.start_spot_workers( self.queue_name, self.bid, self.resources_needed, start_workers=self.num_workers, queue_upscaling=self.queue_upscaling, ssh_keypair=self.ssh_keypair, timeout=self.cloud_timeout) else: for i in range(self.num_workers): self.wm.start_worker( self.queue_name, self.resources_needed, ssh_keypair=self.ssh_keypair, timeout=self.cloud_timeout) elif self.queue_name is None or self.queue_name == 'local': self.logger.debug('Starting local worker') self.p = subprocess.Popen([ 'studio-local-worker', '--verbose=%s' % self.config['verbose'], '--timeout=' + str(self.cloud_timeout)], close_fds=True) # yet another case is when queue name is specified, but # cloud is not - that means running on a separately # managed server that listens to the queue # # The contract is queue_name that starts with sqs or ec2 # is an SQS queue, otherwise, it is a PubSub queue return self
def stubtest_worker(testclass, experiment_name, runner_args, config_name, test_script, expected_output, script_args=[], queue=LocalQueue(), wait_for_experiment=True, delete_when_done=True, test_output=True): my_path = os.path.dirname(os.path.realpath(__file__)) config_name = os.path.join(my_path, config_name) logger = logging.getLogger('stubtest_worker') logger.setLevel(10) queue.clean() with model.get_db_provider(model.get_config(config_name)) as db: try: db.delete_experiment(experiment_name) except Exception: pass p = subprocess.Popen(['studio', 'run'] + runner_args + [ '--config=' + config_name, '--verbose=debug', '--force-git', '--experiment=' + experiment_name, test_script ] + script_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, cwd=my_path) pout, _ = p.communicate() if pout: logger.debug("studio run output: \n" + pout) db = model.get_db_provider(model.get_config(config_name)) experiments = [ e for e in db.get_user_experiments() if e.startswith(experiment_name) ] assert len(experiments) == 1 experiment_name = experiments[0] try: # test saved arguments keybase = "/experiments/" + experiment_name saved_args = db._get(keybase + '/args') if saved_args is not None: testclass.assertTrue(len(saved_args) == len(script_args)) for i in range(len(saved_args)): testclass.assertTrue(saved_args[i] == script_args[i]) testclass.assertTrue(db._get(keybase + '/filename') == test_script) else: testclass.assertTrue(script_args is None or len(script_args) == 0) experiment = db.get_experiment(experiment_name) if wait_for_experiment: while not experiment.status == 'finished': time.sleep(1) experiment = db.get_experiment(experiment_name) if test_output: with open(db.store.get_artifact(experiment.artifacts['output']), 'r') as f: data = f.read() split_data = data.strip().split('\n') testclass.assertEquals(split_data[-1], expected_output) check_workspace(testclass, db, experiment_name) if delete_when_done: db.delete_experiment(experiment_name) return db except Exception as e: print("Exception {} raised during test".format(e)) print("worker output: \n {}".format(pout)) print("Exception trace:") print(traceback.format_exc()) raise e
def _kill(args, cli_args): with model.get_db_provider(cli_args.config) as db: for e in args: get_logger().info('Deleting experiment ' + e) db.delete_experiment(e)
def get_db_provider(self): config = model.get_config('test_config_http_client.yaml') config['database']['serverUrl'] = 'http://localhost:' + str(self.port) return model.get_db_provider(config)
def _get_provider(): config = _get_config() return model.get_db_provider(config)
def main(args=sys.argv[1:]): logger = logs.get_logger('studio-runner') parser = argparse.ArgumentParser(description='Studio runner. \ Usage: studio run <runner_arguments> \ script <script_arguments>') parser.add_argument('--config', help='configuration file', default=None) parser.add_argument('--project', help='name of the project', default=None) parser.add_argument('--experiment', '-e', help='Name of the experiment. If none provided, ' + 'random uuid will be generated', default=None) parser.add_argument('--guest', help='Guest mode (does not require db credentials)', action='store_true') parser.add_argument( '--force-git', help='If run in a git directory, force running the experiment ' + 'even if changes are not commited', action='store_true') parser.add_argument('--gpus', help='Number of gpus needed to run the experiment', type=int, default=None) parser.add_argument('--cpus', help='Number of cpus needed to run the experiment' + ' (used to configure cloud instance)', type=int, default=None) parser.add_argument('--ram', help='Amount of RAM needed to run the experiment' + ' (used to configure cloud instance), ex: 10G, 10GB', default=None) parser.add_argument('--gpuMem', help='Amount of GPU RAM needed to run the experiment', default=None) parser.add_argument( '--hdd', help='Amount of hard drive space needed to run the experiment' + ' (used to configure cloud instance), ex: 10G, 10GB', default=None) parser.add_argument('--queue', '-q', help='Name of the remote execution queue', default=None) parser.add_argument( '--cloud', help='Cloud execution mode. Could be gcloud, gcspot, ec2 or ec2spot', default=None) parser.add_argument( '--bid', help='Spot instance price bid, specified in USD or in percentage ' + 'of on-demand instance price. Default is %(default)s', default='100%') parser.add_argument( '--capture-once', '-co', help='Name of the immutable artifact to be captured. ' + 'It will be captured once before the experiment is run', default=[], action='append') parser.add_argument( '--capture', '-c', help='Name of the mutable artifact to be captured continuously', default=[], action='append') parser.add_argument( '--reuse', '-r', help='Name of the artifact from another experiment to use', default=[], action='append') parser.add_argument('--verbose', '-v', help='Verbosity level. Allowed values: ' + 'debug, info, warn, error, crit ' + 'or numerical value of logger levels.', default=None) parser.add_argument( '--metric', help='Metric to show in the summary of the experiment, ' + 'and to base hyperparameter search on. ' + 'Refers a scalar value in tensorboard log ' + 'example: --metric=val_loss[:final | :min | :max] to report ' + 'validation loss in the end of the keras experiment ' + '(or smallest or largest throughout the experiment for :min ' + 'and :max respectively)', default=None) parser.add_argument( '--hyperparam', '-hp', help='Try out multiple values of a certain parameter. ' + 'For example, --hyperparam=learning_rate:0.01:0.1:l10 ' + 'will instantiate 10 versions of the script, replace ' + 'learning_rate with a one of the 10 values for learning ' + 'rate that lies on a log grid from 0.01 to 0.1, create ' 'experiments and place them in the queue.', default=[], action='append') parser.add_argument('--num-workers', help='Number of local or cloud workers to spin up', type=int, default=None) parser.add_argument( '--python-pkg', help='Python package not present in the current environment ' + 'that is needed for experiment. Only compatible with ' + 'remote and cloud workers for now', default=[], action='append') parser.add_argument( '--ssh-keypair', help='Name of the SSH keypair used to access the EC2 ' + 'instances directly', default=None) parser.add_argument( '--optimizer', '-opt', help='Name of optimizer to use, by default is grid search. ' + 'The name of the optimizer must either be in ' + 'studio/optimizer_plugins ' + 'directory or the path to the optimizer source file ' + 'must be supplied. ', default='grid') parser.add_argument( '--cloud-timeout', help="Time (in seconds) that cloud workers wait for messages. " + "If negative, " + "wait for the first message in the queue indefinitely " + "and shut down " + "as soon as no new messages are available. " + "If zero, don't wait at all." + "Default value is %(default)d", type=int, default=300) parser.add_argument('--user-startup-script', help='Path of script to run immediately ' + 'before running the remote worker', default=None) parser.add_argument( '--branch', help='Branch of studioml to use when running remote worker, useful ' + 'for debugging pull requests. Default is current', default=None) parser.add_argument( '--max-duration', help='Max experiment runtime (i.e. time after which experiment ' + 'should be killed no matter what.). Examples of values ' + 'might include 5h, 48h2m10s', default=None) parser.add_argument( '--lifetime', help='Max experiment lifetime (i.e. wait time after which ' + 'experiment loses relevance and should not be started)' + ' Examples include 240h30m10s', default=None) parser.add_argument( '--container', help='Singularity container in which experiment should be run. ' + 'Assumes that container has all dependencies installed', default=None) parser.add_argument('--port', help='Ports to open on a cloud instance', default=[], action='append') # detect which argument is the script filename # and attribute all arguments past that index as related to the script (runner_args, other_args) = parser.parse_known_args(args) py_suffix_args = [ i for i, arg in enumerate(args) if arg.endswith('.py') or '::' in arg ] rerun = False if len(py_suffix_args) < 1: print('None of the arugments end with .py') if len(other_args) == 0: print("Trying to run a container job") assert runner_args.container is not None exec_filename = None elif len(other_args) == 1: print("Treating last argument as experiment key to rerun") rerun = True experiment_key = args[-1] else: print("Too many extra arguments - should be either none " + "for container job or one for experiment re-run") sys.exit(1) else: script_index = py_suffix_args[0] exec_filename, other_args = args[script_index], args[script_index + 1:] runner_args = parser.parse_args(args[:script_index]) # TODO: Queue the job based on arguments and only then execute. config = model.get_config(runner_args.config) if runner_args.verbose: config['verbose'] = runner_args.verbose if runner_args.guest: config['database']['guest'] = True if runner_args.container: runner_args.capture_once.append(runner_args.container + ':_singularity') verbose = model.parse_verbosity(config['verbose']) logger.setLevel(verbose) if git_util.is_git() and not git_util.is_clean() and not rerun: logger.warn('Running from dirty git repo') if not runner_args.force_git: logger.error( 'Specify --force-git to run experiment from dirty git repo') sys.exit(1) resources_needed = _parse_hardware(runner_args, config['resources_needed']) logger.debug('resources requested: ') logger.debug(str(resources_needed)) # Set up default artifacts: # note that their "local" paths will be updated # on Experiment creation, # but they must have "local" field defined # to have storage credentials set up properly. artifacts = { 'workspace': { 'mutable': False, 'local': os.getcwd(), 'unpack': True }, 'modeldir': { 'mutable': True, 'local': '', 'unpack': True }, 'retval': { 'mutable': True, 'local': '', 'unpack': True }, 'output': { 'mutable': True, 'local': '', 'unpack': True }, 'tb': { 'mutable': True, 'local': '', 'unpack': True }, '_metrics': { 'mutable': True, 'local': '', 'unpack': True }, '_metadata': { 'mutable': True, 'local': '', 'unpack': True } } artifacts.update(_parse_artifacts(runner_args.capture, mutable=True)) artifacts.update(_parse_artifacts(runner_args.capture_once, mutable=False)) with model.get_db_provider(config) as db: artifacts.update(_parse_external_artifacts(runner_args.reuse, db)) logger.debug("Task artifacts: %s", repr(artifacts)) storage_creds = config.get('storage', {}).get(KEY_CREDENTIALS, None) _setup_artifacts_creds(artifacts, storage_creds) if runner_args.branch: config['cloud']['branch'] = runner_args.branch if runner_args.user_startup_script: config['cloud']['user_startup_script'] = \ runner_args.user_startup_script if runner_args.lifetime: config['experimentLifetime'] = runner_args.lifetime queueLifetime = None if any(runner_args.hyperparam): if runner_args.optimizer == "grid": experiments = _add_hyperparam_experiments(exec_filename, other_args, runner_args, artifacts, resources_needed, logger) queue = model.get_queue(queue_name=runner_args.queue, cloud=runner_args.cloud, config=config, close_after=queueLifetime, logger=logger, verbose=verbose) queue_name = submit_experiments(experiments, config=config, logger=logger, queue=queue) spin_up_workers(runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) else: opt_modulepath = os.path.join( os.path.dirname(os.path.abspath(__file__)), "optimizer_plugins", runner_args.optimizer + ".py") if not os.path.exists(opt_modulepath): opt_modulepath = os.path.abspath( os.path.expanduser(runner_args.optimizer)) logger.info('optimizer path: %s' % opt_modulepath) assert os.path.exists(opt_modulepath) sys.path.append(os.path.dirname(opt_modulepath)) opt_module = importlib.import_module( os.path.basename(opt_modulepath.replace(".py", ''))) h = HyperparameterParser(runner_args, logger) hyperparams = h.parse() optimizer = getattr(opt_module, "Optimizer")(hyperparams, config['optimizer'], logger) workers_started = False queue_name = runner_args.queue while not optimizer.stop(): hyperparam_pop = optimizer.ask() hyperparam_tuples = h.convert_to_tuples(hyperparam_pop) experiments = _add_hyperparam_experiments( exec_filename, other_args, runner_args, artifacts, resources_needed, logger, optimizer=optimizer, hyperparam_tuples=hyperparam_tuples) queue = model.get_queue(queue_name=queue_name, cloud=runner_args.cloud, config=config, close_after=queueLifetime, logger=logger, verbose=verbose) queue_name = submit_experiments(experiments, config=config, logger=logger, queue=queue) if not workers_started: spin_up_workers(runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) workers_started = True fitnesses, behaviors = get_experiment_fitnesses( experiments, optimizer, config, logger) try: optimizer.tell(hyperparam_pop, fitnesses, behaviors) except BaseException: util.check_for_kb_interrupt() optimizer.tell(hyperparam_pop, fitnesses) try: optimizer.disp() except BaseException: util.check_for_kb_interrupt() logger.warn('Optimizer has no disp() method') else: if rerun: with model.get_db_provider(config) as db: experiment = db.get_experiment(experiment_key) new_key = runner_args.experiment if runner_args.experiment \ else experiment_key + '_rerun' + str(uuid.uuid4()) experiment.key = new_key for _, art in six.iteritems(experiment.artifacts): art['mutable'] = False experiments = [experiment] else: experiments = [ create_experiment(filename=exec_filename, args=other_args, experiment_name=runner_args.experiment, project=runner_args.project, artifacts=artifacts, resources_needed=resources_needed, metric=runner_args.metric, max_duration=runner_args.max_duration, dependency_policy=StudioDependencyPolicy()) ] queue = model.get_queue(queue_name=runner_args.queue, cloud=runner_args.cloud, config=config, close_after=queueLifetime, logger=logger, verbose=verbose) queue_name = submit_experiments(experiments, config=config, logger=logger, queue=queue) spin_up_workers(runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) return
def get_experiment_fitnesses(experiments, optimizer, config, logger): with model.get_db_provider() as db: progbar = Progbar(len(experiments), interval=0.0) logger.info("Waiting for fitnesses from %s experiments" % len(experiments)) bad_line_dicts = [dict() for x in range(len(experiments))] has_result = [False for i in range(len(experiments))] fitnesses = [0.0 for i in range(len(experiments))] behaviors = [None for i in range(len(experiments))] term_criterion = config['optimizer']['termination_criterion'] skip_gen_thres = term_criterion['skip_gen_thres'] skip_gen_timeout = term_criterion['skip_gen_timeout'] result_timestamp = time.time() while sum(has_result) < len(experiments): for i, experiment in enumerate(experiments): if float(sum(has_result)) / len(experiments) >= skip_gen_thres\ and time.time() - result_timestamp > skip_gen_timeout: logger.warn( "Skipping to next gen with %s of solutions evaled" % (float(sum(has_result)) / len(experiments))) has_result = [True] * len(experiments) break if has_result[i]: continue returned_experiment = db.get_experiment(experiment.key, getinfo=True) output = db._get_experiment_logtail(returned_experiment) if output is None: continue for j, line in enumerate(output): if line.startswith( "Traceback (most recent call last):") and \ j not in bad_line_dicts[i]: logger.warn("Experiment %s: error" " discovered in output" % returned_experiment.key) logger.warn("".join(output[j:])) bad_line_dicts[i][j] = True if line.startswith("Behavior") or \ line.startswith("behavior"): try: behavior = eval(line.rstrip().split(':')[1]) if isinstance(behavior, np.ndarray): pass elif isinstance(behavior, list): behavior = np.array(behavior) else: raise except BaseException: util.check_for_kb_interrupt() if j not in bad_line_dicts[i]: logger.warn( 'Experiment %s: error parsing or invalid' ' behavior' % returned_experiment.key) logger.warn(line) bad_line_dicts[i][j] = True else: behaviors[i] = behavior if line.startswith("Fitness") or \ line.startswith("fitness"): try: fitness = float(line.rstrip().split(':')[1]) # assert fitness >= 0.0 except BaseException: util.check_for_kb_interrupt() if j not in bad_line_dicts[i]: logger.warn( 'Experiment %s: error parsing or invalid' ' fitness' % returned_experiment.key) logger.warn(line) bad_line_dicts[i][j] = True else: if fitness < 0.0: logger.warn('Experiment %s: returned' ' fitness is less than zero,' ' setting it to zero' % returned_experiment.key) fitness = 0.0 fitnesses[i] = fitness has_result[i] = True progbar.add(1) result_timestamp = time.time() break time.sleep(config['sleep_time']) return fitnesses, behaviors
def worker_loop(queue, parsed_args, single_experiment=False, timeout=0, verbose=None): fetch_artifacts = True logger = logs.get_logger('worker_loop') hold_period = 4 retval = 0 while True: msg = queue.dequeue(acknowledge=False, timeout=timeout) if not msg: break first_exp, ack_key = msg data_dict = json.loads(sixdecode(first_exp)) experiment_key = data_dict['experiment']['key'] config = data_dict['config'] parsed_args.config = config if verbose: config['verbose'] = verbose else: verbose = model.parse_verbosity(config.get('verbose', None)) logger.setLevel(verbose) logger.debug('Received message: \n{}'.format(data_dict)) executor = LocalExecutor(queue, parsed_args) with model.get_db_provider(config) as db: # experiment = experiment_from_dict(data_dict['experiment']) def try_get_experiment(): experiment = db.get_experiment(experiment_key) if experiment is None: raise ValueError( 'experiment is not found - indicates storage failure') return experiment experiment = retry(try_get_experiment, sleep_time=10, logger=logger) if config.get('experimentLifetime', None) and \ int(str2duration(config['experimentLifetime']) .total_seconds()) + experiment.time_added < time.time(): logger.info( 'Experiment expired (max lifetime of {0} was exceeded)'. format(config.get('experimentLifetime', None))) queue.acknowledge(ack_key) continue if allocate_resources(experiment, config, verbose=verbose): def hold_job(): queue.hold(ack_key, hold_period) hold_job() sched = BackgroundScheduler() sched.add_job(hold_job, 'interval', minutes=hold_period / 2) sched.start() try: python = 'python' if experiment.pythonver[0] == '3': python = 'python3' if '_singularity' not in experiment.artifacts.keys(): pip_diff = pip_needed_packages(experiment.pythonenv, python) if any(pip_diff): logger.info( 'Setting up python packages for experiment') if pip_install_packages(pip_diff, python, logger) != 0: logger.info( "Installation of all packages together " + " failed, " "trying one package at a time") for pkg in pip_diff: pip_install_packages([pkg], python, logger) for tag, item in experiment.artifacts.items(): art: Artifact = item if fetch_artifacts or art.local_path is None: get_only_newer: bool = True if tag == 'workspace': get_only_newer = False if not art.is_mutable: logger.info('Fetching artifact ' + tag) art.local_path = retry(lambda: db.get_artifact( art, only_newer=get_only_newer), sleep_time=10, logger=logger) else: logger.info('Skipping mutable artifact ' + tag) returncode = executor.run(experiment) if returncode != 0: retval = returncode finally: sched.shutdown() queue.acknowledge(ack_key) if single_experiment: logger.info('single_experiment is True, quitting') return retval else: logger.info('Cannot run experiment ' + experiment.key + ' due lack of resources. Will retry') # Debounce failed requests we cannot service yet time.sleep(config.get('sleep_time', 5)) logger.info("Queue in {0} is empty, quitting".format( fs_tracker.get_queue_directory())) return retval
def stubtest_worker( testclass, experiment_name, runner_args, config_name, test_script, expected_output, script_args=[], queue=LocalQueue(), wait_for_experiment=True, delete_when_done=True, test_output=True, test_workspace=True): my_path = os.path.dirname(os.path.realpath(__file__)) config_name = os.path.join(my_path, config_name) logger = logs.getLogger('stubtest_worker') logger.setLevel(10) queue.clean() with model.get_db_provider(model.get_config(config_name)) as db: try: db.delete_experiment(experiment_name) except Exception: pass os.environ['PYTHONUNBUFFERED'] = 'True' p = subprocess.Popen(['studio', 'run'] + runner_args + ['--config=' + config_name, '--verbose=debug', '--force-git', '--experiment=' + experiment_name, test_script] + script_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, cwd=my_path) pout, _ = p.communicate() if pout: logger.debug("studio run output: \n" + sixdecode(pout)) splitpout = sixdecode(pout).split('\n') experiments = [line.split(' ')[-1] for line in splitpout if line.startswith('studio run: submitted experiment')] logger.debug("added experiments: {}".format(experiments)) db = model.get_db_provider(model.get_config(config_name)) experiment_name = experiments[0] try: experiment = db.get_experiment(experiment_name) if wait_for_experiment: while not experiment or not experiment.status == 'finished': experiment = db.get_experiment(experiment_name) if test_output: with open(db.get_artifact(experiment.artifacts['output']), 'r') as f: data = f.read() split_data = data.strip().split('\n') print(data) testclass.assertEquals(split_data[-1], expected_output) if test_workspace: check_workspace(testclass, db, experiment_name) if delete_when_done: retry(lambda: db.delete_experiment(experiment_name), sleep_time=10) return db except Exception as e: print("Exception {} raised during test".format(e)) print("worker output: \n {}".format(pout)) print("Exception trace:") print(traceback.format_exc()) raise e
def run(self, experiment): if isinstance(experiment, six.string_types): experiment = self.db.get_experiment(experiment) elif not isinstance(experiment, Experiment): raise ValueError("Unknown type of experiment: " + str(type(experiment))) self.logger.info("Experiment key: " + experiment.key) with model.get_db_provider(self.config) as db: db.start_experiment(experiment) """ Override env variables with those inside the queued message """ env = dict(os.environ) if 'env' in self.config.keys(): for k, v in six.iteritems(self.config['env']): if v is not None: env[str(k)] = str(v) env['PYTHONUNBUFFERED'] = 'TRUE' fs_tracker.setup_experiment(env, experiment, clean=False) log_path = fs_tracker.get_artifact_cache('output', experiment.key) self.logger.debug('Child process environment:') self.logger.debug(str(env)) sched = BackgroundScheduler() sched.start() with open(log_path, 'w') as output_file: python = 'python' if experiment.pythonver[0] == '3': python = 'python3' python = which(python) cmd = [python, experiment.filename] + experiment.args cwd = experiment.artifacts['workspace'].local_path container_artifact = experiment.artifacts.get('_singularity') if container_artifact: container = container_artifact.get('local') if not container: container = container_artifact.get('qualified') cwd = fs_tracker.get_artifact_cache( 'workspace', experiment.key) for tag, art in six.iteritems(experiment.artifacts): local_path = art.get('local') if not art['mutable'] and os.path.exists(local_path): os.symlink(art['local'], os.path.join(os.path.dirname(cwd), tag)) if experiment.filename is not None: cmd = [ 'singularity', 'exec', container, ] + cmd else: cmd = ['singularity', 'run', container] self.logger.info('Running cmd: {0} in {1}'.format(cmd, cwd)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env, cwd=cwd, text=True) def kill_subprocess(): p.kill() def get_duration(tag: str): value = self.config.get(tag, '0m') return int(str2duration(value).total_seconds() / 60) def checkpoint(): try: db.checkpoint_experiment(experiment) except BaseException as e: self.logger.info(e) check_for_kb_interrupt() minutes = get_duration('saveWorkspaceFrequency') sched.add_job(checkpoint, 'interval', minutes=minutes) metrics_path = fs_tracker.get_artifact_cache( '_metrics', experiment.key) minutes = get_duration('saveMetricsFrequency') sched.add_job(lambda: save_metrics(metrics_path), 'interval', minutes=minutes) def kill_if_stopped(): try: db_expr = db.get_experiment(experiment.key, getinfo=False) except: check_for_kb_interrupt() db_expr = None # Transient issues with getting experiment data might # result in a None value being returned, as result # leave the experiment running because we wont be able to # do anything else even if this experiment is stopped # in any event if the experiment runs too long then it # will exceed its allocated time and stop if db_expr is not None: if db_expr.status == 'stopped': kill_subprocess() return if experiment.max_duration is not None and \ time.time() > experiment.time_started + \ int(str2duration(experiment.max_duration) .total_seconds()): kill_subprocess() return # If our tasks queue is signalled inactive # during work process execution, that means we need to drop # current execution and exit if not self.task_queue.is_active(): kill_subprocess() sched.add_job(kill_if_stopped, 'interval', seconds=10) while True: output = p.stdout.readline() if output == '' and p.poll() is not None: break if output: line_out = output.strip() print(line_out) output_file.write(line_out) try: p.wait() finally: save_metrics(metrics_path) sched.shutdown() db.checkpoint_experiment(experiment) db.finish_experiment(experiment) return p.returncode
def get_db_provider(self): config = model.get_config(self.client_config_file) config['database']['serverUrl'] = 'http://localhost:' + str(self.port) return model.get_db_provider(config)
def get_db_provider(self, config_name): config_file = os.path.join( os.path.dirname( os.path.realpath(__file__)), config_name) return model.get_db_provider(model.get_config(config_file))
def _stop(args, cli_args): with model.get_db_provider(cli_args.config) as db: for e in args: get_logger().info('Stopping experiment ' + e) db.stop_experiment(e)
def studio_run(self, line, cell=None): script_text = [] pickleable_ns = {} for varname, var in six.iteritems(self.shell.user_ns): if not varname.startswith('__'): if isinstance(var, ModuleType) and \ var.__name__ != 'studio.magics': script_text.append( 'import {} as {}'.format(var.__name__, varname) ) else: try: pickle.dumps(var) pickleable_ns[varname] = var except BaseException: pass script_text.append(cell) script_text = '\n'.join(script_text) stub_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'run_magic.py.stub') with open(stub_path) as f: script_stub = f.read() script = script_stub.format(script=script_text) experiment_key = str(int(time.time())) + \ "_jupyter_" + str(uuid.uuid4()) print('Running studio with experiment key ' + experiment_key) config = model.get_config() if config['database']['type'] == 'http': print("Experiment progress can be viewed/shared at:") print("{}/experiment/{}".format( config['database']['serverUrl'], experiment_key)) workspace_new = fs_tracker.get_artifact_cache( 'workspace', experiment_key) rsync_cp('.', workspace_new) with open(os.path.join(workspace_new, '_script.py'), 'w') as f: f.write(script) ns_path = fs_tracker.get_artifact_cache('_ns', experiment_key) with gzip.open(ns_path, 'wb') as f: f.write(pickle.dumps(pickleable_ns)) if any(line): runner_args = line.strip().split(' ') else: runner_args = [] runner_args.append('--capture={}:_ns'.format(ns_path)) runner_args.append('--capture-once=.:workspace') runner_args.append('--force-git') runner_args.append('--experiment=' + experiment_key) notebook_cwd = os.getcwd() os.chdir(workspace_new) print(runner_args + ['_script.py']) runner_main(runner_args + ['_script.py']) os.chdir(notebook_cwd) with model.get_db_provider() as db: while True: experiment = db.get_experiment(experiment_key) if experiment and experiment.status == 'finished': break time.sleep(10) new_ns_path = db.get_artifact(experiment.artifacts['_ns']) with open(new_ns_path) as f: new_ns = pickle.loads(f.read()) self.shell.user_ns.update(new_ns)