def _update_user(self): if not os.path.exists(self.token_file): # refresh tokens don't expire, hence we can # use them forever once obtained # or (time.time() - os.path.getmtime(api_key)) > HOUR: if self.use_email_auth: self.sign_in_with_email() self.expired = False else: self.expired = True else: # If json file fails to load, try again counter = 0 user = None while True: if user is not None or counter >= MAX_NUM_RETRIES: break try: with open(self.token_file) as f: user = json.loads(f.read()) except BaseException as e: check_for_kb_interrupt() self.logger.info(e) time.sleep(SLEEP_TIME) counter += 1 if user is None: return self.user = user if time.time() > self.user.get('expiration', 0): counter = 0 while counter < MAX_NUM_RETRIES: try: self.refresh_token(user['email'], user['refreshToken']) break except BaseException as e: check_for_kb_interrupt() self.logger.info(e) time.sleep(SLEEP_TIME) counter += 1 else: self.expired = False
def get_experiment(self, experiment, getinfo='True'): if isinstance(experiment, str): key = experiment else: key = experiment.key headers = self._get_headers() try: request = requests.post(self.url + '/api/get_experiment', headers=headers, data=json.dumps({"key": key})) self._raise_detailed_error(request) data = request.json()['experiment'] return experiment_from_dict(data) except BaseException as e: check_for_kb_interrupt() self.logger.info('error getting experiment {}'.format(key)) self.logger.info(e) return None
def __init__(self, queue_name, sub_name=None, verbose=10): from google.cloud import pubsub assert 'GOOGLE_APPLICATION_CREDENTIALS' in os.environ.keys() with open(os.environ['GOOGLE_APPLICATION_CREDENTIALS']) as f: credentials = json.loads(f.read()) project_name = credentials['project_id'] self.logger = logs.get_logger(self.__class__.__name__) if verbose is not None: self.logger.setLevel(parse_verbosity(verbose)) self.pubclient = pubsub.PublisherClient() self.subclient = pubsub.SubscriberClient() self.project = project_name self.topic_name = self.pubclient.topic_path(project_name, queue_name) self.logger.info("Topic name = {}".format(self.topic_name)) try: self.pubtopic = self.pubclient.get_topic(self.topic_name) except BaseException as e: check_for_kb_interrupt() self.pubtopic = self.pubclient.create_topic(self.topic_name) self.logger.info('topic {} created'.format(self.topic_name)) sub_name = sub_name if sub_name else queue_name + "_sub" self.logger.info("Topic name = {}".format(queue_name)) self.logger.info("Subscription name = {}".format(sub_name)) self.sub_name = self.subclient.subscription_path( project_name, sub_name) try: self.subclient.get_subscription(self.sub_name) except BaseException as e: check_for_kb_interrupt() self.logger.warn(e) self.subclient.create_subscription(self.sub_name, self.topic_name, ack_deadline_seconds=20) self.logger.info('subscription {} created'.format(sub_name))
def get_hash(self, local_path=None): if local_path is None: local_path = self.local_path if local_path is None or not os.path.exists(local_path): return self._generate_key() tar_filename =\ tar_artifact(local_path, self.key, self.get_compression(), self.logger) try: retval = util.sha256_checksum(tar_filename) os.remove(tar_filename) self.logger.debug('deleted local artifact file %s', tar_filename) return retval except BaseException as exc: util.check_for_kb_interrupt() self.logger.error('error generating a hash for %s: %s', tar_filename, repr(exc)) return None
def checkpoint(): try: db.checkpoint_experiment(experiment) except BaseException as e: self.logger.info(e) check_for_kb_interrupt()
def get_experiment_fitnesses(experiments, optimizer, config, logger): with model.get_db_provider() as db: progbar = Progbar(len(experiments), interval=0.0) logger.info("Waiting for fitnesses from %s experiments" % len(experiments)) bad_line_dicts = [dict() for x in range(len(experiments))] has_result = [False for i in range(len(experiments))] fitnesses = [0.0 for i in range(len(experiments))] behaviors = [None for i in range(len(experiments))] term_criterion = config['optimizer']['termination_criterion'] skip_gen_thres = term_criterion['skip_gen_thres'] skip_gen_timeout = term_criterion['skip_gen_timeout'] result_timestamp = time.time() while sum(has_result) < len(experiments): for i, experiment in enumerate(experiments): if float(sum(has_result)) / len(experiments) >= skip_gen_thres\ and time.time() - result_timestamp > skip_gen_timeout: logger.warn( "Skipping to next gen with %s of solutions evaled" % (float(sum(has_result)) / len(experiments))) has_result = [True] * len(experiments) break if has_result[i]: continue returned_experiment = db.get_experiment(experiment.key, getinfo=True) output = db._get_experiment_logtail(returned_experiment) if output is None: continue for j, line in enumerate(output): if line.startswith( "Traceback (most recent call last):") and \ j not in bad_line_dicts[i]: logger.warn("Experiment %s: error" " discovered in output" % returned_experiment.key) logger.warn("".join(output[j:])) bad_line_dicts[i][j] = True if line.startswith("Behavior") or \ line.startswith("behavior"): try: behavior = eval(line.rstrip().split(':')[1]) if isinstance(behavior, np.ndarray): pass elif isinstance(behavior, list): behavior = np.array(behavior) else: raise except BaseException: util.check_for_kb_interrupt() if j not in bad_line_dicts[i]: logger.warn( 'Experiment %s: error parsing or invalid' ' behavior' % returned_experiment.key) logger.warn(line) bad_line_dicts[i][j] = True else: behaviors[i] = behavior if line.startswith("Fitness") or \ line.startswith("fitness"): try: fitness = float(line.rstrip().split(':')[1]) # assert fitness >= 0.0 except BaseException: util.check_for_kb_interrupt() if j not in bad_line_dicts[i]: logger.warn( 'Experiment %s: error parsing or invalid' ' fitness' % returned_experiment.key) logger.warn(line) bad_line_dicts[i][j] = True else: if fitness < 0.0: logger.warn('Experiment %s: returned' ' fitness is less than zero,' ' setting it to zero' % returned_experiment.key) fitness = 0.0 fitnesses[i] = fitness has_result[i] = True progbar.add(1) result_timestamp = time.time() break time.sleep(config['sleep_time']) return fitnesses, behaviors
def main(args=sys.argv[1:]): logger = logs.get_logger('studio-runner') parser = argparse.ArgumentParser(description='Studio runner. \ Usage: studio run <runner_arguments> \ script <script_arguments>') parser.add_argument('--config', help='configuration file', default=None) parser.add_argument('--project', help='name of the project', default=None) parser.add_argument('--experiment', '-e', help='Name of the experiment. If none provided, ' + 'random uuid will be generated', default=None) parser.add_argument('--guest', help='Guest mode (does not require db credentials)', action='store_true') parser.add_argument( '--force-git', help='If run in a git directory, force running the experiment ' + 'even if changes are not commited', action='store_true') parser.add_argument('--gpus', help='Number of gpus needed to run the experiment', type=int, default=None) parser.add_argument('--cpus', help='Number of cpus needed to run the experiment' + ' (used to configure cloud instance)', type=int, default=None) parser.add_argument('--ram', help='Amount of RAM needed to run the experiment' + ' (used to configure cloud instance), ex: 10G, 10GB', default=None) parser.add_argument('--gpuMem', help='Amount of GPU RAM needed to run the experiment', default=None) parser.add_argument( '--hdd', help='Amount of hard drive space needed to run the experiment' + ' (used to configure cloud instance), ex: 10G, 10GB', default=None) parser.add_argument('--queue', '-q', help='Name of the remote execution queue', default=None) parser.add_argument( '--cloud', help='Cloud execution mode. Could be gcloud, gcspot, ec2 or ec2spot', default=None) parser.add_argument( '--bid', help='Spot instance price bid, specified in USD or in percentage ' + 'of on-demand instance price. Default is %(default)s', default='100%') parser.add_argument( '--capture-once', '-co', help='Name of the immutable artifact to be captured. ' + 'It will be captured once before the experiment is run', default=[], action='append') parser.add_argument( '--capture', '-c', help='Name of the mutable artifact to be captured continuously', default=[], action='append') parser.add_argument( '--reuse', '-r', help='Name of the artifact from another experiment to use', default=[], action='append') parser.add_argument('--verbose', '-v', help='Verbosity level. Allowed values: ' + 'debug, info, warn, error, crit ' + 'or numerical value of logger levels.', default=None) parser.add_argument( '--metric', help='Metric to show in the summary of the experiment, ' + 'and to base hyperparameter search on. ' + 'Refers a scalar value in tensorboard log ' + 'example: --metric=val_loss[:final | :min | :max] to report ' + 'validation loss in the end of the keras experiment ' + '(or smallest or largest throughout the experiment for :min ' + 'and :max respectively)', default=None) parser.add_argument( '--hyperparam', '-hp', help='Try out multiple values of a certain parameter. ' + 'For example, --hyperparam=learning_rate:0.01:0.1:l10 ' + 'will instantiate 10 versions of the script, replace ' + 'learning_rate with a one of the 10 values for learning ' + 'rate that lies on a log grid from 0.01 to 0.1, create ' 'experiments and place them in the queue.', default=[], action='append') parser.add_argument('--num-workers', help='Number of local or cloud workers to spin up', type=int, default=None) parser.add_argument( '--python-pkg', help='Python package not present in the current environment ' + 'that is needed for experiment. Only compatible with ' + 'remote and cloud workers for now', default=[], action='append') parser.add_argument( '--ssh-keypair', help='Name of the SSH keypair used to access the EC2 ' + 'instances directly', default=None) parser.add_argument( '--optimizer', '-opt', help='Name of optimizer to use, by default is grid search. ' + 'The name of the optimizer must either be in ' + 'studio/optimizer_plugins ' + 'directory or the path to the optimizer source file ' + 'must be supplied. ', default='grid') parser.add_argument( '--cloud-timeout', help="Time (in seconds) that cloud workers wait for messages. " + "If negative, " + "wait for the first message in the queue indefinitely " + "and shut down " + "as soon as no new messages are available. " + "If zero, don't wait at all." + "Default value is %(default)d", type=int, default=300) parser.add_argument('--user-startup-script', help='Path of script to run immediately ' + 'before running the remote worker', default=None) parser.add_argument( '--branch', help='Branch of studioml to use when running remote worker, useful ' + 'for debugging pull requests. Default is current', default=None) parser.add_argument( '--max-duration', help='Max experiment runtime (i.e. time after which experiment ' + 'should be killed no matter what.). Examples of values ' + 'might include 5h, 48h2m10s', default=None) parser.add_argument( '--lifetime', help='Max experiment lifetime (i.e. wait time after which ' + 'experiment loses relevance and should not be started)' + ' Examples include 240h30m10s', default=None) parser.add_argument( '--container', help='Singularity container in which experiment should be run. ' + 'Assumes that container has all dependencies installed', default=None) parser.add_argument('--port', help='Ports to open on a cloud instance', default=[], action='append') # detect which argument is the script filename # and attribute all arguments past that index as related to the script (runner_args, other_args) = parser.parse_known_args(args) py_suffix_args = [ i for i, arg in enumerate(args) if arg.endswith('.py') or '::' in arg ] rerun = False if len(py_suffix_args) < 1: print('None of the arugments end with .py') if len(other_args) == 0: print("Trying to run a container job") assert runner_args.container is not None exec_filename = None elif len(other_args) == 1: print("Treating last argument as experiment key to rerun") rerun = True experiment_key = args[-1] else: print("Too many extra arguments - should be either none " + "for container job or one for experiment re-run") sys.exit(1) else: script_index = py_suffix_args[0] exec_filename, other_args = args[script_index], args[script_index + 1:] runner_args = parser.parse_args(args[:script_index]) # TODO: Queue the job based on arguments and only then execute. config = model.get_config(runner_args.config) if runner_args.verbose: config['verbose'] = runner_args.verbose if runner_args.guest: config['database']['guest'] = True if runner_args.container: runner_args.capture_once.append(runner_args.container + ':_singularity') verbose = model.parse_verbosity(config['verbose']) logger.setLevel(verbose) if git_util.is_git() and not git_util.is_clean() and not rerun: logger.warn('Running from dirty git repo') if not runner_args.force_git: logger.error( 'Specify --force-git to run experiment from dirty git repo') sys.exit(1) resources_needed = _parse_hardware(runner_args, config['resources_needed']) logger.debug('resources requested: ') logger.debug(str(resources_needed)) # Set up default artifacts: # note that their "local" paths will be updated # on Experiment creation, # but they must have "local" field defined # to have storage credentials set up properly. artifacts = { 'workspace': { 'mutable': False, 'local': os.getcwd(), 'unpack': True }, 'modeldir': { 'mutable': True, 'local': '', 'unpack': True }, 'retval': { 'mutable': True, 'local': '', 'unpack': True }, 'output': { 'mutable': True, 'local': '', 'unpack': True }, 'tb': { 'mutable': True, 'local': '', 'unpack': True }, '_metrics': { 'mutable': True, 'local': '', 'unpack': True }, '_metadata': { 'mutable': True, 'local': '', 'unpack': True } } artifacts.update(_parse_artifacts(runner_args.capture, mutable=True)) artifacts.update(_parse_artifacts(runner_args.capture_once, mutable=False)) with model.get_db_provider(config) as db: artifacts.update(_parse_external_artifacts(runner_args.reuse, db)) logger.debug("Task artifacts: %s", repr(artifacts)) storage_creds = config.get('storage', {}).get(KEY_CREDENTIALS, None) _setup_artifacts_creds(artifacts, storage_creds) if runner_args.branch: config['cloud']['branch'] = runner_args.branch if runner_args.user_startup_script: config['cloud']['user_startup_script'] = \ runner_args.user_startup_script if runner_args.lifetime: config['experimentLifetime'] = runner_args.lifetime queueLifetime = None if any(runner_args.hyperparam): if runner_args.optimizer == "grid": experiments = _add_hyperparam_experiments(exec_filename, other_args, runner_args, artifacts, resources_needed, logger) queue = model.get_queue(queue_name=runner_args.queue, cloud=runner_args.cloud, config=config, close_after=queueLifetime, logger=logger, verbose=verbose) queue_name = submit_experiments(experiments, config=config, logger=logger, queue=queue) spin_up_workers(runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) else: opt_modulepath = os.path.join( os.path.dirname(os.path.abspath(__file__)), "optimizer_plugins", runner_args.optimizer + ".py") if not os.path.exists(opt_modulepath): opt_modulepath = os.path.abspath( os.path.expanduser(runner_args.optimizer)) logger.info('optimizer path: %s' % opt_modulepath) assert os.path.exists(opt_modulepath) sys.path.append(os.path.dirname(opt_modulepath)) opt_module = importlib.import_module( os.path.basename(opt_modulepath.replace(".py", ''))) h = HyperparameterParser(runner_args, logger) hyperparams = h.parse() optimizer = getattr(opt_module, "Optimizer")(hyperparams, config['optimizer'], logger) workers_started = False queue_name = runner_args.queue while not optimizer.stop(): hyperparam_pop = optimizer.ask() hyperparam_tuples = h.convert_to_tuples(hyperparam_pop) experiments = _add_hyperparam_experiments( exec_filename, other_args, runner_args, artifacts, resources_needed, logger, optimizer=optimizer, hyperparam_tuples=hyperparam_tuples) queue = model.get_queue(queue_name=queue_name, cloud=runner_args.cloud, config=config, close_after=queueLifetime, logger=logger, verbose=verbose) queue_name = submit_experiments(experiments, config=config, logger=logger, queue=queue) if not workers_started: spin_up_workers(runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) workers_started = True fitnesses, behaviors = get_experiment_fitnesses( experiments, optimizer, config, logger) try: optimizer.tell(hyperparam_pop, fitnesses, behaviors) except BaseException: util.check_for_kb_interrupt() optimizer.tell(hyperparam_pop, fitnesses) try: optimizer.disp() except BaseException: util.check_for_kb_interrupt() logger.warn('Optimizer has no disp() method') else: if rerun: with model.get_db_provider(config) as db: experiment = db.get_experiment(experiment_key) new_key = runner_args.experiment if runner_args.experiment \ else experiment_key + '_rerun' + str(uuid.uuid4()) experiment.key = new_key for _, art in six.iteritems(experiment.artifacts): art['mutable'] = False experiments = [experiment] else: experiments = [ create_experiment(filename=exec_filename, args=other_args, experiment_name=runner_args.experiment, project=runner_args.project, artifacts=artifacts, resources_needed=resources_needed, metric=runner_args.metric, max_duration=runner_args.max_duration, dependency_policy=StudioDependencyPolicy()) ] queue = model.get_queue(queue_name=runner_args.queue, cloud=runner_args.cloud, config=config, close_after=queueLifetime, logger=logger, verbose=verbose) queue_name = submit_experiments(experiments, config=config, logger=logger, queue=queue) spin_up_workers(runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) return
def acknowledge(self, key): try: os.remove(key) except BaseException: check_for_kb_interrupt()