def end_expired_trials(experiment_config: dict): """Get all expired trials, end them and return them.""" trials_past_expiry = get_expired_trials( experiment_config['experiment'], experiment_config['max_total_time']) expired_instances = [] current_dt = datetime_now() for trial in trials_past_expiry: expired_instances.append( experiment_utils.get_trial_instance_name( experiment_config['experiment'], trial.id)) trial.time_ended = current_dt # Bail out here because trials_past_expiry will be truthy until evaluated. if not expired_instances: return # Delete instances for expired trials. running_instances = gcloud.list_instances() instances_to_delete = [ i for i in expired_instances if i in running_instances ] if instances_to_delete and not gcloud.delete_instances( instances_to_delete, experiment_config['cloud_compute_zone'], write_to_stdout=False): # If we failed to delete some instances, then don't update the status # of expired trials in database as we don't know which instances were # successfully deleted. Wait for next iteration of end_expired_trials. return db_utils.bulk_save(trials_past_expiry)
def stop_experiment(experiment_name, experiment_config_filename): """Stop the experiment specified by |experiment_config_filename|.""" experiment_config = yaml_utils.read(experiment_config_filename) if experiment_config.get('local_experiment', False): raise NotImplementedError( 'Local experiment stop logic is not implemented.') instances = gcloud.list_instances() cloud_compute_zone = experiment_config['cloud_compute_zone'] trial_prefix = 'r-' + experiment_name experiment_instances = [ instance for instance in instances if instance.startswith(trial_prefix) ] dispatcher_instance = experiment_utils.get_dispatcher_instance_name( experiment_name) if dispatcher_instance not in instances: logger.warning('Dispatcher instance not running, skip.') else: experiment_instances.append(dispatcher_instance) if not experiment_instances: logger.warning('No experiment instances found, no work to do.') return True if not gcloud.delete_instances(experiment_instances, cloud_compute_zone): logger.error('Failed to stop experiment instances.') return False logger.info('Successfully stopped experiment.') return True
def delete_instances(instances, experiment_config): """Deletes |instances|.""" running_instances = gcloud.list_instances() instances_to_delete = [i for i in instances if i in running_instances] return gcloud.delete_instances(instances_to_delete, experiment_config['cloud_compute_zone'])