def _setup(self): # initialize helper classes for interacting with GCE, GCS self.auth_http = instance.oauth_authorization(self.config) self.gce_helper = gce.Gce(self.auth_http, self.config, project_id=self.config['compute']['project']) self.gcs_helper = gcs.Gcs(self.auth_http, self.config, project_id=self.config['compute']['project']) self.instance_manager = instance.VMInstanceManager() self.instances = [] self.launched_instances = {} self.user_terminated = False # get job id self.id = gen_job_id() self.job_name_root = 'job-%s' % (self.id) if self.config['update']: self.job_name_root = 'job-updater-%s' % (self.id) # setup instance completion api call service_not_ready = True while service_not_ready: try: service = discovery.build( 'storage', self.config['compute']['api_version'], http=self.gce_helper.auth_http) self.bucket_req = service.objects().list( bucket=self.config['compute']['bucket'], prefix=self.job_name_root) service_not_ready = False except (ValueError, Exception) as e: logging.info('Connection failed. Retrying...')
def create_gce_helper(self): """ Create a gce helper class configured and authenticated by this object """ # authorize local process auth_http = oauth_authorization(self.config, None) # helper for gce api calls gce_helper = gce.Gce(auth_http, self.config, self.project) return gce_helper
def get(self): #self.response.headers['Content-Type'] = 'text/plain' """# Get the authorized Http object created by the decorator. http = decorator.http()""" http = credentials.authorize(httplib2.Http(memcache)) # Initialize gce.Gce. gce_helper = gce.Gce(http, project_id=settings['project']) # Create a Persistent Disk (PD), which is used as a boot disk. try: gce_helper.create_disk(DISK_NAME) except (gce.ApiError, gce.ApiOperationError, ValueError, Exception) as e: logging.error(INSERT_ERROR, {'name': DISK_NAME}) logging.error(e) return # Start an instance with a local start-up script and boot disk. logging.info('Starting GCE instance') try: gce_helper.start_instance( INSTANCE_NAME, DISK_NAME, service_email=settings['compute']['service_email'], scopes=settings['compute']['scopes'], startup_script='startup.sh', metadata=[{ 'key': 'url', 'value': image_url }, { 'key': 'text', 'value': image_text }, { 'key': 'cs-bucket', 'value': bucket }]) except (gce.ApiError, gce.ApiOperationError, ValueError, Exception) as e: # Delete the disk in case the instance fails to start. delete_resource(gce_helper.delete_disk, DISK_NAME) logging.error(INSERT_ERROR, {'name': INSTANCE_NAME}) logging.error(e) return except gce.DiskDoesNotExistError as e: logging.error(INSERT_ERROR, {'name': INSTANCE_NAME}) logging.error(e) return """
def stage(self): """ Stages changes by creating update disks, if they don't already exist. """ # setup vars compute_config = self.config_['compute'] created_snapshots = False if not self.update_data_disks_: self.compute_update_data_disks() # authorize access to GCE api auth_http = instance.oauth_authorization(self.config_) gce_helper = gce.Gce(auth_http, self.config_, project_id=compute_config['project']) # for all zones, create a disk snapshot if they don't already exist for zone, disk, update_disk_name in zip(compute_config['zones'], compute_config['data_disks'], self.update_data_disks_): # check for existence of the update disk (taken as a flag for the existence of an update node) disk_valid = gce_helper.get_disk(update_disk_name, zone) if not disk_valid: # create a snapshot of the current disk logging.info('Snapshotting disk %s' % (disk)) snapshot_response = gce_helper.snapshot_disk( disk, compute_config['project'], zone) # create a disk from the snapshot logging.info( 'Creating update disk %s from snapshot %s' % (update_disk_name, snapshot_response['snapshot_name'])) gce_helper.create_disk( update_disk_name, zone=zone, size_gb=compute_config['disk_size_gb'], source_snapshot=snapshot_response['snapshot_name']) # delete the snapshot ss_del_response = gce_helper.delete_snapshot( snapshot_name=snapshot_response['snapshot_name'], project=compute_config['project']) created_snapshots = True return created_snapshots
def create_gce_helper(self): """ Creates a local gce helper to re-authorize with google """ auth_http = oauth_authorization(self.config) gce_helper = gce.Gce(auth_http, self.config, self.project) return gce_helper
def push(self): """ Pushed changes by replacing original disks with update disks. Super critical section. """ # setup vars compute_config = self.config_['compute'] dt_now = dt.datetime.now() if not self.update_data_disks_: self.compute_update_data_disks() # authorize access to GCE api auth_http = instance.oauth_authorization(self.config_) gce_helper = gce.Gce(auth_http, self.config_, project_id=compute_config['project']) for zone, disk, update_disk in zip(compute_config['zones'], compute_config['data_disks'], self.update_data_disks_): # check for update disk existence disk_response = gce_helper.get_disk(update_disk, zone) if not disk_response: logging.error('Update disk %s does not exist' % (update_disk)) continue # generate backup disk filename backup_disk = '%s-backup-%s-%s-%s-%sh-%sm-%ss' % ( disk, dt_now.month, dt_now.day, dt_now.year, dt_now.hour, dt_now.minute, dt_now.second) # snapshot the updated data disks snapshot_response = gce_helper.snapshot_disk( update_disk, compute_config['project'], zone) # delete previous disk and replace, if not in use disk_response = gce_helper.get_disk(disk, zone) if disk_response: if USERS_KEY not in disk_response.keys() or ( USERS_KEY in disk_response.keys() and len(disk_response[USERS_KEY]) == 0): # create new disk from snapshot gce_helper.delete_disk(disk) gce_helper.create_disk( disk, zone=zone, size_gb=compute_config['disk_size_gb'], source_snapshot=snapshot_response['snapshot_name']) # delete update disk (don't delete if push can't be done now, otherwise changes won't be overwritten) gce_helper.delete_disk(update_disk) elif USERS_KEY in disk_response.keys() and len( disk_response[USERS_KEY]) > 0: # stage the push for a future time logging.info( 'Master disk in use. Staging backup disk for a future push' ) push_queue_filename = os.path.join(self.cache_dir_, PUSH_QUEUE_FILE) f = open(push_queue_filename, 'a') f.write(backup_disk + '\n') else: logging.warning('Master disk was not found') # create backup disk from snapshot gce_helper.create_disk( backup_disk, zone=zone, size_gb=compute_config['disk_size_gb'], source_snapshot=snapshot_response['snapshot_name']) # delete the snapshot ss_del_response = gce_helper.delete_snapshot( snapshot_name=snapshot_response['snapshot_name'], project=compute_config['project']) return True
def main(): """Perform OAuth 2 authorization, then start, list, and stop instance(s).""" logging.basicConfig(level=logging.INFO) # Load the settings for this sample app. settings = json.loads(open(gce.SETTINGS_FILE, 'r').read()) # Perform OAuth 2.0 authorization flow. flow = flow_from_clientsecrets(settings['client_secrets'], scope=settings['compute_scope']) storage = Storage(settings['oauth_storage']) credentials = storage.get() # Authorize an instance of httplib2.Http. if credentials is None or credentials.invalid: credentials = run(flow, storage) http = httplib2.Http() auth_http = credentials.authorize(http) # Retrieve user input. image_url = raw_input('Enter the URL of an image [Defaults to %s]: ' % IMAGE_URL) if not image_url: image_url = IMAGE_URL image_text = raw_input( 'Enter text to add to the image [Defaults to "%s"]: ' % IMAGE_TEXT) if not image_text: image_text = IMAGE_TEXT bucket = raw_input('Enter a Cloud Storage bucket [Required]: ') if not bucket: logging.error('Cloud Storage bucket required.') return # Initialize gce.Gce. gce_helper = gce.Gce(auth_http, project_id=settings['project']) # Create a Persistent Disk (PD), which is used as a boot disk. try: gce_helper.create_disk(DISK_NAME) except (gce.ApiError, gce.ApiOperationError, ValueError, Exception) as e: logging.error(INSERT_ERROR, {'name': DISK_NAME}) logging.error(e) return # Start an instance with a local start-up script and boot disk. logging.info('Starting GCE instance') try: gce_helper.start_instance( INSTANCE_NAME, DISK_NAME, service_email=settings['compute']['service_email'], scopes=settings['compute']['scopes'], startup_script='startup.sh', metadata=[{ 'key': 'url', 'value': image_url }, { 'key': 'text', 'value': image_text }, { 'key': 'cs-bucket', 'value': bucket }]) except (gce.ApiError, gce.ApiOperationError, ValueError, Exception) as e: # Delete the disk in case the instance fails to start. delete_resource(gce_helper.delete_disk, DISK_NAME) logging.error(INSERT_ERROR, {'name': INSTANCE_NAME}) logging.error(e) return except gce.DiskDoesNotExistError as e: logging.error(INSERT_ERROR, {'name': INSTANCE_NAME}) logging.error(e) return # List all running instances. logging.info('These are your running instances:') instances = gce_helper.list_instances() for instance in instances: logging.info(instance['name']) logging.info('Visit http://storage.googleapis.com/%s/output.png.' % bucket) logging.info('It might take a minute for the output.png file to show up.') raw_input('Hit Enter when done to shutdown instance.') # Stop the instance. delete_resource(gce_helper.stop_instance, INSTANCE_NAME) # Delete the disk. delete_resource(gce_helper.delete_disk, DISK_NAME) logging.info('Remember to delete the output.png file in ' + bucket)
def launch_experiment(args, sleep_time): """ Perform OAuth 2 authorization, then start, list, and stop instance(s). """ # Get total runtime start_time = time.time() launch_prep_start_time = time.time() # Parse arguments and load config file. config_file = args.config config = ec.ExperimentConfig(config_file) logging.basicConfig(level=logging.INFO) auth_http = oauth_authorization(config, args) # Retrieve / create instance data bucket = config['bucket'] if not bucket: logging.error('Cloud Storage bucket required.') return instance_id = random_string(INSTANCE_NAME_LENGTH) instance_root = 'experiment-%s' % (instance_id) instance_name = '%s-' % (instance_root) + '%d' disk_name = instance_name + '-disk' image_name = config['compute']['image'] run_script = config['compute']['run_script'] # Make chunks chunks = make_chunks(config) all_num_grasps = config['all_num_grasps'] grasp_samplers = config['grasp_samplers'] # Initialize gce.Gce logging.info('Initializing GCE') gce_helper = gce.Gce(auth_http, config, project_id=config['project']) gcs_helper = gcs.Gcs(auth_http, config, project_id=config['project']) # Start an instance for each chunk num_instances = 0 instances_per_region = 0 zone_index = 0 instances = [] instance_names = [] disk_names = [] instance_results = [] num_zones = len(config['compute']['zones']) yesno = raw_input( 'Create %d instances? [Y/n] ' % (len(chunks) * len(grasp_samplers) * len(all_num_grasps))) if yesno.lower() == 'n': sys.exit(1) for chunk in chunks: for grasp_sampler in grasp_samplers: for num_grasps in all_num_grasps: # Create instance-specific configuration dataset = chunk['dataset'] chunk_start, chunk_end = chunk['chunk'] curr_instance_name = instance_name % num_instances curr_disk_name = disk_name % num_instances # Create instance metadata metadata = [ { 'key': 'config', 'value': config.file_contents }, { 'key': 'instance_name', 'value': curr_instance_name }, { 'key': 'project_name', 'value': config['project'] }, { 'key': 'bucket_name', 'value': bucket }, # chunking metadata { 'key': 'dataset', 'value': dataset }, { 'key': 'chunk_start', 'value': chunk_start }, { 'key': 'chunk_end', 'value': chunk_end }, { 'key': 'run_script', 'value': run_script }, { 'key': 'num_grasps', 'value': num_grasps }, { 'key': 'grasp_sampler', 'value': grasp_sampler } ] # Create a new instance logging.info('Creating GCE instance %s' % curr_instance_name) instances.append( GceInstance(curr_instance_name, curr_disk_name, image_name, config['compute']['zones'][zone_index], metadata, config)) # update loop info num_instances += 1 instances_per_region += 1 instance_names.append(curr_instance_name) disk_names.append(curr_disk_name) instance_console = ( 'https://console.developers.google.com/' 'project/nth-clone-620/compute/instancesDetail/' 'zones/us-central1-a/instances/%s/console#end' ) % curr_instance_name # switch to new region if known to be above quota if instances_per_region >= config['compute']['instance_quota']: instances_per_region = 0 zone_index += 1 if zone_index >= num_zones: logging.warning( 'Cannot create more instances! Capping experiment at %d instances.' % (num_instances)) break # clear global q global instance_launch_queue while not instance_launch_queue.empty(): instance_launch_queue.get() # launch all instances using multiprocessing launch_start_time = time.time() if config['num_processes'] == 1: for instance in instances: instance.start() else: pool = mp.Pool(min(config['num_processes'], len(instances))) pool.map(launch_instance, instances) logging.info('Done launching instances') # put instance launch names into a queue instance_results = [] while not instance_launch_queue.empty(): curr_instance_name = instance_launch_queue.get() instance_results.append('%s.tar.gz' % curr_instance_name) # set up service result_dl_start_time = time.time() service_not_ready = True while service_not_ready: try: service = discovery.build('storage', config['compute']['api_version'], http=auth_http) req = service.objects().list(bucket=bucket) service_not_ready = False except (ValueError, Exception) as e: logging.info('Connection failed. Retrying...') instance_results.sort() completed_instance_results = [] while instance_results: # Wait before checking again done_override = wait_for_input(sleep_time, prompt='done? ') if done_override: completed_instance_results.extend(instance_results) instance_results = [] break logging.info('Checking for completion...') try: resp = req.execute() except (ValueError, Exception) as e: logging.info('Connection failed. Retrying...') continue try: items = resp['items'] except KeyError as e: logging.error(e) logging.error(resp) continue for item in items: if item['name'] in instance_results: completed_instance_results.append(item['name']) instance_results.remove(item['name']) logging.info('Instance %s completed!' % item['name']) logging.info('Waiting for %s', ' '.join(instance_results)) # Delete the instances. delete_start_time = time.time() if config['num_processes'] == 1: for instance in instances: instance.stop() else: pool = mp.Pool(min(config['num_processes'], len(instances))) pool.map(stop_instance, instances) logging.info('Done stopping instances') # Print running instances all_running_instances = [] for zone in config['compute']['zones']: zone_instances = gce_helper.list_instances(zone) lines = ['These are your running instances in zone %s:' % (zone)] for zone_instance in zone_instances: logging.info(zone_instance['name']) lines.append(' ' + zone_instance['name']) if not zone_instances: lines.append(' (none)') zone_instances_text = '\n'.join(lines) all_running_instances.append(zone_instances_text) logging.info(zone_instances_text) # Download the results download_start_time = time.time() store_dir, instance_result_dirs = gcs_helper.retrieve_results( config['bucket'], completed_instance_results, instance_root) # Send the user an email message = EMAIL_NOTIFICATION % dict( instance_id=instance_id, instance_names='\n'.join(map(lambda n: ' ' + n, instance_names)), experiment_config=config_file, script_commands=config['compute']['startup_script'], listinstances_output='\n\n'.join(all_running_instances)) send_notification_email(message=message, config=config, subject="Your experiment has completed.") # Save config file with open(os.path.join(store_dir, 'config.yaml'), 'w') as f: f.write(config.file_contents) # Run the results script TODO: move above the email result_agg_start_time = time.time() results_script_call = 'python %s %s %s' % (config['results_script'], config_file, store_dir) os.system(results_script_call) # get runtime end_time = time.time() total_runtime = end_time - start_time launch_prep_time = launch_start_time - launch_prep_start_time launch_time = result_dl_start_time - launch_start_time run_time = delete_start_time - result_dl_start_time delete_time = download_start_time - delete_start_time dl_time = result_agg_start_time - download_start_time agg_time = end_time - result_agg_start_time logging.info('Total runtime: %f' % (total_runtime)) logging.info('Prep time: %f' % (launch_prep_time)) logging.info('Launch time: %f' % (launch_time)) logging.info('Run time: %f' % (run_time)) logging.info('Delete time: %f' % (delete_time)) logging.info('Download time: %f' % (dl_time)) logging.info('Result aggregation time: %f' % (agg_time))