예제 #1
0
    def _setup(self):
        # initialize helper classes for interacting with GCE, GCS
        self.auth_http = instance.oauth_authorization(self.config)
        self.gce_helper = gce.Gce(self.auth_http,
                                  self.config,
                                  project_id=self.config['compute']['project'])
        self.gcs_helper = gcs.Gcs(self.auth_http,
                                  self.config,
                                  project_id=self.config['compute']['project'])
        self.instance_manager = instance.VMInstanceManager()

        self.instances = []
        self.launched_instances = {}
        self.user_terminated = False

        # get job id
        self.id = gen_job_id()
        self.job_name_root = 'job-%s' % (self.id)
        if self.config['update']:
            self.job_name_root = 'job-updater-%s' % (self.id)

        # setup instance completion api call
        service_not_ready = True
        while service_not_ready:
            try:
                service = discovery.build(
                    'storage',
                    self.config['compute']['api_version'],
                    http=self.gce_helper.auth_http)
                self.bucket_req = service.objects().list(
                    bucket=self.config['compute']['bucket'],
                    prefix=self.job_name_root)
                service_not_ready = False
            except (ValueError, Exception) as e:
                logging.info('Connection failed. Retrying...')
예제 #2
0
    def create_gce_helper(self):
        """ Create a gce helper class configured and authenticated by this object """
        # authorize local process
        auth_http = oauth_authorization(self.config, None)

        # helper for gce api calls
        gce_helper = gce.Gce(auth_http, self.config, self.project)
        return gce_helper
예제 #3
0
    def get(self):
        #self.response.headers['Content-Type'] = 'text/plain'
        """# Get the authorized Http object created by the decorator.
        http = decorator.http()"""
        http = credentials.authorize(httplib2.Http(memcache))

        # Initialize gce.Gce.
        gce_helper = gce.Gce(http, project_id=settings['project'])

        # Create a Persistent Disk (PD), which is used as a boot disk.
        try:
            gce_helper.create_disk(DISK_NAME)
        except (gce.ApiError, gce.ApiOperationError, ValueError,
                Exception) as e:
            logging.error(INSERT_ERROR, {'name': DISK_NAME})
            logging.error(e)
            return

        # Start an instance with a local start-up script and boot disk.
        logging.info('Starting GCE instance')
        try:
            gce_helper.start_instance(
                INSTANCE_NAME,
                DISK_NAME,
                service_email=settings['compute']['service_email'],
                scopes=settings['compute']['scopes'],
                startup_script='startup.sh',
                metadata=[{
                    'key': 'url',
                    'value': image_url
                }, {
                    'key': 'text',
                    'value': image_text
                }, {
                    'key': 'cs-bucket',
                    'value': bucket
                }])
        except (gce.ApiError, gce.ApiOperationError, ValueError,
                Exception) as e:
            # Delete the disk in case the instance fails to start.
            delete_resource(gce_helper.delete_disk, DISK_NAME)
            logging.error(INSERT_ERROR, {'name': INSTANCE_NAME})
            logging.error(e)
            return
        except gce.DiskDoesNotExistError as e:
            logging.error(INSERT_ERROR, {'name': INSTANCE_NAME})
            logging.error(e)
            return
        """
예제 #4
0
    def stage(self):
        """ Stages changes by creating update disks, if they don't already exist. """
        # setup vars
        compute_config = self.config_['compute']
        created_snapshots = False
        if not self.update_data_disks_:
            self.compute_update_data_disks()

        # authorize access to GCE api
        auth_http = instance.oauth_authorization(self.config_)
        gce_helper = gce.Gce(auth_http,
                             self.config_,
                             project_id=compute_config['project'])

        # for all zones, create a disk snapshot if they don't already exist
        for zone, disk, update_disk_name in zip(compute_config['zones'],
                                                compute_config['data_disks'],
                                                self.update_data_disks_):
            # check for existence of the update disk (taken as a flag for the existence of an update node)
            disk_valid = gce_helper.get_disk(update_disk_name, zone)
            if not disk_valid:
                # create a snapshot of the current disk
                logging.info('Snapshotting disk %s' % (disk))
                snapshot_response = gce_helper.snapshot_disk(
                    disk, compute_config['project'], zone)

                # create a disk from the snapshot
                logging.info(
                    'Creating update disk %s from snapshot %s' %
                    (update_disk_name, snapshot_response['snapshot_name']))
                gce_helper.create_disk(
                    update_disk_name,
                    zone=zone,
                    size_gb=compute_config['disk_size_gb'],
                    source_snapshot=snapshot_response['snapshot_name'])

                # delete the snapshot
                ss_del_response = gce_helper.delete_snapshot(
                    snapshot_name=snapshot_response['snapshot_name'],
                    project=compute_config['project'])
                created_snapshots = True
        return created_snapshots
예제 #5
0
 def create_gce_helper(self):
     """ Creates a local gce helper to re-authorize with google """
     auth_http = oauth_authorization(self.config)
     gce_helper = gce.Gce(auth_http, self.config, self.project)
     return gce_helper
예제 #6
0
    def push(self):
        """ Pushed changes by replacing original disks with update disks. Super critical section. """
        # setup vars
        compute_config = self.config_['compute']
        dt_now = dt.datetime.now()
        if not self.update_data_disks_:
            self.compute_update_data_disks()

        # authorize access to GCE api
        auth_http = instance.oauth_authorization(self.config_)
        gce_helper = gce.Gce(auth_http,
                             self.config_,
                             project_id=compute_config['project'])

        for zone, disk, update_disk in zip(compute_config['zones'],
                                           compute_config['data_disks'],
                                           self.update_data_disks_):
            # check for update disk existence
            disk_response = gce_helper.get_disk(update_disk, zone)
            if not disk_response:
                logging.error('Update disk %s does not exist' % (update_disk))
                continue

            # generate backup disk filename
            backup_disk = '%s-backup-%s-%s-%s-%sh-%sm-%ss' % (
                disk, dt_now.month, dt_now.day, dt_now.year, dt_now.hour,
                dt_now.minute, dt_now.second)

            # snapshot the updated data disks
            snapshot_response = gce_helper.snapshot_disk(
                update_disk, compute_config['project'], zone)

            # delete previous disk and replace, if not in use
            disk_response = gce_helper.get_disk(disk, zone)
            if disk_response:
                if USERS_KEY not in disk_response.keys() or (
                        USERS_KEY in disk_response.keys()
                        and len(disk_response[USERS_KEY]) == 0):
                    # create new disk from snapshot
                    gce_helper.delete_disk(disk)
                    gce_helper.create_disk(
                        disk,
                        zone=zone,
                        size_gb=compute_config['disk_size_gb'],
                        source_snapshot=snapshot_response['snapshot_name'])

                    # delete update disk (don't delete if push can't be done now, otherwise changes won't be overwritten)
                    gce_helper.delete_disk(update_disk)

                elif USERS_KEY in disk_response.keys() and len(
                        disk_response[USERS_KEY]) > 0:
                    # stage the push for a future time
                    logging.info(
                        'Master disk in use. Staging backup disk for a future push'
                    )
                    push_queue_filename = os.path.join(self.cache_dir_,
                                                       PUSH_QUEUE_FILE)
                    f = open(push_queue_filename, 'a')
                    f.write(backup_disk + '\n')
            else:
                logging.warning('Master disk was not found')

            # create backup disk from snapshot
            gce_helper.create_disk(
                backup_disk,
                zone=zone,
                size_gb=compute_config['disk_size_gb'],
                source_snapshot=snapshot_response['snapshot_name'])

            # delete the snapshot
            ss_del_response = gce_helper.delete_snapshot(
                snapshot_name=snapshot_response['snapshot_name'],
                project=compute_config['project'])
        return True
예제 #7
0
def main():
    """Perform OAuth 2 authorization, then start, list, and stop instance(s)."""

    logging.basicConfig(level=logging.INFO)

    # Load the settings for this sample app.
    settings = json.loads(open(gce.SETTINGS_FILE, 'r').read())

    # Perform OAuth 2.0 authorization flow.
    flow = flow_from_clientsecrets(settings['client_secrets'],
                                   scope=settings['compute_scope'])
    storage = Storage(settings['oauth_storage'])
    credentials = storage.get()

    # Authorize an instance of httplib2.Http.
    if credentials is None or credentials.invalid:
        credentials = run(flow, storage)
    http = httplib2.Http()
    auth_http = credentials.authorize(http)

    # Retrieve user input.
    image_url = raw_input('Enter the URL of an image [Defaults to %s]: ' %
                          IMAGE_URL)
    if not image_url:
        image_url = IMAGE_URL
    image_text = raw_input(
        'Enter text to add to the image [Defaults to "%s"]: ' % IMAGE_TEXT)
    if not image_text:
        image_text = IMAGE_TEXT
    bucket = raw_input('Enter a Cloud Storage bucket [Required]: ')
    if not bucket:
        logging.error('Cloud Storage bucket required.')
        return

    # Initialize gce.Gce.
    gce_helper = gce.Gce(auth_http, project_id=settings['project'])

    # Create a Persistent Disk (PD), which is used as a boot disk.
    try:
        gce_helper.create_disk(DISK_NAME)
    except (gce.ApiError, gce.ApiOperationError, ValueError, Exception) as e:
        logging.error(INSERT_ERROR, {'name': DISK_NAME})
        logging.error(e)
        return

    # Start an instance with a local start-up script and boot disk.
    logging.info('Starting GCE instance')
    try:
        gce_helper.start_instance(
            INSTANCE_NAME,
            DISK_NAME,
            service_email=settings['compute']['service_email'],
            scopes=settings['compute']['scopes'],
            startup_script='startup.sh',
            metadata=[{
                'key': 'url',
                'value': image_url
            }, {
                'key': 'text',
                'value': image_text
            }, {
                'key': 'cs-bucket',
                'value': bucket
            }])
    except (gce.ApiError, gce.ApiOperationError, ValueError, Exception) as e:
        # Delete the disk in case the instance fails to start.
        delete_resource(gce_helper.delete_disk, DISK_NAME)
        logging.error(INSERT_ERROR, {'name': INSTANCE_NAME})
        logging.error(e)
        return
    except gce.DiskDoesNotExistError as e:
        logging.error(INSERT_ERROR, {'name': INSTANCE_NAME})
        logging.error(e)
        return

    # List all running instances.
    logging.info('These are your running instances:')
    instances = gce_helper.list_instances()
    for instance in instances:
        logging.info(instance['name'])

    logging.info('Visit http://storage.googleapis.com/%s/output.png.' % bucket)
    logging.info('It might take a minute for the output.png file to show up.')
    raw_input('Hit Enter when done to shutdown instance.')

    # Stop the instance.
    delete_resource(gce_helper.stop_instance, INSTANCE_NAME)

    # Delete the disk.
    delete_resource(gce_helper.delete_disk, DISK_NAME)

    logging.info('Remember to delete the output.png file in ' + bucket)
예제 #8
0
def launch_experiment(args, sleep_time):
    """
    Perform OAuth 2 authorization, then start, list, and stop instance(s).
    """
    # Get total runtime
    start_time = time.time()
    launch_prep_start_time = time.time()

    # Parse arguments and load config file.
    config_file = args.config
    config = ec.ExperimentConfig(config_file)
    logging.basicConfig(level=logging.INFO)
    auth_http = oauth_authorization(config, args)

    # Retrieve / create instance data
    bucket = config['bucket']
    if not bucket:
        logging.error('Cloud Storage bucket required.')
        return
    instance_id = random_string(INSTANCE_NAME_LENGTH)
    instance_root = 'experiment-%s' % (instance_id)
    instance_name = '%s-' % (instance_root) + '%d'
    disk_name = instance_name + '-disk'
    image_name = config['compute']['image']
    run_script = config['compute']['run_script']

    # Make chunks
    chunks = make_chunks(config)
    all_num_grasps = config['all_num_grasps']
    grasp_samplers = config['grasp_samplers']

    # Initialize gce.Gce
    logging.info('Initializing GCE')
    gce_helper = gce.Gce(auth_http, config, project_id=config['project'])
    gcs_helper = gcs.Gcs(auth_http, config, project_id=config['project'])

    # Start an instance for each chunk
    num_instances = 0
    instances_per_region = 0
    zone_index = 0
    instances = []
    instance_names = []
    disk_names = []
    instance_results = []
    num_zones = len(config['compute']['zones'])

    yesno = raw_input(
        'Create %d instances? [Y/n] ' %
        (len(chunks) * len(grasp_samplers) * len(all_num_grasps)))
    if yesno.lower() == 'n':
        sys.exit(1)

    for chunk in chunks:
        for grasp_sampler in grasp_samplers:
            for num_grasps in all_num_grasps:
                # Create instance-specific configuration
                dataset = chunk['dataset']
                chunk_start, chunk_end = chunk['chunk']

                curr_instance_name = instance_name % num_instances
                curr_disk_name = disk_name % num_instances

                # Create instance metadata
                metadata = [
                    {
                        'key': 'config',
                        'value': config.file_contents
                    },
                    {
                        'key': 'instance_name',
                        'value': curr_instance_name
                    },
                    {
                        'key': 'project_name',
                        'value': config['project']
                    },
                    {
                        'key': 'bucket_name',
                        'value': bucket
                    },
                    # chunking metadata
                    {
                        'key': 'dataset',
                        'value': dataset
                    },
                    {
                        'key': 'chunk_start',
                        'value': chunk_start
                    },
                    {
                        'key': 'chunk_end',
                        'value': chunk_end
                    },
                    {
                        'key': 'run_script',
                        'value': run_script
                    },
                    {
                        'key': 'num_grasps',
                        'value': num_grasps
                    },
                    {
                        'key': 'grasp_sampler',
                        'value': grasp_sampler
                    }
                ]

                # Create a new instance
                logging.info('Creating GCE instance %s' % curr_instance_name)
                instances.append(
                    GceInstance(curr_instance_name, curr_disk_name, image_name,
                                config['compute']['zones'][zone_index],
                                metadata, config))

                # update loop info
                num_instances += 1
                instances_per_region += 1
                instance_names.append(curr_instance_name)
                disk_names.append(curr_disk_name)
                instance_console = (
                    'https://console.developers.google.com/'
                    'project/nth-clone-620/compute/instancesDetail/'
                    'zones/us-central1-a/instances/%s/console#end'
                ) % curr_instance_name

                # switch to new region if known to be above quota
                if instances_per_region >= config['compute']['instance_quota']:
                    instances_per_region = 0
                    zone_index += 1

                if zone_index >= num_zones:
                    logging.warning(
                        'Cannot create more instances! Capping experiment at %d instances.'
                        % (num_instances))
                    break

    # clear global q
    global instance_launch_queue
    while not instance_launch_queue.empty():
        instance_launch_queue.get()

    # launch all instances using multiprocessing
    launch_start_time = time.time()
    if config['num_processes'] == 1:
        for instance in instances:
            instance.start()
    else:
        pool = mp.Pool(min(config['num_processes'], len(instances)))
        pool.map(launch_instance, instances)
    logging.info('Done launching instances')

    # put instance launch names into a queue
    instance_results = []
    while not instance_launch_queue.empty():
        curr_instance_name = instance_launch_queue.get()
        instance_results.append('%s.tar.gz' % curr_instance_name)

    # set up service
    result_dl_start_time = time.time()
    service_not_ready = True
    while service_not_ready:
        try:
            service = discovery.build('storage',
                                      config['compute']['api_version'],
                                      http=auth_http)
            req = service.objects().list(bucket=bucket)
            service_not_ready = False
        except (ValueError, Exception) as e:
            logging.info('Connection failed. Retrying...')

    instance_results.sort()
    completed_instance_results = []
    while instance_results:
        # Wait before checking again
        done_override = wait_for_input(sleep_time, prompt='done? ')
        if done_override:
            completed_instance_results.extend(instance_results)
            instance_results = []
            break

        logging.info('Checking for completion...')
        try:
            resp = req.execute()
        except (ValueError, Exception) as e:
            logging.info('Connection failed. Retrying...')
            continue

        try:
            items = resp['items']
        except KeyError as e:
            logging.error(e)
            logging.error(resp)
            continue

        for item in items:
            if item['name'] in instance_results:
                completed_instance_results.append(item['name'])
                instance_results.remove(item['name'])
                logging.info('Instance %s completed!' % item['name'])
        logging.info('Waiting for %s', ' '.join(instance_results))

    # Delete the instances.
    delete_start_time = time.time()
    if config['num_processes'] == 1:
        for instance in instances:
            instance.stop()
    else:
        pool = mp.Pool(min(config['num_processes'], len(instances)))
        pool.map(stop_instance, instances)
    logging.info('Done stopping instances')

    # Print running instances
    all_running_instances = []
    for zone in config['compute']['zones']:
        zone_instances = gce_helper.list_instances(zone)
        lines = ['These are your running instances in zone %s:' % (zone)]
        for zone_instance in zone_instances:
            logging.info(zone_instance['name'])
            lines.append('    ' + zone_instance['name'])
        if not zone_instances:
            lines.append('    (none)')
        zone_instances_text = '\n'.join(lines)
        all_running_instances.append(zone_instances_text)
        logging.info(zone_instances_text)

    # Download the results
    download_start_time = time.time()
    store_dir, instance_result_dirs = gcs_helper.retrieve_results(
        config['bucket'], completed_instance_results, instance_root)

    # Send the user an email
    message = EMAIL_NOTIFICATION % dict(
        instance_id=instance_id,
        instance_names='\n'.join(map(lambda n: '    ' + n, instance_names)),
        experiment_config=config_file,
        script_commands=config['compute']['startup_script'],
        listinstances_output='\n\n'.join(all_running_instances))

    send_notification_email(message=message,
                            config=config,
                            subject="Your experiment has completed.")

    # Save config file
    with open(os.path.join(store_dir, 'config.yaml'), 'w') as f:
        f.write(config.file_contents)

    # Run the results script TODO: move above the email
    result_agg_start_time = time.time()
    results_script_call = 'python %s %s %s' % (config['results_script'],
                                               config_file, store_dir)
    os.system(results_script_call)

    # get runtime
    end_time = time.time()
    total_runtime = end_time - start_time
    launch_prep_time = launch_start_time - launch_prep_start_time
    launch_time = result_dl_start_time - launch_start_time
    run_time = delete_start_time - result_dl_start_time
    delete_time = download_start_time - delete_start_time
    dl_time = result_agg_start_time - download_start_time
    agg_time = end_time - result_agg_start_time

    logging.info('Total runtime: %f' % (total_runtime))
    logging.info('Prep time: %f' % (launch_prep_time))
    logging.info('Launch time: %f' % (launch_time))
    logging.info('Run time: %f' % (run_time))
    logging.info('Delete time: %f' % (delete_time))
    logging.info('Download time: %f' % (dl_time))
    logging.info('Result aggregation time: %f' % (agg_time))