示例#1
0
    def assign_hosts_to_workers(self):
        """Assign host instances to workers."""
        all_host_names = set()
        for assignment in self.gce_project.host_worker_assignments:
            host_cluster = self.gce_project.get_cluster(assignment.host)
            worker_cluster = self.gce_project.get_cluster(assignment.worker)

            if host_cluster.gce_zone != worker_cluster.gce_zone:
                logging.error('Mismatching zones for %s and %s.',
                              assignment.host, assignment.worker)
                continue

            if (host_cluster.instance_count * assignment.workers_per_host !=
                    worker_cluster.instance_count):
                logging.error(
                    'Invalid host/worker cluster size for %s and %s.',
                    assignment.host, assignment.worker)
                continue

            if host_cluster.high_end != worker_cluster.high_end:
                logging.error('Mismatching high end setting for %s and %s',
                              assignment.host, assignment.worker)
                continue

            manager = bot_manager.BotManager(self.gce_project.project_id,
                                             host_cluster.gce_zone)
            host_instance_group = manager.instance_group(host_cluster.name)

            if not host_instance_group.exists():
                logging.error('Host instance group %s does not exist.',
                              host_cluster.name)
                continue

            host_names = [
                _instance_name_from_url(instance['instance'])
                for instance in host_instance_group.list_managed_instances()
            ]
            all_host_names.update(host_names)
            worker_instances = self.get_all_workers_in_cluster(
                manager, worker_cluster.name)

            if len(worker_instances) != worker_cluster.instance_count:
                logging.error(
                    'Actual number of worker instances for %s did not match. '
                    'Expected %d, got %d.', worker_cluster.name,
                    worker_cluster.instance_count, len(worker_instances))
                continue

            new_assignments = self.do_assign_hosts_to_workers(
                host_names, worker_instances, assignment.workers_per_host)
            ndb_utils.put_multi(new_assignments)

        self.cleanup_old_assignments(all_host_names)
def cleanup_resources():
    """Clean up resources."""
    manager = bot_manager.BotManager(TEST_PROJECT, TEST_ZONE)
    try:
        manager.instance_group(test_instance_group_name()).delete()
    except bot_manager.NotFoundError:
        pass

    try:
        manager.instance_template(test_instance_template_name()).delete()
    except bot_manager.NotFoundError:
        pass
示例#3
0
  def delete_gce_resources(self, project_info, cluster_info):
    """Delete instance templates and instance groups."""
    manager = bot_manager.BotManager(self.gce_project.project_id,
                                     cluster_info.gce_zone)

    resource_name = get_resource_name(cluster_info.cluster, project_info.name)

    try:
      manager.instance_group(resource_name).delete()
    except bot_manager.NotFoundError:
      logging.info('Instance group %s already deleted.', resource_name)

    try:
      manager.instance_template(resource_name).delete()
    except bot_manager.NotFoundError:
      logging.info('Instance template %s already deleted.', resource_name)
示例#4
0
  def update_cluster(self,
                     cluster,
                     resource_name,
                     cpu_count,
                     task_tag=None,
                     disk_size_gb=None,
                     service_account=None,
                     tls_cert=None):
    """Update the cluster."""
    manager = bot_manager.BotManager(self.gce_project.project_id,
                                     cluster.gce_zone)

    instance_template = manager.instance_template(resource_name)
    instance_group = manager.instance_group(resource_name)

    # Load expected template body.
    template_body = get_template_body(
        self.gce_project,
        cluster.instance_template,
        task_tag=task_tag,
        disk_size_gb=disk_size_gb,
        service_account=service_account,
        tls_cert=tls_cert)

    if instance_template.exists():
      # Check for updates.
      current_template_body = instance_template.get()
      template_needs_update = _template_needs_update(
          current_template_body, template_body, resource_name)
    else:
      logging.info('Creating new instance template: %s', resource_name)
      instance_template.create(template_body)
      template_needs_update = False

    if instance_group.exists():
      if template_needs_update:
        # Instance groups need to be deleted first before an instance template
        # can be deleted.
        logging.info('Deleting instance group %s for template update.',
                     resource_name)
        try:
          instance_group.delete()
        except bot_manager.NotFoundError:
          # Already deleted.
          pass
      else:
        instance_group_body = instance_group.get()
        if instance_group_body['targetSize'] != cpu_count:
          logging.info('Resizing instance group %s from %d to %d.',
                       resource_name, instance_group_body['targetSize'],
                       cpu_count)
          try:
            instance_group.resize(cpu_count, wait_for_instances=False)
          except bot_manager.OperationError as e:
            logging.error('Failed to resize instance group %s: %s',
                          resource_name, str(e))

        else:
          logging.info('No instance group size changes needed.')

        return

    if template_needs_update:
      logging.info('Recreating instance template: %s', resource_name)
      instance_template.delete()
      instance_template.create(template_body)

    logging.info('Creating new instance group: %s', resource_name)
    try:
      instance_group.create(
          resource_name,
          resource_name,
          size=cpu_count,
          wait_for_instances=False)
    except bot_manager.OperationError as e:
      logging.error('Failed to create instance group %s: %s', resource_name,
                    str(e))
示例#5
0
    def update_cluster(self,
                       cluster,
                       resource_name,
                       cpu_count,
                       task_tag=None,
                       disk_size_gb=None,
                       service_account=None,
                       tls_cert=None):
        """Update the cluster."""
        manager = bot_manager.BotManager(self.gce_project.project_id,
                                         cluster.gce_zone)

        instance_template = manager.instance_template(resource_name)
        instance_group = manager.instance_group(resource_name)

        # Load expected template body.
        template_body = get_template_body(self.gce_project,
                                          cluster.instance_template,
                                          task_tag=task_tag,
                                          disk_size_gb=disk_size_gb,
                                          service_account=service_account,
                                          tls_cert=tls_cert)

        if instance_template.exists():
            # Check for updates.
            current_template_body = instance_template.get()
            template_needs_update = _template_needs_update(
                current_template_body, template_body, resource_name)
        else:
            logging.info('Creating new instance template: %s', resource_name)
            instance_template.create(template_body)
            template_needs_update = False

        if instance_group.exists():
            if template_needs_update:
                # Instance groups need to be deleted first before an instance template
                # can be deleted.
                logging.info('Deleting instance group %s for template update.',
                             resource_name)
                try:
                    instance_group.delete()
                except bot_manager.NotFoundError:
                    # Already deleted.
                    pass
            else:
                instance_group_body = instance_group.get()
                if instance_group_body['targetSize'] != cpu_count:
                    logging.info('Resizing instance group %s from %d to %d.',
                                 resource_name,
                                 instance_group_body['targetSize'], cpu_count)
                    try:
                        instance_group.resize(cpu_count,
                                              wait_for_instances=False)
                    except bot_manager.OperationError as e:
                        logging.error('Failed to resize instance group %s: %s',
                                      resource_name, str(e))

                else:
                    logging.info('No instance group size changes needed.')

                # Check if needs to update autoHealingPolicies.
                auto_healing_policy = {}
                # Check if needs to update health check URL in autoHealingPolicies.
                old_url = instance_group_body.get('auto_healing_policy',
                                                  {}).get('health_check')
                new_url = cluster.auto_healing_policy.get('health_check')

                if new_url != old_url:
                    logging.info(
                        'Updating the health check URL in auto_healing_policy'
                        'of instance group %s from %s to %s.', resource_name,
                        old_url, new_url)
                    auto_healing_policy['healthCheck'] = new_url

                # Check if needs to update initial delay in autoHealingPolicies.
                old_delay = instance_group_body.get(
                    'auto_healing_policy', {}).get('initial_delay_sec')
                new_delay = cluster.auto_healing_policy.get(
                    'initial_delay_sec')

                if new_delay != old_delay:
                    logging.info(
                        'Updating the health check initial delay in auto_healing_policy'
                        'of instance group %s from %s seconds to %s seconds.',
                        resource_name, old_delay, new_delay)
                    auto_healing_policy['initialDelaySec'] = new_delay

                # Send one request to update either or both if needed
                if auto_healing_policy:
                    if new_url is None or new_delay is None:
                        auto_healing_policy = {}
                        if new_url is not None or new_delay is not None:
                            logging.warning(
                                'Deleting auto_healing_policy '
                                'because its two values (health_check, initial_delay_sec) '
                                'should never exist independently: (%s, %s)',
                                new_url, new_delay)
                    try:
                        instance_group.patch_auto_healing_policies(
                            auto_healing_policy=auto_healing_policy,
                            wait_for_instances=False)
                    except bot_manager.OperationError as e:
                        logging.error('Failed to create instance group %s: %s',
                                      resource_name, str(e))
                return

        if template_needs_update:
            logging.info('Recreating instance template: %s', resource_name)
            instance_template.delete()
            instance_template.create(template_body)

        logging.info('Creating new instance group: %s', resource_name)
        try:
            instance_group.create(
                resource_name,
                resource_name,
                size=cpu_count,
                auto_healing_policy=cluster.auto_healing_policy,
                wait_for_instances=False)
        except bot_manager.OperationError as e:
            logging.error('Failed to create instance group %s: %s',
                          resource_name, str(e))
 def setUp(self):
     cleanup_resources()
     self.manager = bot_manager.BotManager(TEST_PROJECT, TEST_ZONE)