示例#1
0
def _terminate_pool_instances(pool, instances, config, terminateByPool=False):
    """ Terminate an instance with the given configuration """
    from .models import INSTANCE_STATE, PoolStatusEntry, POOL_STATUS_ENTRY_TYPE
    instance_ids_by_region = _get_instance_ids_by_region(instances)

    for region in instance_ids_by_region:
        cluster = Laniakea(None)
        try:
            cluster.connect(region=region,
                            aws_access_key_id=config.aws_access_key_id,
                            aws_secret_access_key=config.aws_secret_access_key)
        except Exception as msg:
            # Log this error to the pool status messages
            entry = PoolStatusEntry()
            entry.type = POOL_STATUS_ENTRY_TYPE['unclassified']
            entry.pool = pool
            entry.msg = str(msg)
            entry.isCritical = True
            entry.save()

            logger.exception(
                "[Pool %d] terminate_pool_instances: laniakea failure: %s",
                pool.id, msg)
            return None

        try:
            if terminateByPool:
                boto_instances = cluster.find(
                    filters={"tag:" + SPOTMGR_TAG + "-PoolId": str(pool.pk)})

                # Data consistency checks
                for boto_instance in boto_instances:
                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    state_code = boto_instance.state_code & 255
                    if not (
                        (boto_instance.id in instance_ids_by_region[region]) or
                        (state_code == INSTANCE_STATE['shutting-down']
                         or state_code == INSTANCE_STATE['terminated'])):
                        logger.error(
                            "[Pool %d] Instance with EC2 ID %s (status %d) "
                            "is not in region list for region %s", pool.id,
                            boto_instance.id, state_code, region)

                cluster.terminate(boto_instances)
            else:
                logger.info("[Pool %d] Terminating %s instances in region %s",
                            pool.id, len(instance_ids_by_region[region]),
                            region)
                cluster.terminate(
                    cluster.find(instance_ids=instance_ids_by_region[region]))
        except (boto.exception.EC2ResponseError,
                boto.exception.BotoServerError, ssl.SSLError,
                socket.error) as msg:
            logger.exception(
                "[Pool %d] terminate_pool_instances: boto failure: %s",
                pool.id, msg)
            return 1
    def terminate_pool_instances(self,
                                 pool,
                                 instances,
                                 config,
                                 terminateByPool=False):
        """ Terminate an instance with the given configuration """
        instance_ids_by_region = self.get_instance_ids_by_region(instances)

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "terminate_pool_instances", msg))
                return None

            try:
                if terminateByPool:
                    boto_instances = cluster.find(
                        filters={"tag:SpotManager-PoolId": str(pool.pk)})

                    # Data consistency checks
                    for boto_instance in boto_instances:
                        if not ((boto_instance.id
                                 in instance_ids_by_region[region]) or
                                (boto_instance.state_code
                                 == INSTANCE_STATE['shutting-down']
                                 or boto_instance.state_code
                                 == INSTANCE_STATE['terminated'])):
                            logger.error(
                                "[Pool %d] Instance with EC2 ID %s (status %d) is not in region list for region %s"
                                % (pool.id, boto_instance.id,
                                   boto_instance.state_code, region))

                    cluster.terminate(boto_instances)
                else:
                    logger.info(
                        "[Pool %d] Terminating %s instances in region %s" %
                        (pool.id, len(instance_ids_by_region[region]), region))
                    cluster.terminate(
                        cluster.find(
                            instance_ids=instance_ids_by_region[region]))
            except boto.exception.EC2ResponseError as msg:
                logger.exception("[Pool %d] %s: boto failure: %s" %
                                 (pool.id, "terminate_pool_instances", msg))
                return 1
 def terminate_pool_instances(self, pool, instances, config, terminateByPool=False):
     """ Terminate an instance with the given configuration """        
     instance_ids_by_region = self.get_instance_ids_by_region(instances)
     
     for region in instance_ids_by_region:
         cluster = Laniakea(None)
         try:
             cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
         except Exception as msg:
             logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "terminate_pool_instances", msg))
             return None
     
         try:
             if terminateByPool:
                 boto_instances = cluster.find(filters={"tag:SpotManager-PoolId" : str(pool.pk)})
                 
                 # Data consistency checks
                 for boto_instance in boto_instances:
                     assert ((boto_instance.id in instance_ids_by_region[region])
                             or (boto_instance.state_code == INSTANCE_STATE['shutting-down'] 
                             or boto_instance.state_code == INSTANCE_STATE['terminated']))
                     
                 cluster.terminate(boto_instances)
             else:
                 logger.info("[Pool %d] Terminating %s instances in region %s" % (pool.id, len(instance_ids_by_region[region]),region))
                 cluster.terminate(cluster.find(instance_ids=instance_ids_by_region[region]))
         except boto.exception.EC2ResponseError as msg:
             logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "terminate_pool_instances", msg))
             return 1
    def terminate_pool_instances(self,
                                 pool,
                                 instances,
                                 config,
                                 terminateByPool=False):
        """ Terminate an instance with the given configuration """
        instance_ids_by_region = self.get_instance_ids_by_region(instances)

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "terminate_pool_instances", msg))
                return None

            try:
                if terminateByPool:
                    boto_instances = cluster.find(
                        filters={"tag:SpotManager-PoolId": str(pool.pk)})

                    # Data consistency checks
                    for boto_instance in boto_instances:
                        assert ((boto_instance.id
                                 in instance_ids_by_region[region])
                                or (boto_instance.state_code
                                    == INSTANCE_STATE['shutting-down']
                                    or boto_instance.state_code
                                    == INSTANCE_STATE['terminated']))

                    cluster.terminate(boto_instances)
                else:
                    logger.info(
                        "[Pool %d] Terminating %s instances in region %s" %
                        (pool.id, len(instance_ids_by_region[region]), region))
                    cluster.terminate(
                        cluster.find(
                            instance_ids=instance_ids_by_region[region]))
            except boto.exception.EC2ResponseError as msg:
                logger.exception("[Pool %d] %s: boto failure: %s" %
                                 (pool.id, "terminate_pool_instances", msg))
                return 1
    def terminate_pool_instances(self, pool, instances, config, terminateByPool=False):
        """ Terminate an instance with the given configuration """
        instance_ids_by_region = self.get_instance_ids_by_region(instances)

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "terminate_pool_instances", msg))
                return None

            try:
                if terminateByPool:
                    boto_instances = cluster.find(filters={"tag:SpotManager-PoolId" : str(pool.pk)})

                    # Data consistency checks
                    for boto_instance in boto_instances:
                        # state_code is a 16-bit value where the high byte is
                        # an opaque internal value and should be ignored.
                        state_code = boto_instance.state_code & 255
                        if not ((boto_instance.id in instance_ids_by_region[region])
                                or (state_code == INSTANCE_STATE['shutting-down']
                                or state_code == INSTANCE_STATE['terminated'])):
                            logger.error("[Pool %d] Instance with EC2 ID %s (status %d) is not in region list for region %s" % (pool.id, boto_instance.id, state_code, region))

                    cluster.terminate(boto_instances)
                else:
                    logger.info("[Pool %d] Terminating %s instances in region %s" % (pool.id, len(instance_ids_by_region[region]), region))
                    cluster.terminate(cluster.find(instance_ids=instance_ids_by_region[region]))
            except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg:
                logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "terminate_pool_instances", msg))
                return 1
    def update_pool_instances(self, pool, config):
        """ Check the state of the instances in a pool and update it in the database """
        instances = Instance.objects.filter(pool=pool)
        instance_ids_by_region = self.get_instance_ids_by_region(instances)
        instances_by_ids = self.get_instances_by_ids(instances)
        instances_left = []

        debug_boto_instance_ids_seen = set()
        debug_not_updatable_continue = set()
        debug_not_in_region = {}


        for instance_id in instances_by_ids:
            if instance_id:
                instances_left.append(instances_by_ids[instance_id])

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "update_pool_instances", msg))
                return None

            try:
                boto_instances = cluster.find(filters={"tag:SpotManager-PoolId" : str(pool.pk)})

                for boto_instance in boto_instances:
                    # Store ID seen for debugging purposes
                    debug_boto_instance_ids_seen.add(boto_instance.id)

                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    state_code = boto_instance.state_code & 255

                    if "SpotManager-Updatable" not in boto_instance.tags or int(boto_instance.tags["SpotManager-Updatable"]) <= 0:
                        # The instance is not marked as updatable. We must not touch it because
                        # a spawning thread is still managing this instance. However, we must also
                        # remove this instance from the instances_left list if it's already in our
                        # database, because otherwise our code here would delete it from the database.
                        if boto_instance.id in instance_ids_by_region[region]:
                            instances_left.remove(instances_by_ids[boto_instance.id])
                        else:
                            debug_not_updatable_continue.add(boto_instance.id)
                        continue

                    instance = None

                    # Whenever we see an instance that is not in our instance list for that region,
                    # make sure it's a terminated instance because we should never have a running
                    # instance that matches the search above but is not in our database.
                    if not boto_instance.id in instance_ids_by_region[region]:
                        if not ((state_code == INSTANCE_STATE['shutting-down']
                            or state_code == INSTANCE_STATE['terminated'])):

                            # As a last resort, try to find the instance in our database.
                            # If the instance was saved to our database between the entrance
                            # to this function and the search query sent to EC2, then the instance
                            # will not be in our instances list but returned by EC2. In this
                            # case, we try to load it directly from the database.
                            q = Instance.objects.filter(ec2_instance_id=boto_instance.id)
                            if q:
                                instance = q[0]
                                logger.error("[Pool %d] Instance with EC2 ID %s was reloaded from database." % (pool.id, boto_instance.id))
                            else:
                                logger.error("[Pool %d] Instance with EC2 ID %s is not in our database." % (pool.id, boto_instance.id))

                                # Terminate at this point, we run in an inconsistent state
                                assert(False)
                        debug_not_in_region[boto_instance.id] = state_code
                        continue

                    instance = instances_by_ids[boto_instance.id]
                    instances_left.remove(instance)

                    # Check the status code and update if necessary
                    if instance.status_code != state_code:
                        instance.status_code = state_code
                        instance.save()

                    # If for some reason we don't have a hostname yet,
                    # update it accordingly.
                    if not instance.hostname:
                        instance.hostname = boto_instance.public_dns_name
                        instance.save()

            except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg:
                logger.exception("%s: boto failure: %s" % ("update_pool_instances", msg))
                return 1

        if instances_left:
            for instance in instances_left:
                if not instance.ec2_instance_id in debug_boto_instance_ids_seen:
                    logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database, has no corresponding machine on EC2." % (pool.id, instance.ec2_instance_id))

                if instance.ec2_instance_id in debug_not_updatable_continue:
                    logger.error("[Pool %d] Deleting instance with EC2 ID %s from our database because it is not updatable but not in our region." % (pool.id, instance.ec2_instance_id))

                if instance.ec2_instance_id in debug_not_in_region:
                    logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database, has state code %s on EC2" % (pool.id, instance.ec2_instance_id, debug_not_in_region[instance.ec2_instance_id]))

                logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database." % (pool.id, instance.ec2_instance_id))
                instance.delete()
        def start_instances_async(pool, config, count, images, region, zone, instances):
            userdata = LaniakeaCommandLine.handle_import_tags(config.ec2_userdata)
            userdata = LaniakeaCommandLine.handle_tags(userdata, config.ec2_userdata_macros)
            if not userdata:
                logger.error("[Pool %d] Failed to compile userdata." % pool.id)

                entry = PoolStatusEntry()
                entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
                entry.pool = pool
                entry.isCritical = True
                entry.msg = "Configuration error: Failed to compile userdata"
                entry.save()

                for instance in instances:
                    instance.delete()

                raise RuntimeError("start_instances_async: Failed to compile userdata")

            images["default"]['user_data'] = userdata
            images["default"]['placement'] = zone
            images["default"]['count'] = count

            cluster = Laniakea(images)
            try:
                cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "start_instances_async", msg))

                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                # Delete all pending instances as we failed to create them
                for instance in instances:
                    instance.delete()

                return

            config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)

            try:
                logger.info("[Pool %d] Creating %s instances..." % (pool.id, count))
                boto_instances = cluster.create_spot(config.ec2_max_price, tags=config.ec2_tags, delete_on_termination=True, timeout=20 * 60)
                canceled_requests = count - len(boto_instances)

                logger.info("[Pool %d] Successfully created %s instances, %s requests timed out and were canceled" % (pool.id, len(boto_instances), canceled_requests))

                for i in range(0, len(boto_instances)):
                    instances[i].hostname = boto_instances[i].public_dns_name
                    instances[i].ec2_instance_id = boto_instances[i].id
                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    instances[i].status_code = boto_instances[i].state_code & 255
                    instances[i].save()

                    assert(instances[i].ec2_instance_id != None)

                    # Now that we saved the object into our database, mark the instance as updatable
                    # so our update code can pick it up and update it accordingly when it changes states
                    boto_instances[i].add_tag("SpotManager-Updatable", "1")

                if canceled_requests > 0:
                    for i in range(len(boto_instances), count):
                        # Delete instances belong to canceled spot requests
                        logger.info("[Pool %d] Deleting instance with id %s (belongs to canceled request)" % (pool.id, instances[i].pk))
                        instances[i].delete()

                # Delete certain warnings we might have created earlier that no longer apply

                # If we ever exceeded the maximum spot instance count, we can clear
                # the warning now because we obviously succeeded in starting some instances.
                PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']).delete()

                # The same holds for temporary failures of any sort
                PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['temporary-failure']).delete()

                # Do not delete unclassified errors here for now, so the user can see them.

            except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg:
                if "MaxSpotInstanceCountExceeded" in str(msg):
                    logger.warning("[Pool %d] Maximum instance count exceeded for region %s" % (pool.id, region))
                    if not PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']):
                        entry = PoolStatusEntry()
                        entry.pool = pool
                        entry.type = POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']
                        entry.msg = "Auto-selected region exceeded its maximum spot instance count."
                        entry.save()
                elif "Service Unavailable" in str(msg):
                    logger.warning("[Pool %d] Temporary failure in region %s: %s" % (pool.id, region, msg))
                    entry = PoolStatusEntry()
                    entry.pool = pool
                    entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure']
                    entry.msg = "Temporary failure occurred: %s" % msg
                    entry.save()
                else:
                    logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "start_instances_async", msg))
                    entry = PoolStatusEntry()
                    entry.type = 0
                    entry.pool = pool
                    entry.isCritical = True
                    entry.msg = "Unclassified error occurred: %s" % msg
                    entry.save()

                # Delete all pending instances, assuming that an exception from laniakea
                # means that all instance requests failed.
                for instance in instances:
                    instance.delete()

                return
 def update_pool_instances(self, pool, config):
     """ Check the state of the instances in a pool and update it in the database """
     instances = Instance.objects.filter(pool=pool)
     instance_ids_by_region = self.get_instance_ids_by_region(instances)
     instances_by_ids = self.get_instances_by_ids(instances)
     instances_left = []
     
     for instance_id in instances_by_ids:
         if instance_id:
             instances_left.append(instances_by_ids[instance_id])
     
     for region in instance_ids_by_region:
         cluster = Laniakea(None)
         try:
             cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
         except Exception as msg:
             # Log this error to the pool status messages
             entry = PoolStatusEntry()
             entry.type = 0
             entry.pool = pool
             entry.msg = str(msg)
             entry.isCritical = True
             entry.save()
             
             logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "update_pool_instances", msg))
             return None
     
         try:
             boto_instances = cluster.find(filters={"tag:SpotManager-PoolId" : str(pool.pk), "tag:SpotManager-Updatable" : "1"})
             
             
             for boto_instance in boto_instances:
                 instance = None
                 
                 # Whenever we see an instance that is not in our instance list for that region,
                 # make sure it's a terminated instance because we should never have a running 
                 # instance that matches the search above but is not in our database.
                 if not boto_instance.id in instance_ids_by_region[region]:
                     if not ((boto_instance.state_code == INSTANCE_STATE['shutting-down'] 
                         or boto_instance.state_code == INSTANCE_STATE['terminated'])):
                         
                         # As a last resort, try to find the instance in our database.
                         # If the instance was saved to our database between the entrance
                         # to this function and the search query sent to EC2, then the instance
                         # will not be in our instances list but returned by EC2. In this
                         # case, we try to load it directly from the database.
                         q = Instance.objects.filter(ec2_instance_id = boto_instance.id)
                         if q:
                             instance = q[0]
                         else:
                             logger.error("[Pool %d] Instance with EC2 ID %s is not in our database." % (pool.id, boto_instance.id))
                                 
                             # Terminate at this point, we run in an inconsistent state
                             assert(False)
                         
                     continue
                 
                 if not instance:
                     instance = instances_by_ids[boto_instance.id]
                     instances_left.remove(instance)
                 
                 # Check the status code and update if necessary
                 if instance.status_code != boto_instance.state_code:
                     instance.status_code = boto_instance.state_code
                     instance.save()
                     
                 # If for some reason we don't have a hostname yet,
                 # update it accordingly.
                 if not instance.hostname:
                     instance.hostname = boto_instance.public_dns_name
                     instance.save()
                 
         except boto.exception.EC2ResponseError as msg:
             logger.exception("%s: boto failure: %s" % ("update_pool_instances", msg))
             return 1
     
     if instances_left:
         for instance in instances_left:
             logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database, has no corresponding machine on EC2." % (pool.id, instance.ec2_instance_id))
             instance.delete()
示例#9
0
    def update_pool_instances(self, pool, config):
        """ Check the state of the instances in a pool and update it in the database """
        instances = Instance.objects.filter(pool=pool)
        instance_ids_by_region = self.get_instance_ids_by_region(instances)
        instances_by_ids = self.get_instances_by_ids(instances)
        instances_left = []

        debug_boto_instance_ids_seen = set()
        debug_not_updatable_continue = set()
        debug_not_in_region = {}

        for instance_id in instances_by_ids:
            if instance_id:
                instances_left.append(instances_by_ids[instance_id])

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "update_pool_instances", msg))
                return None

            try:
                boto_instances = cluster.find(
                    filters={"tag:SpotManager-PoolId": str(pool.pk)})

                for boto_instance in boto_instances:
                    # Store ID seen for debugging purposes
                    debug_boto_instance_ids_seen.add(boto_instance.id)

                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    state_code = boto_instance.state_code & 255

                    if "SpotManager-Updatable" not in boto_instance.tags or int(
                            boto_instance.tags["SpotManager-Updatable"]) <= 0:
                        # The instance is not marked as updatable. We must not touch it because
                        # a spawning thread is still managing this instance. However, we must also
                        # remove this instance from the instances_left list if it's already in our
                        # database, because otherwise our code here would delete it from the database.
                        if boto_instance.id in instance_ids_by_region[region]:
                            instances_left.remove(
                                instances_by_ids[boto_instance.id])
                        else:
                            debug_not_updatable_continue.add(boto_instance.id)
                        continue

                    instance = None

                    # Whenever we see an instance that is not in our instance list for that region,
                    # make sure it's a terminated instance because we should never have a running
                    # instance that matches the search above but is not in our database.
                    if not boto_instance.id in instance_ids_by_region[region]:
                        if not (
                            (state_code == INSTANCE_STATE['shutting-down']
                             or state_code == INSTANCE_STATE['terminated'])):

                            # As a last resort, try to find the instance in our database.
                            # If the instance was saved to our database between the entrance
                            # to this function and the search query sent to EC2, then the instance
                            # will not be in our instances list but returned by EC2. In this
                            # case, we try to load it directly from the database.
                            q = Instance.objects.filter(
                                ec2_instance_id=boto_instance.id)
                            if q:
                                instance = q[0]
                                logger.error(
                                    "[Pool %d] Instance with EC2 ID %s was reloaded from database."
                                    % (pool.id, boto_instance.id))
                            else:
                                logger.error(
                                    "[Pool %d] Instance with EC2 ID %s is not in our database."
                                    % (pool.id, boto_instance.id))

                                # Terminate at this point, we run in an inconsistent state
                                assert (False)
                        debug_not_in_region[boto_instance.id] = state_code
                        continue

                    instance = instances_by_ids[boto_instance.id]
                    instances_left.remove(instance)

                    # Check the status code and update if necessary
                    if instance.status_code != state_code:
                        instance.status_code = state_code
                        instance.save()

                    # If for some reason we don't have a hostname yet,
                    # update it accordingly.
                    if not instance.hostname:
                        instance.hostname = boto_instance.public_dns_name
                        instance.save()

            except (boto.exception.EC2ResponseError,
                    boto.exception.BotoServerError, ssl.SSLError,
                    socket.error) as msg:
                logger.exception("%s: boto failure: %s" %
                                 ("update_pool_instances", msg))
                return 1

        if instances_left:
            for instance in instances_left:
                if not instance.ec2_instance_id in debug_boto_instance_ids_seen:
                    logger.info(
                        "[Pool %d] Deleting instance with EC2 ID %s from our database, has no corresponding machine on EC2."
                        % (pool.id, instance.ec2_instance_id))

                if instance.ec2_instance_id in debug_not_updatable_continue:
                    logger.error(
                        "[Pool %d] Deleting instance with EC2 ID %s from our database because it is not updatable but not in our region."
                        % (pool.id, instance.ec2_instance_id))

                if instance.ec2_instance_id in debug_not_in_region:
                    logger.info(
                        "[Pool %d] Deleting instance with EC2 ID %s from our database, has state code %s on EC2"
                        % (pool.id, instance.ec2_instance_id,
                           debug_not_in_region[instance.ec2_instance_id]))

                logger.info(
                    "[Pool %d] Deleting instance with EC2 ID %s from our database."
                    % (pool.id, instance.ec2_instance_id))
                instance.delete()
示例#10
0
        def start_instances_async(pool, config, count, images, region, zone,
                                  instances):
            userdata = LaniakeaCommandLine.handle_import_tags(
                config.ec2_userdata)
            userdata = LaniakeaCommandLine.handle_tags(
                userdata, config.ec2_userdata_macros)
            if not userdata:
                logger.error("[Pool %d] Failed to compile userdata." % pool.id)

                entry = PoolStatusEntry()
                entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
                entry.pool = pool
                entry.isCritical = True
                entry.msg = "Configuration error: Failed to compile userdata"
                entry.save()

                for instance in instances:
                    instance.delete()

                raise RuntimeError(
                    "start_instances_async: Failed to compile userdata")

            images["default"]['user_data'] = userdata
            images["default"]['placement'] = zone
            images["default"]['count'] = count

            cluster = Laniakea(images)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "start_instances_async", msg))

                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                # Delete all pending instances as we failed to create them
                for instance in instances:
                    instance.delete()

                return

            config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)

            try:
                logger.info("[Pool %d] Creating %s instances..." %
                            (pool.id, count))
                boto_instances = cluster.create_spot(
                    config.ec2_max_price,
                    tags=config.ec2_tags,
                    delete_on_termination=True,
                    timeout=20 * 60)
                canceled_requests = count - len(boto_instances)

                logger.info(
                    "[Pool %d] Successfully created %s instances, %s requests timed out and were canceled"
                    % (pool.id, len(boto_instances), canceled_requests))

                for i in range(0, len(boto_instances)):
                    instances[i].hostname = boto_instances[i].public_dns_name
                    instances[i].ec2_instance_id = boto_instances[i].id
                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    instances[
                        i].status_code = boto_instances[i].state_code & 255
                    instances[i].save()

                    assert (instances[i].ec2_instance_id != None)

                    # Now that we saved the object into our database, mark the instance as updatable
                    # so our update code can pick it up and update it accordingly when it changes states
                    boto_instances[i].add_tag("SpotManager-Updatable", "1")

                if canceled_requests > 0:
                    for i in range(len(boto_instances), count):
                        # Delete instances belong to canceled spot requests
                        logger.info(
                            "[Pool %d] Deleting instance with id %s (belongs to canceled request)"
                            % (pool.id, instances[i].pk))
                        instances[i].delete()

                # Delete certain warnings we might have created earlier that no longer apply

                # If we ever exceeded the maximum spot instance count, we can clear
                # the warning now because we obviously succeeded in starting some instances.
                PoolStatusEntry.objects.filter(
                    pool=pool,
                    type=POOL_STATUS_ENTRY_TYPE[
                        'max-spot-instance-count-exceeded']).delete()

                # The same holds for temporary failures of any sort
                PoolStatusEntry.objects.filter(
                    pool=pool,
                    type=POOL_STATUS_ENTRY_TYPE['temporary-failure']).delete()

                # Do not delete unclassified errors here for now, so the user can see them.

            except (boto.exception.EC2ResponseError,
                    boto.exception.BotoServerError, ssl.SSLError,
                    socket.error) as msg:
                if "MaxSpotInstanceCountExceeded" in str(msg):
                    logger.warning(
                        "[Pool %d] Maximum instance count exceeded for region %s"
                        % (pool.id, region))
                    if not PoolStatusEntry.objects.filter(
                            pool=pool,
                            type=POOL_STATUS_ENTRY_TYPE[
                                'max-spot-instance-count-exceeded']):
                        entry = PoolStatusEntry()
                        entry.pool = pool
                        entry.type = POOL_STATUS_ENTRY_TYPE[
                            'max-spot-instance-count-exceeded']
                        entry.msg = "Auto-selected region exceeded its maximum spot instance count."
                        entry.save()
                elif "Service Unavailable" in str(msg):
                    logger.warning(
                        "[Pool %d] Temporary failure in region %s: %s" %
                        (pool.id, region, msg))
                    entry = PoolStatusEntry()
                    entry.pool = pool
                    entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure']
                    entry.msg = "Temporary failure occurred: %s" % msg
                    entry.save()
                else:
                    logger.exception("[Pool %d] %s: boto failure: %s" %
                                     (pool.id, "start_instances_async", msg))
                    entry = PoolStatusEntry()
                    entry.type = 0
                    entry.pool = pool
                    entry.isCritical = True
                    entry.msg = "Unclassified error occurred: %s" % msg
                    entry.save()

                # Delete all pending instances, assuming that an exception from laniakea
                # means that all instance requests failed.
                for instance in instances:
                    instance.delete()

                return
示例#11
0
def _update_pool_instances(pool, config):
    """Check the state of the instances in a pool and update it in the database"""
    from .models import Instance, INSTANCE_STATE, PoolStatusEntry, POOL_STATUS_ENTRY_TYPE
    instances = Instance.objects.filter(pool=pool)
    instance_ids_by_region = _get_instance_ids_by_region(instances)
    instances_by_ids = _get_instances_by_ids(instances)
    instances_left = []
    instances_created = False

    debug_boto_instance_ids_seen = set()
    debug_not_updatable_continue = set()
    debug_not_in_region = {}

    for instance in instances_by_ids.values():
        if instance.status_code != INSTANCE_STATE['requested']:
            instances_left.append(instance)

    # set config to this pool for now in case we set tags on fulfilled spot requests
    config.ec2_tags[SPOTMGR_TAG + '-PoolId'] = str(pool.pk)

    for region in instance_ids_by_region:
        cluster = Laniakea(None)
        try:
            cluster.connect(region=region,
                            aws_access_key_id=config.aws_access_key_id,
                            aws_secret_access_key=config.aws_secret_access_key)
        except Exception as msg:
            # Log this error to the pool status messages
            entry = PoolStatusEntry()
            entry.type = POOL_STATUS_ENTRY_TYPE['unclassified']
            entry.pool = pool
            entry.msg = str(msg)
            entry.isCritical = True
            entry.save()

            logger.exception(
                "[Pool %d] update_pool_instances: laniakea failure: %s",
                pool.id, msg)
            return

        try:
            # first check status of pending spot requests
            requested = []
            for instance_id in instance_ids_by_region[region]:
                if instances_by_ids[instance_id].status_code == INSTANCE_STATE[
                        'requested']:
                    requested.append(instance_id)

            if requested:
                boto_results = cluster.check_spot_requests(
                    requested, config.ec2_tags)

                for req_id, result in zip(requested, boto_results):
                    instance = instances_by_ids[req_id]

                    if isinstance(result, boto.ec2.instance.Instance):
                        logger.info(
                            "[Pool %d] spot request fulfilled %s -> %s",
                            pool.id, req_id, result.id)

                        # spot request has been fulfilled
                        instance.hostname = result.public_dns_name
                        instance.ec2_instance_id = result.id
                        # state_code is a 16-bit value where the high byte is
                        # an opaque internal value and should be ignored.
                        instance.status_code = result.state_code & 255
                        instance.save()

                        # update local data structures to use the new instances instead
                        del instances_by_ids[req_id]
                        instances_by_ids[result.id] = instance
                        instance_ids_by_region[region].append(result.id)
                        # don't add it to instances_left yet to avoid race with adding tags

                        # Now that we saved the object into our database, mark the instance as updatable
                        # so our update code can pick it up and update it accordingly when it changes states
                        result.add_tag(SPOTMGR_TAG + "-Updatable", "1")

                        instances_created = True

                    # request object is returned in case request is closed/cancelled/failed
                    elif isinstance(
                            result,
                            boto.ec2.spotinstancerequest.SpotInstanceRequest):
                        if result.state in {"cancelled", "closed"}:
                            # this is normal, remove from DB and move on
                            logger.info("[Pool %d] spot request %s is %s",
                                        pool.id, req_id, result.state)
                            instances_by_ids[req_id].delete()
                        elif result.state in {"open", "active"}:
                            # this should not happen! warn and leave in DB in case it's fulfilled later
                            logger.warning(
                                "[Pool %d] Request %s is %s and %s.", pool.id,
                                req_id, result.status.code, result.state)
                        else:  # state=failed
                            msg = "Request %s is %s and %s." % (
                                req_id, result.status.code, result.state)

                            entry = PoolStatusEntry()
                            entry.type = POOL_STATUS_ENTRY_TYPE['unclassified']
                            entry.pool = pool
                            entry.msg = str(msg)
                            entry.isCritical = True
                            entry.save()

                            logger.error("[Pool %d] %s", pool.id, msg)
                            instances_by_ids[req_id].delete()

                    elif result is None:
                        logger.info("[Pool %d] spot request %s is still open",
                                    pool.pk, req_id)

                    else:
                        logger.warning("[Pool %d] spot request %s returned %s",
                                       pool.pk, req_id,
                                       type(result).__name__)

            boto_instances = cluster.find(
                filters={"tag:" + SPOTMGR_TAG + "-PoolId": str(pool.pk)})

            for boto_instance in boto_instances:
                # Store ID seen for debugging purposes
                debug_boto_instance_ids_seen.add(boto_instance.id)

                # state_code is a 16-bit value where the high byte is
                # an opaque internal value and should be ignored.
                state_code = boto_instance.state_code & 255

                if (SPOTMGR_TAG + "-Updatable" not in boto_instance.tags
                        or int(boto_instance.tags[SPOTMGR_TAG + "-Updatable"])
                        <= 0):
                    # The instance is not marked as updatable. We must not touch it because
                    # a spawning thread is still managing this instance. However, we must also
                    # remove this instance from the instances_left list if it's already in our
                    # database, because otherwise our code here would delete it from the database.
                    if boto_instance.id in instance_ids_by_region[region]:
                        instances_left.remove(
                            instances_by_ids[boto_instance.id])
                    else:
                        debug_not_updatable_continue.add(boto_instance.id)
                    continue

                instance = None

                # Whenever we see an instance that is not in our instance list for that region,
                # make sure it's a terminated instance because we should never have a running
                # instance that matches the search above but is not in our database.
                if boto_instance.id not in instance_ids_by_region[region]:
                    if state_code not in [
                            INSTANCE_STATE['shutting-down'],
                            INSTANCE_STATE['terminated']
                    ]:

                        # As a last resort, try to find the instance in our database.
                        # If the instance was saved to our database between the entrance
                        # to this function and the search query sent to EC2, then the instance
                        # will not be in our instances list but returned by EC2. In this
                        # case, we try to load it directly from the database.
                        q = Instance.objects.filter(
                            ec2_instance_id=boto_instance.id)
                        if q:
                            instance = q[0]
                            logger.error(
                                "[Pool %d] Instance with EC2 ID %s was reloaded from database.",
                                pool.id, boto_instance.id)
                        else:
                            logger.error(
                                "[Pool %d] Instance with EC2 ID %s is not in our database.",
                                pool.id, boto_instance.id)

                            # Terminate at this point, we run in an inconsistent state
                            assert (False)
                    debug_not_in_region[boto_instance.id] = state_code
                    continue

                instance = instances_by_ids[boto_instance.id]
                instances_left.remove(instance)

                # Check the status code and update if necessary
                if instance.status_code != state_code:
                    instance.status_code = state_code
                    instance.save()

                # If for some reason we don't have a hostname yet,
                # update it accordingly.
                if not instance.hostname:
                    instance.hostname = boto_instance.public_dns_name
                    instance.save()

        except (boto.exception.EC2ResponseError,
                boto.exception.BotoServerError, ssl.SSLError,
                socket.error) as msg:
            if "MaxSpotInstanceCountExceeded" in str(msg):
                logger.warning(
                    "[Pool %d] update_pool_instances: Maximum instance count exceeded for region %s",
                    pool.id, region)
                if not PoolStatusEntry.objects.filter(
                        pool=pool,
                        type=POOL_STATUS_ENTRY_TYPE[
                            'max-spot-instance-count-exceeded']):
                    entry = PoolStatusEntry()
                    entry.pool = pool
                    entry.type = POOL_STATUS_ENTRY_TYPE[
                        'max-spot-instance-count-exceeded']
                    entry.msg = "Auto-selected region exceeded its maximum spot instance count."
                    entry.save()
            elif "Service Unavailable" in str(msg):
                logger.warning(
                    "[Pool %d] update_pool_instances: Temporary failure in region %s: %s",
                    pool.id, region, msg)
                entry = PoolStatusEntry()
                entry.pool = pool
                entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure']
                entry.msg = "Temporary failure occurred: %s" % msg
                entry.save()
            else:
                logger.exception(
                    "[Pool %d] update_pool_instances: boto failure: %s",
                    pool.id, msg)
                entry = PoolStatusEntry()
                entry.type = POOL_STATUS_ENTRY_TYPE['unclassified']
                entry.pool = pool
                entry.isCritical = True
                entry.msg = "Unclassified error occurred: %s" % msg
                entry.save()
            return

    for instance in instances_left:
        reasons = []

        if instance.ec2_instance_id not in debug_boto_instance_ids_seen:
            reasons.append("no corresponding machine on EC2")

        if instance.ec2_instance_id in debug_not_updatable_continue:
            reasons.append("not updatable")

        if instance.ec2_instance_id in debug_not_in_region:
            reasons.append("has state code %s on EC2 but not in our region" %
                           debug_not_in_region[instance.ec2_instance_id])

        if not reasons:
            reasons.append("?")

        logger.info(
            "[Pool %d] Deleting instance with EC2 ID %s from our database: %s",
            pool.id, instance.ec2_instance_id, ", ".join(reasons))
        instance.delete()

    if instances_created:
        # Delete certain warnings we might have created earlier that no longer apply

        # If we ever exceeded the maximum spot instance count, we can clear
        # the warning now because we obviously succeeded in starting some instances.
        PoolStatusEntry.objects.filter(
            pool=pool,
            type=POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']
        ).delete()

        # The same holds for temporary failures of any sort
        PoolStatusEntry.objects.filter(
            pool=pool,
            type=POOL_STATUS_ENTRY_TYPE['temporary-failure']).delete()
示例#12
0
def _start_pool_instances(pool, config, count=1):
    """ Start an instance with the given configuration """
    from .models import Instance, INSTANCE_STATE, PoolStatusEntry, POOL_STATUS_ENTRY_TYPE
    images = _create_laniakea_images(config)

    # Figure out where to put our instances
    try:
        (region, zone, instance_type, rejected) = _get_best_region_zone(config)
    except (boto.exception.EC2ResponseError, boto.exception.BotoServerError,
            ssl.SSLError, socket.error):
        # In case of temporary failures here, we will retry again in the next cycle
        logger.warning("[Pool %d] Failed to acquire spot instance prices: %s.",
                       pool.id, traceback.format_exc())
        return
    except RuntimeError:
        logger.error("[Pool %d] Failed to compile userdata.", pool.id)
        entry = PoolStatusEntry()
        entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
        entry.pool = pool
        entry.isCritical = True
        entry.msg = "Configuration error: %s" % traceback.format_exc()
        entry.save()
        return

    priceLowEntries = PoolStatusEntry.objects.filter(
        pool=pool, type=POOL_STATUS_ENTRY_TYPE['price-too-low'])

    if not region:
        logger.warning(
            "[Pool %d] No allowed region was cheap enough to spawn instances.",
            pool.id)

        if not priceLowEntries:
            entry = PoolStatusEntry()
            entry.pool = pool
            entry.type = POOL_STATUS_ENTRY_TYPE['price-too-low']
            entry.msg = "No allowed region was cheap enough to spawn instances."
            for zone in rejected:
                entry.msg += "\n%s at %s" % (zone, rejected[zone])
            entry.save()
        return
    else:
        if priceLowEntries:
            priceLowEntries.delete()

    logger.debug(
        "[Pool %d] Using instance type %s in region %s with availability zone %s.",
        pool.id, instance_type, region, zone)

    try:
        userdata = LaniakeaCommandLine.handle_import_tags(
            config.ec2_userdata.decode('utf-8'))

        # Copy the userdata_macros and populate with internal variables
        ec2_userdata_macros = dict(config.ec2_userdata_macros)
        ec2_userdata_macros["EC2SPOTMANAGER_POOLID"] = str(pool.id)

        userdata = LaniakeaCommandLine.handle_tags(userdata,
                                                   ec2_userdata_macros)
        if not userdata:
            logger.error("[Pool %d] Failed to compile userdata.", pool.id)

            entry = PoolStatusEntry()
            entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
            entry.pool = pool
            entry.isCritical = True
            entry.msg = "Configuration error: Failed to compile userdata"
            entry.save()

            raise RuntimeError(
                "start_pool_instances: Failed to compile userdata")

        images["default"]['user_data'] = userdata.encode("utf-8")
        images["default"]['placement'] = zone
        images["default"]['count'] = count
        images["default"]['instance_type'] = instance_type

        cluster = Laniakea(images)
        try:
            cluster.connect(region=region,
                            aws_access_key_id=config.aws_access_key_id,
                            aws_secret_access_key=config.aws_secret_access_key)
        except ssl.SSLError as msg:
            logger.warning(
                "[Pool %d] start_pool_instances: Temporary failure in region %s: %s",
                pool.id, region, msg)
            entry = PoolStatusEntry()
            entry.pool = pool
            entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure']
            entry.msg = "Temporary failure occurred: %s" % msg
            entry.save()

            return

        except Exception as msg:
            logger.exception(
                "[Pool %d] start_pool_instances: laniakea failure: %s",
                pool.id, msg)

            # Log this error to the pool status messages
            entry = PoolStatusEntry()
            entry.type = POOL_STATUS_ENTRY_TYPE['unclassified']
            entry.pool = pool
            entry.msg = str(msg)
            entry.isCritical = True
            entry.save()

            return

        try:
            logger.info("[Pool %d] Creating %d instances...", pool.id, count)
            for ec2_request in cluster.create_spot_requests(
                    config.ec2_max_price,
                    delete_on_termination=True,
                    timeout=10 * 60):
                instance = Instance()
                instance.ec2_instance_id = ec2_request
                instance.ec2_region = region
                instance.ec2_zone = zone
                instance.status_code = INSTANCE_STATE["requested"]
                instance.pool = pool
                instance.save()

        except (boto.exception.EC2ResponseError,
                boto.exception.BotoServerError, ssl.SSLError,
                socket.error) as msg:
            if "MaxSpotInstanceCountExceeded" in str(msg):
                logger.warning(
                    "[Pool %d] start_pool_instances: Maximum instance count exceeded for region %s",
                    pool.id, region)
                if not PoolStatusEntry.objects.filter(
                        pool=pool,
                        type=POOL_STATUS_ENTRY_TYPE[
                            'max-spot-instance-count-exceeded']):
                    entry = PoolStatusEntry()
                    entry.pool = pool
                    entry.type = POOL_STATUS_ENTRY_TYPE[
                        'max-spot-instance-count-exceeded']
                    entry.msg = "Auto-selected region exceeded its maximum spot instance count."
                    entry.save()
            elif "Service Unavailable" in str(msg):
                logger.warning(
                    "[Pool %d] start_pool_instances: Temporary failure in region %s: %s",
                    pool.id, region, msg)
                entry = PoolStatusEntry()
                entry.pool = pool
                entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure']
                entry.msg = "Temporary failure occurred: %s" % msg
                entry.save()
            else:
                logger.exception(
                    "[Pool %d] start_pool_instances: boto failure: %s",
                    pool.id, msg)
                entry = PoolStatusEntry()
                entry.type = POOL_STATUS_ENTRY_TYPE['unclassified']
                entry.pool = pool
                entry.isCritical = True
                entry.msg = "Unclassified error occurred: %s" % msg
                entry.save()

    except Exception as msg:
        logger.exception(
            "[Pool %d] start_pool_instances: unhandled failure: %s", pool.id,
            msg)
        raise
     def start_instances_async(pool, config, count, images, region, zone, instances):
         userdata = LaniakeaCommandLine.handle_import_tags(config.ec2_userdata)
         userdata = LaniakeaCommandLine.handle_tags(userdata, config.ec2_userdata_macros)
         if not userdata:
             logger.error("[Pool %d] Failed to compile userdata." % pool.id)
             raise RuntimeError("start_instances_async: Failed to compile userdata")
         
         images["default"]['user_data'] = userdata
         images["default"]['placement'] = zone
         images["default"]['count'] = count
 
         cluster = Laniakea(images)
         try:
             cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
         except Exception as msg:
             logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "start_instances_async", msg))
             
             # Log this error to the pool status messages
             entry = PoolStatusEntry()
             entry.pool = pool
             entry.msg = str(msg)
             entry.isCritical = True
             entry.save()
             
             # Delete all pending instances as we failed to create them
             for instance in instances:
                 instance.delete()
                 
             return
         
         config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)
 
         try:
             logger.info("[Pool %d] Creating %s instances..." % (pool.id, count))
             (boto_instances, boto_pending) = cluster.create_spot(config.ec2_max_price, tags=config.ec2_tags, delete_on_termination=True, timeout=20*60)
             
             logger.info("[Pool %d] Successfully created %s instances, %s requests timed out and were canceled" % (pool.id, len(boto_instances), len(boto_pending)))
             
             assert (len(boto_instances) + len(boto_pending)) == len(instances) == count
             
             for i in range(0,len(boto_instances)):
                 instances[i].hostname = boto_instances[i].public_dns_name
                 instances[i].ec2_instance_id = boto_instances[i].id
                 instances[i].status_code = boto_instances[i].state_code
                 instances[i].save()
                 
                 assert(instances[i].ec2_instance_id != None)
                 
                 # Now that we saved the object into our database, mark the instance as updatable
                 # so our update code can pick it up and update it accordingly when it changes states 
                 boto_instances[i].add_tag("SpotManager-Updatable", "1")
                 
             if boto_pending:
                 for i in range(len(boto_instances),count):
                     # Delete instances belong to canceled spot requests
                     logger.info("[Pool %d] Deleting instance with id %s (belongs to canceled request)" % (pool.id, instances[i].pk))
                     instances[i].delete()
             
         except boto.exception.EC2ResponseError as msg:
             logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "start_instances_async", msg))
             return
    def update_pool_instances(self, pool, config):
        """ Check the state of the instances in a pool and update it in the database """
        instances = Instance.objects.filter(pool=pool)
        instance_ids_by_region = self.get_instance_ids_by_region(instances)
        instances_by_ids = self.get_instances_by_ids(instances)
        instances_left = []

        for instance_id in instances_by_ids:
            if instance_id:
                instances_left.append(instances_by_ids[instance_id])

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "update_pool_instances", msg))
                return None

            try:
                boto_instances = cluster.find(
                    filters={
                        "tag:SpotManager-PoolId": str(pool.pk),
                        "tag:SpotManager-Updatable": "1"
                    })

                for boto_instance in boto_instances:
                    instance = None

                    # Whenever we see an instance that is not in our instance list for that region,
                    # make sure it's a terminated instance because we should never have a running
                    # instance that matches the search above but is not in our database.
                    if not boto_instance.id in instance_ids_by_region[region]:
                        if not ((boto_instance.state_code
                                 == INSTANCE_STATE['shutting-down']
                                 or boto_instance.state_code
                                 == INSTANCE_STATE['terminated'])):

                            # As a last resort, try to find the instance in our database.
                            # If the instance was saved to our database between the entrance
                            # to this function and the search query sent to EC2, then the instance
                            # will not be in our instances list but returned by EC2. In this
                            # case, we try to load it directly from the database.
                            q = Instance.objects.filter(
                                ec2_instance_id=boto_instance.id)
                            if q:
                                instance = q[0]
                            else:
                                logger.error(
                                    "[Pool %d] Instance with EC2 ID %s is not in our database."
                                    % (pool.id, boto_instance.id))

                                # Terminate at this point, we run in an inconsistent state
                                assert (False)

                        continue

                    if not instance:
                        instance = instances_by_ids[boto_instance.id]
                        instances_left.remove(instance)

                    # Check the status code and update if necessary
                    if instance.status_code != boto_instance.state_code:
                        instance.status_code = boto_instance.state_code
                        instance.save()

                    # If for some reason we don't have a hostname yet,
                    # update it accordingly.
                    if not instance.hostname:
                        instance.hostname = boto_instance.public_dns_name
                        instance.save()

            except boto.exception.EC2ResponseError as msg:
                logger.exception("%s: boto failure: %s" %
                                 ("update_pool_instances", msg))
                return 1

        if instances_left:
            for instance in instances_left:
                logger.info(
                    "[Pool %d] Deleting instance with EC2 ID %s from our database, has no corresponding machine on EC2."
                    % (pool.id, instance.ec2_instance_id))
                instance.delete()
        def start_instances_async(pool, config, count, images, region, zone,
                                  instances):
            userdata = LaniakeaCommandLine.handle_import_tags(
                config.ec2_userdata)
            userdata = LaniakeaCommandLine.handle_tags(
                userdata, config.ec2_userdata_macros)
            if not userdata:
                logger.error("[Pool %d] Failed to compile userdata." % pool.id)
                raise RuntimeError(
                    "start_instances_async: Failed to compile userdata")

            images["default"]['user_data'] = userdata
            images["default"]['placement'] = zone
            images["default"]['count'] = count

            cluster = Laniakea(images)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "start_instances_async", msg))

                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                # Delete all pending instances as we failed to create them
                for instance in instances:
                    instance.delete()

                return

            config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)

            try:
                logger.info("[Pool %d] Creating %s instances..." %
                            (pool.id, count))
                (boto_instances, boto_pending) = cluster.create_spot(
                    config.ec2_max_price,
                    tags=config.ec2_tags,
                    delete_on_termination=True,
                    timeout=20 * 60)

                logger.info(
                    "[Pool %d] Successfully created %s instances, %s requests timed out and were canceled"
                    % (pool.id, len(boto_instances), len(boto_pending)))

                assert (len(boto_instances) +
                        len(boto_pending)) == len(instances) == count

                for i in range(0, len(boto_instances)):
                    instances[i].hostname = boto_instances[i].public_dns_name
                    instances[i].ec2_instance_id = boto_instances[i].id
                    instances[i].status_code = boto_instances[i].state_code
                    instances[i].save()

                    assert (instances[i].ec2_instance_id != None)

                    # Now that we saved the object into our database, mark the instance as updatable
                    # so our update code can pick it up and update it accordingly when it changes states
                    boto_instances[i].add_tag("SpotManager-Updatable", "1")

                if boto_pending:
                    for i in range(len(boto_instances), count):
                        # Delete instances belong to canceled spot requests
                        logger.info(
                            "[Pool %d] Deleting instance with id %s (belongs to canceled request)"
                            % (pool.id, instances[i].pk))
                        instances[i].delete()

            except boto.exception.EC2ResponseError as msg:
                logger.exception("[Pool %d] %s: boto failure: %s" %
                                 (pool.id, "start_instances_async", msg))
                return