def terminate_pool_instances(self, pool, instances, config, terminateByPool=False):
     """ Terminate an instance with the given configuration """        
     instance_ids_by_region = self.get_instance_ids_by_region(instances)
     
     for region in instance_ids_by_region:
         cluster = Laniakea(None)
         try:
             cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
         except Exception as msg:
             logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "terminate_pool_instances", msg))
             return None
     
         try:
             if terminateByPool:
                 boto_instances = cluster.find(filters={"tag:SpotManager-PoolId" : str(pool.pk)})
                 
                 # Data consistency checks
                 for boto_instance in boto_instances:
                     assert ((boto_instance.id in instance_ids_by_region[region])
                             or (boto_instance.state_code == INSTANCE_STATE['shutting-down'] 
                             or boto_instance.state_code == INSTANCE_STATE['terminated']))
                     
                 cluster.terminate(boto_instances)
             else:
                 logger.info("[Pool %d] Terminating %s instances in region %s" % (pool.id, len(instance_ids_by_region[region]),region))
                 cluster.terminate(cluster.find(instance_ids=instance_ids_by_region[region]))
         except boto.exception.EC2ResponseError as msg:
             logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "terminate_pool_instances", msg))
             return 1
예제 #2
0
def _terminate_pool_instances(pool, instances, config, terminateByPool=False):
    """ Terminate an instance with the given configuration """
    from .models import INSTANCE_STATE, PoolStatusEntry, POOL_STATUS_ENTRY_TYPE
    instance_ids_by_region = _get_instance_ids_by_region(instances)

    for region in instance_ids_by_region:
        cluster = Laniakea(None)
        try:
            cluster.connect(region=region,
                            aws_access_key_id=config.aws_access_key_id,
                            aws_secret_access_key=config.aws_secret_access_key)
        except Exception as msg:
            # Log this error to the pool status messages
            entry = PoolStatusEntry()
            entry.type = POOL_STATUS_ENTRY_TYPE['unclassified']
            entry.pool = pool
            entry.msg = str(msg)
            entry.isCritical = True
            entry.save()

            logger.exception(
                "[Pool %d] terminate_pool_instances: laniakea failure: %s",
                pool.id, msg)
            return None

        try:
            if terminateByPool:
                boto_instances = cluster.find(
                    filters={"tag:" + SPOTMGR_TAG + "-PoolId": str(pool.pk)})

                # Data consistency checks
                for boto_instance in boto_instances:
                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    state_code = boto_instance.state_code & 255
                    if not (
                        (boto_instance.id in instance_ids_by_region[region]) or
                        (state_code == INSTANCE_STATE['shutting-down']
                         or state_code == INSTANCE_STATE['terminated'])):
                        logger.error(
                            "[Pool %d] Instance with EC2 ID %s (status %d) "
                            "is not in region list for region %s", pool.id,
                            boto_instance.id, state_code, region)

                cluster.terminate(boto_instances)
            else:
                logger.info("[Pool %d] Terminating %s instances in region %s",
                            pool.id, len(instance_ids_by_region[region]),
                            region)
                cluster.terminate(
                    cluster.find(instance_ids=instance_ids_by_region[region]))
        except (boto.exception.EC2ResponseError,
                boto.exception.BotoServerError, ssl.SSLError,
                socket.error) as msg:
            logger.exception(
                "[Pool %d] terminate_pool_instances: boto failure: %s",
                pool.id, msg)
            return 1
    def terminate_pool_instances(self,
                                 pool,
                                 instances,
                                 config,
                                 terminateByPool=False):
        """ Terminate an instance with the given configuration """
        instance_ids_by_region = self.get_instance_ids_by_region(instances)

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "terminate_pool_instances", msg))
                return None

            try:
                if terminateByPool:
                    boto_instances = cluster.find(
                        filters={"tag:SpotManager-PoolId": str(pool.pk)})

                    # Data consistency checks
                    for boto_instance in boto_instances:
                        if not ((boto_instance.id
                                 in instance_ids_by_region[region]) or
                                (boto_instance.state_code
                                 == INSTANCE_STATE['shutting-down']
                                 or boto_instance.state_code
                                 == INSTANCE_STATE['terminated'])):
                            logger.error(
                                "[Pool %d] Instance with EC2 ID %s (status %d) is not in region list for region %s"
                                % (pool.id, boto_instance.id,
                                   boto_instance.state_code, region))

                    cluster.terminate(boto_instances)
                else:
                    logger.info(
                        "[Pool %d] Terminating %s instances in region %s" %
                        (pool.id, len(instance_ids_by_region[region]), region))
                    cluster.terminate(
                        cluster.find(
                            instance_ids=instance_ids_by_region[region]))
            except boto.exception.EC2ResponseError as msg:
                logger.exception("[Pool %d] %s: boto failure: %s" %
                                 (pool.id, "terminate_pool_instances", msg))
                return 1
    def terminate_pool_instances(self,
                                 pool,
                                 instances,
                                 config,
                                 terminateByPool=False):
        """ Terminate an instance with the given configuration """
        instance_ids_by_region = self.get_instance_ids_by_region(instances)

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "terminate_pool_instances", msg))
                return None

            try:
                if terminateByPool:
                    boto_instances = cluster.find(
                        filters={"tag:SpotManager-PoolId": str(pool.pk)})

                    # Data consistency checks
                    for boto_instance in boto_instances:
                        assert ((boto_instance.id
                                 in instance_ids_by_region[region])
                                or (boto_instance.state_code
                                    == INSTANCE_STATE['shutting-down']
                                    or boto_instance.state_code
                                    == INSTANCE_STATE['terminated']))

                    cluster.terminate(boto_instances)
                else:
                    logger.info(
                        "[Pool %d] Terminating %s instances in region %s" %
                        (pool.id, len(instance_ids_by_region[region]), region))
                    cluster.terminate(
                        cluster.find(
                            instance_ids=instance_ids_by_region[region]))
            except boto.exception.EC2ResponseError as msg:
                logger.exception("[Pool %d] %s: boto failure: %s" %
                                 (pool.id, "terminate_pool_instances", msg))
                return 1
    def terminate_pool_instances(self, pool, instances, config, terminateByPool=False):
        """ Terminate an instance with the given configuration """
        instance_ids_by_region = self.get_instance_ids_by_region(instances)

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "terminate_pool_instances", msg))
                return None

            try:
                if terminateByPool:
                    boto_instances = cluster.find(filters={"tag:SpotManager-PoolId" : str(pool.pk)})

                    # Data consistency checks
                    for boto_instance in boto_instances:
                        # state_code is a 16-bit value where the high byte is
                        # an opaque internal value and should be ignored.
                        state_code = boto_instance.state_code & 255
                        if not ((boto_instance.id in instance_ids_by_region[region])
                                or (state_code == INSTANCE_STATE['shutting-down']
                                or state_code == INSTANCE_STATE['terminated'])):
                            logger.error("[Pool %d] Instance with EC2 ID %s (status %d) is not in region list for region %s" % (pool.id, boto_instance.id, state_code, region))

                    cluster.terminate(boto_instances)
                else:
                    logger.info("[Pool %d] Terminating %s instances in region %s" % (pool.id, len(instance_ids_by_region[region]), region))
                    cluster.terminate(cluster.find(instance_ids=instance_ids_by_region[region]))
            except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg:
                logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "terminate_pool_instances", msg))
                return 1
    def update_pool_instances(self, pool, config):
        """ Check the state of the instances in a pool and update it in the database """
        instances = Instance.objects.filter(pool=pool)
        instance_ids_by_region = self.get_instance_ids_by_region(instances)
        instances_by_ids = self.get_instances_by_ids(instances)
        instances_left = []

        debug_boto_instance_ids_seen = set()
        debug_not_updatable_continue = set()
        debug_not_in_region = {}


        for instance_id in instances_by_ids:
            if instance_id:
                instances_left.append(instances_by_ids[instance_id])

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "update_pool_instances", msg))
                return None

            try:
                boto_instances = cluster.find(filters={"tag:SpotManager-PoolId" : str(pool.pk)})

                for boto_instance in boto_instances:
                    # Store ID seen for debugging purposes
                    debug_boto_instance_ids_seen.add(boto_instance.id)

                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    state_code = boto_instance.state_code & 255

                    if "SpotManager-Updatable" not in boto_instance.tags or int(boto_instance.tags["SpotManager-Updatable"]) <= 0:
                        # The instance is not marked as updatable. We must not touch it because
                        # a spawning thread is still managing this instance. However, we must also
                        # remove this instance from the instances_left list if it's already in our
                        # database, because otherwise our code here would delete it from the database.
                        if boto_instance.id in instance_ids_by_region[region]:
                            instances_left.remove(instances_by_ids[boto_instance.id])
                        else:
                            debug_not_updatable_continue.add(boto_instance.id)
                        continue

                    instance = None

                    # Whenever we see an instance that is not in our instance list for that region,
                    # make sure it's a terminated instance because we should never have a running
                    # instance that matches the search above but is not in our database.
                    if not boto_instance.id in instance_ids_by_region[region]:
                        if not ((state_code == INSTANCE_STATE['shutting-down']
                            or state_code == INSTANCE_STATE['terminated'])):

                            # As a last resort, try to find the instance in our database.
                            # If the instance was saved to our database between the entrance
                            # to this function and the search query sent to EC2, then the instance
                            # will not be in our instances list but returned by EC2. In this
                            # case, we try to load it directly from the database.
                            q = Instance.objects.filter(ec2_instance_id=boto_instance.id)
                            if q:
                                instance = q[0]
                                logger.error("[Pool %d] Instance with EC2 ID %s was reloaded from database." % (pool.id, boto_instance.id))
                            else:
                                logger.error("[Pool %d] Instance with EC2 ID %s is not in our database." % (pool.id, boto_instance.id))

                                # Terminate at this point, we run in an inconsistent state
                                assert(False)
                        debug_not_in_region[boto_instance.id] = state_code
                        continue

                    instance = instances_by_ids[boto_instance.id]
                    instances_left.remove(instance)

                    # Check the status code and update if necessary
                    if instance.status_code != state_code:
                        instance.status_code = state_code
                        instance.save()

                    # If for some reason we don't have a hostname yet,
                    # update it accordingly.
                    if not instance.hostname:
                        instance.hostname = boto_instance.public_dns_name
                        instance.save()

            except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg:
                logger.exception("%s: boto failure: %s" % ("update_pool_instances", msg))
                return 1

        if instances_left:
            for instance in instances_left:
                if not instance.ec2_instance_id in debug_boto_instance_ids_seen:
                    logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database, has no corresponding machine on EC2." % (pool.id, instance.ec2_instance_id))

                if instance.ec2_instance_id in debug_not_updatable_continue:
                    logger.error("[Pool %d] Deleting instance with EC2 ID %s from our database because it is not updatable but not in our region." % (pool.id, instance.ec2_instance_id))

                if instance.ec2_instance_id in debug_not_in_region:
                    logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database, has state code %s on EC2" % (pool.id, instance.ec2_instance_id, debug_not_in_region[instance.ec2_instance_id]))

                logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database." % (pool.id, instance.ec2_instance_id))
                instance.delete()
 def update_pool_instances(self, pool, config):
     """ Check the state of the instances in a pool and update it in the database """
     instances = Instance.objects.filter(pool=pool)
     instance_ids_by_region = self.get_instance_ids_by_region(instances)
     instances_by_ids = self.get_instances_by_ids(instances)
     instances_left = []
     
     for instance_id in instances_by_ids:
         if instance_id:
             instances_left.append(instances_by_ids[instance_id])
     
     for region in instance_ids_by_region:
         cluster = Laniakea(None)
         try:
             cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
         except Exception as msg:
             # Log this error to the pool status messages
             entry = PoolStatusEntry()
             entry.type = 0
             entry.pool = pool
             entry.msg = str(msg)
             entry.isCritical = True
             entry.save()
             
             logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "update_pool_instances", msg))
             return None
     
         try:
             boto_instances = cluster.find(filters={"tag:SpotManager-PoolId" : str(pool.pk), "tag:SpotManager-Updatable" : "1"})
             
             
             for boto_instance in boto_instances:
                 instance = None
                 
                 # Whenever we see an instance that is not in our instance list for that region,
                 # make sure it's a terminated instance because we should never have a running 
                 # instance that matches the search above but is not in our database.
                 if not boto_instance.id in instance_ids_by_region[region]:
                     if not ((boto_instance.state_code == INSTANCE_STATE['shutting-down'] 
                         or boto_instance.state_code == INSTANCE_STATE['terminated'])):
                         
                         # As a last resort, try to find the instance in our database.
                         # If the instance was saved to our database between the entrance
                         # to this function and the search query sent to EC2, then the instance
                         # will not be in our instances list but returned by EC2. In this
                         # case, we try to load it directly from the database.
                         q = Instance.objects.filter(ec2_instance_id = boto_instance.id)
                         if q:
                             instance = q[0]
                         else:
                             logger.error("[Pool %d] Instance with EC2 ID %s is not in our database." % (pool.id, boto_instance.id))
                                 
                             # Terminate at this point, we run in an inconsistent state
                             assert(False)
                         
                     continue
                 
                 if not instance:
                     instance = instances_by_ids[boto_instance.id]
                     instances_left.remove(instance)
                 
                 # Check the status code and update if necessary
                 if instance.status_code != boto_instance.state_code:
                     instance.status_code = boto_instance.state_code
                     instance.save()
                     
                 # If for some reason we don't have a hostname yet,
                 # update it accordingly.
                 if not instance.hostname:
                     instance.hostname = boto_instance.public_dns_name
                     instance.save()
                 
         except boto.exception.EC2ResponseError as msg:
             logger.exception("%s: boto failure: %s" % ("update_pool_instances", msg))
             return 1
     
     if instances_left:
         for instance in instances_left:
             logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database, has no corresponding machine on EC2." % (pool.id, instance.ec2_instance_id))
             instance.delete()
예제 #8
0
    def update_pool_instances(self, pool, config):
        """ Check the state of the instances in a pool and update it in the database """
        instances = Instance.objects.filter(pool=pool)
        instance_ids_by_region = self.get_instance_ids_by_region(instances)
        instances_by_ids = self.get_instances_by_ids(instances)
        instances_left = []

        debug_boto_instance_ids_seen = set()
        debug_not_updatable_continue = set()
        debug_not_in_region = {}

        for instance_id in instances_by_ids:
            if instance_id:
                instances_left.append(instances_by_ids[instance_id])

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "update_pool_instances", msg))
                return None

            try:
                boto_instances = cluster.find(
                    filters={"tag:SpotManager-PoolId": str(pool.pk)})

                for boto_instance in boto_instances:
                    # Store ID seen for debugging purposes
                    debug_boto_instance_ids_seen.add(boto_instance.id)

                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    state_code = boto_instance.state_code & 255

                    if "SpotManager-Updatable" not in boto_instance.tags or int(
                            boto_instance.tags["SpotManager-Updatable"]) <= 0:
                        # The instance is not marked as updatable. We must not touch it because
                        # a spawning thread is still managing this instance. However, we must also
                        # remove this instance from the instances_left list if it's already in our
                        # database, because otherwise our code here would delete it from the database.
                        if boto_instance.id in instance_ids_by_region[region]:
                            instances_left.remove(
                                instances_by_ids[boto_instance.id])
                        else:
                            debug_not_updatable_continue.add(boto_instance.id)
                        continue

                    instance = None

                    # Whenever we see an instance that is not in our instance list for that region,
                    # make sure it's a terminated instance because we should never have a running
                    # instance that matches the search above but is not in our database.
                    if not boto_instance.id in instance_ids_by_region[region]:
                        if not (
                            (state_code == INSTANCE_STATE['shutting-down']
                             or state_code == INSTANCE_STATE['terminated'])):

                            # As a last resort, try to find the instance in our database.
                            # If the instance was saved to our database between the entrance
                            # to this function and the search query sent to EC2, then the instance
                            # will not be in our instances list but returned by EC2. In this
                            # case, we try to load it directly from the database.
                            q = Instance.objects.filter(
                                ec2_instance_id=boto_instance.id)
                            if q:
                                instance = q[0]
                                logger.error(
                                    "[Pool %d] Instance with EC2 ID %s was reloaded from database."
                                    % (pool.id, boto_instance.id))
                            else:
                                logger.error(
                                    "[Pool %d] Instance with EC2 ID %s is not in our database."
                                    % (pool.id, boto_instance.id))

                                # Terminate at this point, we run in an inconsistent state
                                assert (False)
                        debug_not_in_region[boto_instance.id] = state_code
                        continue

                    instance = instances_by_ids[boto_instance.id]
                    instances_left.remove(instance)

                    # Check the status code and update if necessary
                    if instance.status_code != state_code:
                        instance.status_code = state_code
                        instance.save()

                    # If for some reason we don't have a hostname yet,
                    # update it accordingly.
                    if not instance.hostname:
                        instance.hostname = boto_instance.public_dns_name
                        instance.save()

            except (boto.exception.EC2ResponseError,
                    boto.exception.BotoServerError, ssl.SSLError,
                    socket.error) as msg:
                logger.exception("%s: boto failure: %s" %
                                 ("update_pool_instances", msg))
                return 1

        if instances_left:
            for instance in instances_left:
                if not instance.ec2_instance_id in debug_boto_instance_ids_seen:
                    logger.info(
                        "[Pool %d] Deleting instance with EC2 ID %s from our database, has no corresponding machine on EC2."
                        % (pool.id, instance.ec2_instance_id))

                if instance.ec2_instance_id in debug_not_updatable_continue:
                    logger.error(
                        "[Pool %d] Deleting instance with EC2 ID %s from our database because it is not updatable but not in our region."
                        % (pool.id, instance.ec2_instance_id))

                if instance.ec2_instance_id in debug_not_in_region:
                    logger.info(
                        "[Pool %d] Deleting instance with EC2 ID %s from our database, has state code %s on EC2"
                        % (pool.id, instance.ec2_instance_id,
                           debug_not_in_region[instance.ec2_instance_id]))

                logger.info(
                    "[Pool %d] Deleting instance with EC2 ID %s from our database."
                    % (pool.id, instance.ec2_instance_id))
                instance.delete()
예제 #9
0
def _update_pool_instances(pool, config):
    """Check the state of the instances in a pool and update it in the database"""
    from .models import Instance, INSTANCE_STATE, PoolStatusEntry, POOL_STATUS_ENTRY_TYPE
    instances = Instance.objects.filter(pool=pool)
    instance_ids_by_region = _get_instance_ids_by_region(instances)
    instances_by_ids = _get_instances_by_ids(instances)
    instances_left = []
    instances_created = False

    debug_boto_instance_ids_seen = set()
    debug_not_updatable_continue = set()
    debug_not_in_region = {}

    for instance in instances_by_ids.values():
        if instance.status_code != INSTANCE_STATE['requested']:
            instances_left.append(instance)

    # set config to this pool for now in case we set tags on fulfilled spot requests
    config.ec2_tags[SPOTMGR_TAG + '-PoolId'] = str(pool.pk)

    for region in instance_ids_by_region:
        cluster = Laniakea(None)
        try:
            cluster.connect(region=region,
                            aws_access_key_id=config.aws_access_key_id,
                            aws_secret_access_key=config.aws_secret_access_key)
        except Exception as msg:
            # Log this error to the pool status messages
            entry = PoolStatusEntry()
            entry.type = POOL_STATUS_ENTRY_TYPE['unclassified']
            entry.pool = pool
            entry.msg = str(msg)
            entry.isCritical = True
            entry.save()

            logger.exception(
                "[Pool %d] update_pool_instances: laniakea failure: %s",
                pool.id, msg)
            return

        try:
            # first check status of pending spot requests
            requested = []
            for instance_id in instance_ids_by_region[region]:
                if instances_by_ids[instance_id].status_code == INSTANCE_STATE[
                        'requested']:
                    requested.append(instance_id)

            if requested:
                boto_results = cluster.check_spot_requests(
                    requested, config.ec2_tags)

                for req_id, result in zip(requested, boto_results):
                    instance = instances_by_ids[req_id]

                    if isinstance(result, boto.ec2.instance.Instance):
                        logger.info(
                            "[Pool %d] spot request fulfilled %s -> %s",
                            pool.id, req_id, result.id)

                        # spot request has been fulfilled
                        instance.hostname = result.public_dns_name
                        instance.ec2_instance_id = result.id
                        # state_code is a 16-bit value where the high byte is
                        # an opaque internal value and should be ignored.
                        instance.status_code = result.state_code & 255
                        instance.save()

                        # update local data structures to use the new instances instead
                        del instances_by_ids[req_id]
                        instances_by_ids[result.id] = instance
                        instance_ids_by_region[region].append(result.id)
                        # don't add it to instances_left yet to avoid race with adding tags

                        # Now that we saved the object into our database, mark the instance as updatable
                        # so our update code can pick it up and update it accordingly when it changes states
                        result.add_tag(SPOTMGR_TAG + "-Updatable", "1")

                        instances_created = True

                    # request object is returned in case request is closed/cancelled/failed
                    elif isinstance(
                            result,
                            boto.ec2.spotinstancerequest.SpotInstanceRequest):
                        if result.state in {"cancelled", "closed"}:
                            # this is normal, remove from DB and move on
                            logger.info("[Pool %d] spot request %s is %s",
                                        pool.id, req_id, result.state)
                            instances_by_ids[req_id].delete()
                        elif result.state in {"open", "active"}:
                            # this should not happen! warn and leave in DB in case it's fulfilled later
                            logger.warning(
                                "[Pool %d] Request %s is %s and %s.", pool.id,
                                req_id, result.status.code, result.state)
                        else:  # state=failed
                            msg = "Request %s is %s and %s." % (
                                req_id, result.status.code, result.state)

                            entry = PoolStatusEntry()
                            entry.type = POOL_STATUS_ENTRY_TYPE['unclassified']
                            entry.pool = pool
                            entry.msg = str(msg)
                            entry.isCritical = True
                            entry.save()

                            logger.error("[Pool %d] %s", pool.id, msg)
                            instances_by_ids[req_id].delete()

                    elif result is None:
                        logger.info("[Pool %d] spot request %s is still open",
                                    pool.pk, req_id)

                    else:
                        logger.warning("[Pool %d] spot request %s returned %s",
                                       pool.pk, req_id,
                                       type(result).__name__)

            boto_instances = cluster.find(
                filters={"tag:" + SPOTMGR_TAG + "-PoolId": str(pool.pk)})

            for boto_instance in boto_instances:
                # Store ID seen for debugging purposes
                debug_boto_instance_ids_seen.add(boto_instance.id)

                # state_code is a 16-bit value where the high byte is
                # an opaque internal value and should be ignored.
                state_code = boto_instance.state_code & 255

                if (SPOTMGR_TAG + "-Updatable" not in boto_instance.tags
                        or int(boto_instance.tags[SPOTMGR_TAG + "-Updatable"])
                        <= 0):
                    # The instance is not marked as updatable. We must not touch it because
                    # a spawning thread is still managing this instance. However, we must also
                    # remove this instance from the instances_left list if it's already in our
                    # database, because otherwise our code here would delete it from the database.
                    if boto_instance.id in instance_ids_by_region[region]:
                        instances_left.remove(
                            instances_by_ids[boto_instance.id])
                    else:
                        debug_not_updatable_continue.add(boto_instance.id)
                    continue

                instance = None

                # Whenever we see an instance that is not in our instance list for that region,
                # make sure it's a terminated instance because we should never have a running
                # instance that matches the search above but is not in our database.
                if boto_instance.id not in instance_ids_by_region[region]:
                    if state_code not in [
                            INSTANCE_STATE['shutting-down'],
                            INSTANCE_STATE['terminated']
                    ]:

                        # As a last resort, try to find the instance in our database.
                        # If the instance was saved to our database between the entrance
                        # to this function and the search query sent to EC2, then the instance
                        # will not be in our instances list but returned by EC2. In this
                        # case, we try to load it directly from the database.
                        q = Instance.objects.filter(
                            ec2_instance_id=boto_instance.id)
                        if q:
                            instance = q[0]
                            logger.error(
                                "[Pool %d] Instance with EC2 ID %s was reloaded from database.",
                                pool.id, boto_instance.id)
                        else:
                            logger.error(
                                "[Pool %d] Instance with EC2 ID %s is not in our database.",
                                pool.id, boto_instance.id)

                            # Terminate at this point, we run in an inconsistent state
                            assert (False)
                    debug_not_in_region[boto_instance.id] = state_code
                    continue

                instance = instances_by_ids[boto_instance.id]
                instances_left.remove(instance)

                # Check the status code and update if necessary
                if instance.status_code != state_code:
                    instance.status_code = state_code
                    instance.save()

                # If for some reason we don't have a hostname yet,
                # update it accordingly.
                if not instance.hostname:
                    instance.hostname = boto_instance.public_dns_name
                    instance.save()

        except (boto.exception.EC2ResponseError,
                boto.exception.BotoServerError, ssl.SSLError,
                socket.error) as msg:
            if "MaxSpotInstanceCountExceeded" in str(msg):
                logger.warning(
                    "[Pool %d] update_pool_instances: Maximum instance count exceeded for region %s",
                    pool.id, region)
                if not PoolStatusEntry.objects.filter(
                        pool=pool,
                        type=POOL_STATUS_ENTRY_TYPE[
                            'max-spot-instance-count-exceeded']):
                    entry = PoolStatusEntry()
                    entry.pool = pool
                    entry.type = POOL_STATUS_ENTRY_TYPE[
                        'max-spot-instance-count-exceeded']
                    entry.msg = "Auto-selected region exceeded its maximum spot instance count."
                    entry.save()
            elif "Service Unavailable" in str(msg):
                logger.warning(
                    "[Pool %d] update_pool_instances: Temporary failure in region %s: %s",
                    pool.id, region, msg)
                entry = PoolStatusEntry()
                entry.pool = pool
                entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure']
                entry.msg = "Temporary failure occurred: %s" % msg
                entry.save()
            else:
                logger.exception(
                    "[Pool %d] update_pool_instances: boto failure: %s",
                    pool.id, msg)
                entry = PoolStatusEntry()
                entry.type = POOL_STATUS_ENTRY_TYPE['unclassified']
                entry.pool = pool
                entry.isCritical = True
                entry.msg = "Unclassified error occurred: %s" % msg
                entry.save()
            return

    for instance in instances_left:
        reasons = []

        if instance.ec2_instance_id not in debug_boto_instance_ids_seen:
            reasons.append("no corresponding machine on EC2")

        if instance.ec2_instance_id in debug_not_updatable_continue:
            reasons.append("not updatable")

        if instance.ec2_instance_id in debug_not_in_region:
            reasons.append("has state code %s on EC2 but not in our region" %
                           debug_not_in_region[instance.ec2_instance_id])

        if not reasons:
            reasons.append("?")

        logger.info(
            "[Pool %d] Deleting instance with EC2 ID %s from our database: %s",
            pool.id, instance.ec2_instance_id, ", ".join(reasons))
        instance.delete()

    if instances_created:
        # Delete certain warnings we might have created earlier that no longer apply

        # If we ever exceeded the maximum spot instance count, we can clear
        # the warning now because we obviously succeeded in starting some instances.
        PoolStatusEntry.objects.filter(
            pool=pool,
            type=POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']
        ).delete()

        # The same holds for temporary failures of any sort
        PoolStatusEntry.objects.filter(
            pool=pool,
            type=POOL_STATUS_ENTRY_TYPE['temporary-failure']).delete()
    def update_pool_instances(self, pool, config):
        """ Check the state of the instances in a pool and update it in the database """
        instances = Instance.objects.filter(pool=pool)
        instance_ids_by_region = self.get_instance_ids_by_region(instances)
        instances_by_ids = self.get_instances_by_ids(instances)
        instances_left = []

        for instance_id in instances_by_ids:
            if instance_id:
                instances_left.append(instances_by_ids[instance_id])

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "update_pool_instances", msg))
                return None

            try:
                boto_instances = cluster.find(
                    filters={
                        "tag:SpotManager-PoolId": str(pool.pk),
                        "tag:SpotManager-Updatable": "1"
                    })

                for boto_instance in boto_instances:
                    instance = None

                    # Whenever we see an instance that is not in our instance list for that region,
                    # make sure it's a terminated instance because we should never have a running
                    # instance that matches the search above but is not in our database.
                    if not boto_instance.id in instance_ids_by_region[region]:
                        if not ((boto_instance.state_code
                                 == INSTANCE_STATE['shutting-down']
                                 or boto_instance.state_code
                                 == INSTANCE_STATE['terminated'])):

                            # As a last resort, try to find the instance in our database.
                            # If the instance was saved to our database between the entrance
                            # to this function and the search query sent to EC2, then the instance
                            # will not be in our instances list but returned by EC2. In this
                            # case, we try to load it directly from the database.
                            q = Instance.objects.filter(
                                ec2_instance_id=boto_instance.id)
                            if q:
                                instance = q[0]
                            else:
                                logger.error(
                                    "[Pool %d] Instance with EC2 ID %s is not in our database."
                                    % (pool.id, boto_instance.id))

                                # Terminate at this point, we run in an inconsistent state
                                assert (False)

                        continue

                    if not instance:
                        instance = instances_by_ids[boto_instance.id]
                        instances_left.remove(instance)

                    # Check the status code and update if necessary
                    if instance.status_code != boto_instance.state_code:
                        instance.status_code = boto_instance.state_code
                        instance.save()

                    # If for some reason we don't have a hostname yet,
                    # update it accordingly.
                    if not instance.hostname:
                        instance.hostname = boto_instance.public_dns_name
                        instance.save()

            except boto.exception.EC2ResponseError as msg:
                logger.exception("%s: boto failure: %s" %
                                 ("update_pool_instances", msg))
                return 1

        if instances_left:
            for instance in instances_left:
                logger.info(
                    "[Pool %d] Deleting instance with EC2 ID %s from our database, has no corresponding machine on EC2."
                    % (pool.id, instance.ec2_instance_id))
                instance.delete()