def start_instances_async(pool, config, count, images, region, zone, instances): userdata = LaniakeaCommandLine.handle_import_tags(config.ec2_userdata) userdata = LaniakeaCommandLine.handle_tags(userdata, config.ec2_userdata_macros) if not userdata: logger.error("[Pool %d] Failed to compile userdata." % pool.id) entry = PoolStatusEntry() entry.type = POOL_STATUS_ENTRY_TYPE['config-error'] entry.pool = pool entry.isCritical = True entry.msg = "Configuration error: Failed to compile userdata" entry.save() for instance in instances: instance.delete() raise RuntimeError("start_instances_async: Failed to compile userdata") images["default"]['user_data'] = userdata images["default"]['placement'] = zone images["default"]['count'] = count cluster = Laniakea(images) try: cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key) except Exception as msg: logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "start_instances_async", msg)) # Log this error to the pool status messages entry = PoolStatusEntry() entry.type = 0 entry.pool = pool entry.msg = str(msg) entry.isCritical = True entry.save() # Delete all pending instances as we failed to create them for instance in instances: instance.delete() return config.ec2_tags['SpotManager-PoolId'] = str(pool.pk) try: logger.info("[Pool %d] Creating %s instances..." % (pool.id, count)) boto_instances = cluster.create_spot(config.ec2_max_price, tags=config.ec2_tags, delete_on_termination=True, timeout=20 * 60) canceled_requests = count - len(boto_instances) logger.info("[Pool %d] Successfully created %s instances, %s requests timed out and were canceled" % (pool.id, len(boto_instances), canceled_requests)) for i in range(0, len(boto_instances)): instances[i].hostname = boto_instances[i].public_dns_name instances[i].ec2_instance_id = boto_instances[i].id # state_code is a 16-bit value where the high byte is # an opaque internal value and should be ignored. instances[i].status_code = boto_instances[i].state_code & 255 instances[i].save() assert(instances[i].ec2_instance_id != None) # Now that we saved the object into our database, mark the instance as updatable # so our update code can pick it up and update it accordingly when it changes states boto_instances[i].add_tag("SpotManager-Updatable", "1") if canceled_requests > 0: for i in range(len(boto_instances), count): # Delete instances belong to canceled spot requests logger.info("[Pool %d] Deleting instance with id %s (belongs to canceled request)" % (pool.id, instances[i].pk)) instances[i].delete() # Delete certain warnings we might have created earlier that no longer apply # If we ever exceeded the maximum spot instance count, we can clear # the warning now because we obviously succeeded in starting some instances. PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']).delete() # The same holds for temporary failures of any sort PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['temporary-failure']).delete() # Do not delete unclassified errors here for now, so the user can see them. except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg: if "MaxSpotInstanceCountExceeded" in str(msg): logger.warning("[Pool %d] Maximum instance count exceeded for region %s" % (pool.id, region)) if not PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']): entry = PoolStatusEntry() entry.pool = pool entry.type = POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded'] entry.msg = "Auto-selected region exceeded its maximum spot instance count." entry.save() elif "Service Unavailable" in str(msg): logger.warning("[Pool %d] Temporary failure in region %s: %s" % (pool.id, region, msg)) entry = PoolStatusEntry() entry.pool = pool entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure'] entry.msg = "Temporary failure occurred: %s" % msg entry.save() else: logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "start_instances_async", msg)) entry = PoolStatusEntry() entry.type = 0 entry.pool = pool entry.isCritical = True entry.msg = "Unclassified error occurred: %s" % msg entry.save() # Delete all pending instances, assuming that an exception from laniakea # means that all instance requests failed. for instance in instances: instance.delete() return
def start_instances_async(pool, config, count, images, region, zone, instances): userdata = LaniakeaCommandLine.handle_import_tags(config.ec2_userdata) userdata = LaniakeaCommandLine.handle_tags(userdata, config.ec2_userdata_macros) if not userdata: logger.error("[Pool %d] Failed to compile userdata." % pool.id) raise RuntimeError("start_instances_async: Failed to compile userdata") images["default"]['user_data'] = userdata images["default"]['placement'] = zone images["default"]['count'] = count cluster = Laniakea(images) try: cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key) except Exception as msg: logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "start_instances_async", msg)) # Log this error to the pool status messages entry = PoolStatusEntry() entry.pool = pool entry.msg = str(msg) entry.isCritical = True entry.save() # Delete all pending instances as we failed to create them for instance in instances: instance.delete() return config.ec2_tags['SpotManager-PoolId'] = str(pool.pk) try: logger.info("[Pool %d] Creating %s instances..." % (pool.id, count)) (boto_instances, boto_pending) = cluster.create_spot(config.ec2_max_price, tags=config.ec2_tags, delete_on_termination=True, timeout=20*60) logger.info("[Pool %d] Successfully created %s instances, %s requests timed out and were canceled" % (pool.id, len(boto_instances), len(boto_pending))) assert (len(boto_instances) + len(boto_pending)) == len(instances) == count for i in range(0,len(boto_instances)): instances[i].hostname = boto_instances[i].public_dns_name instances[i].ec2_instance_id = boto_instances[i].id instances[i].status_code = boto_instances[i].state_code instances[i].save() assert(instances[i].ec2_instance_id != None) # Now that we saved the object into our database, mark the instance as updatable # so our update code can pick it up and update it accordingly when it changes states boto_instances[i].add_tag("SpotManager-Updatable", "1") if boto_pending: for i in range(len(boto_instances),count): # Delete instances belong to canceled spot requests logger.info("[Pool %d] Deleting instance with id %s (belongs to canceled request)" % (pool.id, instances[i].pk)) instances[i].delete() except boto.exception.EC2ResponseError as msg: logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "start_instances_async", msg)) return
def start_instances_async(pool, config, count, images, region, zone, instances): userdata = LaniakeaCommandLine.handle_import_tags( config.ec2_userdata) userdata = LaniakeaCommandLine.handle_tags( userdata, config.ec2_userdata_macros) if not userdata: logger.error("[Pool %d] Failed to compile userdata." % pool.id) entry = PoolStatusEntry() entry.type = POOL_STATUS_ENTRY_TYPE['config-error'] entry.pool = pool entry.isCritical = True entry.msg = "Configuration error: Failed to compile userdata" entry.save() for instance in instances: instance.delete() raise RuntimeError( "start_instances_async: Failed to compile userdata") images["default"]['user_data'] = userdata images["default"]['placement'] = zone images["default"]['count'] = count cluster = Laniakea(images) try: cluster.connect( region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key) except Exception as msg: logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "start_instances_async", msg)) # Log this error to the pool status messages entry = PoolStatusEntry() entry.type = 0 entry.pool = pool entry.msg = str(msg) entry.isCritical = True entry.save() # Delete all pending instances as we failed to create them for instance in instances: instance.delete() return config.ec2_tags['SpotManager-PoolId'] = str(pool.pk) try: logger.info("[Pool %d] Creating %s instances..." % (pool.id, count)) boto_instances = cluster.create_spot( config.ec2_max_price, tags=config.ec2_tags, delete_on_termination=True, timeout=20 * 60) canceled_requests = count - len(boto_instances) logger.info( "[Pool %d] Successfully created %s instances, %s requests timed out and were canceled" % (pool.id, len(boto_instances), canceled_requests)) for i in range(0, len(boto_instances)): instances[i].hostname = boto_instances[i].public_dns_name instances[i].ec2_instance_id = boto_instances[i].id # state_code is a 16-bit value where the high byte is # an opaque internal value and should be ignored. instances[ i].status_code = boto_instances[i].state_code & 255 instances[i].save() assert (instances[i].ec2_instance_id != None) # Now that we saved the object into our database, mark the instance as updatable # so our update code can pick it up and update it accordingly when it changes states boto_instances[i].add_tag("SpotManager-Updatable", "1") if canceled_requests > 0: for i in range(len(boto_instances), count): # Delete instances belong to canceled spot requests logger.info( "[Pool %d] Deleting instance with id %s (belongs to canceled request)" % (pool.id, instances[i].pk)) instances[i].delete() # Delete certain warnings we might have created earlier that no longer apply # If we ever exceeded the maximum spot instance count, we can clear # the warning now because we obviously succeeded in starting some instances. PoolStatusEntry.objects.filter( pool=pool, type=POOL_STATUS_ENTRY_TYPE[ 'max-spot-instance-count-exceeded']).delete() # The same holds for temporary failures of any sort PoolStatusEntry.objects.filter( pool=pool, type=POOL_STATUS_ENTRY_TYPE['temporary-failure']).delete() # Do not delete unclassified errors here for now, so the user can see them. except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg: if "MaxSpotInstanceCountExceeded" in str(msg): logger.warning( "[Pool %d] Maximum instance count exceeded for region %s" % (pool.id, region)) if not PoolStatusEntry.objects.filter( pool=pool, type=POOL_STATUS_ENTRY_TYPE[ 'max-spot-instance-count-exceeded']): entry = PoolStatusEntry() entry.pool = pool entry.type = POOL_STATUS_ENTRY_TYPE[ 'max-spot-instance-count-exceeded'] entry.msg = "Auto-selected region exceeded its maximum spot instance count." entry.save() elif "Service Unavailable" in str(msg): logger.warning( "[Pool %d] Temporary failure in region %s: %s" % (pool.id, region, msg)) entry = PoolStatusEntry() entry.pool = pool entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure'] entry.msg = "Temporary failure occurred: %s" % msg entry.save() else: logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "start_instances_async", msg)) entry = PoolStatusEntry() entry.type = 0 entry.pool = pool entry.isCritical = True entry.msg = "Unclassified error occurred: %s" % msg entry.save() # Delete all pending instances, assuming that an exception from laniakea # means that all instance requests failed. for instance in instances: instance.delete() return
def start_instances_async(pool, config, count, images, region, zone, instances): userdata = LaniakeaCommandLine.handle_import_tags( config.ec2_userdata) userdata = LaniakeaCommandLine.handle_tags( userdata, config.ec2_userdata_macros) if not userdata: logger.error("[Pool %d] Failed to compile userdata." % pool.id) raise RuntimeError( "start_instances_async: Failed to compile userdata") images["default"]['user_data'] = userdata images["default"]['placement'] = zone images["default"]['count'] = count cluster = Laniakea(images) try: cluster.connect( region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key) except Exception as msg: logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "start_instances_async", msg)) # Log this error to the pool status messages entry = PoolStatusEntry() entry.pool = pool entry.msg = str(msg) entry.isCritical = True entry.save() # Delete all pending instances as we failed to create them for instance in instances: instance.delete() return config.ec2_tags['SpotManager-PoolId'] = str(pool.pk) try: logger.info("[Pool %d] Creating %s instances..." % (pool.id, count)) (boto_instances, boto_pending) = cluster.create_spot( config.ec2_max_price, tags=config.ec2_tags, delete_on_termination=True, timeout=20 * 60) logger.info( "[Pool %d] Successfully created %s instances, %s requests timed out and were canceled" % (pool.id, len(boto_instances), len(boto_pending))) assert (len(boto_instances) + len(boto_pending)) == len(instances) == count for i in range(0, len(boto_instances)): instances[i].hostname = boto_instances[i].public_dns_name instances[i].ec2_instance_id = boto_instances[i].id instances[i].status_code = boto_instances[i].state_code instances[i].save() assert (instances[i].ec2_instance_id != None) # Now that we saved the object into our database, mark the instance as updatable # so our update code can pick it up and update it accordingly when it changes states boto_instances[i].add_tag("SpotManager-Updatable", "1") if boto_pending: for i in range(len(boto_instances), count): # Delete instances belong to canceled spot requests logger.info( "[Pool %d] Deleting instance with id %s (belongs to canceled request)" % (pool.id, instances[i].pk)) instances[i].delete() except boto.exception.EC2ResponseError as msg: logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "start_instances_async", msg)) return