def _start_pool_instances(pool, config, count=1): """ Start an instance with the given configuration """ from .models import Instance, INSTANCE_STATE, PoolStatusEntry, POOL_STATUS_ENTRY_TYPE images = _create_laniakea_images(config) # Figure out where to put our instances try: (region, zone, instance_type, rejected) = _get_best_region_zone(config) except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error): # In case of temporary failures here, we will retry again in the next cycle logger.warning("[Pool %d] Failed to acquire spot instance prices: %s.", pool.id, traceback.format_exc()) return except RuntimeError: logger.error("[Pool %d] Failed to compile userdata.", pool.id) entry = PoolStatusEntry() entry.type = POOL_STATUS_ENTRY_TYPE['config-error'] entry.pool = pool entry.isCritical = True entry.msg = "Configuration error: %s" % traceback.format_exc() entry.save() return priceLowEntries = PoolStatusEntry.objects.filter( pool=pool, type=POOL_STATUS_ENTRY_TYPE['price-too-low']) if not region: logger.warning( "[Pool %d] No allowed region was cheap enough to spawn instances.", pool.id) if not priceLowEntries: entry = PoolStatusEntry() entry.pool = pool entry.type = POOL_STATUS_ENTRY_TYPE['price-too-low'] entry.msg = "No allowed region was cheap enough to spawn instances." for zone in rejected: entry.msg += "\n%s at %s" % (zone, rejected[zone]) entry.save() return else: if priceLowEntries: priceLowEntries.delete() logger.debug( "[Pool %d] Using instance type %s in region %s with availability zone %s.", pool.id, instance_type, region, zone) try: userdata = LaniakeaCommandLine.handle_import_tags( config.ec2_userdata.decode('utf-8')) # Copy the userdata_macros and populate with internal variables ec2_userdata_macros = dict(config.ec2_userdata_macros) ec2_userdata_macros["EC2SPOTMANAGER_POOLID"] = str(pool.id) userdata = LaniakeaCommandLine.handle_tags(userdata, ec2_userdata_macros) if not userdata: logger.error("[Pool %d] Failed to compile userdata.", pool.id) entry = PoolStatusEntry() entry.type = POOL_STATUS_ENTRY_TYPE['config-error'] entry.pool = pool entry.isCritical = True entry.msg = "Configuration error: Failed to compile userdata" entry.save() raise RuntimeError( "start_pool_instances: Failed to compile userdata") images["default"]['user_data'] = userdata.encode("utf-8") images["default"]['placement'] = zone images["default"]['count'] = count images["default"]['instance_type'] = instance_type cluster = Laniakea(images) try: cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key) except ssl.SSLError as msg: logger.warning( "[Pool %d] start_pool_instances: Temporary failure in region %s: %s", pool.id, region, msg) entry = PoolStatusEntry() entry.pool = pool entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure'] entry.msg = "Temporary failure occurred: %s" % msg entry.save() return except Exception as msg: logger.exception( "[Pool %d] start_pool_instances: laniakea failure: %s", pool.id, msg) # Log this error to the pool status messages entry = PoolStatusEntry() entry.type = POOL_STATUS_ENTRY_TYPE['unclassified'] entry.pool = pool entry.msg = str(msg) entry.isCritical = True entry.save() return try: logger.info("[Pool %d] Creating %d instances...", pool.id, count) for ec2_request in cluster.create_spot_requests( config.ec2_max_price, delete_on_termination=True, timeout=10 * 60): instance = Instance() instance.ec2_instance_id = ec2_request instance.ec2_region = region instance.ec2_zone = zone instance.status_code = INSTANCE_STATE["requested"] instance.pool = pool instance.save() except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg: if "MaxSpotInstanceCountExceeded" in str(msg): logger.warning( "[Pool %d] start_pool_instances: Maximum instance count exceeded for region %s", pool.id, region) if not PoolStatusEntry.objects.filter( pool=pool, type=POOL_STATUS_ENTRY_TYPE[ 'max-spot-instance-count-exceeded']): entry = PoolStatusEntry() entry.pool = pool entry.type = POOL_STATUS_ENTRY_TYPE[ 'max-spot-instance-count-exceeded'] entry.msg = "Auto-selected region exceeded its maximum spot instance count." entry.save() elif "Service Unavailable" in str(msg): logger.warning( "[Pool %d] start_pool_instances: Temporary failure in region %s: %s", pool.id, region, msg) entry = PoolStatusEntry() entry.pool = pool entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure'] entry.msg = "Temporary failure occurred: %s" % msg entry.save() else: logger.exception( "[Pool %d] start_pool_instances: boto failure: %s", pool.id, msg) entry = PoolStatusEntry() entry.type = POOL_STATUS_ENTRY_TYPE['unclassified'] entry.pool = pool entry.isCritical = True entry.msg = "Unclassified error occurred: %s" % msg entry.save() except Exception as msg: logger.exception( "[Pool %d] start_pool_instances: unhandled failure: %s", pool.id, msg) raise