Exemplo n.º 1
0
    def prepare_template(self, availability_zone: str, subnet_id: str,
                         on_demand: bool, key_name: str):
        """Prepares CloudFormation template to run a Spot Instance."""

        # read and update CF template
        with open(data_dir('create_ami.yaml')) as f:
            template = yaml.load(f, Loader=CfnYamlLoader)

        # remove key parameter if key is not provided
        if not key_name:
            del template['Parameters']['KeyName']
            del template['Resources']['SpotInstanceLaunchTemplate'][
                'Properties']['LaunchTemplateData']['KeyName']

        # set availability zone
        if availability_zone:
            template['Resources']['SpotInstanceLaunchTemplate']['Properties'][
                'LaunchTemplateData']['Placement'] = {
                    'AvailabilityZone': availability_zone,
                }

        # set subnet
        if subnet_id:
            template['Resources']['SpotInstanceLaunchTemplate']['Properties'][
                'LaunchTemplateData']['NetworkInterfaces'] = [{
                    'SubnetId': subnet_id,
                    'DeviceIndex': 0,
                }]

        # run on-demand instance
        if on_demand:
            del template['Resources']['SpotInstanceLaunchTemplate'][
                'Properties']['LaunchTemplateData']['InstanceMarketOptions']

        return yaml.dump(template, Dumper=CfnYamlDumper)
Exemplo n.º 2
0
def create_or_update_instance_profile(cf, output: AbstractOutputWriter):
    """Creates or updates instance profile.
    It was moved to a separate stack because creating of an instance profile resource takes 2 minutes.
    """
    instance_profile_stack_name = 'spotty-instance-profile'
    with open(data_dir('create_instance_profile.yaml')) as f:
        instance_profile_stack_template = f.read()

    if stack_exists(cf, instance_profile_stack_name):
        try:
            res = cf.update_stack(
                StackName=instance_profile_stack_name,
                TemplateBody=instance_profile_stack_template,
                Capabilities=['CAPABILITY_IAM'],
            )
        except ClientError as e:
            res = None
            error_code = e.response.get('Error', {}).get('Code', 'Unknown')
            if error_code != 'ValidationError':
                raise e

        if res:
            output.write('Updating IAM role for the instance...')

            # wait for the stack to be updated
            waiter = cf.get_waiter('stack_update_complete')
            waiter.wait(StackName=res['StackId'], WaiterConfig={'Delay': 10})
    else:
        output.write('Creating IAM role for the instance...')

        res = cf.create_stack(
            StackName=instance_profile_stack_name,
            TemplateBody=instance_profile_stack_template,
            Capabilities=['CAPABILITY_IAM'],
            OnFailure='DELETE',
        )

        # wait for the stack to be created
        waiter = cf.get_waiter('stack_create_complete')
        waiter.wait(StackName=res['StackId'], WaiterConfig={'Delay': 10})

    info = cf.describe_stacks(
        StackName=instance_profile_stack_name)['Stacks'][0]
    status = info['StackStatus']
    if status not in ['CREATE_COMPLETE', 'UPDATE_COMPLETE']:
        raise ValueError('Stack "%s" failed.\n'
                         'Please, see CloudFormation logs for the details.' %
                         instance_profile_stack_name)

    profile_arn = [
        row['OutputValue'] for row in info['Outputs']
        if row['OutputKey'] == 'ProfileArn'
    ][0]

    return profile_arn
Exemplo n.º 3
0
    def prepare_template(self, ec2, availability_zone: str, subnet_id: str,
                         instance_type: str, volumes: list, ports: list,
                         max_price, docker_commands):
        """Prepares CloudFormation template to run a Spot Instance."""

        # read and update CF template
        with open(data_dir('run_container.yaml')) as f:
            template = yaml.load(f, Loader=CfnYamlLoader)

        # ending letters for the devices (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/device_naming.html)
        device_letters = 'fghijklmnop'

        # create and attach volumes
        for i, volume in enumerate(volumes):
            device_letter = device_letters[i]
            volume_resources, volume_availability_zone = self._get_volume_resources(
                ec2, volume, device_letter)

            # existing volume will be attached to the instance
            if availability_zone and volume_availability_zone and (
                    availability_zone != volume_availability_zone):
                raise ValueError(
                    'The availability zone in the configuration file doesn\'t match the availability zone '
                    'of the existing volume or you have two existing volumes in different availability '
                    'zones.')

            # update availability zone
            if volume_availability_zone:
                availability_zone = volume_availability_zone

            # update template resources
            template['Resources'].update(volume_resources)

        # set availability zone
        if availability_zone:
            template['Resources']['SpotInstanceLaunchTemplate']['Properties'][
                'LaunchTemplateData']['Placement'] = {
                    'AvailabilityZone': availability_zone,
                }

        # set subnet
        if subnet_id:
            template['Resources']['SpotInstanceLaunchTemplate']['Properties'][
                'LaunchTemplateData']['NetworkInterfaces'] = [{
                    'SubnetId':
                    subnet_id,
                    'DeviceIndex':
                    0,
                    'Groups':
                    template['Resources']['SpotInstanceLaunchTemplate']
                    ['Properties']['LaunchTemplateData']['SecurityGroupIds'],
                }]
            del template['Resources']['SpotInstanceLaunchTemplate'][
                'Properties']['LaunchTemplateData']['SecurityGroupIds']

        # make sure that the lambda to update log group retention was called after
        # the log group was created
        template['Resources']['RenameSnapshotFunctionRetention'][
            'DependsOn'] = [
                resource_name
                for resource_name, resource in template['Resources'].items()
                if resource['Type'] == 'Custom::SnapshotRenaming'
            ]

        # delete calls of the SetLogsRetentionFunction lambda
        if not template['Resources']['RenameSnapshotFunctionRetention'][
                'DependsOn']:
            del template['Resources']['RenameSnapshotFunctionRetention']

        # make sure that the lambda to update log group retention was called after
        # the log group was created
        template['Resources']['DeleteSnapshotFunctionRetention'][
            'DependsOn'] = [
                resource_name
                for resource_name, resource in template['Resources'].items()
                if resource['Type'] == 'Custom::SnapshotDeletion'
            ]

        # delete calls of the SetLogsRetentionFunction lambda
        if not template['Resources']['DeleteSnapshotFunctionRetention'][
                'DependsOn']:
            del template['Resources']['DeleteSnapshotFunctionRetention']

        # TerminateInstanceFunction lambda should depend on all volume attachments
        template['Resources']['TerminateInstance']['DependsOn'] = [
            resource_name
            for resource_name, resource in template['Resources'].items()
            if resource['Type'] == 'AWS::EC2::VolumeAttachment'
        ]

        # add ports to the security group
        for port in set(ports):
            if port != 22:
                template['Resources']['InstanceSecurityGroup']['Properties'][
                    'SecurityGroupIngress'] += [{
                        'CidrIp': '0.0.0.0/0',
                        'IpProtocol': 'tcp',
                        'FromPort': port,
                        'ToPort': port,
                    }, {
                        'CidrIpv6': '::/0',
                        'IpProtocol': 'tcp',
                        'FromPort': port,
                        'ToPort': port,
                    }]

        if max_price:
            # check the maximum price
            current_price = get_current_spot_price(ec2, instance_type,
                                                   availability_zone)
            if current_price > max_price:
                raise ValueError(
                    'Current price for the instance (%.04f) is higher than the maximum price in the '
                    'configuration file (%.04f).' % (current_price, max_price))

            # set maximum price
            template['Resources']['SpotInstanceLaunchTemplate']['Properties']['LaunchTemplateData'] \
                ['InstanceMarketOptions']['SpotOptions']['MaxPrice'] = max_price

        # set initial docker commands
        if docker_commands:
            template['Resources']['SpotInstanceLaunchTemplate']['Metadata']['AWS::CloudFormation::Init'] \
                ['docker_container_config']['files']['/tmp/docker/docker_commands.sh']['content'] = docker_commands

        return yaml.dump(template, Dumper=CfnYamlDumper)
Exemplo n.º 4
0
    def run(self, output: AbstractOutputWriter):
        # check that it's a GPU instance type
        instance_type = self._config['instance']['instanceType']
        if not is_gpu_instance(instance_type):
            raise ValueError('"%s" is not a GPU instance' % instance_type)

        region = self._config['instance']['region']
        cf = boto3.client('cloudformation', region_name=region)
        ec2 = boto3.client('ec2', region_name=region)

        # check that an image with this name doesn't exist yet
        ami_name = self._config['instance']['amiName']
        res = ec2.describe_images(Filters=[
            {'Name': 'name', 'Values': [ami_name]},
        ])

        if len(res['Images']):
            raise ValueError('AMI with name "%s" already exists.' % ami_name)

        # read and update CF template
        with open(data_dir('create_ami.yaml')) as f:
            template = yaml.load(f, Loader=CfnYamlLoader)

        # remove key parameter if key is not provided
        key_name = self._config['instance'].get('keyName', '')
        if not key_name:
            del template['Parameters']['KeyName']
            del template['Resources']['SpotInstanceLaunchTemplate']['Properties']['LaunchTemplateData']['KeyName']

        # create stack
        params = [
            {'ParameterKey': 'InstanceType', 'ParameterValue': instance_type},
            {'ParameterKey': 'ImageName', 'ParameterValue': ami_name},
        ]
        if key_name:
            params.append({'ParameterKey': 'KeyName', 'ParameterValue': key_name})

        stack_name = 'spotty-nvidia-docker-ami-%s' % random_string(8)
        res = cf.create_stack(
            StackName=stack_name,
            TemplateBody=yaml.dump(template, Dumper=CfnYamlDumper),
            Parameters=params,
            Capabilities=['CAPABILITY_IAM'],
            OnFailure='DELETE',
        )

        output.write('Waiting for the AMI to be created...')

        resource_messages = [
            ('InstanceProfile', 'creating IAM role for the instance'),
            ('SpotInstance', 'launching the instance'),
            ('InstanceReadyWaitCondition', 'installing NVIDIA Docker'),
            ('AMICreatedWaitCondition', 'creating AMI and terminating the instance'),
        ]

        # wait for the stack to be created
        status, stack = wait_stack_status_changed(cf, stack_id=res['StackId'], waiting_status='CREATE_IN_PROGRESS',
                                                  resource_messages=resource_messages,
                                                  resource_success_status='CREATE_COMPLETE', output=output)

        if status == 'CREATE_COMPLETE':
            ami_id = [row['OutputValue'] for row in stack['Outputs'] if row['OutputKey'] == 'NewAMI'][0]

            output.write('\n'
                         '--------------------\n'
                         'AMI "%s" (ID=%s) was successfully created.\n'
                         'Use "spotty start" command to run a Spot Instance.\n'
                         '--------------------' % (ami_name, ami_id))
        else:
            raise ValueError('Stack "%s" was not created.\n'
                             'See CloudFormation and CloudWatch logs for details.' % stack_name)