def download(self, download_filters: list, output: AbstractOutputWriter, dry_run=False): # create or get existing bucket for the project bucket_name = self.instance_deployment.bucket.get_or_create_bucket( output, dry_run) # sync files from the instance to a temporary S3 directory output.write('Uploading files from the instance to S3 bucket...') upload_from_instance_to_s3(download_filters, self.get_ip_address(), self.ssh_port, self.ssh_user, self.ssh_key_path, dry_run=dry_run) # sync the project with the S3 bucket output.write('Downloading files from S3 bucket to local...') download_from_s3_to_local(bucket_name, self.instance_config.name, self.project_config.project_dir, self.instance_config.region, download_filters, dry_run=dry_run)
def delete_stack(self, output: AbstractOutputWriter, stack_id=None): """Deletes an AMI stack. Args: output: output writer stack_id: ID of the stack to delete (for older versions of Spotty) """ # delete the image stack = Stack.get_by_name(self._cf, stack_id) if stack_id else self.get_stack() stack.delete() output.write('Waiting for the AMI to be deleted...') # wait for the deletion to be completed with output.prefix(' '): stack = stack.wait_status_changed( waiting_status='DELETE_IN_PROGRESS', resource_messages=[], resource_success_status='DELETE_COMPLETE', output=output) if stack.status == 'DELETE_COMPLETE': output.write('\n' '-----------------------------\n' 'AMI was successfully deleted.\n' '-----------------------------') else: raise ValueError( 'Stack "%s" not deleted.\n' 'See CloudFormation and CloudWatch logs for details.' % stack_id)
def _delete_snapshot(snapshot: Snapshot, output: AbstractOutputWriter): try: snapshot.delete() output.write('- previous snapshot "%s" was deleted' % snapshot.name) except Exception as e: output.write('- previous snapshot "%s" was not deleted. Error: %s' % (snapshot.name, str(e)))
def _run(self, instance_manager: AbstractInstanceManager, args: Namespace, output: AbstractOutputWriter): dry_run = args.dry_run if args.container: # check that the instance is started if not instance_manager.is_running(): raise InstanceNotRunningError(instance_manager.instance_config.name) # start a container on the running instance instance_manager.start_container(output, dry_run=dry_run) if not dry_run: instance_name = '' if len(instance_manager.project_config.instances) > 1: instance_name = ' ' + instance_manager.instance_config.name output.write('\nContainer was successfully started.\n' 'Use the "spotty sh%s" command to connect to the container.\n' % instance_name) else: # start the instance with output.prefix('[dry-run] ' if dry_run else ''): instance_manager.start(output, dry_run) if not dry_run: instance_name = '' if len(instance_manager.project_config.instances) > 1: instance_name = ' ' + instance_manager.instance_config.name output.write('\n%s\n' '\nUse the "spotty sh%s" command to connect to the container.\n' % (instance_manager.get_status_text(), instance_name))
def delete_stack(self, output: AbstractOutputWriter): stack = Stack.get_by_name(self._dm, self._stack_name) if not stack: return output.write('Waiting for the stack to be deleted...') # delete the stack try: if stack.is_running: # stop an ongoing operation first to make sure the delete method # won't raise an error "Resource '...' has an ongoing conflicting operation" stack.stop() # if the docker-waiter resource is still waiting for a signal, send a failure signal # to be able to delete the stack resource = DMResource.get_by_name(self._dm, self._stack_name, self._DOCKER_WAITER_RESOURCE_NAME) if resource.is_in_progress: self._rtc.set_value(self._DOCKER_STATUS_CONFIG_RESOURCE_NAME, '/failure/1', '1') # wait until the stack will be created or will fail stack.wait_stack_done() stack.delete() stack.wait_stack_deleted() except Exception as e: raise ValueError('Stack "%s" was not deleted. Error: %s\n' 'See Deployment Manager logs for details.' % (self._stack_name, str(e)))
def _delete_ec2_volume(ec2_volume: Volume, output: AbstractOutputWriter): try: ec2_volume.delete() output.write('- volume "%s" was deleted' % ec2_volume.name) except Exception as e: output.write('- volume "%s" was not deleted. Error: %s' % (ec2_volume.name, str(e)))
def delete(self, output: AbstractOutputWriter): # check that the "amiId" parameter is not set if self.instance_config.ami_id: raise ValueError( 'The "amiId" parameter cannot be used for deleting an AMI.') # check if the AMI stack exists stack_id = None if not self.stack.get_stack(): # try to get the stack ID from the AMI tags (for older versions of Spotty) ami = Image.get_by_name(self._ec2, self.instance_config.ami_name) if not ami: raise ValueError('AMI with the name "%s" not found.' % self.instance_config.ami_name) stack_id = ami.get_tag_value('spotty:stack-id') if not stack_id: raise ValueError('AMI "%s" wasn\'t created by Spotty.' % self.instance_config.ami_name) # ask user to confirm the deletion confirm = input('AMI "%s" will be deleted.\n' 'Type "y" to confirm: ' % self.instance_config.ami_name) if confirm != 'y': output.write('You didn\'t confirm the operation.') return self.stack.delete_stack(output, stack_id=stack_id)
def download(self, download_filters: list, output: AbstractOutputWriter, dry_run=False): # get the project bucket name bucket_name = self.bucket_manager.get_bucket().name # sync files from the instance to a temporary S3 directory output.write('Uploading files from the instance to the bucket...') remote_cmd = self.data_transfer.get_upload_instance_to_bucket_command( bucket_name=bucket_name, download_filters=download_filters, use_sudo=( not self.instance_config.container_config.run_as_host_user), dry_run=dry_run, ) logging.debug('Remote sync command: ' + remote_cmd) # execute the command on the host OS exit_code = self.exec(remote_cmd) if exit_code != 0: raise ValueError( 'Failed to upload files from the instance to the bucket') if not dry_run: # sync the project with the S3 bucket output.write('Downloading files from the bucket to local...') self.data_transfer.download_bucket_to_local( bucket_name=bucket_name, download_filters=download_filters)
def create_stack(self, template: str, output: AbstractOutputWriter): """Deploys a Deployment Manager template.""" # create a stack res = Stack.create(self._dm, self._stack_name, template) # print(res) # exit() output.write('Waiting for the stack to be created...') resource_messages = OrderedDict([ (self._INSTANCE_RESOURCE_NAME, 'launching the instance'), (self._DOCKER_WAITER_RESOURCE_NAME, 'running the Docker container'), ]) # wait for the stack to be created with output.prefix(' '): wait_resources(self._dm, self._ce, self._stack_name, resource_messages, instance_resource_name=self._INSTANCE_RESOURCE_NAME, machine_name=self._machine_name, output=output)
def download(self, download_filters: list, output: AbstractOutputWriter, dry_run=False): output.write('Downloading files from the instance...') # check rsync is installed check_rsync_installed() # sync the project with the instance rsync_cmd = get_download_command( local_dir=self.project_config.project_dir, remote_dir=self.instance_config.host_project_dir, ssh_user=self.ssh_user, ssh_host=self.ssh_host, ssh_key_path=self.ssh_key_path, ssh_port=self.ssh_port, filters=download_filters, use_sudo=( not self.instance_config.container_config.run_as_host_user), dry_run=dry_run, ) # execute the command locally logging.debug('rsync command: ' + rsync_cmd) exit_code = subprocess.call(rsync_cmd, shell=True) if exit_code != 0: raise ValueError('Failed to download files from the instance.')
def wait_resources(dm: DMClient, ce: CEClient, deployment_name: str, resource_messages: OrderedDict, instance_resource_name: str, machine_name: str, output: AbstractOutputWriter, delay: int = 5): # make sure that the instance resource is in the messages list assert any(resource_name == instance_resource_name for resource_name, _ in resource_messages.items()) created_resources = set() for resource_name, message in resource_messages.items(): output.write('- %s...' % message) is_created = False while not is_created: sleep(delay) # get the resource info try: # check that the deployment is not failed stack = Stack.get_by_name(dm, deployment_name) if stack.error: raise ValueError('Deployment "%s" failed.\n' 'Error: %s' % (deployment_name, stack.error['message'])) # check if the instance was preempted, terminated or deleted right after creation if instance_resource_name in created_resources: instance = Instance.get_by_name(ce, machine_name) if not instance or instance.is_terminated: raise ValueError( 'Error: the instance was unexpectedly terminated. Please, check out the ' 'instance logs to find out the reason.\n') # get resource resource = DMResource.get_by_name(dm, deployment_name, resource_name) except (ConnectionResetError, ServerNotFoundError): logging.warning('Connection problem') continue # resource doesn't exist yet if not resource: continue # resource failed if resource.is_failed: error_msg = ('Error: ' + resource.error_message) if resource.error_message \ else 'Please, see Deployment Manager logs for the details.' % deployment_name raise ValueError('Deployment "%s" failed.\n%s' % (deployment_name, error_msg)) # resource was successfully created is_created = resource.is_created created_resources.add(resource_name)
def _run(self, instance_manager: AbstractInstanceManager, args: Namespace, output: AbstractOutputWriter): instance_manager.stop(output) output.write('\n' '----------------------------------\n' 'Instance was successfully deleted.\n' '----------------------------------')
def stop(self, only_shutdown: bool, output: AbstractOutputWriter): if only_shutdown: output.write('Shutting down the instance... ', newline=False) self.instance_deployment.get_instance().stop() output.write('DONE') else: # delete the stack and apply deletion policies self.instance_deployment.delete(output)
def _run(self, instance_manager: AbstractInstanceManager, args: Namespace, output: AbstractOutputWriter): filters = [{'exclude': ['*']}, {'include': args.filters}] dry_run = args.dry_run with output.prefix('[dry-run] ' if dry_run else ''): instance_manager.download(filters, output, dry_run) output.write('Done')
def _get_template_parameters(self, instance_profile_arn: str, instance_name: str, bucket_name: str, sync_filters: list, volumes: List[AbstractInstanceVolume], container: ContainerDeployment, output: AbstractOutputWriter, dry_run=False): # get VPC ID vpc_id = self.get_vpc_id() # get image info ami = self._get_ami() output.write('- AMI: "%s" (%s)' % (ami.name, ami.image_id)) # check root volume size root_volume_size = self.instance_config.root_volume_size if root_volume_size and root_volume_size < ami.size: raise ValueError('Root volume size cannot be less than the size of AMI (%dGB).' % ami.size) elif not root_volume_size: # if a root volume size is not specified, make it 5GB larger than the AMI size root_volume_size = ami.size + 5 # create key pair key_name = self.key_pair.get_or_create_key(dry_run) # get mount directories for the volumes mount_dirs = [volume.mount_dir for volume in volumes] # get Docker runtime parameters runtime_parameters = container.get_runtime_parameters(is_gpu_instance(self.instance_config.instance_type)) # print info about the Docker data root if self.instance_config.docker_data_root: docker_data_volume_name = [volume.name for volume in volumes if is_subdir(self.instance_config.docker_data_root, volume.mount_dir)][0] output.write('- Docker data will be stored on the "%s" volume' % docker_data_volume_name) # create stack parameters = { 'VpcId': vpc_id, 'InstanceProfileArn': instance_profile_arn, 'InstanceType': self.instance_config.instance_type, 'KeyName': key_name, 'ImageId': ami.image_id, 'RootVolumeSize': str(root_volume_size), 'VolumeMountDirectories': ('"%s"' % '" "'.join(mount_dirs)) if mount_dirs else '', 'DockerDataRootDirectory': self.instance_config.docker_data_root, 'DockerImage': container.config.image, 'DockerfilePath': container.dockerfile_path, 'DockerBuildContextPath': container.docker_context_path, 'DockerRuntimeParameters': runtime_parameters, 'DockerWorkingDirectory': container.config.working_dir, 'InstanceNameTag': self.ec2_instance_name, 'ProjectS3Path': get_project_s3_path(bucket_name), 'HostProjectDirectory': container.host_project_dir, 'SyncCommandArgs': list2cmdline(get_instance_sync_arguments(sync_filters)), 'UploadS3Path': get_tmp_instance_s3_path(bucket_name, instance_name), } return parameters
def get_or_create_bucket(self, output: AbstractOutputWriter, dry_run=False): bucket_name = self._find_bucket() if not bucket_name: bucket_name = '-'.join([self._bucket_prefix, random_string(12), self._region]) if not dry_run: self._s3.create_bucket(ACL='private', Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': self._region}) output.write('Bucket "%s" was created.' % bucket_name) return bucket_name
def _get_instance_id(instances: List[dict], instance_name: str, output: AbstractOutputWriter): if not instance_name: if len(instances) > 1: # ask user to choose the instance output.write('Select the instance:\n') with output.prefix(' '): for i, instance_config in enumerate(instances): output.write('[%d] %s' % (i + 1, instance_config['name'])) output.write() try: num = int(input('Enter number: ')) output.write() except ValueError: num = 0 if num < 1 or num > len(instances): raise ValueError('The value from 1 to %d was expected.' % len(instances)) instance_id = num - 1 else: instance_id = 0 else: # get instance ID by name instance_ids = [i for i, instance in enumerate(instances) if instance['name'] == instance_name] if not instance_ids: raise ValueError('Instance "%s" not found in the configuration file' % instance_name) instance_id = instance_ids[0] return instance_id
def create_or_update_instance_profile(cf, output: AbstractOutputWriter): """Creates or updates instance profile. It was moved to a separate stack because creating of an instance profile resource takes 2 minutes. """ instance_profile_stack_name = 'spotty-instance-profile' with open(data_dir('create_instance_profile.yaml')) as f: instance_profile_stack_template = f.read() if stack_exists(cf, instance_profile_stack_name): try: res = cf.update_stack( StackName=instance_profile_stack_name, TemplateBody=instance_profile_stack_template, Capabilities=['CAPABILITY_IAM'], ) except ClientError as e: res = None error_code = e.response.get('Error', {}).get('Code', 'Unknown') if error_code != 'ValidationError': raise e if res: output.write('Updating IAM role for the instance...') # wait for the stack to be updated waiter = cf.get_waiter('stack_update_complete') waiter.wait(StackName=res['StackId'], WaiterConfig={'Delay': 10}) else: output.write('Creating IAM role for the instance...') res = cf.create_stack( StackName=instance_profile_stack_name, TemplateBody=instance_profile_stack_template, Capabilities=['CAPABILITY_IAM'], OnFailure='DELETE', ) # wait for the stack to be created waiter = cf.get_waiter('stack_create_complete') waiter.wait(StackName=res['StackId'], WaiterConfig={'Delay': 10}) info = cf.describe_stacks( StackName=instance_profile_stack_name)['Stacks'][0] status = info['StackStatus'] if status not in ['CREATE_COMPLETE', 'UPDATE_COMPLETE']: raise ValueError('Stack "%s" failed.\n' 'Please, see CloudFormation logs for the details.' % instance_profile_stack_name) profile_arn = [ row['OutputValue'] for row in info['Outputs'] if row['OutputKey'] == 'ProfileArn' ][0] return profile_arn
def sync(self, output: AbstractOutputWriter, dry_run=False): # create or get existing bucket for the project bucket_name = self.instance_deployment.bucket.get_or_create_bucket(output, dry_run) # sync the project with the bucket output.write('Syncing the project with the bucket...') sync_local_to_bucket(self.project_config.project_dir, bucket_name, self.project_config.sync_filters, dry_run) if not dry_run: # sync the bucket with the instance output.write('Syncing the bucket with the instance...') sync_bucket_to_instance(self.project_config.sync_filters, self.get_ip_address(), self.ssh_port, self.ssh_user, self.ssh_key_path)
def _get_volume_resource(ec2, volume: EbsVolume, output: AbstractOutputWriter): # new volume will be created volume_resource = { 'Type': 'AWS::EC2::Volume', 'DeletionPolicy': 'Retain', 'Properties': { 'AvailabilityZone': { 'Fn::GetAtt': ['Instance', 'AvailabilityZone'] }, 'Tags': [{ 'Key': 'Name', 'Value': volume.ec2_volume_name, }], 'VolumeType': volume.type, }, } # check if the snapshot exists and restore the volume from it snapshot = Snapshot.get_by_name(ec2, volume.ec2_volume_name) if snapshot: # volume will be restored from the snapshot # check size of the volume if volume.size and (volume.size < snapshot.size): raise ValueError( 'Specified size for the "%s" volume (%dGB) is less than size of the ' 'snapshot (%dGB).' % (volume.name, volume.size, snapshot.size)) # set snapshot ID volume_resource['Properties']['SnapshotId'] = snapshot.snapshot_id output.write('- volume "%s" will be restored from the snapshot' % volume.ec2_volume_name) else: # empty volume will be created, check that the size is specified if not volume.size: raise ValueError('Size for the new volume is required.') output.write('- volume "%s" will be created' % volume.ec2_volume_name) # set size of the volume if volume.size: volume_resource['Properties']['Size'] = volume.size # set a name for the new volume volume_resource['Properties']['Tags'] = [{ 'Key': 'Name', 'Value': volume.ec2_volume_name }] return volume_resource
def run(self, args: Namespace, output: AbstractOutputWriter): # get all regions if not args.region: ec2 = boto3.client('ec2') res = ec2.describe_regions() regions = [row['RegionName'] for row in res['Regions']] else: regions = [args.region] instance_type = args.instance_type output.write('Getting spot instance prices for "%s"...\n' % instance_type) prices = [] for region in regions: ec2 = boto3.client('ec2', region_name=region) res = get_spot_prices(ec2, instance_type) prices += [(price, zone) for zone, price in res.items()] # sort availability zones by price prices.sort(key=lambda x: x[0]) if prices: output.write('Price Zone') for price, zone in prices: output.write('%.04f %s' % (price, zone)) else: output.write('Spot instances of this type are not available.')
def run(self, output: AbstractOutputWriter): # get all regions ec2 = boto3.client('ec2') res = ec2.describe_regions() regions = [row['RegionName'] for row in res['Regions']] instance_type = self._args.instance_type if not is_valid_instance_type(instance_type): raise ValueError('Instance type "%s" doesn\'t exist.' % instance_type) output.write('Getting spot instance prices for "%s"...\n' % instance_type) prices = [] for region in regions: ec2 = boto3.client('ec2', region_name=region) res = get_spot_prices(ec2, instance_type) prices += [(price, zone) for zone, price in res.items()] # sort availability zones by price prices.sort(key=lambda x: x[0]) if prices: output.write('Price Zone') for price, zone in prices: output.write('%.04f %s' % (price, zone)) else: output.write('Spot instances of this type are not available.')
def _get_volume_resources(volumes: List[AbstractInstanceVolume], output: AbstractOutputWriter): resources = {} # ending letters for the devices (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/device_naming.html) # TODO: different device names on Nitro-based instances, # see: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/nvme-ebs-volumes.html device_letters = 'fghijklmnop' # create and attach volumes for i, volume in enumerate(volumes): if isinstance(volume, EbsVolume): device_letter = device_letters[i] ec2_volume = volume.get_ec2_volume() if ec2_volume: # check if the volume is available if not ec2_volume.is_available(): raise ValueError( 'EBS volume "%s" is not available (state: %s).' % (volume.ec2_volume_name, ec2_volume.state)) # check size of the volume if volume.size and (volume.size != ec2_volume.size): raise ValueError( 'Specified size for the "%s" volume (%dGB) doesn\'t match the size of the ' 'existing volume (%dGB).' % (volume.name, volume.size, ec2_volume.size)) output.write('- volume "%s" (%s) will be attached' % (ec2_volume.name, ec2_volume.volume_id)) volume_id = ec2_volume.volume_id else: # create Volume resource vol_resource_name = 'Volume' + device_letter.upper() vol_resource = _get_volume_resource(volume, output) resources[vol_resource_name] = vol_resource volume_id = {'Ref': vol_resource_name} # create VolumeAttachment resource vol_attachment_resource_name = 'VolumeAttachment' + device_letter.upper( ) device_name = '/dev/sd' + device_letter vol_attachment_resource = _get_volume_attachment_resource( volume_id, device_name) resources[vol_attachment_resource_name] = vol_attachment_resource return resources
def _create_stack(self, template: str, output: AbstractOutputWriter): """Creates the stack and waits until it will be created.""" output.write('Creating IAM role for the instance...') stack = Stack.create_stack( cf=self._cf, StackName=self._stack_name, TemplateBody=template, Capabilities=['CAPABILITY_IAM'], OnFailure='DELETE', ) # wait for the stack to be created stack.wait_stack_created(delay=15)
def get_or_create_bucket(self, output: AbstractOutputWriter, tags: list, dry_run=False): bucket_name = self._find_bucket() if not bucket_name: bucket_name = '-'.join([self._bucket_prefix, random_string(12), self._region]) if not dry_run: # a fix for the boto3 issue: https://github.com/boto/boto3/issues/125 if self._region == 'us-east-1': self._s3.create_bucket(ACL='private', Bucket=bucket_name) else: self._s3.create_bucket(ACL='private', Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': self._region}) self._s3.put_bucket_tagging(Bucket=bucket_name, Tagging={'TagSet': tags}) output.write('Bucket "%s" was created.' % bucket_name) return bucket_name
def get_or_create_bucket(self, output: AbstractOutputWriter, dry_run=False): bucket_name = self._find_bucket() if not bucket_name: bucket_name = '-'.join( [self._bucket_prefix, random_string(12), self._region]) if not dry_run: self._gs.create_bucket(bucket_name, self._region) self._gs.create_dir(bucket_name, BUCKET_SYNC_DIR) output.write('Bucket "%s" was created.' % bucket_name) return bucket_name
def _run(self, instance_manager: AbstractInstanceManager, args: Namespace, output: AbstractOutputWriter): # check that the instance is started if not instance_manager.is_running(): raise InstanceNotRunningError( instance_manager.instance_config.name) dry_run = args.dry_run with output.prefix('[dry-run] ' if dry_run else ''): try: instance_manager.sync(output, dry_run) except NothingToDoError as e: output.write(str(e)) return output.write('Done')
def _run(self, instance_manager: AbstractInstanceManager, args: Namespace, output: AbstractOutputWriter): # start the instance dry_run = args.dry_run with output.prefix('[dry-run] ' if dry_run else ''): instance_manager.start(output, dry_run) if not dry_run: instance_name = '' if len(instance_manager.project_config.instances) > 1: instance_name = ' ' + instance_manager.instance_config.name output.write( '\nThe instance was successfully started.\n' '\n%s\n' '\nUse the "spotty ssh%s" command to connect to the Docker container.\n' % (instance_manager.get_status_text(), instance_name))
def delete_stack(self, output: AbstractOutputWriter, no_wait=False): stack = Stack.get_by_name(self._cf, self._stack_name) if not stack: return if not no_wait: output.write('Waiting for the stack to be deleted...') # delete the stack try: stack.delete() if not no_wait: stack.wait_stack_deleted() except Exception as e: raise ValueError('Stack "%s" was not deleted. Error: %s\n' 'See CloudFormation logs for details.' % (self._stack_name, str(e)))
def get_template_parameters(ec2, instance_config: InstanceConfig, instance_profile_arn: str, bucket_name: str, key_pair_name: str, output: AbstractOutputWriter): # get AMI ami = get_ami(ec2, instance_config.ami_id, instance_config.ami_name) output.write('- AMI: "%s" (%s)' % (ami.name, ami.image_id)) # check root volume size root_volume_size = instance_config.root_volume_size if root_volume_size and root_volume_size < ami.size: raise ValueError( 'Root volume size cannot be less than the size of AMI (%dGB).' % ami.size) elif not root_volume_size: # if a root volume size is not specified, make it 5GB larger than the AMI size root_volume_size = ami.size + 5 # print info about the Docker data root ebs_volumes = [ volume for volume in instance_config.volumes if isinstance(volume, EbsVolume) ] if instance_config.docker_data_root: docker_data_volume_name = [ volume.name for volume in ebs_volumes if is_subdir(instance_config.docker_data_root, volume.mount_dir) ][0] output.write('- Docker data will be stored on the "%s" volume' % docker_data_volume_name) # create stack parameters = { 'VpcId': get_vpc_id(ec2, instance_config.subnet_id), 'InstanceProfileArn': instance_profile_arn, 'InstanceType': instance_config.instance_type, 'KeyName': key_pair_name, 'ImageId': ami.image_id, 'RootVolumeSize': str(root_volume_size), 'DockerDataRootDirectory': instance_config.docker_data_root, 'InstanceNameTag': instance_config.ec2_instance_name, 'HostProjectDirectory': instance_config.host_project_dir, 'LogsS3Path': get_logs_s3_path(bucket_name, instance_config.name), } return parameters