def verify_stack_status(stack_name, waiting_states, successful_states): """ Wait for the stack creation to be completed and notify if the stack creation fails. :param stack_name: the stack name that we should verify :param waiting_states: list of status to wait for :param successful_states: list of final status considered as successful :return: True if the final status is in the successful_states list, False otherwise. """ from pcluster.aws.aws_api import AWSApi # pylint: disable=import-outside-toplevel status = AWSApi.instance().cfn.describe_stack(stack_name).get( "StackStatus") resource_status = "" while status in waiting_states: status = AWSApi.instance().cfn.describe_stack(stack_name).get( "StackStatus") events = AWSApi.instance().cfn.get_stack_events( stack_name)["StackEvents"][0] resource_status = ("Status: %s - %s" % (events.get("LogicalResourceId"), events.get("ResourceStatus"))).ljust(80) sys.stdout.write("\r%s" % resource_status) sys.stdout.flush() time.sleep(5) # print the last status update in the logs if resource_status != "": LOGGER.debug(resource_status) return status in successful_states
def upload_archive(bucket: str, bucket_prefix: str, archive_path: str): archive_filename = os.path.basename(archive_path) with open(archive_path, "rb") as archive_file: archive_data = archive_file.read() AWSApi.instance().s3.put_object(bucket, archive_data, f"{bucket_prefix}/{archive_filename}") return f"s3://{bucket}/{bucket_prefix}/{archive_filename}"
def delete_s3_artifacts(self): """Cleanup S3 bucket artifact directory.""" LOGGER.debug( "Cleaning up S3 resources bucket_name=%s, service_name=%s, remove_artifact=%s", self.name, self._service_name, self._cleanup_on_deletion, ) if self.artifact_directory and self._cleanup_on_deletion: try: LOGGER.info("Deleting artifacts under %s/%s", self.name, self.artifact_directory) AWSApi.instance().s3_resource.delete_object( bucket_name=self.name, prefix=f"{self.artifact_directory}/") AWSApi.instance().s3_resource.delete_object_versions( bucket_name=self.name, prefix=f"{self.artifact_directory}/") except AWSClientError as e: LOGGER.warning( "Failed to delete S3 artifact under %s/%s with error %s. Please delete them manually.", self.name, self.artifact_directory, str(e), )
def delete(self, force=False): # noqa: C901 """Delete CFN Stack and associate resources and deregister the image.""" if force or (not self._check_instance_using_image() and not self._check_image_is_shared()): try: if AWSApi.instance().cfn.stack_exists(self.image_id): if self.stack.imagebuilder_image_is_building: raise BadRequestImageBuilderActionError( "Image cannot be deleted because EC2 ImageBuilder Image has a running workflow." ) # Delete stack AWSApi.instance().cfn.delete_stack(self.image_id) if AWSApi.instance().ec2.image_exists(image_id=self.image_id, build_status_avaliable=False): # Deregister image AWSApi.instance().ec2.deregister_image(self.image.id) # Delete snapshot for snapshot_id in self.image.snapshot_ids: AWSApi.instance().ec2.delete_snapshot(snapshot_id) # Delete s3 image directory try: self.bucket.check_bucket_exists() self.bucket.delete_s3_artifacts() except AWSClientError: logging.warning("S3 bucket %s does not exist, skip image s3 artifacts deletion.", self.bucket.name) # Delete log group try: AWSApi.instance().logs.delete_log_group(self._log_group_name) except AWSClientError: logging.warning("Unable to delete log group %s.", self._log_group_name) except (AWSClientError, ImageError) as e: raise _imagebuilder_error_mapper(e, f"Unable to delete image and stack, due to {str(e)}")
def _ec2_run_instance(self, availability_zone: str, **kwargs): # noqa: C901 FIXME!!! """Wrap ec2 run_instance call. Useful since a successful run_instance call signals 'DryRunOperation'.""" try: AWSApi.instance().ec2.run_instances(**kwargs) except AWSClientError as e: code = e.error_code message = str(e) subnet_id = kwargs["NetworkInterfaces"][0]["SubnetId"] if code == "UnsupportedOperation": if "does not support specifying CpuOptions" in message: message.replace("specifying CpuOptions", "disabling simultaneous multithreading") self._add_failure(message, FailureLevel.ERROR) elif code == "InstanceLimitExceeded": self._add_failure( "You've reached the limit on the number of instances you can run concurrently " f"for the configured instance type. {message}", FailureLevel.ERROR, ) elif code == "InsufficientInstanceCapacity": self._add_failure( f"There is not enough capacity to fulfill your request. {message}", FailureLevel.ERROR) elif code == "InsufficientFreeAddressesInSubnet": self._add_failure( "The specified subnet does not contain enough free private IP addresses " f"to fulfill your request. {message}", FailureLevel.ERROR, ) elif code == "InvalidParameterCombination": if "associatePublicIPAddress" in message: # Instances with multiple Network Interfaces cannot currently take public IPs. # This check is meant to warn users about this problem until services are fixed. self._add_failure( f"The instance type {kwargs['InstanceType']} cannot take public IPs. " f"Please make sure that the subnet with id '{subnet_id}' has the proper routing configuration " "to allow private IPs reaching the Internet (e.g. a NAT Gateway and a valid route table).", FailureLevel.WARNING, ) elif (code == "Unsupported" and availability_zone not in AWSApi.instance().ec2. get_supported_az_for_instance_type(kwargs["InstanceType"])): # If an availability zone without desired instance type is selected, error code is "Unsupported" # Therefore, we need to write our own code to tell the specific problem qualified_az = AWSApi.instance( ).ec2.get_supported_az_for_instance_type( kwargs["InstanceType"]) self._add_failure( f"Your requested instance type ({kwargs['InstanceType']}) is not supported in the " f"Availability Zone ({availability_zone}) of your requested subnet ({subnet_id}). " f"Please retry your request by choosing a subnet in {qualified_az}. ", FailureLevel.ERROR, ) else: self._add_failure( f"Unable to validate configuration parameters for instance type {kwargs['InstanceType']}. " f"Please double check your cluster configuration. {message}", FailureLevel.ERROR, )
def upload_bootstrapped_file(self): """Upload bootstrapped file to identify bucket is configured successfully.""" AWSApi.instance().s3.put_object( bucket_name=self.name, body="bucket is configured successfully.", key="/".join([self._root_directory, self._bootstrapped_file_name]), )
def _upload_file(self, content, file_name, file_type, format=S3FileFormat.YAML): """Upload file to S3 bucket.""" if format == S3FileFormat.YAML: result = AWSApi.instance().s3.put_object( bucket_name=self.name, body=yaml.dump(content), key=self.get_object_key(file_type, file_name), ) elif format == S3FileFormat.JSON: result = AWSApi.instance().s3.put_object( bucket_name=self.name, body=json.dumps(content), key=self.get_object_key(file_type, file_name), ) else: result = AWSApi.instance().s3.put_object( bucket_name=self.name, body=content, key=self.get_object_key(file_type, file_name), ) return result
def _validate(self, security_group_ids: List[str]): if security_group_ids: for sg_id in security_group_ids: try: AWSApi.instance().ec2.describe_security_group(sg_id) except AWSClientError as e: self._add_failure(str(e), FailureLevel.ERROR)
def check_bucket_is_bootstrapped(self): """Check bucket is configured successfully or not by bootstrapped file.""" AWSApi.instance().s3.head_object(bucket_name=self.name, object_name="/".join([ self._root_directory, self._bootstrapped_file_name ]))
def _get_supported_batch_instance_types(): """ Get the instance types supported by Batch in the desired region. This is done by calling Batch's CreateComputeEnvironment with a bad instance type and parsing the error message. """ supported_instance_types = AWSApi.instance().ec2.list_instance_types() supported_instance_families = _get_instance_families_from_types(supported_instance_types) known_exceptions = ["optimal"] supported_instance_types_and_families = supported_instance_types + supported_instance_families + known_exceptions try: parsed_instance_types_and_families = AWSApi.instance().batch.get_supported_instance_types_and_families() if _batch_instance_types_and_families_are_supported( parsed_instance_types_and_families, supported_instance_types_and_families ): supported_batch_types = parsed_instance_types_and_families else: supported_batch_types = supported_instance_types_and_families except Exception as e: # When the instance types supported by Batch can't be parsed from an error message, # log the reason for the failure and return instead a list of all instance types # supported in the region. LOGGER.debug( "Failed to parse supported Batch instance types from a CreateComputeEnvironment error message: %s", e ) supported_batch_types = supported_instance_types_and_families return supported_batch_types
def _validate(self, subnet_ids: List[str]): try: subnets = AWSApi.instance().ec2.describe_subnets( subnet_ids=subnet_ids) # Check all subnets are in the same VPC vpc_id = None for subnet in subnets: if vpc_id is None: vpc_id = subnet["VpcId"] elif vpc_id != subnet["VpcId"]: self._add_failure( "Subnet {0} is not in VPC {1}. Please make sure all subnets are in the same VPC." .format(subnet["SubnetId"], vpc_id), FailureLevel.ERROR, ) # Check for DNS support in the VPC if not AWSApi.instance().ec2.is_enable_dns_support(vpc_id): self._add_failure( f"DNS Support is not enabled in the VPC {vpc_id}.", FailureLevel.ERROR) if not AWSApi.instance().ec2.is_enable_dns_hostnames(vpc_id): self._add_failure( f"DNS Hostnames not enabled in the VPC {vpc_id}.", FailureLevel.ERROR) except AWSClientError as e: self._add_failure(str(e), FailureLevel.ERROR)
def _validate(self, hosted_zone_id, cluster_vpc, cluster_name): if AWSApi.instance().route53.is_hosted_zone_private(hosted_zone_id): vpc_ids = AWSApi.instance().route53.get_hosted_zone_vpcs(hosted_zone_id) if cluster_vpc not in vpc_ids: self._add_failure( f"Private Route53 hosted zone {hosted_zone_id} need to be associated with " f"the VPC of the cluster: {cluster_vpc}. " f"The VPCs associated with hosted zone are {vpc_ids}.", FailureLevel.ERROR, ) else: self._add_failure( f"Hosted zone {hosted_zone_id} cannot be used. " f"Public Route53 hosted zone is not officially supported by ParallelCluster.", FailureLevel.ERROR, ) domain_name = AWSApi.instance().route53.get_hosted_zone_domain_name(hosted_zone_id) total_length = len(cluster_name) + len(domain_name) if total_length > CLUSTER_NAME_AND_CUSTOM_DOMAIN_NAME_MAX_LENGTH: self._add_failure( ( "Error: When specifying HostedZoneId, " f"the total length of cluster name {cluster_name} and domain name {domain_name} can not be " f"longer than {CLUSTER_NAME_AND_CUSTOM_DOMAIN_NAME_MAX_LENGTH} character, " f"current length is {total_length}" ), FailureLevel.ERROR, )
def _validate(self, placement_group_id: str): if placement_group_id: try: AWSApi.instance().ec2.describe_placement_group( placement_group_id) except AWSClientError as e: self._add_failure(str(e), FailureLevel.ERROR)
def __init__(self, resource_id, log_group_name, bucket, output_dir, bucket_prefix=None, keep_s3_objects=False): # check bucket bucket_region = AWSApi.instance().s3.get_bucket_region( bucket_name=bucket) if bucket_region != get_region(): raise LogsExporterError( f"The bucket used for exporting logs must be in the same region as the {resource_id}. " f"The given resource is in {get_region()}, but the bucket's region is {bucket_region}." ) self.bucket = bucket self.log_group_name = log_group_name self.output_dir = output_dir self.keep_s3_objects = keep_s3_objects if bucket_prefix: self.bucket_prefix = bucket_prefix self.delete_everything_under_prefix = False else: # If the default bucket prefix is being used and there's nothing underneath that prefix already # then we can delete everything under that prefix after downloading the data # (unless keep-s3-objects is specified) self.bucket_prefix = f"{resource_id}-logs-{datetime.datetime.now().strftime('%Y%m%d%H%M')}" self.delete_everything_under_prefix = AWSApi.instance( ).s3_resource.is_empty(bucket, self.bucket_prefix)
def execute(self, log_stream_prefix=None, start_time: datetime.datetime = None, end_time: datetime.datetime = None): """Start export task. Returns logs streams folder.""" # Export logs to S3 task_id = self._export_logs_to_s3(log_stream_prefix=log_stream_prefix, start_time=start_time, end_time=end_time) LOGGER.info("Log export task id: %s", task_id) # Download exported S3 objects to output dir subfolder try: log_streams_dir = os.path.join(self.output_dir, "cloudwatch-logs") self._download_s3_objects_with_prefix(task_id, log_streams_dir) LOGGER.info("Archive of CloudWatch logs saved to %s", self.output_dir) except OSError: raise LogsExporterError( "Unable to download archive logs from S3, double check your filters are correct." ) finally: if not self.keep_s3_objects: if self.delete_everything_under_prefix: delete_key = self.bucket_prefix else: delete_key = "/".join((self.bucket_prefix, task_id)) LOGGER.debug( "Cleaning up S3 bucket %s. Deleting all objects under %s", self.bucket, delete_key) AWSApi.instance().s3_resource.delete_objects( bucket_name=self.bucket, prefix=delete_key)
def list_log_streams(self, next_token: str = None): """ List image builder's logs. :param next_token: Token for paginated requests. :returns ListLogsResponse """ try: log_streams = [] if AWSApi.instance().logs.log_group_exists(self._log_group_name): LOGGER.debug("Listing log streams from log group %s", self._log_group_name) log_stream_resp = AWSApi.instance().logs.describe_log_streams( log_group_name=self._log_group_name, next_token=next_token ) log_streams.extend(log_stream_resp["logStreams"]) next_token = log_stream_resp.get("nextToken") else: LOGGER.debug("Log Group %s doesn't exist.", self._log_group_name) raise NotFoundImageBuilderActionError( ("Unable to find image logs, please double check if image id=" f"{self.image_id} is correct.") ) return LogStreams(log_streams, next_token) except AWSClientError as e: raise ImageBuilderActionError(f"Unexpected error when retrieving image's logs: {e}")
def _download_s3_objects_with_prefix(self, task_id, destdir): """Download all object in bucket with given prefix into destdir.""" prefix = f"{self.bucket_prefix}/{task_id}" LOGGER.debug( "Downloading exported logs from s3 bucket %s (under key %s) to %s", self.bucket, prefix, destdir) for archive_object in AWSApi.instance().s3_resource.get_objects( bucket_name=self.bucket, prefix=prefix): decompressed_path = os.path.dirname( os.path.join(destdir, archive_object.key)) decompressed_path = decompressed_path.replace( r"{unwanted_path_segment}{sep}".format( unwanted_path_segment=prefix, sep=os.path.sep), "") compressed_path = f"{decompressed_path}.gz" LOGGER.debug("Downloading object with key=%s to %s", archive_object.key, compressed_path) os.makedirs(os.path.dirname(compressed_path), exist_ok=True) AWSApi.instance().s3_resource.download_file( bucket_name=self.bucket, key=archive_object.key, output=compressed_path) # Create a decompressed copy of the downloaded archive and remove the original LOGGER.debug("Extracting object at %s to %s", compressed_path, decompressed_path) with gzip.open(compressed_path) as gfile, open( decompressed_path, "wb") as outfile: outfile.write(gfile.read()) os.remove(compressed_path)
def _validate_instance_type(self, instance_type: str): if instance_type not in AWSApi.instance().ec2.list_instance_types(): self._add_failure( f"The instance type '{instance_type}' is not supported.", FailureLevel.ERROR, ) return [] return AWSApi.instance().ec2.get_supported_architectures(instance_type)
def disable_awsbatch_compute_environment(self): """Disable AWS Batch compute environment.""" LOGGER.info("Disabling AWS Batch compute environment : %s", self.name) try: AWSApi.instance().batch.disable_compute_environment( ce_name=self.stack.batch_compute_environment) except Exception as e: raise _cluster_error_mapper( e, f"Unable to disable Batch compute environment. {str(e)}")
def _validate_no_existing_image(self): """Validate that no existing image or stack with the same ImageBuilder image_id exists.""" if AWSApi.instance().ec2.image_exists(self.image_id): raise ConflictImageBuilderActionError(f"ParallelCluster image {self.image_id} already exists.") if AWSApi.instance().cfn.stack_exists(self.image_id): raise ConflictImageBuilderActionError( f"ParallelCluster build infrastructure for image {self.image_id} already exists" )
def _validate(self, backup_id): if backup_id: try: AWSApi.instance().fsx.describe_backup(backup_id) except AWSClientError as e: self._add_failure( "Failed to retrieve backup with Id '{0}': {1}".format( backup_id, str(e)), FailureLevel.ERROR, )
def _update_stack_template(self, template_url): """Update template of the running stack according to updated template.""" try: AWSApi.instance().cfn.update_stack_from_url( self.stack_name, template_url) self._wait_for_stack_update() except AWSClientError as e: if "no updates are to be performed" in str(e).lower(): return # If updated_template was the same as the stack's current one, consider the update a success raise e
def _validate(self, url): if get_url_scheme(url) == "s3": try: bucket = get_bucket_name_from_s3_url(url) AWSApi.instance().s3.head_bucket(bucket_name=bucket) except AWSClientError as e: self._add_failure(str(e), FailureLevel.ERROR) else: self._add_failure(f"The value '{url}' is not a valid S3 URI.", FailureLevel.ERROR)
def _validate(self, file_system_id, head_node_subnet_id): try: # Check to see if there is any existing mt on the fs file_system = AWSApi.instance().fsx.get_filesystem_info( file_system_id).file_system_data vpc_id = AWSApi.instance().ec2.get_subnet_vpc(head_node_subnet_id) # Check to see if fs is in the same VPC as the stack if file_system.get("VpcId") != vpc_id: self._add_failure( "Currently only support using FSx file system that is in the same VPC as the cluster. " "The file system provided is in {0}.".format( file_system.get("VpcId")), FailureLevel.ERROR, ) # If there is an existing mt in the az, need to check the inbound and outbound rules of the security groups network_interface_ids = file_system.get("NetworkInterfaceIds") if not network_interface_ids: self._add_failure( "Unable to validate FSx security groups. The given FSx file system '{0}' doesn't have " "Elastic Network Interfaces attached to it.".format( file_system_id), FailureLevel.ERROR, ) else: network_interface_responses = AWSApi.instance( ).ec2.describe_network_interfaces(network_interface_ids) fs_access = False network_interfaces = [ ni for ni in network_interface_responses if ni.get("VpcId") == vpc_id ] for network_interface in network_interfaces: # Get list of security group IDs sg_ids = [ sg.get("GroupId") for sg in network_interface.get("Groups") ] if _check_in_out_access(sg_ids, port=988): fs_access = True break if not fs_access: self._add_failure( "The current security group settings on file system '{0}' does not satisfy mounting requirement" ". The file system must be associated to a security group that allows inbound and outbound " "TCP traffic through port 988.".format(file_system_id), FailureLevel.ERROR, ) except AWSClientError as e: self._add_failure(str(e), FailureLevel.ERROR)
def get_log_events( self, log_stream_name: str, start_time: datetime = None, end_time: datetime = None, start_from_head: bool = False, limit: int = None, next_token: str = None, ): """ Get the log stream events. :param log_stream_name: Log stream name :param start_time: Start time of interval of interest for log events. ISO 8601 format: YYYY-MM-DDThh:mm:ssTZD :param end_time: End time of interval of interest for log events. ISO 8601 format: YYYY-MM-DDThh:mm:ssTZD :param start_from_head: If the value is true, the earliest log events are returned first. If the value is false, the latest log events are returned first. The default value is false. :param limit: The maximum number of log events returned. If you don't specify a value, the maximum is as many log events as can fit in a response size of 1 MB, up to 10,000 log events. :param next_token: Token for paginated requests. """ if not AWSApi.instance().cfn.stack_exists(self.stack_name): raise NotFoundClusterActionError( f"Cluster {self.name} does not exist.") try: log_events_response = AWSApi.instance().logs.get_log_events( log_group_name=self.stack.log_group_name, log_stream_name=log_stream_name, end_time=datetime_to_epoch(end_time) if end_time else None, start_time=datetime_to_epoch(start_time) if start_time else None, limit=limit, start_from_head=start_from_head, next_token=next_token, ) return LogStream(self.stack_name, log_stream_name, log_events_response) except AWSClientError as e: if e.message.startswith("The specified log group"): LOGGER.debug("Log Group %s doesn't exist.", self.stack.log_group_name) raise NotFoundClusterActionError( f"CloudWatch logging is not enabled for cluster {self.name}." ) if e.message.startswith("The specified log stream"): LOGGER.debug("Log Stream %s doesn't exist.", log_stream_name) raise NotFoundClusterActionError( f"The specified log stream {log_stream_name} does not exist." ) raise _cluster_error_mapper( e, f"Unexpected error when retrieving log events: {e}.")
def _validate(self, key_name: str): if key_name: try: AWSApi.instance().ec2.describe_key_pair(key_name) except AWSClientError as e: self._add_failure(str(e), FailureLevel.ERROR) else: self._add_failure( "If you do not specify a key pair, you can't connect to the instance unless you choose an AMI " "that is configured to allow users another way to log in", FailureLevel.WARNING, )
def create( self, disable_rollback: bool = True, validator_suppressors: Set[ValidatorSuppressor] = None, validation_failure_level: FailureLevel = FailureLevel.ERROR, ): """Create the CFN Stack and associate resources.""" suppressed_validation_failures = self.validate_create_request(validator_suppressors, validation_failure_level) # Generate artifact directory for image self._generate_artifact_dir() creation_result = None artifacts_uploaded = False try: self._upload_config() LOGGER.info("Building ParallelCluster image: %s", self.image_id) # Generate cdk cfn template self.template_body = CDKTemplateBuilder().build_imagebuilder_template( image_config=self.config, image_id=self.image_id, bucket=self.bucket ) # upload generated template self._upload_artifacts() artifacts_uploaded = True # Stack creation creation_result = AWSApi.instance().cfn.create_stack_from_url( stack_name=self.image_id, template_url=self.bucket.get_cfn_template_url( template_name=self._s3_artifacts_dict.get("template_name") ), disable_rollback=disable_rollback, tags=self._get_cfn_tags(), capabilities="CAPABILITY_NAMED_IAM", ) self.__stack = ImageBuilderStack(AWSApi.instance().cfn.describe_stack(self.image_id)) LOGGER.debug("StackId: %s", self.stack.id) LOGGER.info("Status: %s", self.stack.status) return suppressed_validation_failures except Exception as e: LOGGER.critical(e) if not creation_result and artifacts_uploaded: # Cleanup S3 artifacts if stack is not created yet self.bucket.delete_s3_artifacts() raise _imagebuilder_error_mapper(e, f"ParallelCluster image build infrastructure creation failed.\n{e}")
def export_stack_events(stack_name: str, output_file: str): """Save CFN stack events into a file.""" stack_events = [] chunk = AWSApi.instance().cfn.get_stack_events(stack_name) stack_events.append(chunk["StackEvents"]) while chunk.get("nextToken"): chunk = AWSApi.instance().cfn.get_stack_events( stack_name, next_token=chunk["nextToken"]) stack_events.append(chunk["StackEvents"]) with open(output_file, "w", encoding="utf-8") as cfn_events_file: cfn_events_file.write( json.dumps(stack_events, cls=JSONEncoder, indent=2))
def get_stack_events(self, next_token: str = None): """ Get the CloudFormation stack events for the cluster. :param next_token Start from next_token if provided. """ try: if not AWSApi.instance().cfn.stack_exists(self.stack_name): raise NotFoundClusterActionError( f"Cluster {self.name} does not exist.") return AWSApi.instance().cfn.get_stack_events( self.stack_name, next_token=next_token) except AWSClientError as e: raise _cluster_error_mapper( e, f"Unexpected error when retrieving stack events: {e}")
def _validate(self, bucket): try: AWSApi.instance().s3.head_bucket(bucket_name=bucket) # Check versioning is enabled on the bucket bucket_versioning_status = AWSApi.instance( ).s3.get_bucket_versioning_status(bucket) if bucket_versioning_status != "Enabled": self._add_failure( "The S3 bucket {0} specified cannot be used by cluster " "because versioning setting is: {1}, not 'Enabled'. Please enable bucket versioning." .format(bucket, bucket_versioning_status), FailureLevel.ERROR, ) except AWSClientError as e: self._add_failure(str(e), FailureLevel.ERROR)