def __init__(self, scope: core.Construct, _id: str, **kwargs) -> None: super().__init__(scope, _id, **kwargs) # Setup SSM parameter of credentials, bucket_para, ignore_list ssm_credential_para = ssm.StringParameter.from_secure_string_parameter_attributes( self, "ssm_parameter_credentials", parameter_name=ssm_parameter_credentials, version=1) ssm_bucket_para = ssm.StringParameter(self, "s3bucket_serverless", string_value=json.dumps( bucket_para, indent=4)) ssm_parameter_ignore_list = ssm.StringParameter( self, "s3_migrate_ignore_list", string_value=ignore_list) # Setup DynamoDB ddb_file_list = ddb.Table(self, "s3migrate_serverless", partition_key=ddb.Attribute( name="Key", type=ddb.AttributeType.STRING), billing_mode=ddb.BillingMode.PAY_PER_REQUEST) # Setup SQS sqs_queue_DLQ = sqs.Queue(self, "s3migrate_serverless_Q_DLQ", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14)) sqs_queue = sqs.Queue(self, "s3migrate_serverless_Q", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14), dead_letter_queue=sqs.DeadLetterQueue( max_receive_count=3, queue=sqs_queue_DLQ)) # Setup API for Lambda to get IP address (for debug networking routing purpose) checkip = api.RestApi( self, "lambda-checkip-api", cloud_watch_role=True, deploy=True, description="For Lambda get IP address", default_integration=api.MockIntegration( integration_responses=[ api.IntegrationResponse(status_code="200", response_templates={ "application/json": "$context.identity.sourceIp" }) ], request_templates={"application/json": '{"statusCode": 200}'}), endpoint_types=[api.EndpointType.REGIONAL]) checkip.root.add_method("GET", method_responses=[ api.MethodResponse( status_code="200", response_models={ "application/json": api.Model.EMPTY_MODEL }) ]) # Setup Lambda functions handler = lam.Function(self, "s3-migrate-worker", code=lam.Code.asset("./lambda"), handler="lambda_function_worker.lambda_handler", runtime=lam.Runtime.PYTHON_3_8, memory_size=1024, timeout=core.Duration.minutes(15), tracing=lam.Tracing.ACTIVE, environment={ 'table_queue_name': ddb_file_list.table_name, 'Des_bucket_default': Des_bucket_default, 'Des_prefix_default': Des_prefix_default, 'StorageClass': StorageClass, 'checkip_url': checkip.url, 'ssm_parameter_credentials': ssm_parameter_credentials }) handler_jobsender = lam.Function( self, "s3-migrate-jobsender", code=lam.Code.asset("./lambda"), handler="lambda_function_jobsender.lambda_handler", runtime=lam.Runtime.PYTHON_3_8, memory_size=1024, timeout=core.Duration.minutes(15), tracing=lam.Tracing.ACTIVE, environment={ 'table_queue_name': ddb_file_list.table_name, 'StorageClass': StorageClass, 'checkip_url': checkip.url, 'sqs_queue': sqs_queue.queue_name, 'ssm_parameter_credentials': ssm_parameter_credentials, 'ssm_parameter_ignore_list': ssm_parameter_ignore_list.parameter_name, 'ssm_parameter_bucket': ssm_bucket_para.parameter_name }) # Allow lambda read/write DDB, SQS ddb_file_list.grant_read_write_data(handler) ddb_file_list.grant_read_write_data(handler_jobsender) sqs_queue.grant_send_messages(handler_jobsender) # SQS trigger Lambda worker handler.add_event_source(SqsEventSource(sqs_queue, batch_size=1)) # Option1: Create S3 Bucket, all new objects in this bucket will be transmitted by Lambda Worker s3bucket = s3.Bucket(self, "s3_new_migrate") s3bucket.grant_read(handler) s3bucket.add_event_notification(s3.EventType.OBJECT_CREATED, s3n.SqsDestination(sqs_queue)) # Option2: Allow Exist S3 Buckets to be read by Lambda functions. # Lambda Jobsender will scan and compare the these buckets and trigger Lambda Workers to transmit bucket_name = '' for b in bucket_para: if bucket_name != b['src_bucket']: # 如果列了多个相同的Bucket,就跳过 bucket_name = b['src_bucket'] s3exist_bucket = s3.Bucket.from_bucket_name( self, bucket_name, # 用这个做id bucket_name=bucket_name) s3exist_bucket.grant_read(handler_jobsender) s3exist_bucket.grant_read(handler) # Allow Lambda read ssm parameters ssm_bucket_para.grant_read(handler_jobsender) ssm_credential_para.grant_read(handler) ssm_credential_para.grant_read(handler_jobsender) ssm_parameter_ignore_list.grant_read(handler_jobsender) # Schedule cron event to trigger Lambda Jobsender per hour: event.Rule(self, 'cron_trigger_jobsender', schedule=event.Schedule.rate(core.Duration.hours(1)), targets=[target.LambdaFunction(handler_jobsender)]) # Create Lambda logs filter to create network traffic metric handler.log_group.add_metric_filter( "Complete-bytes", metric_name="Complete-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Complete", bytes, key]')) handler.log_group.add_metric_filter( "Uploading-bytes", metric_name="Uploading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Uploading", bytes, key]')) handler.log_group.add_metric_filter( "Downloading-bytes", metric_name="Downloading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Downloading", bytes, key]')) lambda_metric_Complete = cw.Metric(namespace="s3_migrate", metric_name="Complete-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Upload = cw.Metric(namespace="s3_migrate", metric_name="Uploading-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Download = cw.Metric(namespace="s3_migrate", metric_name="Downloading-bytes", statistic="Sum", period=core.Duration.minutes(1)) handler.log_group.add_metric_filter( "ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"ERROR"')) handler.log_group.add_metric_filter( "WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"WARNING"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) # Dashboard to monitor SQS and Lambda board = cw.Dashboard(self, "s3_migrate_serverless") board.add_widgets( cw.GraphWidget(title="Lambda-NETWORK", left=[ lambda_metric_Download, lambda_metric_Upload, lambda_metric_Complete ]), # TODO: here monitor all lambda concurrency not just the working one. Limitation from CDK # Lambda now supports monitor single lambda concurrency, will change this after CDK support cw.GraphWidget(title="Lambda-all-concurrent", left=[ handler.metric_all_concurrent_executions( period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="Lambda-invocations/errors/throttles", left=[ handler.metric_invocations( period=core.Duration.minutes(1)), handler.metric_errors(period=core.Duration.minutes(1)), handler.metric_throttles(period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="Lambda-duration", left=[ handler.metric_duration(period=core.Duration.minutes(1)) ]), ) board.add_widgets( cw.GraphWidget( title="SQS-Jobs", left=[ sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="SQS-DeadLetterQueue", left=[ sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.GraphWidget(title="ERROR/WARNING Logs", left=[log_metric_ERROR], right=[log_metric_WARNING]), cw.SingleValueWidget( title="Running/Waiting and Dead Jobs", metrics=[ sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)) ], height=6)) # Alarm for queue - DLQ alarm_DLQ = cw.Alarm( self, "SQS_DLQ", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible( ), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=1, datapoints_to_alarm=1) alarm_topic = sns.Topic(self, "SQS queue-DLQ has dead letter") alarm_topic.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic)) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_serverless")
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) self.platform_resources = ImportedResources(self, self.stack_name) ### ECS_APP_NAME = "octicketing-microservice" ECS_DEPLOYMENT_GROUP_NAME = "octicketingECSBlueGreen" ECS_DEPLOYMENT_CONFIG_NAME = "CodeDeployDefault.ECSLinear10PercentEvery1Minutes" ECS_DEPLOYMENT_CONFIG_ALL = "CodeDeployDefault.ECSAllAtOnce" ECS_TASKSET_TERMINATION_WAIT_TIME = 10 ECS_TASK_FAMILY_NAME = "octicketing-service" ECS_APP_LOG_GROUP_NAME = "/ecs/" + ECS_TASK_FAMILY_NAME DUMMY_APP_NAME = "hello-world-microservice" DUMMY_TASK_FAMILY_NAME = "hello-world-service" DUMMY_APP_LOG_GROUP_NAME = "/ecs/dummy-" + ECS_TASK_FAMILY_NAME DUMMY_CONTAINER_IMAGE = self.account + ".dkr.ecr." + \ self.region + ".amazonaws.com/hello-world:latest" Dmmuyvare = "" # ============================================================================= # ECR and CodeCommit repositories for the Blue/ Green deployment # ============================================================================= # ECR repository for the docker images self.octicketing_ecr_repo = aws_ecr.Repository( self, "OcticketingECRRepo", repository_name=ECS_APP_NAME, removal_policy=core.RemovalPolicy.DESTROY) self.octicketing_code_repo = aws_codecommit.Repository( self, ECS_APP_NAME + "-bg", repository_name=ECS_APP_NAME + "-bg", description=ECS_APP_NAME + "blue-green service repository") core.CfnOutput(self, 'BGRepoName', value=self.octicketing_code_repo.repository_name, export_name='OcticketingBGRepoName') core.CfnOutput(self, 'BGRepoARN', value=self.octicketing_code_repo.repository_arn, export_name='OcticketingBGRepoARN') # ============================================================================= # CODE BUILD and ECS TASK ROLES for the Blue/ Green deployment # ============================================================================= # IAM role for the Code Build project codeBuildServiceRole = aws_iam.Role( self, "codeBuildServiceRole", assumed_by=aws_iam.ServicePrincipal('codebuild.amazonaws.com')) inlinePolicyForCodeBuild = aws_iam.PolicyStatement( effect=aws_iam.Effect.ALLOW, actions=[ "ecr:GetAuthorizationToken", "ecr:BatchCheckLayerAvailability", "ecr:InitiateLayerUpload", "ecr:BatchGetImage", "ecr:GetDownloadUrlForLayer", "ecr:UploadLayerPart", "ecr:CompleteLayerUpload", "ecr:PutImage" ], resources=["*"]) codeBuildServiceRole.add_to_policy(inlinePolicyForCodeBuild) # ECS task role ecsTaskRole = aws_iam.Role( self, "ecsTaskRoleForWorkshop", assumed_by=aws_iam.ServicePrincipal('ecs-tasks.amazonaws.com')) ecsTaskRole.add_managed_policy( aws_iam.ManagedPolicy.from_aws_managed_policy_name( "service-role/AmazonECSTaskExecutionRolePolicy")) ecsTaskRole.add_managed_policy( aws_iam.ManagedPolicy.from_aws_managed_policy_name( "SecretsManagerReadWrite")) # ============================================================================= # CODE DEPLOY APPLICATION for the Blue/ Green deployment # ============================================================================= # Creating the code deploy application codeDeployApplication = codedeploy.EcsApplication( self, "OcticketingCodeDeploy") # Creating the code deploy service role codeDeployServiceRole = aws_iam.Role( self, "codeDeployServiceRole", assumed_by=aws_iam.ServicePrincipal('codedeploy.amazonaws.com')) codeDeployServiceRole.add_managed_policy( aws_iam.ManagedPolicy.from_aws_managed_policy_name( "AWSCodeDeployRoleForECS")) # IAM role for custom lambda function customLambdaServiceRole = aws_iam.Role( self, "codeDeployCustomLambda", assumed_by=aws_iam.ServicePrincipal('lambda.amazonaws.com')) inlinePolicyForLambda = aws_iam.PolicyStatement( effect=aws_iam.Effect.ALLOW, actions=[ "iam:PassRole", "sts:AssumeRole", "codedeploy:List*", "codedeploy:Get*", "codedeploy:UpdateDeploymentGroup", "codedeploy:CreateDeploymentGroup", "codedeploy:DeleteDeploymentGroup" ], resources=["*"]) customLambdaServiceRole.add_managed_policy( aws_iam.ManagedPolicy.from_aws_managed_policy_name( 'service-role/AWSLambdaBasicExecutionRole')) customLambdaServiceRole.add_to_policy(inlinePolicyForLambda) # Custom resource to create the deployment group createDeploymentGroupLambda = aws_lambda.Function( self, 'createDeploymentGroupLambda', code=aws_lambda.Code.from_asset("custom_resources"), runtime=aws_lambda.Runtime.PYTHON_3_8, handler='create_deployment_group.handler', role=customLambdaServiceRole, description="Custom resource to create deployment group", memory_size=128, timeout=core.Duration.seconds(60)) # ================================================================================================ # CloudWatch Alarms for 4XX errors blue4xxMetric = aws_cloudwatch.Metric( namespace='AWS/ApplicationELB', metric_name='HTTPCode_Target_4XX_Count', dimensions={ "TargetGroup": self.platform_resources.blue_target_full_name, "LoadBalancer": self.platform_resources.alb_full_name }, statistic="sum", period=core.Duration.minutes(1)) self.blue_targetAlarm = aws_cloudwatch.Alarm( self, "blue4xxErrors", alarm_name="Blue_4xx_Alarm", alarm_description= "CloudWatch Alarm for the 4xx errors of Blue target group", metric=blue4xxMetric, threshold=1, evaluation_periods=1) green4xxMetric = aws_cloudwatch.Metric( namespace='AWS/ApplicationELB', metric_name='HTTPCode_Target_4XX_Count', dimensions={ "TargetGroup": self.platform_resources.green_target_full_name, "LoadBalancer": self.platform_resources.alb_full_name }, statistic="sum", period=core.Duration.minutes(1)) self.green_targetAlarm = aws_cloudwatch.Alarm( self, "green4xxErrors", alarm_name="Green_4xx_Alarm", alarm_description= "CloudWatch Alarm for the 4xx errors of Green target group", metric=green4xxMetric, threshold=1, evaluation_periods=1) # ================================================================================================ # DUMMY TASK DEFINITION for the initial service creation # This is required for the service being made available to create the CodeDeploy Deployment Group # ================================================================================================ sampleTaskDefinition = aws_ecs.FargateTaskDefinition( self, "sampleTaskDefn", family=DUMMY_TASK_FAMILY_NAME, cpu=256, memory_limit_mib=1024, task_role=ecsTaskRole, execution_role=ecsTaskRole) sampleContainerDefn = sampleTaskDefinition.add_container( "sampleAppContainer", image=aws_ecs.ContainerImage.from_registry(DUMMY_CONTAINER_IMAGE), logging=aws_ecs.AwsLogDriver(log_group=aws_logs.LogGroup( self, "sampleAppLogGroup", log_group_name=DUMMY_APP_LOG_GROUP_NAME, removal_policy=core.RemovalPolicy.DESTROY), stream_prefix=DUMMY_APP_NAME), docker_labels={"name": DUMMY_APP_NAME}) port_mapping = aws_ecs.PortMapping(container_port=8080, protocol=aws_ecs.Protocol.TCP) sampleContainerDefn.add_port_mappings(port_mapping) # ================================================================================================ # ECS task definition using ECR image # Will be used by the CODE DEPLOY for Blue/Green deployment # ================================================================================================ OcticketingTaskDef = aws_ecs.FargateTaskDefinition( self, "appTaskDefn", family=ECS_TASK_FAMILY_NAME, cpu=256, memory_limit_mib=1024, task_role=ecsTaskRole, execution_role=ecsTaskRole) # ============================================================================= octicketing_cont_def = OcticketingTaskDef.add_container( "OcticketingAppContainer", image=aws_ecs.ContainerImage.from_ecr_repository( self.octicketing_ecr_repo, "latest"), logging=aws_ecs.AwsLogDriver(log_group=aws_logs.LogGroup( self, "OcticketingAppLogGroup", log_group_name=ECS_APP_LOG_GROUP_NAME, removal_policy=core.RemovalPolicy.DESTROY), stream_prefix=ECS_APP_NAME), docker_labels={"name": ECS_APP_NAME}) octicketing_cont_def.add_port_mappings(port_mapping) # ============================================================================= # ECS SERVICE for the Blue/ Green deployment # ============================================================================= OcticketingAppService = aws_ecs.FargateService( self, "OcticketingAppService", cluster=self.platform_resources.ecs_cluster, task_definition=sampleTaskDefinition, health_check_grace_period=core.Duration.seconds(10), platform_version=aws_ecs.FargatePlatformVersion.VERSION1_4, desired_count=1, deployment_controller={ "type": aws_ecs.DeploymentControllerType.CODE_DEPLOY }, service_name=ECS_APP_NAME) OcticketingAppService.connections.allow_from( self.platform_resources.alb, aws_ec2.Port.tcp(80)) OcticketingAppService.connections.allow_from( self.platform_resources.alb, aws_ec2.Port.tcp(8080)) OcticketingAppService.attach_to_application_target_group( self.platform_resources.blue_target) # ============================================================================= # CODE DEPLOY - Deployment Group CUSTOM RESOURCE for the Blue/ Green deployment # ============================================================================= core.CustomResource( self, 'customEcsDeploymentGroup', service_token=createDeploymentGroupLambda.function_arn, properties={ "ApplicationName": codeDeployApplication.application_name, "DeploymentGroupName": ECS_DEPLOYMENT_GROUP_NAME, "DeploymentConfigName": ECS_DEPLOYMENT_CONFIG_NAME, "ServiceRoleArn": codeDeployServiceRole.role_arn, "BlueTargetGroup": self.platform_resources.blue_target_name, "GreenTargetGroup": self.platform_resources.green_target_name, "ProdListenerArn": self.platform_resources.prod_listener.listener_arn, "TestListenerArn": self.platform_resources.test_listener.listener_arn, "EcsClusterName": self.platform_resources.ecs_cluster.cluster_name, "EcsServiceName": OcticketingAppService.service_name, "TerminationWaitTime": ECS_TASKSET_TERMINATION_WAIT_TIME, "BlueGroupAlarm": self.blue_targetAlarm.alarm_name, "GreenGroupAlarm": self.green_targetAlarm.alarm_name, }) ecsDeploymentGroup = codedeploy.EcsDeploymentGroup.from_ecs_deployment_group_attributes( self, "ecsDeploymentGroup", application=codeDeployApplication, deployment_group_name=ECS_DEPLOYMENT_GROUP_NAME, deployment_config=codedeploy.EcsDeploymentConfig. from_ecs_deployment_config_name(self, "ecsDeploymentConfig", ECS_DEPLOYMENT_CONFIG_NAME)) # ============================================================================= # CODE BUILD PROJECT for the Blue/ Green deployment # ============================================================================= # Creating the code build project OcticketingAppcodebuild = aws_codebuild.Project( self, "OcticketingAppcodebuild", role=codeBuildServiceRole, environment=aws_codebuild.BuildEnvironment( build_image=aws_codebuild.LinuxBuildImage.STANDARD_4_0, compute_type=aws_codebuild.ComputeType.SMALL, privileged=True, environment_variables={ 'REPOSITORY_URI': { 'value': self.octicketing_ecr_repo.repository_uri, 'type': aws_codebuild.BuildEnvironmentVariableType.PLAINTEXT }, 'TASK_EXECUTION_ARN': { 'value': ecsTaskRole.role_arn, 'type': aws_codebuild.BuildEnvironmentVariableType.PLAINTEXT }, 'TASK_FAMILY': { 'value': ECS_TASK_FAMILY_NAME, 'type': aws_codebuild.BuildEnvironmentVariableType.PLAINTEXT } }), source=aws_codebuild.Source.code_commit( repository=self.octicketing_code_repo)) # ============================================================================= # CODE PIPELINE for Blue/Green ECS deployment # ============================================================================= codePipelineServiceRole = aws_iam.Role( self, "codePipelineServiceRole", assumed_by=aws_iam.ServicePrincipal('codepipeline.amazonaws.com')) inlinePolicyForCodePipeline = aws_iam.PolicyStatement( effect=aws_iam.Effect.ALLOW, actions=[ "iam:PassRole", "sts:AssumeRole", "codecommit:Get*", "codecommit:List*", "codecommit:GitPull", "codecommit:UploadArchive", "codecommit:CancelUploadArchive", "codebuild:BatchGetBuilds", "codebuild:StartBuild", "codedeploy:CreateDeployment", "codedeploy:Get*", "codedeploy:RegisterApplicationRevision", "s3:Get*", "s3:List*", "s3:PutObject" ], resources=["*"]) codePipelineServiceRole.add_to_policy(inlinePolicyForCodePipeline) sourceArtifact = codepipeline.Artifact('sourceArtifact') buildArtifact = codepipeline.Artifact('buildArtifact') # S3 bucket for storing the code pipeline artifacts OcticketingAppArtifactsBucket = s3.Bucket( self, "OcticketingAppArtifactsBucket", encryption=s3.BucketEncryption.S3_MANAGED, block_public_access=s3.BlockPublicAccess.BLOCK_ALL) # S3 bucket policy for the code pipeline artifacts denyUnEncryptedObjectUploads = aws_iam.PolicyStatement( effect=aws_iam.Effect.DENY, actions=["s3:PutObject"], principals=[aws_iam.AnyPrincipal()], resources=[OcticketingAppArtifactsBucket.bucket_arn + "/*"], conditions={ "StringNotEquals": { "s3:x-amz-server-side-encryption": "aws:kms" } }) denyInsecureConnections = aws_iam.PolicyStatement( effect=aws_iam.Effect.DENY, actions=["s3:*"], principals=[aws_iam.AnyPrincipal()], resources=[OcticketingAppArtifactsBucket.bucket_arn + "/*"], conditions={"Bool": { "aws:SecureTransport": "false" }}) OcticketingAppArtifactsBucket.add_to_resource_policy( denyUnEncryptedObjectUploads) OcticketingAppArtifactsBucket.add_to_resource_policy( denyInsecureConnections) # Code Pipeline - CloudWatch trigger event is created by CDK codepipeline.Pipeline( self, "ecsBlueGreen", role=codePipelineServiceRole, artifact_bucket=OcticketingAppArtifactsBucket, stages=[ codepipeline.StageProps( stage_name='Source', actions=[ aws_codepipeline_actions.CodeCommitSourceAction( action_name='Source', repository=self.octicketing_code_repo, output=sourceArtifact, ) ]), codepipeline.StageProps( stage_name='Build', actions=[ aws_codepipeline_actions.CodeBuildAction( action_name='Build', project=OcticketingAppcodebuild, input=sourceArtifact, outputs=[buildArtifact]) ]), codepipeline.StageProps( stage_name='Deploy', actions=[ aws_codepipeline_actions.CodeDeployEcsDeployAction( action_name='Deploy', deployment_group=ecsDeploymentGroup, app_spec_template_input=buildArtifact, task_definition_template_input=buildArtifact, ) ]) ]) # ============================================================================= # Export the outputs # ============================================================================= core.CfnOutput( self, "ecsBlueGreenCodeRepo", description="Demo app code commit repository", export_name="ecsBlueGreenDemoAppRepo", value=self.octicketing_code_repo.repository_clone_url_http)
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # Read Lambda Code): try: with open( "serverless_stacks/lambda_src/konstone_custom_metric_log_generator.py", mode="r") as f: konstone_custom_metric_fn_code = f.read() except OSError: print("Unable to read Lambda Function Code") konstone_custom_metric_fn = _lambda.Function( self, "konstoneFunction", function_name="konstone_custom_metric_fn", runtime=_lambda.Runtime.PYTHON_3_7, handler="index.lambda_handler", code=_lambda.InlineCode(konstone_custom_metric_fn_code), timeout=core.Duration.seconds(3), reserved_concurrent_executions=1, environment={ "LOG_LEVEL": "INFO", "PERCENTAGE_ERRORS": "75" }) # Create Custom Loggroup # /aws/lambda/function-name konstone_custom_metric_lg = _logs.LogGroup( self, "konstoneLoggroup", log_group_name= f"/aws/lambda/{konstone_custom_metric_fn.function_name}", removal_policy=core.RemovalPolicy.DESTROY, retention=_logs.RetentionDays.ONE_DAY, ) # Create Custom Metric Namespace third_party_error_metric = _cloudwatch.Metric( namespace=f"third-party-error-metric", metric_name="third_party_error_metric", label="Total No. of Third Party API Errors", period=core.Duration.minutes(1), statistic="Sum") # Create Custom Metric Log Filter third_party_error_metric_filter = _logs.MetricFilter( self, "thirdPartyApiErrorMetricFilter", filter_pattern=_logs.FilterPattern.boolean_value( "$.third_party_api_error", True), log_group=konstone_custom_metric_lg, metric_namespace=third_party_error_metric.namespace, metric_name=third_party_error_metric.metric_name, default_value=0, metric_value="1") # Create Third Party Error Alarm third_party_error_alarm = _cloudwatch.Alarm( self, "thirdPartyApiErrorAlarm", alarm_description= "Alert if 3rd party API has more than 2 errors in the last two minutes", alarm_name="third-party-api-alarm", metric=third_party_error_metric, comparison_operator=_cloudwatch.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, threshold=2, evaluation_periods=2, datapoints_to_alarm=1, period=core.Duration.minutes(1), treat_missing_data=_cloudwatch.TreatMissingData.NOT_BREACHING)
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) #sns topic for monitor snstopic_monitor01 = aws_sns.Topic(self, "MonitorSnsTopic", display_name="monitor webapp", topic_name="EC2Monitor") #add subcriptions to sns snstopic_monitor01.add_subscription( aws_sns_subc.EmailSubscription("*****@*****.**")) ## vpc block ## prod_config = self.node.try_get_context('envs')['prod'] custom_vpc = aws_ec2.Vpc( self, "CustomVpcID", cidr=prod_config['vpc_config']['vpc_cidr'], max_azs=2, nat_gateways=1, subnet_configuration=[ aws_ec2.SubnetConfiguration( name="PublicSubnet", cidr_mask=prod_config['vpc_config']['cidr_mask'], subnet_type=aws_ec2.SubnetType.PUBLIC), aws_ec2.SubnetConfiguration( name="PrivateSubnet", cidr_mask=prod_config['vpc_config']['cidr_mask'], subnet_type=aws_ec2.SubnetType.PRIVATE), aws_ec2.SubnetConfiguration( name="DbSubnet", cidr_mask=prod_config['vpc_config']['cidr_mask'], subnet_type=aws_ec2.SubnetType.ISOLATED) ]) ## end vpc block ## ## ec2 block ## #import user-data scripts with open("userdata_scripts/setup.sh", mode="r") as file: user_data = file.read() #get latest ami from any region aws_linux_ami = aws_ec2.MachineImage.latest_amazon_linux( generation=aws_ec2.AmazonLinuxGeneration.AMAZON_LINUX_2, edition=aws_ec2.AmazonLinuxEdition.STANDARD, storage=aws_ec2.AmazonLinuxStorage.EBS, virtualization=aws_ec2.AmazonLinuxVirt.HVM) #ec2 test_server = aws_ec2.Instance( self, "ec2id", instance_type=aws_ec2.InstanceType( instance_type_identifier="t2.micro"), instance_name="TestServer01", machine_image=aws_linux_ami, vpc=custom_vpc, vpc_subnets=aws_ec2.SubnetSelection( subnet_type=aws_ec2.SubnetType.PUBLIC), key_name="SAA-C01", user_data=aws_ec2.UserData.custom(user_data)) #allow web traffic test_server.connections.allow_from_any_ipv4( aws_ec2.Port.tcp(80), description="allow web traffic") # add permission to instances profile test_server.role.add_managed_policy( aws_iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) test_server.role.add_managed_policy( aws_iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonS3ReadOnlyAccess")) ## end ec2 block ## ## lambda block ## #import function code try: with open("serverless_stack/functions/function.py", mode="r") as file: function_body = file.read() except OSError: print('File can not read') #function function_01 = aws_lambda.Function( self, "lambdafunction01", function_name="LambdaTestCDK", runtime=aws_lambda.Runtime.PYTHON_3_6, handler="index.lambda_handler", code=aws_lambda.InlineCode(function_body), timeout=core.Duration.seconds(5), reserved_concurrent_executions=1, environment={ 'LOG_LEVEL': 'INFO', 'AUTOMATION': 'SKON' }) ## end lambda block ## ## monitor block ## #ec2 metric for cpu usage ec2_metric_01 = aws_cw.Metric( namespace="AWS/EC2", metric_name="CPUUtilization", dimensions={"InstanceID": test_server.instance_id}, period=core.Duration.minutes(5)) #under utilize alram ec2 low_cpu_ec2 = aws_cw.Alarm( self, "lowcpualram", alarm_description="low cpu utilization", alarm_name="Low-CPU-Alarm", actions_enabled=True, metric=ec2_metric_01, threshold=10, comparison_operator=aws_cw.ComparisonOperator. LESS_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=1, datapoints_to_alarm=1, period=core.Duration.minutes(5), treat_missing_data=aws_cw.TreatMissingData.NOT_BREACHING) #sns on ec2 alram low_cpu_ec2.add_alarm_action(aws_cw_ats.SnsAction(snstopic_monitor01)) #Lambda alram function_01_alarm = aws_cw.Alarm(self, "LambdaAlarm", metric=function_01.metric_errors(), threshold=2, evaluation_periods=1, datapoints_to_alarm=1, period=core.Duration.minutes(5)) #sns on lambda alarm function_01_alarm.add_alarm_action( aws_cw_ats.SnsAction(snstopic_monitor01))
def __init__(self, scope: core.Construct, construct_id: str, stack_log_level: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) # Add your stack resources below): # Maximum number of times, a message can be tried to be process from the queue before deleting self.max_msg_receive_cnt = 5 self.max_msg_receive_cnt_at_retry = 3 # Define Dead Letter Queue self.reliable_q_dlq = _sqs.Queue( self, "DeadLetterQueue", delivery_delay=core.Duration.seconds(100), queue_name=f"reliable_q_dlq", retention_period=core.Duration.days(2), visibility_timeout=core.Duration.seconds(10), receive_message_wait_time=core.Duration.seconds(10)) # Define Retry Queue for Reliable Q self.reliable_q_retry_1 = _sqs.Queue( self, "reliableQueueRetry1", delivery_delay=core.Duration.seconds(10), queue_name=f"reliable_q_retry_1", retention_period=core.Duration.days(2), visibility_timeout=core.Duration.seconds(10), receive_message_wait_time=core.Duration.seconds(10), dead_letter_queue=_sqs.DeadLetterQueue( max_receive_count=self.max_msg_receive_cnt_at_retry, queue=self.reliable_q_dlq)) # Primary Source Queue self.reliable_q = _sqs.Queue( self, "reliableQueue", delivery_delay=core.Duration.seconds(5), queue_name=f"reliable_q", retention_period=core.Duration.days(2), visibility_timeout=core.Duration.seconds(10), receive_message_wait_time=core.Duration.seconds(10), dead_letter_queue=_sqs.DeadLetterQueue( max_receive_count=self.max_msg_receive_cnt, queue=self.reliable_q_retry_1)) ######################################## ####### ####### ####### SQS Data Producer ####### ####### ####### ######################################## # Read Lambda Code try: with open( "stacks/back_end/serverless_sqs_producer_stack/lambda_src/sqs_data_producer.py", encoding="utf-8", mode="r") as f: data_producer_fn_code = f.read() except OSError: print("Unable to read Lambda Function Code") raise data_producer_fn = _lambda.Function( self, "sqsDataProducerFn", function_name=f"data_producer_fn_{construct_id}", description="Produce data events and push to SQS", runtime=_lambda.Runtime.PYTHON_3_7, code=_lambda.InlineCode(data_producer_fn_code), handler="index.lambda_handler", timeout=core.Duration.seconds(5), reserved_concurrent_executions=1, environment={ "LOG_LEVEL": f"{stack_log_level}", "APP_ENV": "Production", "RELIABLE_QUEUE_NAME": f"{self.reliable_q.queue_name}", "TRIGGER_RANDOM_FAILURES": "True" }) # Grant our Lambda Producer privileges to write to SQS self.reliable_q.grant_send_messages(data_producer_fn) # Create Custom Loggroup for Producer data_producer_lg = _logs.LogGroup( self, "dataProducerLogGroup", log_group_name=f"/aws/lambda/{data_producer_fn.function_name}", removal_policy=core.RemovalPolicy.DESTROY, retention=_logs.RetentionDays.ONE_DAY) # Restrict Produce Lambda to be invoked only from the stack owner account data_producer_fn.add_permission( "restrictLambdaInvocationToFhInOwnAccount", principal=_iam.AccountRootPrincipal(), action="lambda:InvokeFunction", source_account=core.Aws.ACCOUNT_ID) # Monitoring for Queue reliable_q_alarm = _cw.Alarm( self, "reliableQueueAlarm", metric=self.reliable_q.metric( "ApproximateNumberOfMessagesVisible"), statistic="sum", threshold=10, period=core.Duration.minutes(5), evaluation_periods=1, comparison_operator=_cw.ComparisonOperator.GREATER_THAN_THRESHOLD) ########################################### ################# OUTPUTS ################# ########################################### output_0 = core.CfnOutput( self, "AutomationFrom", value=f"{GlobalArgs.SOURCE_INFO}", description= "To know more about this automation stack, check out our github page." ) output_1 = core.CfnOutput( self, "SqsDataProducer", value= f"https://console.aws.amazon.com/lambda/home?region={core.Aws.REGION}#/functions/{data_producer_fn.function_name}", description="Produce data events and push to SQS Queue.") output_2 = core.CfnOutput( self, "ReliableQueue", value= f"https://console.aws.amazon.com/sqs/v2/home?region={core.Aws.REGION}#/queues", description="Reliable Queue")
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) #import function code try: with open("serverless_stack/functions/metric_logs_generator.py", mode="r") as file: function_body = file.read() except OSError: print('File can not read') #function function_01 = aws_lambda.Function( self, "lambdafunction01", function_name="LambdaTestCustomMEtric", runtime=aws_lambda.Runtime.PYTHON_3_6, handler="index.lambda_handler", code=aws_lambda.InlineCode(function_body), timeout=core.Duration.seconds(5), reserved_concurrent_executions=1, environment={ 'LOG_LEVEL': 'INFO', 'PERCENTAGE_ERRORS': '75' }) #attached cloudwatch log group custom_metric_log_group01 = aws_logs.LogGroup( self, "cloudwatchlog01", log_group_name=f"/aws/lambda/{function_01.function_name}", removal_policy=core.RemovalPolicy.DESTROY, retention=aws_logs.RetentionDays.ONE_DAY) #Custom metric namespace custom_metric_namespace01 = aws_cw.Metric( namespace=f"custom-error-metric", metric_name="custom-error-metric", label="Amount of Custom API errors", period=core.Duration.minutes(1), statistic="Sum") #Custom metric logs filter custom_metric_filter01 = aws_logs.MetricFilter( self, "customMetricFilter", filter_pattern=aws_logs.FilterPattern.boolean_value( "$.custom_api_error", True), log_group=custom_metric_log_group01, metric_namespace=custom_metric_namespace01.namespace, metric_name=custom_metric_namespace01.metric_name, default_value=0, metric_value="1") #create custom alarm custom_metric_alarm01 = aws_cw.Alarm( self, "customMetricAlarm", alarm_description="Custom API errors", alarm_name="Custom-API-alarm", metric=custom_metric_namespace01, comparison_operator=aws_cw.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, threshold=2, evaluation_periods=2, datapoints_to_alarm=1, period=core.Duration.minutes(1), treat_missing_data=aws_cw.TreatMissingData.NOT_BREACHING)
def __init__(self, scope: core.Construct, _id: str, **kwargs) -> None: super().__init__(scope, _id, **kwargs) ddb_file_list = ddb.Table(self, "ddb", partition_key=ddb.Attribute( name="Key", type=ddb.AttributeType.STRING), billing_mode=ddb.BillingMode.PAY_PER_REQUEST) sqs_queue_DLQ = sqs.Queue(self, "sqs_DLQ", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14)) sqs_queue = sqs.Queue(self, "sqs_queue", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14), dead_letter_queue=sqs.DeadLetterQueue( max_receive_count=100, queue=sqs_queue_DLQ)) checkip = api.RestApi( self, "lambda-checkip-api", cloud_watch_role=True, deploy=True, description="For Lambda get IP address", default_integration=api.MockIntegration( integration_responses=[ api.IntegrationResponse(status_code="200", response_templates={ "application/json": "$context.identity.sourceIp" }) ], request_templates={"application/json": '{"statusCode": 200}'}), endpoint_types=[api.EndpointType.REGIONAL]) checkip.root.add_method("GET", method_responses=[ api.MethodResponse( status_code="200", response_models={ "application/json": api.Model.EMPTY_MODEL }) ]) handler = lam.Function(self, "lambdaFunction", code=lam.Code.asset("./lambda"), handler="lambda_function.lambda_handler", runtime=lam.Runtime.PYTHON_3_8, memory_size=1024, timeout=core.Duration.minutes(15), tracing=lam.Tracing.ACTIVE, environment={ 'table_queue_name': ddb_file_list.table_name, 'Des_bucket_default': Des_bucket_default, 'Des_prefix_default': Des_prefix_default, 'StorageClass': StorageClass, 'aws_access_key_id': aws_access_key_id, 'aws_secret_access_key': aws_secret_access_key, 'aws_access_key_region': aws_access_key_region, 'checkip_url': checkip.url }) ddb_file_list.grant_read_write_data(handler) handler.add_event_source(SqsEventSource(sqs_queue)) s3bucket = s3.Bucket(self, "s3bucket") s3bucket.grant_read(handler) s3bucket.add_event_notification(s3.EventType.OBJECT_CREATED, s3n.SqsDestination(sqs_queue)) # You can import an existing bucket and grant access to lambda # exist_s3bucket = s3.Bucket.from_bucket_name(self, "import_bucket", # bucket_name="you_bucket_name") # exist_s3bucket.grant_read(handler) # But You have to add sqs as imported bucket event notification manually, it doesn't support by CloudFormation # An work around is to add on_cloud_trail_event for the bucket, but will trigger could_trail first # 因为是导入的Bucket,需要手工建Bucket Event Trigger SQS,以及设置SQS允许该bucekt触发的Permission core.CfnOutput(self, "DynamoDB_Table", value=ddb_file_list.table_name) core.CfnOutput(self, "SQS_Job_Queue", value=sqs_queue.queue_name) core.CfnOutput(self, "SQS_Job_Queue_DLQ", value=sqs_queue_DLQ.queue_name) core.CfnOutput(self, "Worker_Lambda_Function", value=handler.function_name) core.CfnOutput(self, "New_S3_Bucket", value=s3bucket.bucket_name) # Create Lambda logs filter to create network traffic metric handler.log_group.add_metric_filter( "Complete-bytes", metric_name="Complete-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Complete", bytes, key]')) handler.log_group.add_metric_filter( "Uploading-bytes", metric_name="Uploading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Uploading", bytes, key]')) handler.log_group.add_metric_filter( "Downloading-bytes", metric_name="Downloading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Downloading", bytes, key]')) lambda_metric_Complete = cw.Metric(namespace="s3_migrate", metric_name="Complete-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Upload = cw.Metric(namespace="s3_migrate", metric_name="Uploading-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Download = cw.Metric(namespace="s3_migrate", metric_name="Downloading-bytes", statistic="Sum", period=core.Duration.minutes(1)) handler.log_group.add_metric_filter( "ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"ERROR"')) handler.log_group.add_metric_filter( "WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"WARNING"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) # Dashboard to monitor SQS and Lambda board = cw.Dashboard(self, "s3_migrate", dashboard_name="s3_migrate_serverless") board.add_widgets( cw.GraphWidget(title="Lambda-NETWORK", left=[ lambda_metric_Download, lambda_metric_Upload, lambda_metric_Complete ]), # TODO: here monitor all lambda concurrency not just the working one. Limitation from CDK # Lambda now supports monitor single lambda concurrency, will change this after CDK support cw.GraphWidget(title="Lambda-all-concurrent", left=[ handler.metric_all_concurrent_executions( period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="Lambda-invocations/errors/throttles", left=[ handler.metric_invocations( period=core.Duration.minutes(1)), handler.metric_errors(period=core.Duration.minutes(1)), handler.metric_throttles(period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="Lambda-duration", left=[ handler.metric_duration(period=core.Duration.minutes(1)) ]), ) board.add_widgets( cw.GraphWidget( title="SQS-Jobs", left=[ sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="SQS-DeadLetterQueue", left=[ sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.GraphWidget(title="ERROR/WARNING Logs", left=[log_metric_ERROR], right=[log_metric_WARNING]), cw.SingleValueWidget( title="Running/Waiting and Dead Jobs", metrics=[ sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)) ], height=6)) # Alarm for queue - DLQ alarm_DLQ = cw.Alarm( self, "SQS_DLQ", alarm_name="s3-migration-serverless-SQS Dead Letter Queue", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible( ), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=1, datapoints_to_alarm=1) alarm_topic = sns.Topic(self, "SQS queue-DLQ has dead letter") alarm_topic.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic)) # Alarm for queue empty, i.e. no visible message and no in-visible message # metric_all_message = cw.MathExpression( # expression="a + b", # label="empty_queue_expression", # using_metrics={ # "a": sqs_queue.metric_approximate_number_of_messages_visible(), # "b": sqs_queue.metric_approximate_number_of_messages_not_visible() # } # ) # alarm_0 = cw.Alarm(self, "SQSempty", # alarm_name="SQS queue empty-Serverless", # metric=metric_all_message, # threshold=0, # comparison_operator=cw.ComparisonOperator.LESS_THAN_OR_EQUAL_TO_THRESHOLD, # evaluation_periods=3, # datapoints_to_alarm=3, # treat_missing_data=cw.TreatMissingData.IGNORE # ) # alarm_topic = sns.Topic(self, "SQS queue empty-Serverless") # alarm_topic.add_subscription(subscription=sub.EmailSubscription(alarm_email)) # alarm_0.add_alarm_action(action.SnsAction(alarm_topic)) # core.CfnOutput(self, "Alarm", value="CloudWatch SQS queue empty Alarm for Serverless: " + alarm_email) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_serverless") core.CfnOutput(self, "API-checkip", value=checkip.url)
def __init__(self, scope: core.Construct, id: str, group_name: str, minute_duration: int, **kwargs) -> None: super().__init__(scope, id, **kwargs) # TODO: Setup alerting of failure to an SNS # TODO: Failure is not the same as a student not in a group # TODO: Streamline input data so that lambda's only get the info they really need # TODO: Comment # TODO: Need to separate unexpected errors from regular errors # Setting up monitoring schedule_stop = lambda_.Function( self, id="ScheduleStopLambda", runtime=lambda_.Runtime.PYTHON_3_7, code=lambda_.Code.from_inline( open("./resources/schedule-termination.py", 'r').read()), handler="index.handler", log_retention=logs.RetentionDays.ONE_DAY, environment=dict(GROUP_NAME=group_name), timeout=core.Duration.seconds(30)) schedule_stop.add_to_role_policy( statement=iam.PolicyStatement(actions=[ "ec2:Describe*", "iam:ListGroupsForUser", "iam:ListUsers" ], effect=iam.Effect.ALLOW, resources=["*"])) terminate_ec2 = lambda_.Function( self, id="TerminateEC2", runtime=lambda_.Runtime.PYTHON_3_7, code=lambda_.Code.from_inline( open("./resources/terminate-ec2.py", 'r').read()), handler="index.handler", log_retention=logs.RetentionDays.ONE_DAY, timeout=core.Duration.seconds(30)) terminate_ec2.add_to_role_policy( statement=iam.PolicyStatement(actions=[ "ec2:DescribeInstance*", "ec2:TerminateInstances", ], effect=iam.Effect.ALLOW, resources=["*"])) # The lambda object that will see if we should schedule. schedule_stop_task = tasks.LambdaInvoke( self, id='schedule stop', lambda_function=schedule_stop, input_path="$.detail.userIdentity", result_path="$.Payload", ) # TODO: Need to change this based on the configuration info above # Wait state to try and delete # wait_x = sfn.Wait(self, 'Wait x minutes', time=sfn.WaitTime.seconds_path("10")) wait_x = sfn.Wait(self, id='Wait x minutes', time=sfn.WaitTime.duration( core.Duration.minutes(minute_duration))) job_failed = sfn.Fail(self, id="Failed Job", cause="Error in the input", error="Error") job_finished = sfn.Succeed(self, id="Job Finished") choice = sfn.Choice(self, 'Can I delete') choice.when(sfn.Condition.boolean_equals('$.Payload.Payload', False), job_finished) choice.otherwise(wait_x) terminate_ec2_task = tasks.LambdaInvoke( self, 'terminate', lambda_function=terminate_ec2, input_path="$.detail.responseElements.instancesSet") wait_x.next(terminate_ec2_task).next(job_finished) state_definition = schedule_stop_task \ .next(choice) terminate_machine = sfn.StateMachine(self, id="State Machine", definition=state_definition) cloudwatch.Alarm(self, "EC2ScheduleAlarm", metric=terminate_machine.metric_failed(), threshold=1, evaluation_periods=1) # TODO Build Rule that monitors for EC2 creation # Any new creation, the EC2 will have to be destroyed. Including # other things? create_event = events.Rule( self, id='detect-ec2-start', description="Detects if an EC2 is created", enabled=True, event_pattern=events.EventPattern( detail_type=["AWS API Call via CloudTrail"], source=["aws.ec2"], detail={ "eventName": ["RunInstances"], "eventSource": ["ec2.amazonaws.com"] }), targets=[targets.SfnStateMachine(terminate_machine)])
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # Get config value for alert email email = self.node.try_get_context("email") if email == 'changeme@localhost': exit( 'ERROR: Change the email in cdk.json or pass it with -c email=changeme@localhost' ) # Create SNS for alarms to be sent to alarm_topic = sns.Topic(self, "backup_alarm", display_name="backup_alarm") # Subscribe my email so the alarms go to me alarm_topic.add_subscription(subscriptions.EmailSubscription(email)) # Create VPC to run everything in. We make this public just because we don't # want to spend $30/mo on a NAT gateway. vpc = ec2.Vpc( self, "VPC", nat_gateways=0, subnet_configuration=[ ec2.SubnetConfiguration(name="public", subnet_type=ec2.SubnetType.PUBLIC) ], ) ecs_sg = ec2.SecurityGroup(self, "ecs_sg", vpc=vpc) efs_sg = ec2.SecurityGroup(self, "efs_sg", vpc=vpc) efs_sg.add_ingress_rule( peer=ecs_sg, connection=ec2.Port.tcp(2049), description="Allow backup runner access", ) # Open this to the VPC efs_sg.add_ingress_rule( peer=ec2.Peer.ipv4("10.0.0.0/8"), connection=ec2.Port.tcp(2049), description="Allow backup runner access", ) # Define the EFS fileSystem = efs.FileSystem( self, "MyEfsFileSystem", vpc=vpc, encrypted=True, lifecycle_policy=efs.LifecyclePolicy.AFTER_7_DAYS, performance_mode=efs.PerformanceMode.GENERAL_PURPOSE, throughput_mode=efs.ThroughputMode.BURSTING, security_group=efs_sg, ) # Define the ECS task cluster = ecs.Cluster(self, "Cluster", vpc=vpc) taskDefinition = ecs.FargateTaskDefinition( self, "taskDefinition", volumes=[ ecs.Volume( name="efsvolume", efs_volume_configuration=ecs.EfsVolumeConfiguration( file_system_id=fileSystem.file_system_id, root_directory="/", transit_encryption="ENABLED", ), ) ], memory_limit_mib=8192, cpu=2048, ) log_driver = ecs.AwsLogDriver( stream_prefix="backup_runner", log_retention=logs.RetentionDays.TWO_WEEKS, ) taskDefinition.add_container( "backup-runner", image=ecs.ContainerImage.from_asset("./resources/backup_runner"), memory_limit_mib=8192, cpu=2048, logging=log_driver, ) # The previous method to add the container doesn't let us specify the mount point for the EFS, # so we have to do it here, and referencing the container that was just added. taskDefinition.default_container.add_mount_points( ecs.MountPoint(container_path="/mnt/efs", read_only=False, source_volume="efsvolume")) # Create rule to trigger this be run every 24 hours events.Rule( self, "scheduled_run", rule_name="backup_runner", # Run at 2am EST (6am UTC) every night schedule=events.Schedule.expression("cron(0 0 * * ? *)"), description="Starts the backup runner task every night", targets=[ targets.EcsTask( cluster=cluster, task_definition=taskDefinition, subnet_selection=ec2.SubnetSelection( subnet_type=ec2.SubnetType.PUBLIC), platform_version=ecs.FargatePlatformVersion. VERSION1_4, # Required to use EFS # Because "Latest" does not yet support EFS security_groups=[ecs_sg], ) ], ) # Create notification topic for backups backup_topic = sns.Topic(self, "backup_topic", display_name="Backup status") # Create AWS Backup vault = backup.BackupVault( self, "Vault", access_policy=iam.PolicyDocument(statements=[ iam.PolicyStatement( effect=iam.Effect.DENY, actions=[ "backup:DeleteBackupVault", "backup:DeleteRecoveryPoint", "backup:UpdateRecoveryPointLifecycle", # "backup:PutBackupVaultAccessPolicy", # This results in "Failed putting policy for Backup vault backuprunnerVaultXXX as it will lock down from further policy changes" "backup:DeleteBackupVaultAccessPolicy", "backup:DeleteBackupVaultNotifications", # "backup:PutBackupVaultNotifications", # This causes oher part of this app to fail. ], resources=["*"], principals=[iam.AnyPrincipal()], ) ]), notification_topic=alarm_topic, notification_events=[ # Monitor for some failures or access to the backups backup.BackupVaultEvents.BACKUP_JOB_EXPIRED, backup.BackupVaultEvents.BACKUP_JOB_FAILED, backup.BackupVaultEvents.COPY_JOB_FAILED, backup.BackupVaultEvents.COPY_JOB_FAILED, backup.BackupVaultEvents.COPY_JOB_STARTED, backup.BackupVaultEvents.RESTORE_JOB_COMPLETED, backup.BackupVaultEvents.RESTORE_JOB_FAILED, backup.BackupVaultEvents.RESTORE_JOB_STARTED, backup.BackupVaultEvents.RESTORE_JOB_SUCCESSFUL, ], ) plan = backup.BackupPlan.daily35_day_retention(self, "backup") plan.add_selection( "Selection", resources=[backup.BackupResource.from_efs_file_system(fileSystem)], ) # # Create metric filter for errors in the CloudWatch Logs from the ECS # METRIC_NAME = "log_errors" METRIC_NAMESPACE = "backup_runner" metric = cloudwatch.Metric(namespace=METRIC_NAMESPACE, metric_name=METRIC_NAME) error_metric = logs.MetricFilter( self, "MetricFilterId", metric_name=METRIC_NAME, metric_namespace=METRIC_NAMESPACE, log_group=log_driver.log_group, filter_pattern=logs.FilterPattern.any_term("ERROR"), metric_value="1", ) error_alarm = cloudwatch.Alarm( self, "AlarmId", metric=metric, evaluation_periods=1, actions_enabled=True, alarm_name="backuper_runner_alarm", alarm_description="Errors in backup runner", comparison_operator=cloudwatch.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, period=core.Duration.hours(1), threshold=1, statistic="sum", ) # Connect the alarm to the SNS error_alarm.add_alarm_action(cloudwatch_actions.SnsAction(alarm_topic)) # The above doesn't give it privileges, so add them to the alarm topic resource policy. alarm_topic.add_to_resource_policy( iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=["sns:Publish"], resources=[alarm_topic.topic_arn], principals=[iam.ServicePrincipal("cloudwatch.amazonaws.com")], ))
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # read parameters from SSM vpcid = _ssm.StringParameter.value_from_lookup(self, "/cdk/ec2/vpc_id") instance_type = _ssm.StringParameter.value_from_lookup( self, "/cdk/ec2/instance_type") key_name = _ssm.StringParameter.value_from_lookup( self, "/cdk/ec2/key_name") allow_ssh_web_location = _ssm.StringParameter.value_from_lookup( self, "/cdk/ec2/sshLocation") # Get the existing VPC my_vpc = _ec2.Vpc.from_lookup(self, "VPC", vpc_id=vpcid) # Prepare security group configuration - create security group my_security_group = _ec2.SecurityGroup( self, "my_security_group", vpc=my_vpc, security_group_name="myfirstcdk_secgroup") # Add an ingress rules for above security group add_securitygroup_ingress_for_22 = my_security_group.add_ingress_rule( # BUG 1 Invalid IPv4 CIDR: "dummy-value-for-" peer=_ec2.Peer.ipv4(allow_ssh_web_location), connection=_ec2.Port.tcp(22)) add_securitygroup_ingress_for_80 = my_security_group.add_ingress_rule( peer=_ec2.Peer.ipv4(allow_ssh_web_location), connection=_ec2.Port.tcp(80)) # create an IAM role with ssm managed policy managed_policies = _iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore"), my_session_mgmt_role = _iam.Role( self, id="my_session_mgmt_role", assumed_by=_iam.ServicePrincipal(service="ec2.amazonaws.com"), description="SSM session management role", managed_policies=list(managed_policies), role_name="SessionManagerRole") # Create an EC2 instance with the above configuration ec2_instance = _ec2.Instance( self, "my_ec2_instance", instance_type=_ec2.InstanceType( instance_type_identifier=instance_type), machine_image=_ec2.MachineImage.latest_amazon_linux(), vpc=my_vpc, instance_name="MyInstance", key_name=key_name, security_group=my_security_group, role=my_session_mgmt_role, user_data=_ec2.UserData.custom(user_data)) # Create a CloudWatch Alarm for EC2 instance CPU utilization metric = _cw.Metric(metric_name="CPUUtilization", namespace="AWS/EC2", dimensions={ "InstanceId": ec2_instance.instance_id, }, statistic="Average") cpu_alarm = _cw.Alarm( self, "cpu_alarm", alarm_name="CPUUtilizationOver15", alarm_description="CPU Utilization Over 15 Percent", evaluation_periods=3, threshold=15, period=core.Duration.seconds(60), metric=metric, datapoints_to_alarm=2, comparison_operator=_cw.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD) # CFN outputs ec2_instance_id = core.CfnOutput( self, "instance_id", value=ec2_instance.instance_id, description="InstanceId of the newly created EC2 instance") availability_zone = core.CfnOutput( self, "availability zone", value=ec2_instance.instance_availability_zone, description="Availability Zone of the newly created EC2") public_dns_name = core.CfnOutput( self, "public_dns_name", value=ec2_instance.instance_public_dns_name, description="Public DNSName of the newly created EC2 instance") public_ip = core.CfnOutput( self, "public_ip", value=ec2_instance.instance_public_ip, description="Public IP address of the newly created EC2") cloudwatch_alarm = core.CfnOutput(self, "Cloudwatch Alarm", value=cpu_alarm.alarm_arn, description="CPU alarm ARN")
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # Create SNS Topic for Operations Team): konstone_ops_team = _sns.Topic( self, "konstoneOpsTeam", display_name="KonStone 24x7 On Watsapp? Support", topic_name="konstoneOpsTeam") # Add Subscription to SNS Topic konstone_ops_team.add_subscription( _subs.EmailSubscription("*****@*****.**")) # Create a MultiAZ VPC): vpc = _ec2.Vpc(self, "konstoneVpcId", cidr="10.0.0.0/24", max_azs=2, nat_gateways=0, subnet_configuration=[ _ec2.SubnetConfiguration( name="public", subnet_type=_ec2.SubnetType.PUBLIC) ]) # Read EC2 BootStrap Script try: with open("bootstrap_scripts/install_httpd.sh", mode="r") as file: user_data = file.read() except OSError: print('Unable to read UserData script') # Get the latest ami amzn_linux_ami = _ec2.MachineImage.latest_amazon_linux( generation=_ec2.AmazonLinuxGeneration.AMAZON_LINUX_2, edition=_ec2.AmazonLinuxEdition.STANDARD, storage=_ec2.AmazonLinuxStorage.EBS, virtualization=_ec2.AmazonLinuxVirt.HVM) # WebServer Instance web_server = _ec2.Instance(self, "WebServer004Id", instance_type=_ec2.InstanceType( instance_type_identifier="t2.micro"), instance_name="WebServer004", machine_image=amzn_linux_ami, vpc=vpc, vpc_subnets=_ec2.SubnetSelection( subnet_type=_ec2.SubnetType.PUBLIC), user_data=_ec2.UserData.custom(user_data)) # Allow Web Traffic to WebServer web_server.connections.allow_from_any_ipv4( _ec2.Port.tcp(80), description="Allow Web Traffic") # Add permission to web server instance profile web_server.role.add_managed_policy( _iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) # Read Lambda Code try: with open("serverless_stacks/lambda_src/konstone_processor.py", mode="r") as f: konstone_fn_code = f.read() except OSError: print("Unable to read Lambda Function Code") # Simple Lambda Function to return event konstone_fn = _lambda.Function( self, "konstoneFunction", function_name="konstone_function", runtime=_lambda.Runtime.PYTHON_3_7, handler="index.lambda_handler", code=_lambda.InlineCode(konstone_fn_code), timeout=core.Duration.seconds(3), reserved_concurrent_executions=1, environment={ "LOG_LEVEL": "INFO", "AUTOMATION": "SKON" }) # EC2 Metric for Avg. CPU ec2_metric_for_avg_cpu = _cloudwatch.Metric( namespace="AWS/EC2", metric_name="CPUUtilization", dimensions={"InstanceId": web_server.instance_id}, period=core.Duration.minutes(5)) # Low CPU Alarm for Web Server low_cpu_alarm = _cloudwatch.Alarm( self, "lowCPUAlarm", alarm_description="Alert if CPU is less than 10%", alarm_name="low-cpu-alarm", actions_enabled=True, metric=ec2_metric_for_avg_cpu, threshold=10, comparison_operator=_cloudwatch.ComparisonOperator. LESS_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=1, datapoints_to_alarm=1, period=core.Duration.minutes(5), treat_missing_data=_cloudwatch.TreatMissingData.NOT_BREACHING) # Inform SNS on EC2 Alarm State low_cpu_alarm.add_alarm_action( _cloudwatch_actions.SnsAction(konstone_ops_team)) # Create Lambda Alarm konstone_fn_error_alarm = _cloudwatch.Alarm( self, "konstoneFunctionErrorAlarm", metric=konstone_fn.metric_errors(), threshold=2, evaluation_periods=1, datapoints_to_alarm=1, period=core.Duration.minutes(5)) # Inform SNS on Lambda Alarm State konstone_fn_error_alarm.add_alarm_action( _cloudwatch_actions.SnsAction(konstone_ops_team))
def __init__(self, scope: core.Construct, id: str, config: ContainerPipelineConfiguration, **kwargs) -> None: super().__init__(scope, id, **kwargs) #VPC vpc = ec2.Vpc(self, "TheVPC", cidr="10.0.0.0/16") #IAM roles service_task_def_exe_role = iam.Role( self, "ServiceTaskDefExecutionRole", assumed_by=iam.ServicePrincipal('ecs-tasks.amazonaws.com')) service_task_def_exe_role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( 'service-role/AmazonECSTaskExecutionRolePolicy')) service_task_def_role = iam.Role( self, 'ServiceTaskDefTaskRole', assumed_by=iam.ServicePrincipal('ecs-tasks.amazonaws.com')) code_deploy_role = iam.Role( self, "CodeDeployRole", assumed_by=iam.ServicePrincipal('codedeploy.amazonaws.com')) code_deploy_role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( 'AWSCodeDeployRoleForECS')) # Fargate cluster cluster = ecs.Cluster(scope=self, id="ecs-cluster", cluster_name=config.ProjectName + "-" + config.stage, vpc=vpc) load_balancer = elbv2.ApplicationLoadBalancer(self, "load_balancer", vpc=vpc, internet_facing=True) #Security Group service_sg = ec2.SecurityGroup(self, "service_sg", vpc=vpc) service_sg.connections.allow_from(load_balancer, ec2.Port.tcp(80)) #ECR Repo image_repo = ecr.Repository.from_repository_name( self, "image_repo", repository_name=config.ProjectName) log_group = logs.LogGroup(self, "log_group", log_group_name=config.ProjectName + "-" + config.stage, removal_policy=core.RemovalPolicy.DESTROY, retention=None) #ECS Task Def fargate_task_definition = ecs.FargateTaskDefinition( scope=self, id="fargate_task_definition", cpu=256, memory_limit_mib=512, execution_role=service_task_def_exe_role, task_role=service_task_def_role, family=config.ProjectName + "-" + config.stage) container = fargate_task_definition.add_container( id="fargate_task_container", image=ecs.ContainerImage.from_ecr_repository(repository=image_repo, tag='release')) container.add_port_mappings( ecs.PortMapping(container_port=80, host_port=80, protocol=ecs.Protocol.TCP)) #ECS Fargate Service fargate_service = ecs.FargateService( scope=self, id="fargate_service", security_group=service_sg, cluster=cluster, desired_count=5, deployment_controller=ecs.DeploymentController( type=ecs.DeploymentControllerType.CODE_DEPLOY), task_definition=fargate_task_definition, service_name=config.ProjectName + "-" + config.stage) #Main Env listern_health_check_main = elbv2.HealthCheck( healthy_http_codes='200', interval=core.Duration.seconds(5), healthy_threshold_count=2, unhealthy_threshold_count=3, timeout=core.Duration.seconds(4)) #Test Env listern_health_check_test = elbv2.HealthCheck( healthy_http_codes='200', interval=core.Duration.seconds(5), healthy_threshold_count=2, unhealthy_threshold_count=3, timeout=core.Duration.seconds(4)) listener_main = load_balancer.add_listener( "load_balancer_listener_1", port=80, ) listern_main_targets = listener_main.add_targets( "load_balancer_target_1", port=80, health_check=listern_health_check_main, targets=[fargate_service]) listener_test = load_balancer.add_listener( "load_balancer_listener_2", port=8080, ) listern_test_targets = listener_test.add_targets( "load_balancer_target_2", port=80, health_check=listern_health_check_test, targets=[fargate_service]) #Alarms: monitor 500s on target group aws_cloudwatch.Alarm( self, "TargetGroup5xx", metric=listern_main_targets.metric_http_code_target( elbv2.HttpCodeTarget.TARGET_5XX_COUNT), threshold=1, evaluation_periods=1, period=core.Duration.minutes(1)) aws_cloudwatch.Alarm( self, "TargetGroup25xx", metric=listern_test_targets.metric_http_code_target( elbv2.HttpCodeTarget.TARGET_5XX_COUNT), threshold=1, evaluation_periods=1, period=core.Duration.minutes(1)) #Alarms: monitor unhealthy hosts on target group aws_cloudwatch.Alarm( self, "TargetGroupUnhealthyHosts", metric=listern_main_targets.metric('UnHealthyHostCount'), threshold=1, evaluation_periods=1, period=core.Duration.minutes(1)) aws_cloudwatch.Alarm( self, "TargetGroup2UnhealthyHosts", metric=listern_test_targets.metric('UnHealthyHostCount'), threshold=1, evaluation_periods=1, period=core.Duration.minutes(1)) core.CfnOutput(self, "lburl", value=load_balancer.load_balancer_dns_name, export_name="LoadBalancerUrl")
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) yt_api_key = _ssm.StringParameter( self, 'YouTubeApiKey', parameter_name=f"/{id}/YouTubeApiKey", string_value='PLACEHOLDER') yt_list_id = _ssm.StringParameter( self, 'YouTubePlayListId', parameter_name=f"/{id}/YouTubePlayListId", string_value='PLf-O3X2-mxDmn0ikyO7OF8sPr2GDQeZXk') yt_next_page_token = _ssm.StringParameter( self, 'NextPageToken', parameter_name=f"/{id}/NextPageToken", string_value='CAEQAQ') telegram_bot_token = _ssm.StringParameter( self, 'TelegramBotToken', parameter_name=f"/{id}/TelegramBotToken", string_value='PLACEHOLDER') telegram_chat_id = _ssm.StringParameter( self, 'TelegramChatId', parameter_name=f"/{id}/TelegramChatId", string_value='PLACEHOLDER') function = _lambda.Function( self, 'FiqueEmCasaConfPublisher', code=_lambda.Code.asset('src/fique_em_casa_conf/'), handler='lambda_function.lambda_handler', runtime=_lambda.Runtime.PYTHON_3_7, timeout=core.Duration.seconds(30), retry_attempts=0, environment={ 'YT_API_KEY_SSM': yt_api_key.parameter_name, 'YT_LIST_ID_SSM': yt_list_id.parameter_name, 'YT_NEXT_PAGE_TOKEN_SSM': yt_next_page_token.parameter_name, 'TELEGRAM_BOT_TOKEN_SSM': telegram_bot_token.parameter_name, 'TELEGRAM_CHAT_ID_SSM': telegram_chat_id.parameter_name }) yt_api_key.grant_read(function) yt_list_id.grant_read(function) yt_next_page_token.grant_read(function) yt_next_page_token.grant_write(function) telegram_bot_token.grant_read(function) telegram_chat_id.grant_read(function) _events.Rule( self, 'FiqueEmCasaConfSchedule', description= "Sends one video from FiqueEmCasaConf to Telegram every day", enabled=True if 'Prod' in id else False, schedule=_events.Schedule.expression( expression='cron(0 15 * * ? *)'), targets=[_events_targets.LambdaFunction(function)]) error_notifications = _sns.Topic(self, 'ErrorNotifications') fique_em_casa_conf_alarm = _cw.Alarm( self, 'FiqueEmCasaConfErrors', metric=function.metric_errors(), threshold=0, evaluation_periods=1, comparison_operator=_cw.ComparisonOperator.GREATER_THAN_THRESHOLD) fique_em_casa_conf_alarm.add_alarm_action( _cw_actions.SnsAction(error_notifications))
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) CLUSTER_NAME = self.node.try_get_context("cluster_name") NOTIFY_EMAIL = self.node.try_get_context("notify_email") SLACK_WEBHOOK_URL = self.node.try_get_context("webhook_url") if not CLUSTER_NAME or not NOTIFY_EMAIL or not SLACK_WEBHOOK_URL: logger.error( f"Required context variables for {id} were not provided!") else: # Get the log group of our postgres instance log_group = logs.LogGroup.from_log_group_name( self, "InAur01DetectionLogGroup", f"/aws/rds/cluster/{CLUSTER_NAME}/postgresql", ) # Create new metric metric = cloudwatch.Metric( namespace="LogMetrics", metric_name="InAur01DetectionFailedDbLoginAttempts", ) # Apply metric filter # Filter all metrics of failed login attempts in log logs.MetricFilter( self, "InAur01DetectionMetricFilter", log_group=log_group, metric_namespace=metric.namespace, metric_name=metric.metric_name, filter_pattern=logs.FilterPattern.all_terms( "FATAL: password authentication failed for user"), metric_value="1", ) # Create new SNS topic topic = sns.Topic(self, "InAur01DetectionTopic") # Add email subscription topic.add_subscription(subs.EmailSubscription(NOTIFY_EMAIL)) # Create new alarm for metric # Alarm will trigger if there is >= 10 failed login attempts # over a period of 30 seconds. alarm = cloudwatch.Alarm( self, "InAur01DetectionAlarm", metric=metric, threshold=10, evaluation_periods=1, period=core.Duration.seconds(30), datapoints_to_alarm=1, statistic="sum", ) # Add SNS action to alarm alarm.add_alarm_action(cw_actions.SnsAction(topic)) # Create unban lambda lambda_dir_path = os.path.join(os.getcwd(), "ir_cdk_stacks", "in_aur_01") unban_lambda = _lambda.Function( self, "InAur01ResponseUnbanFunction", runtime=_lambda.Runtime.PYTHON_3_8, handler="unban_lambda.lambda_handler", code=_lambda.Code.from_asset(lambda_dir_path), ) # Assign EC2 permissions to lambda unban_lambda.add_to_role_policy( iam.PolicyStatement( actions=["ec2:DeleteNetworkAclEntry"], effect=iam.Effect.ALLOW, resources=["*"], )) # Create stepfunction # Define a second state machine to unban the blacklisted IP after 1 hour wait_step = sfn.Wait( self, "InAur01ResponseStepWait", time=sfn.WaitTime.duration(core.Duration.hours(1)), ) unban_step = sfn.Task( self, "InAur01ResponseStepUnban", task=tasks.RunLambdaTask( unban_lambda, integration_pattern=sfn.ServiceIntegrationPattern. FIRE_AND_FORGET, ), parameters={"Payload.$": "$"}, ) statemachine = sfn.StateMachine( self, "InAur01ResponseUnbanStateMachine", definition=wait_step.next(unban_step), timeout=core.Duration.hours(1.5), ) # Create lambda function lambda_func = _lambda.Function( self, "InAur01ResponseFunction", runtime=_lambda.Runtime.PYTHON_3_8, handler="response_lambda.lambda_handler", code=_lambda.Code.from_asset(lambda_dir_path), environment={ "webhook_url": SLACK_WEBHOOK_URL, "unban_sm_arn": statemachine.state_machine_arn, "cluster_name": CLUSTER_NAME, }, ) # AWS CDK has a bug where it would not add the correct permission # to the lambda for Cloudwatch log subscription to invoke it. # Hence, we need to manually add permission to lambda. lambda_func.add_permission( "InAur01ResponseFunctionInvokePermission", principal=iam.ServicePrincipal("logs.amazonaws.com"), action="lambda:InvokeFunction", source_arn=log_group.log_group_arn + ":*", ) # Assign permissions to response lambda lambda_func.add_to_role_policy( iam.PolicyStatement( actions=[ "states:StartExecution", ], effect=iam.Effect.ALLOW, resources=[statemachine.state_machine_arn], )) # Assign RDS Read-only permissions to lambda lambda_func.add_to_role_policy( iam.PolicyStatement( actions=["rds:Describe*"], effect=iam.Effect.ALLOW, resources=["*"], )) # Assign EC2 permissions to lambda lambda_func.add_to_role_policy( iam.PolicyStatement( actions=[ "ec2:Describe*", "ec2:CreateNetworkAclEntry", "ec2:DeleteNetworkAclEntry", ], effect=iam.Effect.ALLOW, resources=["*"], )) # Assign CloudWatch logs permissions to lambda lambda_func.add_to_role_policy( iam.PolicyStatement( actions=[ "cloudwatch:Get*", "cloudwatch:Describe*", "logs:FilterLogEvents", "logs:DescribeMetricFilters", ], effect=iam.Effect.ALLOW, resources=["*"], )) sns_event_source = lambda_event_sources.SnsEventSource(topic) lambda_func.add_event_source(sns_event_source)
def create_all_queues(self) -> None: """ Create all STACK queues, attach subscriptions and alarms """ # General DLQs for lambdas (not API) self.create_queue(id="dead_letter_queue") general_dlq_alarm = cloudwatch.Alarm( self, "DLQAlarm", metric=self.queues_["dead_letter_queue"].metric( "ApproximateNumberOfMessagesVisible"), evaluation_periods=1, threshold=0.0, comparison_operator=ComparisonOperator.GREATER_THAN_THRESHOLD, ) general_dlq_alarm.add_alarm_action( cw_actions.SnsAction(self.topics_["alarm_topic"])) # DLQ for API lambdas self.create_queue(id="api_dead_letter_queue") api_dlq_alarm = cloudwatch.Alarm( self, "APIDLQAlarm", metric=self.queues_["api_dead_letter_queue"].metric( "ApproximateNumberOfMessagesVisible"), evaluation_periods=1, threshold=0.0, comparison_operator=ComparisonOperator.GREATER_THAN_THRESHOLD, ) api_dlq_alarm.add_alarm_action( cw_actions.SnsAction(self.topics_["alarm_topic"])) # The new_scenes_queue subscribe to CBERS 4/4A quicklooks notifications. The # STAC items are generated from the original INPE metadata file as # soon as the quicklooks are created in the PDS bucket # This code fragment creates the queue, the associated dlq and # subscribe to CBERS 4/4A quicklook notification topics self.create_queue( id="process_new_scenes_queue_dlq", retention_period=core.Duration.seconds(1209600), ) process_new_scenes_queue_alarm = cloudwatch.Alarm( self, "ProcessNewScenesQueueAlarm", metric=self.queues_["process_new_scenes_queue_dlq"].metric( "ApproximateNumberOfMessagesVisible"), evaluation_periods=1, threshold=0.0, comparison_operator=ComparisonOperator.GREATER_THAN_THRESHOLD, ) process_new_scenes_queue_alarm.add_alarm_action( cw_actions.SnsAction(self.topics_["alarm_topic"])) self.create_queue( id="new_scenes_queue", visibility_timeout=core.Duration.seconds(385), retention_period=core.Duration.seconds(1209600), dead_letter_queue=sqs.DeadLetterQueue( max_receive_count=1, queue=self.queues_["process_new_scenes_queue_dlq"]), ) # Add subscriptions for each CB4 camera sns.Topic.from_topic_arn( self, id="CB4MUX", topic_arn="arn:aws:sns:us-east-1:599544552497:NewCB4MUXQuicklook", ).add_subscription( sns_subscriptions.SqsSubscription( self.queues_["new_scenes_queue"])) sns.Topic.from_topic_arn( self, id="CB4AWFI", topic_arn="arn:aws:sns:us-east-1:599544552497:NewCB4AWFIQuicklook", ).add_subscription( sns_subscriptions.SqsSubscription( self.queues_["new_scenes_queue"])) sns.Topic.from_topic_arn( self, id="CB4PAN10M", topic_arn= "arn:aws:sns:us-east-1:599544552497:NewCB4PAN10MQuicklook", ).add_subscription( sns_subscriptions.SqsSubscription( self.queues_["new_scenes_queue"])) sns.Topic.from_topic_arn( self, id="CBPAN5M", topic_arn="arn:aws:sns:us-east-1:599544552497:NewCB4PAN5MQuicklook", ).add_subscription( sns_subscriptions.SqsSubscription( self.queues_["new_scenes_queue"])) # Subscription for CB4A (all cameras) sns.Topic.from_topic_arn( self, id="CB4A", topic_arn="arn:aws:sns:us-east-1:599544552497:NewCB4AQuicklook", ).add_subscription( sns_subscriptions.SqsSubscription( self.queues_["new_scenes_queue"])) self.create_queue( id="catalog_prefix_update_queue", visibility_timeout=core.Duration.seconds(60), retention_period=core.Duration.seconds(1209600), dead_letter_queue=sqs.DeadLetterQueue( max_receive_count=3, queue=self.queues_["dead_letter_queue"]), ) # Reconcile queue for INPE's XML metadata self.create_queue( id="consume_reconcile_queue_dlq", retention_period=core.Duration.seconds(1209600), ) consume_reconcile_queue_alarm = cloudwatch.Alarm( self, "ConsumeReconcileQueueAlarm", metric=self.queues_["consume_reconcile_queue_dlq"].metric( "ApproximateNumberOfMessagesVisible"), evaluation_periods=1, threshold=0.0, comparison_operator=ComparisonOperator.GREATER_THAN_THRESHOLD, ) consume_reconcile_queue_alarm.add_alarm_action( cw_actions.SnsAction(self.topics_["alarm_topic"])) self.create_queue( id="reconcile_queue", visibility_timeout=core.Duration.seconds(1000), retention_period=core.Duration.seconds(1209600), dead_letter_queue=sqs.DeadLetterQueue( max_receive_count=3, queue=self.queues_["consume_reconcile_queue_dlq"]), ) # Reconcile queue for STAC items self.create_queue( id="consume_stac_reconcile_queue_dlq", retention_period=core.Duration.seconds(1209600), ) consume_stac_reconcile_queue_alarm = cloudwatch.Alarm( self, "ConsumeStacReconcileQueueAlarm", metric=self.queues_["consume_stac_reconcile_queue_dlq"].metric( "ApproximateNumberOfMessagesVisible"), evaluation_periods=1, threshold=0.0, comparison_operator=ComparisonOperator.GREATER_THAN_THRESHOLD, ) consume_stac_reconcile_queue_alarm.add_alarm_action( cw_actions.SnsAction(self.topics_["alarm_topic"])) self.create_queue( id="stac_reconcile_queue", visibility_timeout=core.Duration.seconds(1000), retention_period=core.Duration.seconds(1209600), dead_letter_queue=sqs.DeadLetterQueue( max_receive_count=3, queue=self.queues_["consume_stac_reconcile_queue_dlq"], ), ) # Queue for STAC items to be inserted into Elasticsearch. Subscribe to the # topic with new stac items self.create_queue( id="insert_into_elasticsearch_queue", visibility_timeout=core.Duration.seconds(180), retention_period=core.Duration.seconds(1209600), dead_letter_queue=sqs.DeadLetterQueue( max_receive_count=3, queue=self.queues_["dead_letter_queue"]), ) # Subscription for new item topics self.topics_["stac_item_topic"].add_subscription( sns_subscriptions.SqsSubscription( self.queues_["insert_into_elasticsearch_queue"])) # Subscription for reconciled item topics self.topics_["reconcile_stac_item_topic"].add_subscription( sns_subscriptions.SqsSubscription( self.queues_["insert_into_elasticsearch_queue"])) # Backup queue for STAC items inserted into Elasticsearch. # This holds the same items received by "insert_into_elasticsearch_queue", # simply holding them for some time to allow recover from ES # cluster failures (see #78) # This queue subscribe only to new item topics self.create_queue( id="backup_insert_into_elasticsearch_queue", visibility_timeout=core.Duration.seconds(180), retention_period=core.Duration.days( settings.backup_queue_retention_days), dead_letter_queue=sqs.DeadLetterQueue( max_receive_count=3, queue=self.queues_["dead_letter_queue"]), ) # Subscription for new item topics self.topics_["stac_item_topic"].add_subscription( sns_subscriptions.SqsSubscription( self.queues_["backup_insert_into_elasticsearch_queue"]))
def __init__( self, scope: core.Construct, _id: str, *, vpc, bucket_para, # key_name, ddb_file_list, sqs_queue, sqs_queue_DLQ, ssm_bucket_para, ssm_credential_para, s3bucket, s3_deploy, **kwargs) -> None: super().__init__(scope, _id, **kwargs) # Create environment variable into userdata env_var = f'export table_queue_name={ddb_file_list.table_name}\n' \ f'export sqs_queue_name={sqs_queue.queue_name}\n' \ f'export ssm_parameter_bucket={ssm_bucket_para.parameter_name}\n' env_var_st = f'echo \"export table_queue_name={ddb_file_list.table_name}\" >> /etc/rc.local\n' \ f'echo \"export sqs_queue_name={sqs_queue.queue_name}\" >> /etc/rc.local\n' \ f'echo \"export ssm_parameter_bucket={ssm_bucket_para.parameter_name}\" >> /etc/rc.local\n' # Create log group and put group name into userdata s3_migrate_log = logs.LogGroup(self, "applog") cw_agent_config['logs']['logs_collected']['files']['collect_list'][0][ 'log_group_name'] = s3_migrate_log.log_group_name cw_agent_config['logs']['logs_collected']['files']['collect_list'][1][ 'log_group_name'] = s3_migrate_log.log_group_name cw_agent_config['metrics']['append_dimensions'][ 'AutoScalingGroupName'] = "\\${aws:AutoScalingGroupName}" cw_agent_config['metrics']['append_dimensions'][ 'InstanceId'] = "\\${aws:InstanceId}" cw_agent_config_str = json.dumps(cw_agent_config, indent=4).replace("\\\\", "\\") userdata_head = user_data_part1 + cw_agent_config_str + user_data_part2 + \ s3_deploy.bucket_name + " .\n" + env_var + env_var_st jobsender_userdata = userdata_head + user_data_jobsender_p worker_userdata = userdata_head + user_data_worker_p # Create jobsender ec2 node jobsender = autoscaling.AutoScalingGroup( self, "jobsender", instance_type=ec2.InstanceType( instance_type_identifier=jobsender_type), machine_image=linux_ami, # key_name=key_name, user_data=ec2.UserData.custom(jobsender_userdata), vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), desired_capacity=1, min_capacity=0, max_capacity=1) # jobsender.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # Create Worker Autoscaling Group worker_asg = autoscaling.AutoScalingGroup( self, "worker-asg", vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), instance_type=ec2.InstanceType( instance_type_identifier=worker_type), machine_image=linux_ami, # key_name=key_name, # Optional if use SSM-SessionManager user_data=ec2.UserData.custom(worker_userdata), desired_capacity=2, min_capacity=2, max_capacity=10, spot_price="0.5", group_metrics=[autoscaling.GroupMetrics.all()]) # worker_asg.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # Allow EC2 access new DynamoDB Table ddb_file_list.grant_full_access(jobsender) ddb_file_list.grant_full_access(worker_asg) # Allow EC2 access new sqs and its DLQ sqs_queue.grant_consume_messages(jobsender) sqs_queue.grant_send_messages(jobsender) sqs_queue.grant_consume_messages(worker_asg) sqs_queue_DLQ.grant_consume_messages(jobsender) # Allow EC2 access SSM Parameter Store, get bucket infor and get credential ssm_bucket_para.grant_read(jobsender) ssm_credential_para.grant_read(jobsender) ssm_credential_para.grant_read(worker_asg) # Allow EC2 access source code on s3_deploy bucket s3_deploy.grant_read(jobsender) s3_deploy.grant_read(worker_asg) # Allow EC2 access new s3 bucket s3bucket.grant_read(jobsender) s3bucket.grant_read(worker_asg) # Allow EC2 access exist s3 bucket for PUT mode: readonly access the source buckets bucket_name = '' for b in bucket_para: if bucket_name != b['src_bucket']: # 如果列了多个相同的Bucket,就跳过 bucket_name = b['src_bucket'] s3exist_bucket = s3.Bucket.from_bucket_name( self, bucket_name, # 用这个做id bucket_name=bucket_name) s3exist_bucket.grant_read(jobsender) s3exist_bucket.grant_read(worker_asg) # Allow EC2 access exist s3 bucket for GET mode: read and write access the destination buckets # bucket_name = '' # for b in bucket_para: # if bucket_name != b['des_bucket']: # 如果列了多个相同的Bucket,就跳过 # bucket_name = b['des_bucket'] # s3exist_bucket = s3.Bucket.from_bucket_name(self, # bucket_name, # 用这个做id # bucket_name=bucket_name) # s3exist_bucket.grant_read_write(jobsender) # s3exist_bucket.grant_read_write(worker_asg) # Dashboard to monitor SQS and EC2 board = cw.Dashboard(self, "s3_migrate") ec2_metric_cpu_avg = cw.Metric(namespace="AWS/EC2", metric_name="CPUUtilization", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) ec2_metric_net_out = cw.MathExpression( expression= "SEARCH('{AWS/EC2, InstanceId} NetworkOut', 'Average', 60)", label="EC2-NetworkOut", using_metrics={}) autoscaling_GroupDesiredCapacity = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupDesiredCapacity", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupInServiceInstances = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupInServiceInstances", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMinSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMinSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMaxSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMaxSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) # CWAgent collected metric cwagent_mem_avg = cw.MathExpression( expression= "SEARCH('{CWAgent, AutoScalingGroupName, InstanceId} (AutoScalingGroupName=" + worker_asg.auto_scaling_group_name + " AND MetricName=mem_used_percent)', 'Average', 60)", label="mem_avg", using_metrics={}) cwagent_disk_avg = cw.MathExpression( expression= "SEARCH('{CWAgent, path, InstanceId, AutoScalingGroupName, device, fstype} " "(AutoScalingGroupName=" + worker_asg.auto_scaling_group_name + " AND MetricName=disk_used_percent AND path=\"/\")', 'Average', 60)", label="disk_avg", using_metrics={}) cwagent_net_tcp = cw.MathExpression( expression= "SEARCH('{CWAgent, AutoScalingGroupName, InstanceId} (AutoScalingGroupName=" + worker_asg.auto_scaling_group_name + " AND MetricName=tcp_established)', 'Average', 60)", label="tcp_conn", using_metrics={}) # CWAgent collected application logs - filter metric s3_migrate_log.add_metric_filter( "Completed-bytes", metric_name="Completed-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[date, time, info, hs, p="--->Complete", bytes, key]')) s3_migrate_log.add_metric_filter( "Uploading-bytes", metric_name="Uploading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[date, time, info, hs, p="--->Uploading", bytes, key]')) s3_migrate_log.add_metric_filter( "Downloading-bytes", metric_name="Downloading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[date, time, info, hs, p="--->Downloading", bytes, key]')) traffic_metric_Complete = cw.Metric(namespace="s3_migrate", metric_name="Completed-bytes", statistic="Sum", period=core.Duration.minutes(1)) traffic_metric_Upload = cw.Metric(namespace="s3_migrate", metric_name="Uploading-bytes", statistic="Sum", period=core.Duration.minutes(1)) traffic_metric_Download = cw.Metric(namespace="s3_migrate", metric_name="Downloading-bytes", statistic="Sum", period=core.Duration.minutes(1)) s3_migrate_log.add_metric_filter( "ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"ERROR"')) s3_migrate_log.add_metric_filter( "WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"WARNING"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) board.add_widgets( cw.GraphWidget(title="S3-MIGRATION-TOTAL-TRAFFIC", left=[ traffic_metric_Complete, traffic_metric_Upload, traffic_metric_Download ], left_y_axis=cw.YAxisProps(label="Bytes/min", show_units=False)), cw.GraphWidget(title="ERROR/WARNING LOGS", left=[log_metric_ERROR], left_y_axis=cw.YAxisProps(label="Count", show_units=False), right=[log_metric_WARNING], right_y_axis=cw.YAxisProps(label="Count", show_units=False)), cw.GraphWidget( title="SQS-JOBS", left=[ sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.SingleValueWidget( title="RUNNING, WAITING & DEATH JOBS", metrics=[ sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)) ], height=6)) board.add_widgets( cw.GraphWidget(title="EC2-AutoscalingGroup-TCP", left=[cwagent_net_tcp], left_y_axis=cw.YAxisProps(label="Count", show_units=False)), cw.GraphWidget(title="EC2-AutoscalingGroup-CPU/MEMORY", left=[ec2_metric_cpu_avg, cwagent_mem_avg], left_y_axis=cw.YAxisProps(max=100, min=0, label="%", show_units=False)), cw.GraphWidget(title="EC2-AutoscalingGroup-DISK", left=[cwagent_disk_avg], left_y_axis=cw.YAxisProps(max=100, min=0, label="%", show_units=False)), cw.SingleValueWidget(title="EC2-AutoscalingGroup-CAPACITY", metrics=[ autoscaling_GroupDesiredCapacity, autoscaling_GroupInServiceInstances, autoscaling_GroupMinSize, autoscaling_GroupMaxSize ], height=6)) board.add_widgets( cw.GraphWidget(title="EC2-NetworkOut", left=[ec2_metric_net_out], left_y_axis=cw.YAxisProps(label="Bytes/min", show_units=False))) # Autoscaling up when visible message > 100 in 5 mins worker_asg.scale_on_metric( "scaleup", metric=sqs_queue.metric_approximate_number_of_messages_visible(), scaling_steps=[ autoscaling.ScalingInterval(change=1, lower=100, upper=500), autoscaling.ScalingInterval(change=2, lower=500), autoscaling.ScalingInterval(change=0, upper=100, lower=0) ], adjustment_type=autoscaling.AdjustmentType.CHANGE_IN_CAPACITY) # Alarm for queue empty and ec2 > 1 # 消息队列空(没有Visible+Invisible),并且EC2不止一台,则告警,并设置EC2为1台 # 这里还可以根据场景调整,如果Jobsender也用来做传输,则可以在这里设置没有任务的时候,Autoscaling Group为0 metric_all_message = cw.MathExpression( expression="IF(((a+b) == 0) AND (c >1), 0, 1)", # a+b且c>1则设置为0,告警 label="empty_queue_expression", using_metrics={ "a": sqs_queue.metric_approximate_number_of_messages_visible(), "b": sqs_queue.metric_approximate_number_of_messages_not_visible(), "c": autoscaling_GroupInServiceInstances }) alarm_0 = cw.Alarm( self, "SQSempty", alarm_name= "s3-migration-cluster-SQS queue empty and ec2 more than 1 in Cluster", metric=metric_all_message, threshold=0, comparison_operator=cw.ComparisonOperator. LESS_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.NOT_BREACHING) alarm_topic_empty = sns.Topic( self, "SQS queue empty and ec2 more than 1 in Cluster") # 这个告警可以作为批量传输完成后的通知,而且这样做可以只通知一次,而不会不停地通知 alarm_topic_empty.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_0.add_alarm_action(action.SnsAction(alarm_topic_empty)) # If queue empty, set autoscale down to 1 EC2 action_shutdown = autoscaling.StepScalingAction( self, "shutdown", auto_scaling_group=worker_asg, adjustment_type=autoscaling.AdjustmentType.EXACT_CAPACITY) action_shutdown.add_adjustment(adjustment=1, upper_bound=0) alarm_0.add_alarm_action(action.AutoScalingAction(action_shutdown)) # While message in SQS-DLQ, alarm to sns alarm_DLQ = cw.Alarm( self, "SQS_DLQ", alarm_name= "s3-migration-cluster-SQS DLQ more than 1 message-Cluster", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible( ), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.IGNORE) alarm_topic_DLQ = sns.Topic(self, "SQS DLQ more than 1 message-Cluster") alarm_topic_DLQ.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic_DLQ)) # Output core.CfnOutput(self, "LogGroup", value=s3_migrate_log.log_group_name) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_cluster") core.CfnOutput(self, "Alarm", value="CloudWatch SQS queue empty Alarm for cluster: " + alarm_email)
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) security_distribution_list_email = '*****@*****.**' # securityhub_instance = securityhub.CfnHub(self, 'SecurityHub') # Ensure AWS Config is enabled / Ensure CloudTrail is enabled in all Regions 2.1 - 2.8 cloudtrail_bucket_accesslogs = s3.Bucket( self, "CloudTrailS3Accesslogs", block_public_access=s3.BlockPublicAccess.BLOCK_ALL, encryption=s3.BucketEncryption.S3_MANAGED, removal_policy=core.RemovalPolicy.RETAIN) cloudtrail_bucket = s3.Bucket( self, "CloudTrailS3", block_public_access=s3.BlockPublicAccess.BLOCK_ALL, encryption=s3.BucketEncryption.S3_MANAGED, removal_policy=core.RemovalPolicy.RETAIN, server_access_logs_bucket=cloudtrail_bucket_accesslogs, ) cloudtrail_kms = kms.Key(self, "CloudTrailKey", enable_key_rotation=True) # CloudTrail - single account, not Organization trail = cloudtrail.Trail( self, "CloudTrail", enable_file_validation=True, is_multi_region_trail=True, include_global_service_events=True, send_to_cloud_watch_logs=True, cloud_watch_logs_retention=logs.RetentionDays.FOUR_MONTHS, bucket=cloudtrail_bucket, kms_key=cloudtrail_kms) cloudtrail_kms.grant(iam.ServicePrincipal('cloudtrail.amazonaws.com'), 'kms:DescribeKey') cloudtrail_kms.grant( iam.ServicePrincipal( 'cloudtrail.amazonaws.com', conditions={ 'StringLike': { 'kms:EncryptionContext:aws:cloudtrail:arn': 'arn:aws:cloudtrail:*:' + core.Stack.of(self).account + ':trail/*' } }), 'kms:GenerateDataKey*') cloudtrail_kms.add_to_resource_policy( iam.PolicyStatement( actions=["kms:Decrypt", "kms:ReEncryptFrom"], conditions={ 'StringEquals': { 'kms:CallerAccount': core.Stack.of(self).account }, 'StringLike': { 'kms:EncryptionContext:aws:cloudtrail:arn': 'arn:aws:cloudtrail:*:' + core.Stack.of(self).account + ':trail/*' } }, effect=iam.Effect.ALLOW, principals=[iam.AnyPrincipal()], resources=['*'])) cloudtrail_kms.add_to_resource_policy( iam.PolicyStatement(actions=["kms:CreateAlias"], conditions={ 'StringEquals': { 'kms:CallerAccount': core.Stack.of(self).account, 'kms:ViaService': 'ec2.' + core.Stack.of(self).region + '.amazonaws.com' } }, effect=iam.Effect.ALLOW, principals=[iam.AnyPrincipal()], resources=['*'])) cloudtrail_kms.add_to_resource_policy( iam.PolicyStatement( actions=["kms:Decrypt", "kms:ReEncryptFrom"], conditions={ 'StringEquals': { 'kms:CallerAccount': core.Stack.of(self).account }, 'StringLike': { 'kms:EncryptionContext:aws:cloudtrail:arn': 'arn:aws:cloudtrail:*:' + core.Stack.of(self).account + ':trail/*' } }, effect=iam.Effect.ALLOW, principals=[iam.AnyPrincipal()], resources=['*'])) config_role = iam.CfnServiceLinkedRole( self, id='ServiceLinkedRoleConfig', aws_service_name='config.amazonaws.com') global_config = config.CfnConfigurationRecorder(self, 'ConfigRecorder', name='default', # role_arn=config_role.role_arn, role_arn="arn:aws:iam::" + \ core.Stack.of( self).account+":role/aws-service-role/config.amazonaws.com/AWSServiceRoleForConfig", # role_arn=config_role.get_att( # attribute_name='resource.arn').to_string(), recording_group=config.CfnConfigurationRecorder.RecordingGroupProperty( all_supported=True, include_global_resource_types=True ) ) config_bucket = s3.Bucket( self, "ConfigS3", block_public_access=s3.BlockPublicAccess.BLOCK_ALL, encryption=s3.BucketEncryption.S3_MANAGED, removal_policy=core.RemovalPolicy.RETAIN, ) config_bucket.add_to_resource_policy( iam.PolicyStatement( actions=['s3:GetBucketAcl'], effect=iam.Effect.ALLOW, principals=[iam.ServicePrincipal('config.amazonaws.com')], resources=[config_bucket.bucket_arn])) config_bucket.add_to_resource_policy( iam.PolicyStatement( actions=['s3:PutObject'], effect=iam.Effect.ALLOW, principals=[iam.ServicePrincipal('config.amazonaws.com')], resources=[ config_bucket.arn_for_objects('AWSLogs/' + core.Stack.of(self).account + '/Config/*') ], conditions={ "StringEquals": { 's3:x-amz-acl': 'bucket-owner-full-control', } })) config_delivery_stream = config.CfnDeliveryChannel( self, "ConfigDeliveryChannel", s3_bucket_name=config_bucket.bucket_name) # Config Aggregator in Organizations account # config_aggregator = config.CfnConfigurationAggregator(self, 'ConfigAggregator', # configuration_aggregator_name='ConfigAggregator', # organization_aggregation_source=config.CfnConfigurationAggregator.OrganizationAggregationSourceProperty( # role_arn=iam.Role(self, "AWSConfigRoleForOrganizations", # assumed_by=iam.ServicePrincipal( # 'config.amazonaws.com'), # managed_policies=[iam.ManagedPolicy.from_aws_managed_policy_name( # 'service-role/AWSConfigRoleForOrganizations')] # ).role_arn, # all_aws_regions=True # ) # ) # 2.9 – Ensure VPC flow logging is enabled in all VPCs # vpc = ec2.Vpc.from_lookup(self, "VPC", # is_default=True, # ) # S3 for VPC flow logs # vpc_flow_logs_bucket = s3.Bucket(self, "VPCFlowLogsBucket", # block_public_access=s3.BlockPublicAccess.BLOCK_ALL, # encryption=s3.BucketEncryption.S3_MANAGED, # removal_policy=core.RemovalPolicy.RETAIN # ) # Ensure a log metric filter and alarm exist for 3.1 – 3.14 security_notifications_topic = sns.Topic(self, 'CIS_Topic', display_name='CIS_Topic', topic_name='CIS_Topic') sns.Subscription(self, 'CIS_Subscription', topic=security_notifications_topic, protocol=sns.SubscriptionProtocol.EMAIL, endpoint=security_distribution_list_email) cloudwatch_actions_cis = cloudwatch_actions.SnsAction( security_notifications_topic) cis_metricfilter_alarms = { 'CIS-3.1-UnauthorizedAPICalls': '($.errorCode="*UnauthorizedOperation") || ($.errorCode="AccessDenied*")', 'CIS-3.2-ConsoleSigninWithoutMFA': '($.eventName="ConsoleLogin") && ($.additionalEventData.MFAUsed !="Yes")', 'RootAccountUsageAlarm': '$.userIdentity.type="Root" && $.userIdentity.invokedBy NOT EXISTS && $.eventType !="AwsServiceEvent"', 'CIS-3.4-IAMPolicyChanges': '($.eventName=DeleteGroupPolicy) || ($.eventName=DeleteRolePolicy) || ($.eventName=DeleteUserPolicy) || ($.eventName=PutGroupPolicy) || ($.eventName=PutRolePolicy) || ($.eventName=PutUserPolicy) || ($.eventName=CreatePolicy) || ($.eventName=DeletePolicy) || ($.eventName=CreatePolicyVersion) || ($.eventName=DeletePolicyVersion) || ($.eventName=AttachRolePolicy) || ($.eventName=DetachRolePolicy) || ($.eventName=AttachUserPolicy) || ($.eventName=DetachUserPolicy) || ($.eventName=AttachGroupPolicy) || ($.eventName=DetachGroupPolicy)', 'CIS-3.5-CloudTrailChanges': '($.eventName=CreateTrail) || ($.eventName=UpdateTrail) || ($.eventName=DeleteTrail) || ($.eventName=StartLogging) || ($.eventName=StopLogging)', 'CIS-3.6-ConsoleAuthenticationFailure': '($.eventName=ConsoleLogin) && ($.errorMessage="Failed authentication")', 'CIS-3.7-DisableOrDeleteCMK': '($.eventSource=kms.amazonaws.com) && (($.eventName=DisableKey) || ($.eventName=ScheduleKeyDeletion))', 'CIS-3.8-S3BucketPolicyChanges': '($.eventSource=s3.amazonaws.com) && (($.eventName=PutBucketAcl) || ($.eventName=PutBucketPolicy) || ($.eventName=PutBucketCors) || ($.eventName=PutBucketLifecycle) || ($.eventName=PutBucketReplication) || ($.eventName=DeleteBucketPolicy) || ($.eventName=DeleteBucketCors) || ($.eventName=DeleteBucketLifecycle) || ($.eventName=DeleteBucketReplication))', 'CIS-3.9-AWSConfigChanges': '($.eventSource=config.amazonaws.com) && (($.eventName=StopConfigurationRecorder) || ($.eventName=DeleteDeliveryChannel) || ($.eventName=PutDeliveryChannel) || ($.eventName=PutConfigurationRecorder))', 'CIS-3.10-SecurityGroupChanges': '($.eventName=AuthorizeSecurityGroupIngress) || ($.eventName=AuthorizeSecurityGroupEgress) || ($.eventName=RevokeSecurityGroupIngress) || ($.eventName=RevokeSecurityGroupEgress) || ($.eventName=CreateSecurityGroup) || ($.eventName=DeleteSecurityGroup)', 'CIS-3.11-NetworkACLChanges': '($.eventName=CreateNetworkAcl) || ($.eventName=CreateNetworkAclEntry) || ($.eventName=DeleteNetworkAcl) || ($.eventName=DeleteNetworkAclEntry) || ($.eventName=ReplaceNetworkAclEntry) || ($.eventName=ReplaceNetworkAclAssociation)', 'CIS-3.12-NetworkGatewayChanges': '($.eventName=CreateCustomerGateway) || ($.eventName=DeleteCustomerGateway) || ($.eventName=AttachInternetGateway) || ($.eventName=CreateInternetGateway) || ($.eventName=DeleteInternetGateway) || ($.eventName=DetachInternetGateway)', 'CIS-3.13-RouteTableChanges': '($.eventName=CreateRoute) || ($.eventName=CreateRouteTable) || ($.eventName=ReplaceRoute) || ($.eventName=ReplaceRouteTableAssociation) || ($.eventName=DeleteRouteTable) || ($.eventName=DeleteRoute) || ($.eventName=DisassociateRouteTable)', 'CIS-3.14-VPCChanges': '($.eventName=CreateVpc) || ($.eventName=DeleteVpc) || ($.eventName=ModifyVpcAttribute) || ($.eventName=AcceptVpcPeeringConnection) || ($.eventName=CreateVpcPeeringConnection) || ($.eventName=DeleteVpcPeeringConnection) || ($.eventName=RejectVpcPeeringConnection) || ($.eventName=AttachClassicLinkVpc) || ($.eventName=DetachClassicLinkVpc) || ($.eventName=DisableVpcClassicLink) || ($.eventName=EnableVpcClassicLink)', } for x, y in cis_metricfilter_alarms.items(): str_x = str(x) str_y = str(y) logs.MetricFilter( self, "MetricFilter_" + str_x, log_group=trail.log_group, filter_pattern=logs.JsonPattern(json_pattern_string=str_y), metric_name=str_x, metric_namespace="LogMetrics", metric_value='1') cloudwatch.Alarm( self, "Alarm_" + str_x, alarm_name=str_x, alarm_description=str_x, statistic='Sum', period=core.Duration.minutes(5), comparison_operator=cloudwatch.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=1, threshold=1, metric=cloudwatch.Metric(metric_name=str_x, namespace="LogMetrics"), ).add_alarm_action(cloudwatch_actions_cis) # IAM Password Policy custom resource CIS 1.5 - 1.11 cfn_template = cfn_inc.CfnInclude( self, "includeTemplate", template_file="account-password-policy.yaml", parameters={ "MaxPasswordAge": 90, "MinimumPasswordLength": 14, "PasswordReusePrevention": 24, "RequireLowercaseCharacters": True, "RequireNumbers": True, "RequireSymbols": True, "RequireUppercaseCharacters": True, }) # CIS 1.20 support_role = iam.Role( self, "SupportRole", assumed_by=iam.AccountPrincipal( account_id=core.Stack.of(self).account), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( 'AWSSupportAccess') ], role_name='AWSSupportAccess') guardduty_detector = guardduty.CfnDetector(self, 'GuardDutyDetector', enable=True) guardduty_event = events.Rule( self, 'GuardDutyEvent', rule_name='guardduty-notification', description='GuardDuty Notification', event_pattern=events.EventPattern( source=['aws.guardduty'], detail_type=['GuardDuty Finding']), targets=[events_targets.SnsTopic(security_notifications_topic)])
def __init__(self, scope: core.Construct, id: str, **kwargs): super().__init__(scope, id, **kwargs) # The code that defines your stack goes here this_dir = path.dirname(__file__) handler = lmb.Function(self, 'Handler', runtime=lmb.Runtime.PYTHON_3_7, handler='handler.handler', code=lmb.Code.from_asset( path.join(this_dir, 'lambda'))) alias = lmb.Alias(self, "HandlerAlias", alias_name="Current", version=handler.current_version) gw = apigw.LambdaRestApi( self, 'Gateway', description='Endpoint for a singple Lambda-powered web service', handler=alias, endpoint_types=[EndpointType.REGIONAL]) failure_alarm = cloudwatch.Alarm( self, "FailureAlarm", alarm_name=self.stack_name + '-' + '500Alarm', metric=cloudwatch.Metric(metric_name="5XXError", namespace="AWS/ApiGateway", dimensions={ "ApiName": "Gateway", }, statistic="Sum", period=core.Duration.minutes(1)), threshold=1, evaluation_periods=1) alarm500topic = sns.Topic(self, "Alarm500Topic", topic_name=self.stack_name + '-' + 'Alarm500TopicSNS') alarm500topic.add_subscription( subscriptions.EmailSubscription("*****@*****.**")) failure_alarm.add_alarm_action(cw_actions.SnsAction(alarm500topic)) codedeploy.LambdaDeploymentGroup( self, "DeploymentGroup", alias=alias, deployment_config=codedeploy.LambdaDeploymentConfig. CANARY_10_PERCENT_10_MINUTES, alarms=[failure_alarm]) # Create a dynamodb table table_name = self.stack_name + '-' + 'HelloCdkTable' table = dynamodb.Table(self, "TestTable", table_name=table_name, partition_key=Attribute( name="id", type=dynamodb.AttributeType.STRING)) table_name_id = cr.PhysicalResourceId.of(table.table_name) on_create_action = AwsSdkCall( action='putItem', service='DynamoDB', physical_resource_id=table_name_id, parameters={ 'Item': { 'id': { 'S': 'HOLA_CREATE' }, 'date': { 'S': datetime.today().strftime('%Y-%m-%d') }, 'epoch': { 'N': str(int(time.time())) } }, 'TableName': table_name }) on_update_action = AwsSdkCall( action='putItem', service='DynamoDB', physical_resource_id=table_name_id, parameters={ 'Item': { 'id': { 'S': 'HOLA_UPDATE' }, 'date': { 'S': datetime.today().strftime('%Y-%m-%d') }, 'epoch': { 'N': str(int(time.time())) } }, 'TableName': table_name }) cr.AwsCustomResource( self, "TestTableCustomResource", on_create=on_create_action, on_update=on_update_action, policy=cr.AwsCustomResourcePolicy.from_sdk_calls( resources=cr.AwsCustomResourcePolicy.ANY_RESOURCE)) # OUTPUT self.url_output = core.CfnOutput(self, 'Url', value=gw.url)
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # ----------------------------------------------------------------------------------------------------------- # The Simple Webservice Logic - This is what we will be monitoring # # API GW HTTP API, Lambda Fn and DynamoDB # https://github.com/cdk-patterns/serverless/tree/master/the-simple-webservice # ----------------------------------------------------------------------------------------------------------- # DynamoDB Table table = dynamo_db.Table( self, "Hits", partition_key=dynamo_db.Attribute( name="path", type=dynamo_db.AttributeType.STRING), billing_mode=dynamo_db.BillingMode.PAY_PER_REQUEST) # defines an AWS Lambda resource dynamo_lambda = _lambda.Function( self, "DynamoLambdaHandler", runtime=_lambda.Runtime.NODEJS_12_X, # execution environment handler="lambda.handler", # file is "lambda", function is "handler" code=_lambda.Code.from_asset( "lambda_fns"), # Code loaded from the lambda dir environment={'HITS_TABLE_NAME': table.table_name}) # grant the lambda role read/write permissions to our table' table.grant_read_write_data(dynamo_lambda) # defines an API Gateway Http API resource backed by our "dynamoLambda" function. api = api_gw.HttpApi( self, 'HttpAPI', default_integration=integrations.LambdaProxyIntegration( handler=dynamo_lambda)) core.CfnOutput(self, 'HTTP API Url', value=api.url) # ----------------------------------------------------------------------------------------------------------- # Monitoring Logic Starts Here # # This is everything we need to understand the state of our system: # - custom metrics # - cloudwatch alarms # - custom cloudwatch dashboard # ----------------------------------------------------------------------------------------------------------- # SNS Topic so we can hook things into our alerts e.g. email error_topic = sns.Topic(self, 'theBigFanTopic') ### # Custom Metrics ### api_gw_4xx_error_percentage = cloud_watch.MathExpression( expression="m1/m2*100", label="% API Gateway 4xx Errors", using_metrics={ "m1": self.metric_for_api_gw(api.http_api_id, '4XXError', '4XX Errors', 'sum'), "m2": self.metric_for_api_gw(api.http_api_id, 'Count', '# Requests', 'sum'), }, period=core.Duration.minutes(5)) # Gather the % of lambda invocations that error in past 5 mins lambda_error_perc = cloud_watch.MathExpression( expression="e / i * 100", label="% of invocations that errored, last 5 mins", using_metrics={ "i": dynamo_lambda.metric(metric_name="Invocations", statistic="sum"), "e": dynamo_lambda.metric(metric_name="Errors", statistic="sum"), }, period=core.Duration.minutes(5)) # note: throttled requests are not counted in total num of invocations lambda_throttled_perc = cloud_watch.MathExpression( expression="t / (i + t) * 100", label="% of throttled requests, last 30 mins", using_metrics={ "i": dynamo_lambda.metric(metric_name="Invocations", statistic="sum"), "t": dynamo_lambda.metric(metric_name="Throttles", statistic="sum"), }, period=core.Duration.minutes(5)) # I think usererrors are at an account level rather than a table level so merging # these two metrics until I can get a definitive answer. I think usererrors # will always show as 0 when scoped to a table so this is still effectively # a system errors count dynamo_db_total_errors = cloud_watch.MathExpression( expression="m1 + m2", label="DynamoDB Errors", using_metrics={ "m1": table.metric_user_errors(), "m2": table.metric_system_errors_for_operations(), }, period=core.Duration.minutes(5)) # Rather than have 2 alerts, let's create one aggregate metric dynamo_db_throttles = cloud_watch.MathExpression( expression="m1 + m2", label="DynamoDB Throttles", using_metrics={ "m1": table.metric(metric_name="ReadThrottleEvents", statistic="sum"), "m2": table.metric(metric_name="WriteThrottleEvents", statistic="sum"), }, period=core.Duration.minutes(5)) ### # Alarms ### # Api Gateway # 4xx are user errors so a large volume indicates a problem cloud_watch.Alarm(self, id="API Gateway 4XX Errors > 1%", metric=api_gw_4xx_error_percentage, threshold=1, evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # 5xx are internal server errors so we want 0 of these cloud_watch.Alarm(self, id="API Gateway 5XX Errors > 0", metric=self.metric_for_api_gw(api_id=api.http_api_id, metric_name="5XXError", label="5XX Errors", stat="p99"), threshold=0, period=core.Duration.minutes(5), evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) cloud_watch.Alarm(self, id="API p99 latency alarm >= 1s", metric=self.metric_for_api_gw(api_id=api.http_api_id, metric_name="Latency", label="API GW Latency", stat="p99"), threshold=1000, period=core.Duration.minutes(5), evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # Lambda # 2% of Dynamo Lambda invocations erroring cloud_watch.Alarm(self, id="Dynamo Lambda 2% Error", metric=lambda_error_perc, threshold=2, evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # 1% of Lambda invocations taking longer than 1 second cloud_watch.Alarm(self, id="Dynamo Lambda p99 Long Duration (>1s)", metric=dynamo_lambda.metric_duration(), period=core.Duration.minutes(5), threshold=1000, evaluation_periods=6, datapoints_to_alarm=1, statistic="p99", treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # 2% of our lambda invocations are throttled cloud_watch.Alarm(self, id="Dynamo Lambda 2% Throttled", metric=lambda_throttled_perc, threshold=2, evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # DynamoDB # DynamoDB Interactions are throttled - indicated poorly provisioned cloud_watch.Alarm(self, id="DynamoDB Table Reads/Writes Throttled", metric=dynamo_db_throttles, threshold=1, evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # There should be 0 DynamoDB errors cloud_watch.Alarm(self, id="DynamoDB Errors > 0", metric=dynamo_db_total_errors, threshold=0, evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) dashboard = cloud_watch.Dashboard(self, id="CloudWatchDashBoard") dashboard.add_widgets( cloud_watch.GraphWidget(title="Requests", width=8, left=[ self.metric_for_api_gw( api_id=api.http_api_id, metric_name="Count", label="# Requests", stat="sum") ]), cloud_watch.GraphWidget( title="API GW Latency", width=8, stacked=True, left=[ self.metric_for_api_gw(api_id=api.http_api_id, metric_name="Latency", label="API Latency p50", stat="p50"), self.metric_for_api_gw(api_id=api.http_api_id, metric_name="Latency", label="API Latency p90", stat="p90"), self.metric_for_api_gw(api_id=api.http_api_id, metric_name="Latency", label="API Latency p99", stat="p99") ]), cloud_watch.GraphWidget( title="API GW Errors", width=8, stacked=True, left=[ self.metric_for_api_gw(api_id=api.http_api_id, metric_name="4XXError", label="4XX Errors", stat="sum"), self.metric_for_api_gw(api_id=api.http_api_id, metric_name="5XXError", label="5XX Errors", stat="sum") ]), cloud_watch.GraphWidget(title="Dynamo Lambda Error %", width=8, left=[lambda_error_perc]), cloud_watch.GraphWidget( title="Dynamo Lambda Duration", width=8, stacked=True, left=[ dynamo_lambda.metric_duration(statistic="p50"), dynamo_lambda.metric_duration(statistic="p90"), dynamo_lambda.metric_duration(statistic="p99") ]), cloud_watch.GraphWidget(title="Dynamo Lambda Throttle %", width=8, left=[lambda_throttled_perc]), cloud_watch.GraphWidget( title="DynamoDB Latency", width=8, stacked=True, left=[ table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "GetItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "UpdateItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "PutItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "DeleteItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "Query" }), ]), cloud_watch.GraphWidget( title="DynamoDB Consumed Read/Write Units", width=8, stacked=False, left=[ table.metric(metric_name="ConsumedReadCapacityUnits"), table.metric(metric_name="ConsumedWriteCapacityUnits") ]), cloud_watch.GraphWidget( title="DynamoDB Throttles", width=8, stacked=True, left=[ table.metric(metric_name="ReadThrottleEvents", statistic="sum"), table.metric(metric_name="WriteThrottleEvents", statistic="sum") ]), )
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) env = kwargs['env'] work_dir = pathlib.Path(__file__).parents[1] # These below steps allows to reuse ecs cluster which is aleady creatd by shared stack # Get cluster name from ssm parameter cluster_name = ssm.StringParameter.from_string_parameter_name( self, "GetClusterName", string_parameter_name="/dev/compute/container/ecs-cluster-name" ).string_value vpc_az = ssm.StringListParameter.from_string_list_parameter_name( self, "GetVpcAz", string_list_parameter_name="/dev/network/vpc/vpc-az" ).string_list_value # using string instead of stringlist because of subnets parsing issue vpc_public_subnets_1 = ssm.StringParameter.from_string_parameter_name( self, "GetVpcPublicSubnets1", string_parameter_name="/dev/network/vpc/vpc-public-subnets-1" ).string_value vpc_public_subnets_2 = ssm.StringParameter.from_string_parameter_name( self, "GetVpcPublicSubnets2", string_parameter_name="/dev/network/vpc/vpc-public-subnets-2" ).string_value vpc_id = ssm.StringParameter.from_string_parameter_name( self, "GetVpcId", string_parameter_name="/dev/network/vpc/vpc-id").string_value ec2_vpc = ec2.Vpc.from_vpc_attributes( self, "GetVpc", availability_zones=vpc_az, vpc_id=vpc_id, public_subnet_ids=[vpc_public_subnets_1, vpc_public_subnets_2]) # Get security group id from ssm parameter security_group_id = ssm.StringParameter.from_string_parameter_name( self, "GetSgId", string_parameter_name="/dev/network/vpc/security-group-id" ).string_value # Get security group from lookup ec2_sgp = ec2.SecurityGroup.from_security_group_id( self, "GetSgp", security_group_id=security_group_id) # myDateTimeFunction lambda function my_datetime_lambda = _lambda.Function( self, "my-datetime", runtime=_lambda.Runtime.NODEJS_12_X, handler="myDateTimeFunction.handler", code=_lambda.Code.asset("./lambda"), current_version_options=_lambda.VersionOptions( removal_policy=core.RemovalPolicy.RETAIN, retry_attempts=1)) my_datetime_lambda.add_to_role_policy( iam.PolicyStatement(effect=iam.Effect.ALLOW, actions=["lambda:InvokeFunction"], resources=["*"])) # beforeAllowTraffic lambda function pre_traffic_lambda = _lambda.Function( self, "pre-traffic", runtime=_lambda.Runtime.NODEJS_12_X, handler="beforeAllowTraffic.handler", code=_lambda.Code.asset("./lambda"), environment=dict( NewVersion=my_datetime_lambda.current_version.function_arn)) pre_traffic_lambda.add_to_role_policy( iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=["codedeploy:PutLifecycleEventHookExecutionStatus"], resources=["*"])) pre_traffic_lambda.add_to_role_policy( iam.PolicyStatement(effect=iam.Effect.ALLOW, actions=["lambda:InvokeFunction"], resources=["*"])) # afterAllowTraffic lambda function post_traffic_lambda = _lambda.Function( self, "post-traffic", runtime=_lambda.Runtime.NODEJS_12_X, handler="afterAllowTraffic.handler", code=_lambda.Code.asset("./lambda"), environment=dict( NewVersion=my_datetime_lambda.current_version.function_arn)) post_traffic_lambda.add_to_role_policy( iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=["codedeploy:PutLifecycleEventHookExecutionStatus"], resources=["*"])) post_traffic_lambda.add_to_role_policy( iam.PolicyStatement(effect=iam.Effect.ALLOW, actions=["lambda:InvokeFunction"], resources=["*"])) # create a cloudwatch event rule rule = events.Rule( self, "CanaryRule", schedule=events.Schedule.expression("rate(10 minutes)"), targets=[ events_targets.LambdaFunction( my_datetime_lambda.current_version) ], ) # create a cloudwatch alarm based on the lambda erros metrics alarm = cloudwatch.Alarm( self, "CanaryAlarm", metric=my_datetime_lambda.current_version.metric_invocations(), threshold=0, evaluation_periods=2, datapoints_to_alarm=2, treat_missing_data=cloudwatch.TreatMissingData.IGNORE, period=core.Duration.minutes(5), alarm_name="CanaryAlarm") lambda_deployment_group = codedeploy.LambdaDeploymentGroup( self, "datetime-lambda-deployment", alias=my_datetime_lambda.current_version.add_alias("live"), deployment_config=codedeploy.LambdaDeploymentConfig.ALL_AT_ONCE, alarms=[alarm], auto_rollback=codedeploy.AutoRollbackConfig( deployment_in_alarm=True), pre_hook=pre_traffic_lambda, post_hook=post_traffic_lambda) # Pass vpc, sgp and ecs cluster name to get ecs cluster info ecs_cluster = ecs.Cluster.from_cluster_attributes( self, "GetEcsCluster", cluster_name=cluster_name, vpc=ec2_vpc, security_groups=[ec2_sgp]) # Fargate Service task_definition = ecs.FargateTaskDefinition( self, "TaskDef", memory_limit_mib=512, cpu=256, ) container = task_definition.add_container( "web", image=ecs.ContainerImage.from_asset( os.path.join(work_dir, "container")), # Built custom health check for your application specific # and add them here. Ex: Pingcheck, Database etc. health_check=ecs.HealthCheck(command=["CMD-SHELL", "echo"]), # environment=dict(name="latest") ) port_mapping = ecs.PortMapping(container_port=8000, protocol=ecs.Protocol.TCP) container.add_port_mappings(port_mapping) # Create Fargate Service # Current limitation: Blue/Green deployment # https://github.com/aws/aws-cdk/issues/1559 service = ecs.FargateService( self, "Service", cluster=ecs_cluster, task_definition=task_definition, assign_public_ip=True, deployment_controller=ecs.DeploymentController( type=ecs.DeploymentControllerType.ECS), desired_count=2, min_healthy_percent=50) # Create Application LoadBalancer lb = elbv2.ApplicationLoadBalancer(self, "LB", vpc=ec2_vpc, internet_facing=True) # Add listener to the LB listener = lb.add_listener("Listener", port=80, open=True) # Default to Lambda listener.add_targets( "Lambda", targets=[elb_targets.LambdaTarget(my_datetime_lambda)]) # Additionally route to container listener.add_targets("Fargate", port=8000, path_pattern="/container", priority=10, targets=[service]) # add an output with a well-known name to read it from the integ tests self.load_balancer_dns_name = lb.load_balancer_dns_name
def __init__(self, scope: core.Construct, id: str, vpc:aws_ec2.Vpc, ecs_cluster=aws_ecs.Cluster, alb=elbv2.ApplicationLoadBalancer, albTestListener=elbv2.ApplicationListener,albProdListener=elbv2.ApplicationListener, FlaskBlueGroup=elbv2.ApplicationTargetGroup, FlaskGreenGroup=elbv2.ApplicationTargetGroup, **kwargs) -> None: super().__init__(scope, id, **kwargs) ECS_APP_NAME="Flask-app", ECS_DEPLOYMENT_GROUP_NAME = "FlaskAppECSBlueGreen" ECS_DEPLOYMENT_CONFIG_NAME = "CodeDeployDefault.ECSAllAtOnce" ECS_TASKSET_TERMINATION_WAIT_TIME = 10 ECS_TASK_FAMILY_NAME = "Flask-microservice" ECS_APP_NAME = "Flask-microservice" ECS_APP_LOG_GROUP_NAME = "/ecs/Flask-microservice" DUMMY_TASK_FAMILY_NAME = "sample-Nginx-microservice" DUMMY_APP_NAME = "sample-Nginx-microservice" DUMMY_APP_LOG_GROUP_NAME = "/ecs/sample-Nginx-microservice" DUMMY_CONTAINER_IMAGE = "smuralee/nginx" # ============================================================================= # ECR and CodeCommit repositories for the Blue/ Green deployment # ============================================================================= # ECR repository for the docker images FlaskecrRepo = aws_ecr.Repository(self, "FlaskRepo", image_scan_on_push=True ) FlaskCodeCommitrepo = aws_codecommit.Repository(self, "FlaskRepository", repository_name=ECS_APP_NAME, description="Oussama Flask application" ) # ============================================================================= # CODE BUILD and ECS TASK ROLES for the Blue/ Green deployment # ============================================================================= # IAM role for the Code Build project FlaskcodeBuildServiceRole = aws_iam.Role(self, "FlaskcodeBuildServiceRole", assumed_by=aws_iam.ServicePrincipal('codebuild.amazonaws.com') ) inlinePolicyForCodeBuild = aws_iam.PolicyStatement( effect=aws_iam.Effect.ALLOW, actions=[ "ecr:GetAuthorizationToken", "ecr:BatchCheckLayerAvailability", "ecr:InitiateLayerUpload", "ecr:UploadLayerPart", "ecr:CompleteLayerUpload", "ecr:PutImage" ], resources=["*"] ) FlaskcodeBuildServiceRole.add_to_policy(inlinePolicyForCodeBuild) # ECS task role FlaskecsTaskRole = aws_iam.Role(self, "FlaskecsTaskRole", assumed_by=aws_iam.ServicePrincipal('ecs-tasks.amazonaws.com') ) FlaskecsTaskRole.add_managed_policy(aws_iam.ManagedPolicy.from_aws_managed_policy_name("service-role/AmazonECSTaskExecutionRolePolicy")) # ============================================================================= # CODE BUILD PROJECT for the Blue/ Green deployment # ============================================================================= # Creating the code build project FlaskAppcodebuild = aws_codebuild.Project(self, "FlaskAppCodeBuild", role=FlaskcodeBuildServiceRole, environment=aws_codebuild.BuildEnvironment( build_image=aws_codebuild.LinuxBuildImage.STANDARD_4_0, compute_type=aws_codebuild.ComputeType.SMALL, privileged=True, environment_variables={ 'REPOSITORY_URI':{ 'value': FlaskecrRepo.repository_uri, 'type': aws_codebuild.BuildEnvironmentVariableType.PLAINTEXT }, 'TASK_EXECUTION_ARN':{ 'value': FlaskecsTaskRole.role_arn, 'type': aws_codebuild.BuildEnvironmentVariableType.PLAINTEXT }, 'TASK_FAMILY': { 'value': ECS_TASK_FAMILY_NAME, 'type': aws_codebuild.BuildEnvironmentVariableType.PLAINTEXT } } ), source=aws_codebuild.Source.code_commit(repository=FlaskCodeCommitrepo) ) # ============================================================================= # CODE DEPLOY APPLICATION for the Blue/ Green deployment # ============================================================================= # Creating the code deploy application FlaskcodeDeployApplication = codedeploy.EcsApplication(self, "FlaskAppCodeDeploy"); # Creating the code deploy service role FlaskcodeDeployServiceRole = aws_iam.Role(self, "FlaskcodeDeployServiceRole", assumed_by=aws_iam.ServicePrincipal('codedeploy.amazonaws.com') ) FlaskcodeDeployServiceRole.add_managed_policy(aws_iam.ManagedPolicy.from_aws_managed_policy_name("AWSCodeDeployRoleForECS")); # IAM role for custom lambda function FlaskcustomLambdaServiceRole = aws_iam.Role(self, "FlaskcodeDeployCustomLambda", assumed_by= aws_iam.ServicePrincipal('lambda.amazonaws.com') ) inlinePolicyForLambda = aws_iam.PolicyStatement( effect= aws_iam.Effect.ALLOW, actions=[ "iam:PassRole", "sts:AssumeRole", "codedeploy:List*", "codedeploy:Get*", "codedeploy:UpdateDeploymentGroup", "codedeploy:CreateDeploymentGroup", "codedeploy:DeleteDeploymentGroup" ], resources= ["*"] ) FlaskcustomLambdaServiceRole.add_managed_policy(aws_iam.ManagedPolicy.from_aws_managed_policy_name('service-role/AWSLambdaBasicExecutionRole')) FlaskcustomLambdaServiceRole.add_to_policy(inlinePolicyForLambda); # Custom resource to create the deployment group createFlaskDeploymentGroupLambda = aws_lambda.Function(self, 'createFlaskDeploymentGroupLambda', code = aws_lambda.Code.from_asset("custom_resources"), runtime= aws_lambda.Runtime.PYTHON_3_8, handler= 'create_deployment_group.handler', role= FlaskcustomLambdaServiceRole, description= "Custom resource to create deployment group", memory_size= 128, timeout= core.Duration.seconds(60) ) # ================================================================================================ # CloudWatch Alarms for 4XX errors Flaskblue4xxMetric = aws_cloudwatch.Metric( namespace= 'AWS/ApplicationELB', metric_name= 'FlaskHTTPCode_Target_4XX_Count', dimensions={ "TargetGroup":FlaskBlueGroup.target_group_full_name, "LoadBalancer":alb.load_balancer_full_name }, statistic="sum", period=core.Duration.minutes(1) ) FlaskblueGroupAlarm = aws_cloudwatch.Alarm(self, "Flaskblue4xxErrors", alarm_name= "FlaskBlue_4xx_Alarm", alarm_description= "CloudWatch Alarm for the 4xx errors of Blue target group", metric= Flaskblue4xxMetric, threshold= 1, evaluation_periods= 1 ) Flaskgreen4xxMetric = aws_cloudwatch.Metric( namespace= 'AWS/ApplicationELB', metric_name= 'FlaskHTTPCode_Target_4XX_Count', dimensions= { "TargetGroup":FlaskGreenGroup.target_group_full_name, "LoadBalancer":alb.load_balancer_full_name }, statistic= "sum", period= core.Duration.minutes(1) ) FlaskgreenGroupAlarm = aws_cloudwatch.Alarm(self, "Flaskgreen4xxErrors", alarm_name= "FlaskGreen_4xx_Alarm", alarm_description= "CloudWatch Alarm for the 4xx errors of Green target group", metric= Flaskgreen4xxMetric, threshold= 1, evaluation_periods= 1 ) # ================================================================================================ # ECS task definition using ECR image # Will be used by the CODE DEPLOY for deployment # ================================================================================================ FlaskTaskDefinition = aws_ecs.FargateTaskDefinition(self, "FlaskappTaskDefn", family= ECS_TASK_FAMILY_NAME, cpu= 256, memory_limit_mib= 1024, task_role= FlaskecsTaskRole, execution_role= FlaskecsTaskRole ) FlaskcontainerDefinition = FlaskTaskDefinition.add_container("FlaskAppContainer", image= aws_ecs.ContainerImage.from_ecr_repository(FlaskecrRepo, "latest"), logging= aws_ecs.AwsLogDriver( log_group= aws_logs.LogGroup(self, "FlaskAppLogGroup", log_group_name= ECS_APP_LOG_GROUP_NAME, removal_policy= core.RemovalPolicy.DESTROY ), stream_prefix=ECS_APP_NAME ), docker_labels= { "name": ECS_APP_NAME } ) port_mapping = aws_ecs.PortMapping( container_port=80, protocol=aws_ecs.Protocol.TCP ) FlaskcontainerDefinition.add_port_mappings(port_mapping) # ============================================================================= # ECS SERVICE for the Blue/ Green deployment # ============================================================================= FlaskAppService = aws_ecs.FargateService(self, "FlaskAppService", cluster=ecs_cluster, task_definition= FlaskTaskDefinition, health_check_grace_period= core.Duration.seconds(10), desired_count= 3, deployment_controller= { "type": aws_ecs.DeploymentControllerType.CODE_DEPLOY }, service_name= ECS_APP_NAME ) FlaskAppService.connections.allow_from(alb, aws_ec2.Port.tcp(80)) FlaskAppService.connections.allow_from(alb, aws_ec2.Port.tcp(8080)) FlaskAppService.attach_to_application_target_group(FlaskBlueGroup) # ============================================================================= # CODE DEPLOY - Deployment Group CUSTOM RESOURCE for the Application deployment # ============================================================================= core.CustomResource(self, 'FlaskcustomEcsDeploymentGroup', service_token= createFlaskDeploymentGroupLambda.function_arn, properties= { "ApplicationName": FlaskcodeDeployApplication.application_name, "DeploymentGroupName": ECS_DEPLOYMENT_GROUP_NAME, "DeploymentConfigName": ECS_DEPLOYMENT_CONFIG_NAME, "ServiceRoleArn": FlaskcodeDeployServiceRole.role_arn, "BlueTargetGroup": FlaskBlueGroup.target_group_name, "GreenTargetGroup": FlaskGreenGroup.target_group_name, "ProdListenerArn": albProdListener.listener_arn, "TestListenerArn": albTestListener.listener_arn, "EcsClusterName": ecs_cluster.cluster_name, "EcsServiceName": FlaskAppService.service_name, "TerminationWaitTime": ECS_TASKSET_TERMINATION_WAIT_TIME, "BlueGroupAlarm": FlaskblueGroupAlarm.alarm_name, "GreenGroupAlarm": FlaskgreenGroupAlarm.alarm_name, } ) FlaskecsDeploymentGroup = codedeploy.EcsDeploymentGroup.from_ecs_deployment_group_attributes(self, "FlaskecsDeploymentGroup", application= FlaskcodeDeployApplication, deployment_group_name= ECS_DEPLOYMENT_GROUP_NAME, deployment_config= codedeploy.EcsDeploymentConfig.from_ecs_deployment_config_name(self, "FlaskecsDeploymentConfig", ECS_DEPLOYMENT_CONFIG_NAME) ) # ============================================================================= # CODE PIPELINE for ECS deployment # ============================================================================= FlaskcodePipelineServiceRole = aws_iam.Role(self, "FlaskcodePipelineServiceRole", assumed_by=aws_iam.ServicePrincipal('codepipeline.amazonaws.com') ) inlinePolicyForCodePipeline = aws_iam.PolicyStatement( effect= aws_iam.Effect.ALLOW, actions= [ "iam:PassRole", "sts:AssumeRole", "codecommit:Get*", "codecommit:List*", "codecommit:GitPull", "codecommit:UploadArchive", "codecommit:CancelUploadArchive", "codebuild:BatchGetBuilds", "codebuild:StartBuild", "codedeploy:CreateDeployment", "codedeploy:Get*", "codedeploy:RegisterApplicationRevision", "s3:Get*", "s3:List*", "s3:PutObject" ], resources= ["*"] ) FlaskcodePipelineServiceRole.add_to_policy(inlinePolicyForCodePipeline); sourceArtifact = codepipeline.Artifact('sourceArtifact') buildArtifact = codepipeline.Artifact('buildArtifact') # S3 bucket for storing the code pipeline artifacts FlaskAppArtifactsBucket = s3.Bucket(self, "FlaskAppArtifactsBucket", encryption= s3.BucketEncryption.S3_MANAGED, block_public_access= s3.BlockPublicAccess.BLOCK_ALL ) # S3 bucket policy for the code pipeline artifacts FlaskBucketdenyUnEncryptedObjectUploads = aws_iam.PolicyStatement( effect= aws_iam.Effect.DENY, actions= ["s3:PutObject"], principals= [aws_iam.AnyPrincipal()], resources= [FlaskAppArtifactsBucket.bucket_arn+"/*"], conditions={ "StringNotEquals":{ "s3:x-amz-server-side-encryption": "aws:kms" } } ) FlaskBucketdenyInsecureConnections = aws_iam.PolicyStatement( effect= aws_iam.Effect.DENY, actions= ["s3:*"], principals= [aws_iam.AnyPrincipal()], resources= [FlaskAppArtifactsBucket.bucket_arn+"/*"], conditions= { "Bool":{ "aws:SecureTransport": "false" } } ) FlaskAppArtifactsBucket.add_to_resource_policy(FlaskBucketdenyUnEncryptedObjectUploads) FlaskAppArtifactsBucket.add_to_resource_policy(FlaskBucketdenyInsecureConnections) # Code Pipeline - CloudWatch trigger event is created by CDK codepipeline.Pipeline(self, "FlaskECSPipeline", role= FlaskcodePipelineServiceRole, artifact_bucket= FlaskAppArtifactsBucket, stages=[ codepipeline.StageProps( stage_name='Source', actions= [ aws_codepipeline_actions.CodeCommitSourceAction( action_name= 'Source', repository= FlaskCodeCommitrepo, output= sourceArtifact, ) ] ), codepipeline.StageProps( stage_name= 'Build', actions= [ aws_codepipeline_actions.CodeBuildAction( action_name= 'Build', project= FlaskAppcodebuild, input= sourceArtifact, outputs= [buildArtifact] ) ] ), codepipeline.StageProps( stage_name= 'Deploy', actions= [ aws_codepipeline_actions.CodeDeployEcsDeployAction( action_name= 'Deploy', deployment_group= FlaskecsDeploymentGroup, app_spec_template_input= buildArtifact, task_definition_template_input= buildArtifact, ) ] ) ] ) # ============================================================================= # Export the outputs # ============================================================================= core.CfnOutput(self, "FlaskECSCodeRepo", description= "Flask code commit repository", export_name= "FlaskAppRepo", value= FlaskCodeCommitrepo.repository_clone_url_http ) core.CfnOutput(self, "FlaskLBDns", description= "Load balancer DNS", export_name= "FlaskLBDns", value= alb.load_balancer_dns_name )
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) queue = sqs.Queue(self, "StartProwlerScan", receive_message_wait_time=core.Duration.seconds(20), visibility_timeout=core.Duration.seconds(7200)) push_all_active_accounts_onto_queue_lambda_function = lambda_.Function( self, "PushAllActiveAccountsOntoQueue", runtime=lambda_.Runtime.PYTHON_3_8, code=lambda_.Code.asset("lambda/pushAllActiveActivesOntoQueue"), handler="lambda_function.lambda_handler", environment={"SQS_QUEUE_URL": queue.queue_url}) event_lambda_target = events_targets.LambdaFunction( handler=push_all_active_accounts_onto_queue_lambda_function) queue.grant_send_messages( push_all_active_accounts_onto_queue_lambda_function) schedule = events.Schedule.rate(core.Duration.days(1)) events.Rule(self, "DailyTrigger", schedule=schedule, targets=[event_lambda_target]) vpc = ec2.Vpc(self, "Vpc") cluster = ecs.Cluster(self, "Cluster", vpc=vpc) logging = ecs.AwsLogDriver(stream_prefix="ProwlerTask", log_retention=logs.RetentionDays.ONE_DAY) results_bucket = s3.Bucket(self, "ResultsBucket") dockerfile_directory = path.join(path.dirname(path.realpath(__file__)), "docker") image = ecr_assets.DockerImageAsset(self, "ProwlerImageBuild", directory=dockerfile_directory) prowler_task = ecs.FargateTaskDefinition(self, "ProwlerTaskDefinition", cpu=256, memory_limit_mib=512) prowler_task.add_container( "Prowler_image", image=ecs.ContainerImage.from_docker_image_asset(image), logging=logging, environment={ "RESULTS_BUCKET": results_bucket.bucket_name, "SQS_QUEUE_URL": queue.queue_url }) task_role = prowler_task.task_role task_role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name("ReadOnlyAccess")) queue.grant(task_role, "sqs:DeleteMessage") results_bucket.grant_put(task_role) task_role.attach_inline_policy( iam.Policy(self, "AssumeRolePermissions", statements=[ iam.PolicyStatement(actions=["sts:AssumeRole"], effect=iam.Effect.ALLOW, resources=["*"]) ])) run_fargate_task_lambda_function = lambda_.Function( self, "RunFargateTask", runtime=lambda_.Runtime.PYTHON_3_8, code=lambda_.Code.asset("lambda/runFargateTask"), handler="lambda_function.lambda_handler", environment={ "CLUSTER_ARN": cluster.cluster_arn, "SUBNET_IDS": json.dumps( [subnet.subnet_id for subnet in vpc.private_subnets]), "QUEUE_URL": queue.queue_url, "TASK_DEFINITION_ARN": prowler_task.task_definition_arn }) queue.grant(run_fargate_task_lambda_function, "sqs:GetQueueAttributes") sqs_alarm_topic = sns.Topic(self, "SqsAlarmTopic") sqs_alarm_topic.grant_publish(run_fargate_task_lambda_function) sqs_alarm_queue = sqs.Queue( self, "SqsAlarmQueue", retention_period=core.Duration.days(14), visibility_timeout=core.Duration.minutes(3)) sqs_alarm_topic.add_subscription( sns_subscriptions.SqsSubscription(sqs_alarm_queue)) run_fargate_task_lambda_function.add_event_source( lambda_event_sources.SqsEventSource(sqs_alarm_queue)) run_fargate_task_lambda_function.add_to_role_policy( iam.PolicyStatement(actions=["ecs:RunTask"], effect=iam.Effect.ALLOW, resources=[prowler_task.task_definition_arn])) run_fargate_task_lambda_function.add_to_role_policy( iam.PolicyStatement(actions=["iam:PassRole"], effect=iam.Effect.ALLOW, resources=[ prowler_task.execution_role.role_arn, prowler_task.task_role.role_arn ])) sqs_ok_topic = sns.Topic(self, "SqsOkTopic") clear_alarm_queue = lambda_.Function( self, "ClearAlarmQueue", runtime=lambda_.Runtime.PYTHON_3_8, code=lambda_.Code.asset("lambda/clearAlarmQueue"), handler="lambda_function.lambda_handler", environment={"QUEUE_URL": sqs_alarm_queue.queue_url}) clear_alarm_queue.add_event_source( lambda_event_sources.SnsEventSource(sqs_ok_topic)) sqs_alarm_queue.grant(clear_alarm_queue, "sqs:DeleteMessage") alarm = cloudwatch.Alarm( self, "FargateTaskTrigger", metric=queue.metric_approximate_number_of_messages_visible( period=core.Duration.seconds(60), statistic="max"), evaluation_periods=1, threshold=1, alarm_description="Run a fargate task when there " "are messages in the queue", treat_missing_data=cloudwatch.TreatMissingData.IGNORE) alarm.add_alarm_action(cloudwatch_actions.SnsAction(sqs_alarm_topic)) alarm.add_ok_action(cloudwatch_actions.SnsAction(sqs_ok_topic))