def _generate_graph_widget(self, title, metric_list): """Generate a graph widget and update the coordinates.""" widget = cloudwatch.GraphWidget( title=title, left=metric_list, region=self._stack_region, width=self.graph_width, height=self.graph_height, ) widget.position(x=self.coord.x_value, y=self.coord.y_value) self._update_coord(self.graph_width, self.graph_height) return widget
def get_dashboard(self, params): with open(params['dashboard_file']) as json_file: params['dashboard_widgets'] = json.load(json_file) graph_widgets = [] for widget in params['dashboard_widgets']: metric = [ cloudwatch.Metric( namespace=widget['properties']['metrics'][0][0], metric_name=widget['properties']['metrics'][0][1], dimensions={ widget['properties']['metrics'][0][2]: params['name'] }) ] graph_widget = cloudwatch.GraphWidget(height=widget['height'], width=widget['width'], left=metric) graph_widget.position(widget['x'], widget['y']) graph_widgets.append(graph_widget) dashboard = cloudwatch.Dashboard(self, "{}Dashboard".format(params['name']), dashboard_name=params['name'], widgets=[graph_widgets]) return dashboard
def __init__(self, app: core.App, id: str) -> None: super().__init__(app, id) with open("config.json") as f: self.config = json.load(f) assert ( "SECRET_KEY" in self.config), "Need random SECRET_KEY specified in config.json" assert ( "CERTIFICATE_ARN" in self.config), "Need CERTIFICATE_ARN specified in config.json" self.lambda_dir = "assets/lambda" os.makedirs(os.path.join(self.lambda_dir, "templates", "generated"), exist_ok=True) r = requests.get( "https://api.github.com/repos/sumpfork/dominiontabs/releases") changelog = r.json() changelog = [{ "url": ch["html_url"], "date": dt.datetime.strptime(ch["published_at"][:10], "%Y-%m-%d").date(), "name": ch["name"], "tag": ch["tag_name"], "description": ch["body"], } for ch in changelog] env = Environment(loader=FileSystemLoader("templates"), autoescape=select_autoescape(["html"])) t = env.get_template("changelog.html.j2") generated_template_path = os.path.join(self.lambda_dir, "templates", "generated") shutil.rmtree(generated_template_path) os.mkdir(generated_template_path) with open( os.path.join(generated_template_path, "changelog.html"), "w", ) as f: f.write(t.render(changelog=changelog)) static_website_bucket = s3.Bucket( self, "Dominion Divider Generator Site", ) cf_static_dist = cloudfront.Distribution( self, "StaticCloudfrontDist", default_behavior=cloudfront.BehaviorOptions( origin=cloudfront_origins.S3Origin(static_website_bucket)), ) s3_deployment.BucketDeployment( self, "Static Files Deployment", sources=[s3_deployment.Source.asset("./static")], destination_bucket=static_website_bucket, destination_key_prefix="static", ) flask_app = lambda_python.PythonFunction( self, "DominionDividersFlaskApp", entry=self.lambda_dir, index="lambda-handlers.py", handler="apig_wsgi_handler", environment={ "STATIC_WEB_URL": f"https://{cf_static_dist.domain_name}", "FLASK_SECRET_KEY": self.config["SECRET_KEY"], "GA_CONFIG": self.config.get("GA_CONFIG", ""), }, timeout=core.Duration.seconds(60), memory_size=512, runtime=lambda_.Runtime.PYTHON_3_8, ) api = apig.LambdaRestApi( self, "bgtools-api", handler=flask_app, binary_media_types=["*/*"], minimum_compression_size=10e4, deploy_options={ "method_options": { "/*/*": apig.MethodDeploymentOptions(throttling_rate_limit=10, throttling_burst_limit=20) } }, ) cloudfront.Distribution( self, "BGToolsCloudfrontDist", default_behavior=cloudfront.BehaviorOptions( origin=cloudfront_origins.HttpOrigin( core.Fn.select(2, core.Fn.split("/", api.url)), origin_path=core.Fn.join( "", ["/", core.Fn.select(3, core.Fn.split("/", api.url))]), ), origin_request_policy=cloudfront.OriginRequestPolicy( self, "OriginRequestPolicy", cookie_behavior=cloudfront.OriginRequestCookieBehavior.all( ), ), allowed_methods=cloudfront.AllowedMethods.ALLOW_ALL, ), domain_names=["domdiv.bgtools.net"], certificate=acm.Certificate.from_certificate_arn( self, "cert", self.config["CERTIFICATE_ARN"], ), ) dashboard = aws_cloudwatch.Dashboard( self, f"bgtools-dashboard", dashboard_name=f"bgtools-prod", start="-P1D", period_override=aws_cloudwatch.PeriodOverride.INHERIT, ) dashboard.add_widgets( aws_cloudwatch.GraphWidget( title="API Gateway Counts", width=6, height=6, left=[ aws_cloudwatch.Metric( namespace="AWS/ApiGateway", metric_name="5XXError", dimensions={ "ApiName": "bgtools-api", "Stage": api.deployment_stage.stage_name, }, period=core.Duration.minutes(amount=30), statistic="Sum", color="#d62728", ), aws_cloudwatch.Metric( namespace="AWS/ApiGateway", metric_name="4XXError", dimensions={ "ApiName": "bgtools-api", "Stage": api.deployment_stage.stage_name, }, period=core.Duration.minutes(amount=30), statistic="Sum", color="#8c564b", ), aws_cloudwatch.Metric( namespace="AWS/ApiGateway", metric_name="Count", dimensions={ "ApiName": "bgtools-api", "Stage": api.deployment_stage.stage_name, }, period=core.Duration.minutes(amount=30), statistic="Sum", color="#2ca02c", ), ], ), aws_cloudwatch.GraphWidget( title="API Gateway Latencies", width=6, height=6, left=[ aws_cloudwatch.Metric( namespace="AWS/ApiGateway", metric_name="Latency", dimensions={ "ApiName": "bgtools-api", "Stage": api.deployment_stage.stage_name, }, period=core.Duration.minutes(amount=30), statistic="Average", ), aws_cloudwatch.Metric( namespace="AWS/ApiGateway", metric_name="IntegrationLatency", dimensions={ "ApiName": "bgtools-api", "Stage": api.deployment_stage.stage_name, }, period=core.Duration.minutes(amount=30), statistic="Average", ), ], ), )
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # ----------------------------------------------------------------------------------------------------------- # The Simple Webservice Logic - This is what we will be monitoring # # API GW HTTP API, Lambda Fn and DynamoDB # https://github.com/cdk-patterns/serverless/tree/master/the-simple-webservice # ----------------------------------------------------------------------------------------------------------- # DynamoDB Table table = dynamo_db.Table( self, "Hits", partition_key=dynamo_db.Attribute( name="path", type=dynamo_db.AttributeType.STRING), billing_mode=dynamo_db.BillingMode.PAY_PER_REQUEST) # defines an AWS Lambda resource dynamo_lambda = _lambda.Function( self, "DynamoLambdaHandler", runtime=_lambda.Runtime.NODEJS_12_X, # execution environment handler="lambda.handler", # file is "lambda", function is "handler" code=_lambda.Code.from_asset( "lambda_fns"), # Code loaded from the lambda dir environment={'HITS_TABLE_NAME': table.table_name}) # grant the lambda role read/write permissions to our table' table.grant_read_write_data(dynamo_lambda) # defines an API Gateway Http API resource backed by our "dynamoLambda" function. api = api_gw.HttpApi(self, 'HttpAPI', default_integration=api_gw.LambdaProxyIntegration( handler=dynamo_lambda)) core.CfnOutput(self, 'HTTP API Url', value=api.url) # ----------------------------------------------------------------------------------------------------------- # Monitoring Logic Starts Here # # This is everything we need to understand the state of our system: # - custom metrics # - cloudwatch alarms # - custom cloudwatch dashboard # ----------------------------------------------------------------------------------------------------------- # SNS Topic so we can hook things into our alerts e.g. email error_topic = sns.Topic(self, 'theBigFanTopic') ### # Custom Metrics ### api_gw_4xx_error_percentage = cloud_watch.MathExpression( expression="m1/m2*100", label="% API Gateway 4xx Errors", using_metrics={ "m1": self.metric_for_api_gw(api.http_api_id, '4XXError', '4XX Errors', 'sum'), "m2": self.metric_for_api_gw(api.http_api_id, 'Count', '# Requests', 'sum'), }, period=core.Duration.minutes(5)) # Gather the % of lambda invocations that error in past 5 mins lambda_error_perc = cloud_watch.MathExpression( expression="e / i * 100", label="% of invocations that errored, last 5 mins", using_metrics={ "i": dynamo_lambda.metric(metric_name="Invocations", statistic="sum"), "e": dynamo_lambda.metric(metric_name="Errors", statistic="sum"), }, period=core.Duration.minutes(5)) # note: throttled requests are not counted in total num of invocations lambda_throttled_perc = cloud_watch.MathExpression( expression="t / (i + t) * 100", label="% of throttled requests, last 30 mins", using_metrics={ "i": dynamo_lambda.metric(metric_name="Invocations", statistic="sum"), "t": dynamo_lambda.metric(metric_name="Throttles", statistic="sum"), }, period=core.Duration.minutes(5)) # I think usererrors are at an account level rather than a table level so merging # these two metrics until I can get a definitive answer. I think usererrors # will always show as 0 when scoped to a table so this is still effectively # a system errors count dynamo_db_total_errors = cloud_watch.MathExpression( expression="m1 + m2", label="DynamoDB Errors", using_metrics={ "m1": table.metric_user_errors(), "m2": table.metric_system_errors(), }, period=core.Duration.minutes(5)) # Rather than have 2 alerts, let's create one aggregate metric dynamo_db_throttles = cloud_watch.MathExpression( expression="m1 + m2", label="DynamoDB Throttles", using_metrics={ "m1": table.metric(metric_name="ReadThrottleEvents", statistic="sum"), "m2": table.metric(metric_name="WriteThrottleEvents", statistic="sum"), }, period=core.Duration.minutes(5)) ### # Alarms ### # Api Gateway # 4xx are user errors so a large volume indicates a problem cloud_watch.Alarm(self, id="API Gateway 4XX Errors > 1%", metric=api_gw_4xx_error_percentage, threshold=1, evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # 5xx are internal server errors so we want 0 of these cloud_watch.Alarm(self, id="API Gateway 5XX Errors > 0", metric=self.metric_for_api_gw(api_id=api.http_api_id, metric_name="5XXError", label="5XX Errors", stat="p99"), threshold=0, period=core.Duration.minutes(5), evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) cloud_watch.Alarm(self, id="API p99 latency alarm >= 1s", metric=self.metric_for_api_gw(api_id=api.http_api_id, metric_name="Latency", label="API GW Latency", stat="p99"), threshold=1000, period=core.Duration.minutes(5), evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # Lambda # 2% of Dynamo Lambda invocations erroring cloud_watch.Alarm(self, id="Dynamo Lambda 2% Error", metric=lambda_error_perc, threshold=2, evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # 1% of Lambda invocations taking longer than 1 second cloud_watch.Alarm(self, id="Dynamo Lambda p99 Long Duration (>1s)", metric=dynamo_lambda.metric_duration(), period=core.Duration.minutes(5), threshold=1000, evaluation_periods=6, datapoints_to_alarm=1, statistic="p99", treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # 2% of our lambda invocations are throttled cloud_watch.Alarm(self, id="Dynamo Lambda 2% Throttled", metric=lambda_throttled_perc, threshold=2, evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # DynamoDB # DynamoDB Interactions are throttled - indicated poorly provisioned cloud_watch.Alarm(self, id="DynamoDB Table Reads/Writes Throttled", metric=dynamo_db_throttles, threshold=1, evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # There should be 0 DynamoDB errors cloud_watch.Alarm(self, id="DynamoDB Errors > 0", metric=dynamo_db_total_errors, threshold=0, evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) dashboard = cloud_watch.Dashboard(self, id="CloudWatchDashBoard") dashboard.add_widgets( cloud_watch.GraphWidget(title="Requests", width=8, left=[ self.metric_for_api_gw( api_id=api.http_api_id, metric_name="Count", label="# Requests", stat="sum") ]), cloud_watch.GraphWidget( title="API GW Latency", width=8, stacked=True, left=[ self.metric_for_api_gw(api_id=api.http_api_id, metric_name="Latency", label="API Latency p50", stat="p50"), self.metric_for_api_gw(api_id=api.http_api_id, metric_name="Latency", label="API Latency p90", stat="p90"), self.metric_for_api_gw(api_id=api.http_api_id, metric_name="Latency", label="API Latency p99", stat="p99") ]), cloud_watch.GraphWidget( title="API GW Errors", width=8, stacked=True, left=[ self.metric_for_api_gw(api_id=api.http_api_id, metric_name="4XXError", label="4XX Errors", stat="sum"), self.metric_for_api_gw(api_id=api.http_api_id, metric_name="5XXError", label="5XX Errors", stat="sum") ]), cloud_watch.GraphWidget(title="Dynamo Lambda Error %", width=8, left=[lambda_error_perc]), cloud_watch.GraphWidget( title="Dynamo Lambda Duration", width=8, stacked=True, left=[ dynamo_lambda.metric_duration(statistic="p50"), dynamo_lambda.metric_duration(statistic="p90"), dynamo_lambda.metric_duration(statistic="p99") ]), cloud_watch.GraphWidget(title="Dynamo Lambda Throttle %", width=8, left=[lambda_throttled_perc]), cloud_watch.GraphWidget( title="DynamoDB Latency", width=8, stacked=True, left=[ table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "GetItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "UpdateItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "PutItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "DeleteItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "Query" }), ]), cloud_watch.GraphWidget( title="DynamoDB Consumed Read/Write Units", width=8, stacked=False, left=[ table.metric(metric_name="ConsumedReadCapacityUnits"), table.metric(metric_name="ConsumedWriteCapacityUnits") ]), cloud_watch.GraphWidget( title="DynamoDB Throttles", width=8, stacked=True, left=[ table.metric(metric_name="ReadThrottleEvents", statistic="sum"), table.metric(metric_name="WriteThrottleEvents", statistic="sum") ]), )
def __init__(self, scope: core.Construct, id: str, vpc, **kwargs) -> None: super().__init__(scope, id, **kwargs) self.get_cdk_context() self.vpc = vpc #ECS cluster for the loadgen self.loadgen_cluster = ecs.Cluster(self, "Loadgen-Cluster", vpc=self.vpc) #Just using base ENI count, not caring about having ENI trunking turned on client = boto3.client('ec2') response = client.describe_instance_types( InstanceTypes=[self.ecs_instance_type]) eni_per_instance = response['InstanceTypes'][0]['NetworkInfo'][ 'MaximumNetworkInterfaces'] number_of_instances = math.ceil( (self.number_of_workers + 1) / (eni_per_instance - 1)) self.loadgen_cluster.add_capacity("AsgSpot", max_capacity=number_of_instances * 2, min_capacity=number_of_instances, instance_type=ec2.InstanceType( self.ecs_instance_type), spot_price="0.07", spot_instance_draining=True) #cloudmap for service discovery so workers can lookup mast via dns self.loadgen_cluster.add_default_cloud_map_namespace( name=self.cloudmap_namespace) #Create a graph widget to track reservation metrics for our cluster ecs_widget = cw.GraphWidget( left=[self.loadgen_cluster.metric_cpu_reservation()], right=[self.loadgen_cluster.metric_memory_reservation()], title="ECS - CPU and Memory Reservation", ) #CloudWatch dashboard to monitor our stuff self.dashboard = cw.Dashboard(self, "Locustdashboard") self.dashboard.add_widgets(ecs_widget) if not self.distributed_locust: role = "standalone" locustContainer(self, "locust" + role, self.vpc, self.loadgen_cluster, role, self.target_url) else: role = "master" master_construct = locustContainer(self, "locust" + role, self.vpc, self.loadgen_cluster, role, self.target_url) lb_widget = cw.GraphWidget( left=[ master_construct.lb.metric_active_connection_count(), master_construct.lb.metric_target_response_time() ], right=[master_construct.lb.metric_request_count()], title="Load Balancer") self.dashboard.add_widgets(lb_widget) role = "worker" worker_construct = locustContainer(self, "locust" + role, self.vpc, self.loadgen_cluster, role, self.target_url, self.number_of_workers) worker_construct.node.add_dependency(master_construct)
def __init__( self, scope: core.Construct, _id: str, vpc, bucket_para, # key_name, ddb_file_list, sqs_queue, sqs_queue_DLQ, ssm_bucket_para, ssm_credential_para, s3bucket, s3_deploy, **kwargs) -> None: super().__init__(scope, _id, **kwargs) # Create environment variable into userdata env_var = f'export table_queue_name={ddb_file_list.table_name}\n' \ f'export sqs_queue_name={sqs_queue.queue_name}\n' \ f'export ssm_parameter_bucket={ssm_bucket_para.parameter_name}\n' env_var_st = f'echo \"export table_queue_name={ddb_file_list.table_name}\" >> /etc/rc.local\n' \ f'echo \"export sqs_queue_name={sqs_queue.queue_name}\" >> /etc/rc.local\n' \ f'echo \"export ssm_parameter_bucket={ssm_bucket_para.parameter_name}\" >> /etc/rc.local\n' # Create log group and put group name into userdata s3_migrate_log = logs.LogGroup(self, "applog") cw_agent_config['logs']['logs_collected']['files']['collect_list'][0][ 'log_group_name'] = s3_migrate_log.log_group_name cw_agent_config['logs']['logs_collected']['files']['collect_list'][1][ 'log_group_name'] = s3_migrate_log.log_group_name cw_agent_config['metrics']['append_dimensions'][ 'AutoScalingGroupName'] = "\\${aws:AutoScalingGroupName}" cw_agent_config['metrics']['append_dimensions'][ 'InstanceId'] = "\\${aws:InstanceId}" cw_agent_config_str = json.dumps(cw_agent_config, indent=4).replace("\\\\", "\\") userdata_head = user_data_part1 + cw_agent_config_str + user_data_part2 + \ s3_deploy.bucket_name + " .\n" + env_var + env_var_st jobsender_userdata = userdata_head + user_data_jobsender_p worker_userdata = userdata_head + user_data_worker_p # Create jobsender ec2 node jobsender = autoscaling.AutoScalingGroup( self, "jobsender", instance_type=ec2.InstanceType( instance_type_identifier=jobsender_type), machine_image=linux_ami, # key_name=key_name, user_data=ec2.UserData.custom(jobsender_userdata), vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), desired_capacity=1, min_capacity=0, max_capacity=1) # jobsender.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # jobsender.role.add_managed_policy( # iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess")) # Don't give full access s3 to ec2, violate security rule # Create Autoscaling Group with fixed 2*EC2 hosts worker_asg = autoscaling.AutoScalingGroup( self, "worker-asg", vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), instance_type=ec2.InstanceType( instance_type_identifier=worker_type), machine_image=linux_ami, # key_name=key_name, # Optional if use SSM-SessionManager user_data=ec2.UserData.custom(worker_userdata), desired_capacity=2, min_capacity=2, max_capacity=10, spot_price="0.5") # TODO: There is no MetricsCollection in CDK autoscaling group high level API yet. # You need to enable "Group Metrics Collection" in EC2 Console Autoscaling Group - Monitoring tab for metric: # GroupDesiredCapacity, GroupInServiceInstances, GroupPendingInstances and etc. # worker_asg.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # Allow EC2 access new DynamoDB Table ddb_file_list.grant_full_access(jobsender) ddb_file_list.grant_full_access(worker_asg) # Allow EC2 access new sqs and its DLQ sqs_queue.grant_consume_messages(jobsender) sqs_queue.grant_send_messages(jobsender) sqs_queue.grant_consume_messages(worker_asg) sqs_queue_DLQ.grant_consume_messages(jobsender) # Allow EC2 access SSM Parameter Store, get bucket infor and get credential ssm_bucket_para.grant_read(jobsender) ssm_credential_para.grant_read(jobsender) ssm_credential_para.grant_read(worker_asg) # Allow EC2 access source code on s3_deploy bucket s3_deploy.grant_read(jobsender) s3_deploy.grant_read(worker_asg) # Allow EC2 access new s3 bucket s3bucket.grant_read(jobsender) s3bucket.grant_read(worker_asg) # Allow EC2 access exist s3 bucket for PUT mode: readonly access the source buckets bucket_name = '' for b in bucket_para: if bucket_name != b['src_bucket']: # 如果列了多个相同的Bucket,就跳过 bucket_name = b['src_bucket'] s3exist_bucket = s3.Bucket.from_bucket_name( self, bucket_name, # 用这个做id bucket_name=bucket_name) s3exist_bucket.grant_read(jobsender) s3exist_bucket.grant_read(worker_asg) # Allow EC2 access exist s3 bucket for GET mode: read and write access the destination buckets # bucket_name = '' # for b in bucket_para: # if bucket_name != b['des_bucket']: # 如果列了多个相同的Bucket,就跳过 # bucket_name = b['des_bucket'] # s3exist_bucket = s3.Bucket.from_bucket_name(self, # bucket_name, # 用这个做id # bucket_name=bucket_name) # s3exist_bucket.grant_read_write(jobsender) # s3exist_bucket.grant_read_write(worker_asg) # Dashboard to monitor SQS and EC2 board = cw.Dashboard(self, "s3_migrate") ec2_metric_cpu_avg = cw.Metric(namespace="AWS/EC2", metric_name="CPUUtilization", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) ec2_metric_net_out = cw.MathExpression( expression= "SEARCH('{AWS/EC2, InstanceId} NetworkOut', 'Average', 60)", label="EC2-NetworkOut", using_metrics={}) autoscaling_GroupDesiredCapacity = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupDesiredCapacity", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupInServiceInstances = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupInServiceInstances", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMinSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMinSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMaxSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMaxSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) # CWAgent collected metric cwagent_mem_avg = cw.MathExpression( expression= "SEARCH('{CWAgent, AutoScalingGroupName, InstanceId} (AutoScalingGroupName=" + worker_asg.auto_scaling_group_name + " AND MetricName=mem_used_percent)', 'Average', 60)", label="mem_avg", using_metrics={}) cwagent_disk_avg = cw.MathExpression( expression= "SEARCH('{CWAgent, path, InstanceId, AutoScalingGroupName, device, fstype} " "(AutoScalingGroupName=" + worker_asg.auto_scaling_group_name + " AND MetricName=disk_used_percent AND path=\"/\")', 'Average', 60)", label="disk_avg", using_metrics={}) cwagent_net_tcp = cw.MathExpression( expression= "SEARCH('{CWAgent, AutoScalingGroupName, InstanceId} (AutoScalingGroupName=" + worker_asg.auto_scaling_group_name + " AND MetricName=tcp_established)', 'Average', 60)", label="tcp_conn", using_metrics={}) # CWAgent collected application logs - filter metric s3_migrate_log.add_metric_filter( "Completed-bytes", metric_name="Completed-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[date, time, info, hs, p="--->Complete", bytes, key]')) s3_migrate_log.add_metric_filter( "Uploading-bytes", metric_name="Uploading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[date, time, info, hs, p="--->Uploading", bytes, key]')) s3_migrate_log.add_metric_filter( "Downloading-bytes", metric_name="Downloading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[date, time, info, hs, p="--->Downloading", bytes, key]')) traffic_metric_Complete = cw.Metric(namespace="s3_migrate", metric_name="Completed-bytes", statistic="Sum", period=core.Duration.minutes(1)) traffic_metric_Upload = cw.Metric(namespace="s3_migrate", metric_name="Uploading-bytes", statistic="Sum", period=core.Duration.minutes(1)) traffic_metric_Download = cw.Metric(namespace="s3_migrate", metric_name="Downloading-bytes", statistic="Sum", period=core.Duration.minutes(1)) s3_migrate_log.add_metric_filter( "ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"ERROR"')) s3_migrate_log.add_metric_filter( "WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"WARNING"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) board.add_widgets( cw.GraphWidget(title="S3-MIGRATION-TOTAL-TRAFFIC", left=[ traffic_metric_Complete, traffic_metric_Upload, traffic_metric_Download ], left_y_axis=cw.YAxisProps(label="Bytes/min", show_units=False)), cw.GraphWidget(title="ERROR/WARNING LOGS", left=[log_metric_ERROR], left_y_axis=cw.YAxisProps(label="Count", show_units=False), right=[log_metric_WARNING], right_y_axis=cw.YAxisProps(label="Count", show_units=False)), cw.GraphWidget( title="SQS-JOBS", left=[ sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.SingleValueWidget( title="RUNNING, WAITING & DEATH JOBS", metrics=[ sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)) ], height=6)) board.add_widgets( cw.GraphWidget(title="EC2-AutoscalingGroup-TCP", left=[cwagent_net_tcp], left_y_axis=cw.YAxisProps(label="Count", show_units=False)), cw.GraphWidget(title="EC2-AutoscalingGroup-CPU/MEMORY", left=[ec2_metric_cpu_avg, cwagent_mem_avg], left_y_axis=cw.YAxisProps(max=100, min=0, label="%", show_units=False)), cw.GraphWidget(title="EC2-AutoscalingGroup-DISK", left=[cwagent_disk_avg], left_y_axis=cw.YAxisProps(max=100, min=0, label="%", show_units=False)), cw.SingleValueWidget(title="EC2-AutoscalingGroup-CAPACITY", metrics=[ autoscaling_GroupDesiredCapacity, autoscaling_GroupInServiceInstances, autoscaling_GroupMinSize, autoscaling_GroupMaxSize ], height=6)) board.add_widgets( cw.GraphWidget(title="EC2-NetworkOut", left=[ec2_metric_net_out], left_y_axis=cw.YAxisProps(label="Bytes/min", show_units=False))) # Autoscaling up when visible message > 100 in 5 mins worker_asg.scale_on_metric( "scaleup", metric=sqs_queue.metric_approximate_number_of_messages_visible(), scaling_steps=[ autoscaling.ScalingInterval(change=1, lower=100, upper=500), autoscaling.ScalingInterval(change=2, lower=500), autoscaling.ScalingInterval(change=0, upper=100, lower=0) ], adjustment_type=autoscaling.AdjustmentType.CHANGE_IN_CAPACITY) # Alarm for queue empty and ec2 > 1 # 消息队列空(没有Visible+Invisible),并且EC2不止一台,则告警,并设置EC2为1台 # 这里还可以根据场景调整,如果Jobsender也用来做传输,则可以在这里设置没有任务的时候,Autoscaling Group为0 metric_all_message = cw.MathExpression( expression="IF(((a+b) == 0) AND (c >1), 0, 1)", # a+b且c>1则设置为0,告警 label="empty_queue_expression", using_metrics={ "a": sqs_queue.metric_approximate_number_of_messages_visible(), "b": sqs_queue.metric_approximate_number_of_messages_not_visible(), "c": autoscaling_GroupInServiceInstances }) alarm_0 = cw.Alarm( self, "SQSempty", alarm_name= "s3-migration-cluster-SQS queue empty and ec2 more than 1 in Cluster", metric=metric_all_message, threshold=0, comparison_operator=cw.ComparisonOperator. LESS_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.NOT_BREACHING) alarm_topic_empty = sns.Topic( self, "SQS queue empty and ec2 more than 1 in Cluster") # 这个告警可以作为批量传输完成后的通知,而且这样做可以只通知一次,而不会不停地通知 alarm_topic_empty.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_0.add_alarm_action(action.SnsAction(alarm_topic_empty)) # If queue empty, set autoscale down to 1 EC2 action_shutdown = autoscaling.StepScalingAction( self, "shutdown", auto_scaling_group=worker_asg, adjustment_type=autoscaling.AdjustmentType.EXACT_CAPACITY) action_shutdown.add_adjustment(adjustment=1, upper_bound=0) alarm_0.add_alarm_action(action.AutoScalingAction(action_shutdown)) # While message in SQS-DLQ, alarm to sns alarm_DLQ = cw.Alarm( self, "SQS_DLQ", alarm_name= "s3-migration-cluster-SQS DLQ more than 1 message-Cluster", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible( ), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.IGNORE) alarm_topic_DLQ = sns.Topic(self, "SQS DLQ more than 1 message-Cluster") alarm_topic_DLQ.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic_DLQ)) # Output core.CfnOutput(self, "LogGroup", value=s3_migrate_log.log_group_name) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_cluster") core.CfnOutput(self, "Alarm", value="CloudWatch SQS queue empty Alarm for cluster: " + alarm_email)
def __init__(self, scope: core.Construct, id: str, stage: str, api: _api_gw.IRestApi, fn: _lambda.IFunction, table: _ddb.ITable, **kwargs) -> None: super().__init__(scope, id, **kwargs) gw = dict(self.node.try_get_context("gateway")) ### # Custom Metrics ### # Gather the % of lambda invocations that error in past 5 mins lambda_error_perc = cloud_watch.MathExpression( expression="e / i * 100", label="% of invocations that errored, last 5 mins", using_metrics={ "i": fn.metric(metric_name="Invocations", statistic="sum"), "e": fn.metric(metric_name="Errors", statistic="sum"), }, period=core.Duration.minutes(5)) # note: throttled requests are not counted in total num of invocations lambda_throttled_perc = cloud_watch.MathExpression( expression="t / (i + t) * 100", label="% of throttled requests, last 30 mins", using_metrics={ "i": fn.metric(metric_name="Invocations", statistic="sum"), "t": fn.metric(metric_name="Throttles", statistic="sum"), }, period=core.Duration.minutes(5)) dashboard = cloud_watch.Dashboard(self, id="CloudWatchDashBoard", dashboard_name="Serverlesslens") dashboard.add_widgets( cloud_watch.GraphWidget(title="Requests", width=8, left=[ self.metric_for_api_gw( api_name=gw["gw_name"], stage=stage, metric_name="Count", label="# Requests", stat="sum") ]), cloud_watch.GraphWidget( title="API GW Latency", width=8, stacked=True, left=[ self.metric_for_api_gw(api_name=gw["gw_name"], stage=stage, metric_name="Latency", label="API Latency p50", stat="p50"), self.metric_for_api_gw(api_name=gw["gw_name"], stage=stage, metric_name="Latency", label="API Latency p90", stat="p90"), self.metric_for_api_gw(api_name=gw["gw_name"], stage=stage, metric_name="Latency", label="API Latency p99", stat="p99") ]), cloud_watch.GraphWidget( title="API GW Errors", width=8, stacked=True, left=[ self.metric_for_api_gw(api_name=gw["gw_name"], stage=stage, metric_name="4XXError", label="4XX Errors", stat="sum"), self.metric_for_api_gw(api_name=gw["gw_name"], stage=stage, metric_name="5XXError", label="5XX Errors", stat="sum") ]), cloud_watch.GraphWidget(title="Dynamo Lambda Error %", width=8, left=[lambda_error_perc]), cloud_watch.GraphWidget(title="Dynamo Lambda Duration", width=8, stacked=True, left=[ fn.metric_duration(statistic="p50"), fn.metric_duration(statistic="p90"), fn.metric_duration(statistic="p99") ]), cloud_watch.GraphWidget(title="Dynamo Lambda Throttle %", width=8, left=[lambda_throttled_perc]), cloud_watch.GraphWidget( title="DynamoDB Latency", width=8, stacked=True, left=[ table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "GetItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "UpdateItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "PutItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "DeleteItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "Query" }), ]), cloud_watch.GraphWidget( title="DynamoDB Consumed Read/Write Units", width=8, stacked=False, left=[ table.metric(metric_name="ConsumedReadCapacityUnits"), table.metric(metric_name="ConsumedWriteCapacityUnits") ]), cloud_watch.GraphWidget( title="DynamoDB Throttles", width=8, stacked=True, left=[ table.metric(metric_name="ReadThrottleEvents", statistic="sum"), table.metric(metric_name="WriteThrottleEvents", statistic="sum") ]), )
def __init__( self, scope: core.Construct, _id: str, vpc, bucket_para, # key_name, ddb_file_list, sqs_queue, sqs_queue_DLQ, ssm_bucket_para, ssm_credential_para, # s3bucket, **kwargs) -> None: super().__init__(scope, _id, **kwargs) # Create jobsender ec2 node jobsender = ec2.Instance( self, "jobsender", instance_name="s3_migrate_cluster_jobsender", instance_type=ec2.InstanceType( instance_type_identifier=jobsender_type), machine_image=linux_ami, # key_name=key_name, user_data=ec2.UserData.custom(user_data_jobsender), vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC)) # jobsender.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # jobsender.role.add_managed_policy( # iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess")) # Don't give full access s3 to ec2, violate security rule # Create Autoscaling Group with fixed 2*EC2 hosts worker_asg = autoscaling.AutoScalingGroup( self, "worker-asg", vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), instance_type=ec2.InstanceType( instance_type_identifier=worker_type), machine_image=linux_ami, # key_name=key_name, # Optional if use SSM-SessionManager user_data=ec2.UserData.custom(user_data_worker), desired_capacity=1, min_capacity=1, max_capacity=10, spot_price="0.5") # TODO: There is no MetricsCollection in CDK autoscaling group high level API yet. # You need to enable "Group Metrics Collection" in EC2 Console Autoscaling Group - Monitoring tab for metric: # GroupDesiredCapacity, GroupInServiceInstances, GroupPendingInstances and etc. # worker_asg.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # Allow EC2 access new DynamoDB Table ddb_file_list.grant_full_access(jobsender) ddb_file_list.grant_full_access(worker_asg) # Allow EC2 access new sqs and its DLQ sqs_queue.grant_consume_messages(jobsender) sqs_queue.grant_send_messages(jobsender) sqs_queue.grant_consume_messages(worker_asg) sqs_queue_DLQ.grant_consume_messages(jobsender) # Allow EC2 access SSM Parameter Store, get bucket infor and get credential ssm_bucket_para.grant_read(jobsender) ssm_credential_para.grant_read(jobsender) ssm_credential_para.grant_read(worker_asg) # Allow EC2 access new s3 bucket # s3bucket.grant_read(jobsender) # s3bucket.grant_read(worker_asg) # Allow EC2 access exist s3 bucket bucket_name = '' for b in bucket_para: if bucket_name != b['src_bucket']: # 如果列了多个相同的Bucket,就跳过 bucket_name = b['src_bucket'] s3exist_bucket = s3.Bucket.from_bucket_name( self, bucket_name, # 用这个做id bucket_name=bucket_name) s3exist_bucket.grant_read(jobsender) s3exist_bucket.grant_read(worker_asg) # Dashboard to monitor SQS and EC2 board = cw.Dashboard(self, "s3_migrate", dashboard_name="s3_migrate_cluster") ec2_metric_net = cw.Metric( namespace="AWS/EC2", metric_name="NetworkOut", # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name}, period=core.Duration.minutes(1), statistic="Sum") ec2_metric_cpu_max = cw.Metric( namespace="AWS/EC2", metric_name="CPUUtilization", # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name}, period=core.Duration.minutes(1), statistic="Maximum") ec2_metric_cpu_avg = cw.Metric( namespace="AWS/EC2", metric_name="CPUUtilization", # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name}, period=core.Duration.minutes(1)) autoscaling_GroupDesiredCapacity = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupDesiredCapacity", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupInServiceInstances = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupInServiceInstances", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMinSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMinSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMaxSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMaxSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) # CWAgent collected metric cwagent_mem_avg = cw.Metric(namespace="CWAgent", metric_name="mem_used_percent", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, statistic="Average", period=core.Duration.minutes(1)) cwagent_mem_max = cw.Metric(namespace="CWAgent", metric_name="mem_used_percent", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, statistic="Maximum", period=core.Duration.minutes(1)) # CWAgent collected application logs - filter metric s3_migrate_log = logs.LogGroup(self, "applog", log_group_name="s3_migration_log") s3_migrate_log.add_metric_filter( "ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"ERROR"')) s3_migrate_log.add_metric_filter( "WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"WARNING"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) board.add_widgets( cw.GraphWidget(title="EC2-ALL-NETWORK", left=[ec2_metric_net]), cw.GraphWidget(title="EC2-ALL-CPU", left=[ec2_metric_cpu_avg, ec2_metric_cpu_max]), cw.GraphWidget(title="EC2-AutoscalingGroup-MEMORY", left=[cwagent_mem_max, cwagent_mem_avg]), cw.SingleValueWidget(title="EC2-AutoscalingGroup-Capacity", metrics=[ autoscaling_GroupDesiredCapacity, autoscaling_GroupInServiceInstances, autoscaling_GroupMinSize, autoscaling_GroupMaxSize ], height=6), ) board.add_widgets( cw.GraphWidget( title="SQS-Jobs", left=[ sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="SQS-DeadLetterQueue", left=[ sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.GraphWidget(title="ERROR/WARNING Logs", left=[log_metric_ERROR], right=[log_metric_WARNING], height=6), cw.SingleValueWidget( title="Running/Waiting and Death Jobs", metrics=[ sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)) ], height=6)) # Autoscaling up when visible message > 100 every 3 of 3 x 5 mins worker_asg.scale_on_metric( "scaleup", metric=sqs_queue.metric_approximate_number_of_messages_visible(), scaling_steps=[ autoscaling.ScalingInterval(change=1, lower=100, upper=500), autoscaling.ScalingInterval(change=2, lower=500), autoscaling.ScalingInterval(change=0, upper=100, lower=0) ], adjustment_type=autoscaling.AdjustmentType.CHANGE_IN_CAPACITY) # Alarm for queue empty and ec2 > 1 # 消息队列空(没有Visible+Invisible),并且EC2不止一台,则告警,并设置EC2为1台 # 这里还可以根据场景调整,如果Jobsender也用来做传输,则可以在这里设置没有任务的时候,Autoscaling Group为0 metric_all_message = cw.MathExpression( expression="IF(((a+b) == 0) AND (c >1), 0, 1)", # a+b且c>1则设置为0,告警 label="empty_queue_expression", using_metrics={ "a": sqs_queue.metric_approximate_number_of_messages_visible(), "b": sqs_queue.metric_approximate_number_of_messages_not_visible(), "c": autoscaling_GroupInServiceInstances }) alarm_0 = cw.Alarm( self, "SQSempty", alarm_name= "s3-migration-cluster-SQS queue empty and ec2 more than 1 in Cluster", metric=metric_all_message, threshold=0, comparison_operator=cw.ComparisonOperator. LESS_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.NOT_BREACHING) alarm_topic_empty = sns.Topic( self, "SQS queue empty and ec2 more than 1 in Cluster") # 这个告警可以作为批量传输完成后的通知,而且这样做可以只通知一次,而不会不停地通知 alarm_topic_empty.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_0.add_alarm_action(action.SnsAction(alarm_topic_empty)) # If queue empty, set autoscale down to 1 EC2 action_shutdown = autoscaling.StepScalingAction( self, "shutdown", auto_scaling_group=worker_asg, adjustment_type=autoscaling.AdjustmentType.EXACT_CAPACITY) action_shutdown.add_adjustment(adjustment=1, upper_bound=0) alarm_0.add_alarm_action(action.AutoScalingAction(action_shutdown)) # While message in SQS-DLQ, alarm to sns alarm_DLQ = cw.Alarm( self, "SQS_DLQ", alarm_name= "s3-migration-cluster-SQS DLQ more than 1 message-Cluster", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible( ), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.IGNORE) alarm_topic_DLQ = sns.Topic(self, "SQS DLQ more than 1 message-Cluster") alarm_topic_DLQ.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic_DLQ)) # Output core.CfnOutput(self, "JobSenderEC2", value=jobsender.instance_id) core.CfnOutput(self, "WorkerEC2AutoscalingGroup", value=worker_asg.auto_scaling_group_name) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_cluster") core.CfnOutput(self, "Alarm", value="CloudWatch SQS queue empty Alarm for cluster: " + alarm_email)
def __init__(self, scope: core.Construct, _id: str, **kwargs) -> None: super().__init__(scope, _id, **kwargs) ddb_file_list = ddb.Table(self, "ddb", partition_key=ddb.Attribute(name="Key", type=ddb.AttributeType.STRING), billing_mode=ddb.BillingMode.PAY_PER_REQUEST) sqs_queue_DLQ = sqs.Queue(self, "sqs_DLQ", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14) ) sqs_queue = sqs.Queue(self, "sqs_queue", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14), dead_letter_queue=sqs.DeadLetterQueue( max_receive_count=100, queue=sqs_queue_DLQ ) ) handler = lam.Function(self, "lambdaFunction", code=lam.Code.asset("./lambda"), handler="lambda_function.lambda_handler", runtime=lam.Runtime.PYTHON_3_8, memory_size=1024, timeout=core.Duration.minutes(15), tracing=lam.Tracing.ACTIVE, environment={ 'table_queue_name': ddb_file_list.table_name, 'Des_bucket_default': Des_bucket_default, 'Des_prefix_default': Des_prefix_default, 'StorageClass': StorageClass, 'aws_access_key_id': aws_access_key_id, 'aws_secret_access_key': aws_secret_access_key, 'aws_access_key_region': aws_access_key_region }) ddb_file_list.grant_read_write_data(handler) handler.add_event_source(SqsEventSource(sqs_queue)) s3bucket = s3.Bucket(self, "s3bucket") s3bucket.grant_read(handler) s3bucket.add_event_notification(s3.EventType.OBJECT_CREATED, s3n.SqsDestination(sqs_queue)) # You can import an existing bucket and grant access to lambda # exist_s3bucket = s3.Bucket.from_bucket_name(self, "import_bucket", # bucket_name="you_bucket_name") # exist_s3bucket.grant_read(handler) # But You have to add sqs as imported bucket event notification manually, it doesn't support by CloudFormation # An work around is to add on_cloud_trail_event for the bucket, but will trigger could_trail first # 因为是导入的Bucket,需要手工建Bucket Event Trigger SQS,以及设置SQS允许该bucekt触发的Permission core.CfnOutput(self, "DynamoDB_Table", value=ddb_file_list.table_name) core.CfnOutput(self, "SQS_Job_Queue", value=sqs_queue.queue_name) core.CfnOutput(self, "SQS_Job_Queue_DLQ", value=sqs_queue_DLQ.queue_name) core.CfnOutput(self, "Worker_Lambda_Function", value=handler.function_name) core.CfnOutput(self, "New_S3_Bucket", value=s3bucket.bucket_name) # Create Lambda logs filter to create network traffic metric handler.log_group.add_metric_filter("Complete-bytes", metric_name="Complete-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Complete", bytes, key]')) handler.log_group.add_metric_filter("Uploading-bytes", metric_name="Uploading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Uploading", bytes, key]')) handler.log_group.add_metric_filter("Downloading-bytes", metric_name="Downloading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Downloading", bytes, key]')) lambda_metric_Complete = cw.Metric(namespace="s3_migrate", metric_name="Complete-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Upload = cw.Metric(namespace="s3_migrate", metric_name="Uploading-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Download = cw.Metric(namespace="s3_migrate", metric_name="Downloading-bytes", statistic="Sum", period=core.Duration.minutes(1)) handler.log_group.add_metric_filter("ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal( '"ERROR"')) handler.log_group.add_metric_filter("WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal( '"WARNING"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) # Dashboard to monitor SQS and Lambda board = cw.Dashboard(self, "s3_migrate", dashboard_name="s3_migrate_serverless") board.add_widgets(cw.GraphWidget(title="Lambda-NETWORK", left=[lambda_metric_Download, lambda_metric_Upload, lambda_metric_Complete]), # TODO: here monitor all lambda concurrency not just the working one. Limitation from CDK # Lambda now supports monitor single lambda concurrency, will change this after CDK support cw.GraphWidget(title="Lambda-all-concurrent", left=[handler.metric_all_concurrent_executions(period=core.Duration.minutes(1))]), cw.GraphWidget(title="Lambda-invocations/errors/throttles", left=[handler.metric_invocations(period=core.Duration.minutes(1)), handler.metric_errors(period=core.Duration.minutes(1)), handler.metric_throttles(period=core.Duration.minutes(1))]), cw.GraphWidget(title="Lambda-duration", left=[handler.metric_duration(period=core.Duration.minutes(1))]), ) board.add_widgets(cw.GraphWidget(title="SQS-Jobs", left=[sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1) ), sqs_queue.metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1) )]), cw.GraphWidget(title="SQS-DeadLetterQueue", left=[sqs_queue_DLQ.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1) ), sqs_queue_DLQ.metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1) )]), cw.GraphWidget(title="ERROR/WARNING Logs", left=[log_metric_ERROR], right=[log_metric_WARNING]), cw.SingleValueWidget(title="Running/Waiting and Dead Jobs", metrics=[sqs_queue.metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1) ), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1) ), sqs_queue_DLQ.metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1) ), sqs_queue_DLQ.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1) )], height=6) ) # Alarm for queue - DLQ alarm_DLQ = cw.Alarm(self, "SQS_DLQ", alarm_name="s3-migration-serverless-SQS Dead Letter Queue", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible(), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=1, datapoints_to_alarm=1) alarm_topic = sns.Topic(self, "SQS queue-DLQ has dead letter") alarm_topic.add_subscription(subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic)) # Alarm for queue empty, i.e. no visible message and no in-visible message # metric_all_message = cw.MathExpression( # expression="a + b", # label="empty_queue_expression", # using_metrics={ # "a": sqs_queue.metric_approximate_number_of_messages_visible(), # "b": sqs_queue.metric_approximate_number_of_messages_not_visible() # } # ) # alarm_0 = cw.Alarm(self, "SQSempty", # alarm_name="SQS queue empty-Serverless", # metric=metric_all_message, # threshold=0, # comparison_operator=cw.ComparisonOperator.LESS_THAN_OR_EQUAL_TO_THRESHOLD, # evaluation_periods=3, # datapoints_to_alarm=3, # treat_missing_data=cw.TreatMissingData.IGNORE # ) # alarm_topic = sns.Topic(self, "SQS queue empty-Serverless") # alarm_topic.add_subscription(subscription=sub.EmailSubscription(alarm_email)) # alarm_0.add_alarm_action(action.SnsAction(alarm_topic)) # core.CfnOutput(self, "Alarm", value="CloudWatch SQS queue empty Alarm for Serverless: " + alarm_email) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_serverless")
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) bucket_name = 'devassoc-monitored' bucket = s3.Bucket(self, 'bucket-monitored', bucket_name=bucket_name, removal_policy=core.RemovalPolicy.DESTROY, auto_delete_objects=True) core.CfnOutput(self, 'monitored-bucket', value=bucket.bucket_name) size_metric = cw.Metric(namespace='AWS/S3', metric_name='BucketSizeBytes', dimensions={ 'BucketName': bucket.bucket_name, 'StorageType': 'StandardStorage' }, period=core.Duration.days(1)) size_alarm = size_metric.create_alarm( self, 'bucket-alarm', alarm_name='S3 Storage Alarm', comparison_operator=cw.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=1, period=core.Duration.days(1), threshold=1000, actions_enabled=True) size_topic = sns.Topic(self, 'size-topic', display_name='My S3 Alarm List') email_param = ssm.StringParameter.from_string_parameter_name( self, 'email-param', 'notification-email') size_topic_sub = sns.Subscription( self, 'size-topic-sub', topic=size_topic, protocol=sns.SubscriptionProtocol.EMAIL, endpoint=email_param.string_value) size_action = cwa.SnsAction(size_topic) size_alarm.add_alarm_action(size_action) bucket_name = 'devassoc-s3-logs' log_bucket = s3.Bucket(self, 'bucket-s3-logs', bucket_name=bucket_name, removal_policy=core.RemovalPolicy.DESTROY, auto_delete_objects=True) s3_trail = ct.Trail(self, 'bucket-trail', bucket=log_bucket, trail_name='s3_logs') s3_trail.add_s3_event_selector([ct.S3EventSelector(bucket=bucket)]) s3_trail.log_all_s3_data_events() single_value_widget = cw.SingleValueWidget(metrics=[size_metric]) graph_widget = cw.GraphWidget(left=[size_metric]) cw.Dashboard(self, 'cloudwatch-dashboard', dashboard_name='S3Dashboard', widgets=[[single_value_widget, graph_widget]])
def __init__(self, scope: core.Construct, id: str, stream_producer_lg, stream_pipe, py_stream_record_processor_fn, node_stream_record_processor_fn, ** kwargs ) -> None: super().__init__(scope, id, **kwargs) # ): ##### MONITORING ###### ################################################## ########## STREAM METRICS ######### ################################################## # Shows you the ingestion rate into the shard. stream_in_bytes_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="IncomingBytes", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="IncomingBytes", period=core.Duration.minutes(30), statistic="Sum" ) stream_in_records_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="IncomingRecords", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="IncomingRecords", period=core.Duration.minutes(30), statistic="Sum" ) stream_w_throttle_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="WriteProvisionedThroughputExceeded", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="WriteProvisionedThroughputExceeded", period=core.Duration.minutes(30), statistic="Sum" ) stream_r_throttle_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="ReadProvisionedThroughputExceeded", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="ReadProvisionedThroughputExceeded", period=core.Duration.minutes(30), statistic="Sum" ) stream_put_success_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="PutRecords.Success", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="PutRecords.LatSuccessency", period=core.Duration.minutes(30), statistic="Sum" ) stream_put_latency_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="PutRecords.Latency", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="PutRecords.Latency", period=core.Duration.minutes(30), statistic="Sum" ) stream_get_latency_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="GetRecords.Latency", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="GetRecords.Latency", period=core.Duration.minutes(30), statistic="Sum" ) ################################################## ########## STREAM PRODUCER METRICS ######### ################################################## # JSON Metric Filter - https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/FilterAndPatternSyntax.html records_produced_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}-stream-data-processor", metric_name="recordsProducedCount", label="Total No. Of Records Produced", period=core.Duration.minutes(30), statistic="Sum" ) records_produced_metric_filter = _logs.MetricFilter(self, "recordsProducedCountFilter", filter_pattern=_logs.FilterPattern.exists( "$.records_produced"), log_group=stream_producer_lg, metric_namespace=records_produced_metric.namespace, metric_name=records_produced_metric.metric_name, default_value=0, metric_value="$.records_produced", ) ################################################## ########## STREAM CONSUMER METRICS ######### ################################################## py_records_processed_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}-stream-data-processor", # dimensions={ # "RecordsProcessed": "py_processor" # }, metric_name="pyRecordsProcessedCount", label="Total No. Of Records Processed", period=core.Duration.minutes(30), statistic="Sum" ) py_stream_record_processor = _logs.MetricFilter(self, "processedRecordCountFilter01", filter_pattern=_logs.FilterPattern.exists( "$.records_processed"), log_group=py_stream_record_processor_fn.log_group, metric_namespace=py_records_processed_metric.namespace, metric_name=py_records_processed_metric.metric_name, default_value=0, metric_value="$.records_processed", ) node_records_processed_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}-stream-data-processor", metric_name="nodeRecordsProcessedCount", label="Total No. Of Records Processed", period=core.Duration.minutes(30), statistic="Sum" ) node_stream_record_processor = _logs.MetricFilter(self, "processedRecordCountFilter02", filter_pattern=_logs.FilterPattern.exists( "$.records_processed"), log_group=node_stream_record_processor_fn.log_group, metric_namespace=node_records_processed_metric.namespace, metric_name=node_records_processed_metric.metric_name, default_value=0, metric_value="$.records_processed", ) # Create CloudWatch Dashboard for Streams stream_processor_dashboard = _cloudwatch.Dashboard(self, id="streamProcessorDashboard", dashboard_name="Stream-Processor" ) stream_processor_dashboard.add_widgets( _cloudwatch.SingleValueWidget( title="TotalRecordsProduced", metrics=[records_produced_metric] ), _cloudwatch.SingleValueWidget( title="RecordsProcessed-by-Python-Consumer", metrics=[py_records_processed_metric] ), _cloudwatch.SingleValueWidget( title="RecordsProcessed-by-Node-Consumer", metrics=[node_records_processed_metric] ) ) # Stream Incoming bytes Graph stream_processor_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.GraphWidget( title="Shard Ingestion Metrics", left=[stream_in_bytes_metric], right=[stream_in_records_metric] ), _cloudwatch.GraphWidget( title="Shard Throttle Metrics", left=[stream_w_throttle_metric], right=[stream_r_throttle_metric] ) ) ) stream_processor_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.GraphWidget( title="Stream Put Latency", left=[stream_put_latency_metric] ), _cloudwatch.GraphWidget( title="Stream Get Latency", left=[stream_get_latency_metric] ), _cloudwatch.GraphWidget( title="Stream Put Success", left=[stream_put_success_metric] ) ) ) ########################################### ################# OUTPUTS ################# ########################################### output_0 = core.CfnOutput(self, "SecuirtyAutomationFrom", value=f"{global_args.SOURCE_INFO}", description="To know more about this automation stack, check out our github page." )
def __init__(self, scope: core.Construct, id: str, distributed_locust: bool, target_url: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) vpc_cidr = "10.51.0.0/16" number_of_slaves = 3 ecs_instance_type = "c5.large" #Build new VPC vpc = ec2.Vpc(self, "loadgenvpc", cidr=vpc_cidr, subnet_configuration=[{ "cidrMask": 24, "name": "ecsvpc", "subnetType": ec2.SubnetType.PUBLIC }, { "cidrMask": 24, "name": "ecsprivatevpc", "subnetType": ec2.SubnetType.PRIVATE, }]) #ECS cluster for the loadgen loadgen_cluster = ecs.Cluster(self, "Loadgen-Cluster", vpc=vpc) loadgen_cluster.add_capacity( "AsgSpot", max_capacity=2, min_capacity=2, desired_capacity=2, instance_type=ec2.InstanceType(ecs_instance_type), spot_price="0.07", # Enable the Automated Spot Draining support for Amazon ECS spot_instance_draining=True) #cloudmap for service discovery so slaves can lookup mast via dns loadgen_cluster.add_default_cloud_map_namespace(name="loadgen") #Create a graph widget to track reservation metrics for our cluster ecs_widget = cw.GraphWidget( left=[loadgen_cluster.metric_cpu_reservation()], right=[loadgen_cluster.metric_memory_reservation()], title="ECS - CPU and Memory Reservation", ) #CloudWatch dashboard to monitor our stuff dashboard = cw.Dashboard(self, "Locustdashboard") dashboard.add_widgets(ecs_widget) if not distributed_locust: role = "standalone" locustContainer(self, "locust" + role, vpc, loadgen_cluster, role, target_url) else: role = "master" master_construct = locustContainer(self, "locust" + role, vpc, loadgen_cluster, role, target_url) lb_widget = cw.GraphWidget( left=[ master_construct.lb.metric_active_connection_count(), master_construct.lb.metric_target_response_time() ], right=[master_construct.lb.metric_request_count()], title="Load Balancer") dashboard.add_widgets(lb_widget) role = "slave" slave_construct = locustContainer(self, "locust" + role, vpc, loadgen_cluster, role, target_url, number_of_slaves) slave_construct.node.add_dependency(master_construct)
def graph_widget(title, *metrics): return aws_cloudwatch.GraphWidget(title=title, left=list(metrics), height=6, width=8)
def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) example_dashboard_name = "ExampleLambdaDashboard" # Create Example Lambda function lambda_function = aws_lambda.Function( self, "lambda_function", runtime=aws_lambda.Runtime.PYTHON_3_7, handler="lambda-handler.main", code=aws_lambda.Code.from_asset("./lambda")) lambda_function.role.add_managed_policy( aws_iam.ManagedPolicy.from_aws_managed_policy_name( "service-role/AWSLambdaBasicExecutionRole")) # Create CloudWatch Dashboard to view Lambda Function Metrics cw_dashboard = aws_cloudwatch.Dashboard( self, "Lambda Dashboard", dashboard_name=example_dashboard_name) # CloudWatch Dashboard Title title_widget = aws_cloudwatch.TextWidget( markdown="# Dashboard: {}".format(lambda_function.function_name), height=1, width=24) # Create Widgets for CloudWatch Dashboard based on Lambda Function's CloudWatch Metrics invocations_widget = aws_cloudwatch.GraphWidget( title="Invocations", left=[lambda_function.metric_invocations()], width=24) errors_widget = aws_cloudwatch.GraphWidget( title="Errors", left=[lambda_function.metric_errors()], width=24) duration_widget = aws_cloudwatch.GraphWidget( title="Duration", left=[lambda_function.metric_duration()], width=24) throttles_widget = aws_cloudwatch.GraphWidget( title="Throttles", left=[lambda_function.metric_throttles()], width=24) # Create Widget to show last 20 Log Entries log_widget = aws_cloudwatch.LogQueryWidget( log_group_names=[lambda_function.log_group.log_group_name], query_lines=[ "fields @timestamp, @message", "sort @timestamp desc", "limit 20" ], width=24) # Add Widgets to CloudWatch Dashboard cw_dashboard.add_widgets(title_widget, invocations_widget, errors_widget, duration_widget, throttles_widget, log_widget) # Output Dashboard URL cloudwatch_dasboard_url = 'https://{}.console.aws.amazon.com/cloudwatch/home?region={}#dashboards:name={}'.format( Aws.REGION, Aws.REGION, example_dashboard_name) CfnOutput(self, "DashboardOutput", value=cloudwatch_dasboard_url, description="URL of Sample CloudWatch Dashboard", export_name="SampleCloudWatchDashboardURL") CfnOutput(self, "LambdaName", value=lambda_function.function_name, description="Name of the sample Lambda Function", export_name="LambdaName")
def __init__(self, scope: core.Construct, _id: str, **kwargs) -> None: super().__init__(scope, _id, **kwargs) # Setup SSM parameter of credentials, bucket_para, ignore_list ssm_credential_para = ssm.StringParameter.from_secure_string_parameter_attributes( self, "ssm_parameter_credentials", parameter_name=ssm_parameter_credentials, version=1) ssm_bucket_para = ssm.StringParameter(self, "s3bucket_serverless", string_value=json.dumps( bucket_para, indent=4)) ssm_parameter_ignore_list = ssm.StringParameter( self, "s3_migrate_ignore_list", string_value=ignore_list) # Setup DynamoDB ddb_file_list = ddb.Table(self, "s3migrate_serverless", partition_key=ddb.Attribute( name="Key", type=ddb.AttributeType.STRING), billing_mode=ddb.BillingMode.PAY_PER_REQUEST) ddb_file_list.add_global_secondary_index( partition_key=ddb.Attribute(name="desBucket", type=ddb.AttributeType.STRING), index_name="desBucket-index", projection_type=ddb.ProjectionType.INCLUDE, non_key_attributes=["desKey", "versionId"]) # Setup SQS sqs_queue_DLQ = sqs.Queue(self, "s3migrate_serverless_Q_DLQ", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14)) sqs_queue = sqs.Queue(self, "s3migrate_serverless_Q", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14), dead_letter_queue=sqs.DeadLetterQueue( max_receive_count=60, queue=sqs_queue_DLQ)) # Setup API for Lambda to get IP address (for debug networking routing purpose) checkip = api.RestApi( self, "lambda-checkip-api", cloud_watch_role=True, deploy=True, description="For Lambda get IP address", default_integration=api.MockIntegration( integration_responses=[ api.IntegrationResponse(status_code="200", response_templates={ "application/json": "$context.identity.sourceIp" }) ], request_templates={"application/json": '{"statusCode": 200}'}), endpoint_types=[api.EndpointType.REGIONAL]) checkip.root.add_method("GET", method_responses=[ api.MethodResponse( status_code="200", response_models={ "application/json": api.Model.EMPTY_MODEL }) ]) # Setup Lambda functions handler = lam.Function(self, "s3-migrate-worker", code=lam.Code.asset("./lambda"), handler="lambda_function_worker.lambda_handler", runtime=lam.Runtime.PYTHON_3_8, memory_size=1024, timeout=core.Duration.minutes(15), tracing=lam.Tracing.ACTIVE, environment={ 'table_queue_name': ddb_file_list.table_name, 'Des_bucket_default': Des_bucket_default, 'Des_prefix_default': Des_prefix_default, 'StorageClass': StorageClass, 'checkip_url': checkip.url, 'ssm_parameter_credentials': ssm_parameter_credentials, 'JobType': JobType, 'MaxRetry': MaxRetry, 'MaxThread': MaxThread, 'MaxParallelFile': MaxParallelFile, 'JobTimeout': JobTimeout, 'UpdateVersionId': UpdateVersionId, 'GetObjectWithVersionId': GetObjectWithVersionId }) handler_jobsender = lam.Function( self, "s3-migrate-jobsender", code=lam.Code.asset("./lambda"), handler="lambda_function_jobsender.lambda_handler", runtime=lam.Runtime.PYTHON_3_8, memory_size=1024, timeout=core.Duration.minutes(15), tracing=lam.Tracing.ACTIVE, environment={ 'table_queue_name': ddb_file_list.table_name, 'StorageClass': StorageClass, 'checkip_url': checkip.url, 'sqs_queue': sqs_queue.queue_name, 'ssm_parameter_credentials': ssm_parameter_credentials, 'ssm_parameter_ignore_list': ssm_parameter_ignore_list.parameter_name, 'ssm_parameter_bucket': ssm_bucket_para.parameter_name, 'JobType': JobType, 'MaxRetry': MaxRetry, 'JobsenderCompareVersionId': JobsenderCompareVersionId }) # Allow lambda read/write DDB, SQS ddb_file_list.grant_read_write_data(handler) ddb_file_list.grant_read_write_data(handler_jobsender) sqs_queue.grant_send_messages(handler_jobsender) # SQS trigger Lambda worker handler.add_event_source(SqsEventSource(sqs_queue, batch_size=1)) # Option1: Create S3 Bucket, all new objects in this bucket will be transmitted by Lambda Worker s3bucket = s3.Bucket(self, "s3_new_migrate") s3bucket.grant_read(handler) s3bucket.add_event_notification(s3.EventType.OBJECT_CREATED, s3n.SqsDestination(sqs_queue)) # Option2: Allow Exist S3 Buckets to be read by Lambda functions. # Lambda Jobsender will scan and compare the these buckets and trigger Lambda Workers to transmit bucket_name = '' for b in bucket_para: if bucket_name != b['src_bucket']: # 如果列了多个相同的Bucket,就跳过 bucket_name = b['src_bucket'] s3exist_bucket = s3.Bucket.from_bucket_name( self, bucket_name, # 用这个做id bucket_name=bucket_name) if JobType == 'PUT': s3exist_bucket.grant_read(handler_jobsender) s3exist_bucket.grant_read(handler) else: # 'GET' mode s3exist_bucket.grant_read_write(handler_jobsender) s3exist_bucket.grant_read_write(handler) # Allow Lambda read ssm parameters ssm_bucket_para.grant_read(handler_jobsender) ssm_credential_para.grant_read(handler) ssm_credential_para.grant_read(handler_jobsender) ssm_parameter_ignore_list.grant_read(handler_jobsender) # Schedule cron event to trigger Lambda Jobsender per hour: event.Rule(self, 'cron_trigger_jobsender', schedule=event.Schedule.rate(core.Duration.hours(1)), targets=[target.LambdaFunction(handler_jobsender)]) # TODO: Trigger event imediately, add custom resource lambda to invoke handler_jobsender # Create Lambda logs filter to create network traffic metric handler.log_group.add_metric_filter( "Completed-bytes", metric_name="Completed-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Complete", bytes, key]')) handler.log_group.add_metric_filter( "Uploading-bytes", metric_name="Uploading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Uploading", bytes, key]')) handler.log_group.add_metric_filter( "Downloading-bytes", metric_name="Downloading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Downloading", bytes, key]')) handler.log_group.add_metric_filter( "MaxMemoryUsed", metric_name="MaxMemoryUsed", metric_namespace="s3_migrate", metric_value="$memory", filter_pattern=logs.FilterPattern.literal( '[head="REPORT", a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, ' 'a13, a14, a15, a16, memory, MB="MB", rest]')) lambda_metric_Complete = cw.Metric(namespace="s3_migrate", metric_name="Completed-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Upload = cw.Metric(namespace="s3_migrate", metric_name="Uploading-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Download = cw.Metric(namespace="s3_migrate", metric_name="Downloading-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_MaxMemoryUsed = cw.Metric( namespace="s3_migrate", metric_name="MaxMemoryUsed", statistic="Maximum", period=core.Duration.minutes(1)) handler.log_group.add_metric_filter( "ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"ERROR"')) handler.log_group.add_metric_filter( "WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"WARNING"')) # Task timed out handler.log_group.add_metric_filter( "TIMEOUT", metric_name="TIMEOUT-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"Task timed out"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_TIMEOUT = cw.Metric(namespace="s3_migrate", metric_name="TIMEOUT-Logs", statistic="Sum", period=core.Duration.minutes(1)) # Dashboard to monitor SQS and Lambda board = cw.Dashboard(self, "s3_migrate_serverless") board.add_widgets( cw.GraphWidget(title="Lambda-NETWORK", left=[ lambda_metric_Download, lambda_metric_Upload, lambda_metric_Complete ]), cw.GraphWidget(title="Lambda-concurrent", left=[ handler.metric( metric_name="ConcurrentExecutions", period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="Lambda-invocations/errors/throttles", left=[ handler.metric_invocations( period=core.Duration.minutes(1)), handler.metric_errors(period=core.Duration.minutes(1)), handler.metric_throttles(period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="Lambda-duration", left=[ handler.metric_duration(period=core.Duration.minutes(1)) ]), ) board.add_widgets( cw.GraphWidget(title="Lambda_MaxMemoryUsed(MB)", left=[lambda_metric_MaxMemoryUsed]), cw.GraphWidget(title="ERROR/WARNING Logs", left=[log_metric_ERROR], right=[log_metric_WARNING, log_metric_TIMEOUT]), cw.GraphWidget( title="SQS-Jobs", left=[ sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.SingleValueWidget( title="Running/Waiting and Dead Jobs", metrics=[ sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)) ], height=6)) # Alarm for queue - DLQ alarm_DLQ = cw.Alarm( self, "SQS_DLQ", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible( ), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=1, datapoints_to_alarm=1) alarm_topic = sns.Topic(self, "SQS queue-DLQ has dead letter") alarm_topic.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic)) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_serverless")
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) #import function code try: with open("serverless_stack/functions/metric_logs_generator.py", mode="r") as file: function_body = file.read() except OSError: print('File can not read') #function function_01 = aws_lambda.Function( self, "lambdafunction01", function_name="LambdaTestCustomMEtric", runtime=aws_lambda.Runtime.PYTHON_3_6, handler="index.lambda_handler", code=aws_lambda.InlineCode(function_body), timeout=core.Duration.seconds(5), reserved_concurrent_executions=1, environment={ 'LOG_LEVEL': 'INFO', 'PERCENTAGE_ERRORS': '75' }) #attached cloudwatch log group custom_metric_log_group01 = aws_logs.LogGroup( self, "cloudwatchlog01", log_group_name=f"/aws/lambda/{function_01.function_name}", removal_policy=core.RemovalPolicy.DESTROY, retention=aws_logs.RetentionDays.ONE_DAY) #Custom metric namespace custom_metric_namespace01 = aws_cw.Metric( namespace=f"custom-error-metric", metric_name="custom-error-metric", label="Amount of Custom API errors", period=core.Duration.minutes(1), statistic="Sum") #Custom metric logs filter custom_metric_filter01 = aws_logs.MetricFilter( self, "customMetricFilter", filter_pattern=aws_logs.FilterPattern.boolean_value( "$.custom_api_error", True), log_group=custom_metric_log_group01, metric_namespace=custom_metric_namespace01.namespace, metric_name=custom_metric_namespace01.metric_name, default_value=0, metric_value="1") #create custom alarm custom_metric_alarm01 = aws_cw.Alarm( self, "customMetricAlarm", alarm_description="Custom API errors", alarm_name="Custom-API-alarm", metric=custom_metric_namespace01, comparison_operator=aws_cw.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, threshold=2, evaluation_periods=2, datapoints_to_alarm=1, period=core.Duration.minutes(1), treat_missing_data=aws_cw.TreatMissingData.NOT_BREACHING) #cloudwatch dashboard custom_dashboard01 = aws_cw.Dashboard( self, id="CustomDashBoard", dashboard_name="CDK-custom-DashBoard") #lambda metrics to dashboard custom_dashboard01.add_widgets( aws_cw.Row( aws_cw.GraphWidget(title="Lambda-invoke", left=[ function_01.metric_invocations( statistic="Sum", period=core.Duration.minutes(1)) ]), aws_cw.GraphWidget(title="Lambda-errors", left=[ function_01.metric_errors( statistic="Sum", period=core.Duration.minutes(1)) ]))) #custom api errors to dashboard custom_dashboard01.add_widgets( aws_cw.Row( aws_cw.SingleValueWidget(title="Custom-API-errors", metrics=[custom_metric_namespace01])))
def __init__(self, scope: core.Construct, id: str, ** kwargs) -> None: super().__init__(scope, id, **kwargs) # Read Lambda Code): try: with open("serverless_stacks/lambda_src/konstone_custom_metric_log_generator.py", mode="r") as f: konstone_custom_metric_fn_code = f.read() except OSError: print("Unable to read Lambda Function Code") konstone_custom_metric_fn = _lambda.Function( self, "konstoneFunction", function_name="konstone_custom_metric_fn", runtime=_lambda.Runtime.PYTHON_3_7, handler="index.lambda_handler", code=_lambda.InlineCode( konstone_custom_metric_fn_code), timeout=core.Duration.seconds( 3), reserved_concurrent_executions=1, environment={ "LOG_LEVEL": "INFO", "PERCENTAGE_ERRORS": "75" } ) # Create Custom Loggroup # /aws/lambda/function-name konstone_custom_metric_lg = _logs.LogGroup( self, "konstoneLoggroup", log_group_name=f"/aws/lambda/{konstone_custom_metric_fn.function_name}", removal_policy=core.RemovalPolicy.DESTROY, retention=_logs.RetentionDays.ONE_DAY, ) # Create Custom Metric Namespace third_party_error_metric = _cloudwatch.Metric( namespace=f"third-party-error-metric", metric_name="third_party_error_metric", label="Total No. of Third Party API Errors", period=core.Duration.minutes(1), statistic="Sum" ) # Create Custom Metric Log Filter third_party_error_metric_filter = _logs.MetricFilter( self, "thirdPartyApiErrorMetricFilter", filter_pattern=_logs.FilterPattern.boolean_value( "$.third_party_api_error", True), log_group=konstone_custom_metric_lg, metric_namespace=third_party_error_metric.namespace, metric_name=third_party_error_metric.metric_name, default_value=0, metric_value="1" ) # Create Third Party Error Alarm third_party_error_alarm = _cloudwatch.Alarm( self, "thirdPartyApiErrorAlarm", alarm_description="Alert if 3rd party API has more than 2 errors in the last two minutes", alarm_name="third-party-api-alarm", metric=third_party_error_metric, comparison_operator=_cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, threshold=2, evaluation_periods=2, datapoints_to_alarm=1, period=core.Duration.minutes(1), treat_missing_data=_cloudwatch.TreatMissingData.NOT_BREACHING ) # Create CloudWatch Dashboard konstone_dashboard = _cloudwatch.Dashboard( self, id="konstoneDashboard", dashboard_name="Konstone-App-Live-Dashboard" ) # Add Lambda Function Metrics to Dashboard konstone_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.GraphWidget( title="Backend-Invocations", left=[ konstone_custom_metric_fn.metric_invocations( statistic="Sum", period=core.Duration.minutes(1) ) ] ), _cloudwatch.GraphWidget( title="Backend-Errors", left=[ konstone_custom_metric_fn.metric_errors( statistic="Sum", period=core.Duration.minutes(1) ) ] ) ) ) # Add 3rd Party API Error to Dashboard konstone_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.SingleValueWidget( title="Third Party API Errors", metrics=[third_party_error_metric] ) ) )
def __init__(self, scope: core.Construct, id: str, wiki_api_endpoint, **kwargs) -> None: super().__init__(scope, id, **kwargs) # DynamodDB Table(TODO:Create re-usable data model): queries_table = _dynamodb.Table( self, "queriesDataTable", partition_key=_dynamodb.Attribute( name="_id", type=_dynamodb.AttributeType.STRING)) # Create AWS XRay Layer aws_xray_layer = _lambda.LayerVersion( self, 'awsXrayLayer', code=_lambda.Code.from_asset( 'lambda_src/layer_code/aws_xray_python_37.zip'), compatible_runtimes=[ _lambda.Runtime.PYTHON_3_7, _lambda.Runtime.PYTHON_3_8 ], license= f'Mystique LambdaLayer of AWS XRay, Refer to AWS for license.', description='Layer to trace AWS Lamba Calls') # Create Requests Layer requests_layer = _lambda.LayerVersion( self, 'requestsLayer', code=_lambda.Code.from_asset( 'lambda_src/layer_code/requests_python_37.zip'), compatible_runtimes=[ _lambda.Runtime.PYTHON_3_7, _lambda.Runtime.PYTHON_3_8 ], description='Python requests Layer to make HTTP calls') # Defines an AWS Lambda resource with open("lambda_src/polyglot_strangler_fig_svc.py", encoding="utf8") as fp: polyglot_svc_fn_handler_code = fp.read() polyglot_svc_fn = _lambda.Function( self, id='polyglotStranglerFigService', function_name="polyglot_svc_fn", runtime=_lambda.Runtime.PYTHON_3_7, code=_lambda.InlineCode(polyglot_svc_fn_handler_code), handler='index.lambda_handler', timeout=core.Duration.seconds(59), environment={ 'LD_LIBRARY_PATH': '/opt/python', 'WIKI_API_ENDPOINT': wiki_api_endpoint, 'DDB_TABLE_NAME': queries_table.table_name, 'TRIGGER_RANDOM_FAILURES': 'True' }, layers=[aws_xray_layer, requests_layer], tracing=_lambda.Tracing.ACTIVE) # Grant Lambda permissions to write to Dynamodb queries_table.grant_read_write_data(polyglot_svc_fn) ##### PUBLISH TO API GW ###### # Enable AWS XRay Tracing at API GW polyglot_svc_api_stage_options = _apigw.StageOptions( stage_name="myst", logging_level=_apigw.MethodLoggingLevel.INFO, data_trace_enabled=True, metrics_enabled=True, tracing_enabled=True) # Create API Gateway api_01 = _apigw.RestApi(self, 'polglotApiEndpoint', rest_api_name='mystique-xray-tracer-api', deploy_options=polyglot_svc_api_stage_options) v1 = api_01.root.add_resource("polyglot_svc") # Add resource for HTTP Endpoint: API Hosted on EC2 polyglot_svc_api_resource_00 = v1.add_resource('wiki') self.polyglot_svc_api_resource_01 = polyglot_svc_api_resource_00.add_resource( '{query}') polyglot_svc_api_lambda_integration = _apigw.LambdaIntegration( handler=polyglot_svc_fn, proxy=True, integration_responses=[{ "statusCode": "200" }], request_parameters={ "integration.request.path.query": "method.request.path.query" }) self.polyglot_svc_api_resource_01.add_method( http_method="GET", integration=polyglot_svc_api_lambda_integration, method_responses=[{ "statusCode": "200" }], request_parameters={ 'method.request.header.Content-Type': False, 'method.request.path.query': True }) ##### MONITORING ###### # Now let us create alarms for our Lambda Function # alarm is raised there are more than 5(threshold) of the measured metrics in two(datapoint) of the last three seconds(evaluation): # Period=60Seconds, Eval=3, Threshold=5 # metric_errors(): How many invocations of this Lambda fail. # https://docs.aws.amazon.com/cdk/api/latest/python/aws_cdk.aws_lambda/Function.html polyglot_svc_fn_error_alarm = polyglot_svc_fn.metric_errors( ).create_alarm( self, "polglotSvcAlarm", alarm_name="polyglot_svc_fn_error_alarm", threshold=10, evaluation_periods=2, comparison_operator=_cloudwatch.ComparisonOperator. GREATER_THAN_THRESHOLD, period=core.Duration.minutes(1), treat_missing_data=_cloudwatch.TreatMissingData.NOT_BREACHING) # SNS For Alerts for Polyglot Service polyglot_svc_support_topic = _sns.Topic( self, "polyglotSvcTopic", display_name="PolyglotSvc", topic_name="polyglotSvcSupportTopic") # Subscribe Polyglot Service Team Email to topic for email in global_args.POLYGLOT_SUPPORT_EMAIL: polyglot_svc_support_topic.add_subscription( _subs.EmailSubscription(email_address=email)) # polyglot_svc_support_topic.add_subscription( # _subs.EmailSubscription(global_args.POLYGLOT_SUPPORT_EMAIL)) # Add the topic to the Alarm polyglot_svc_fn_error_alarm.add_alarm_action( _cw_actions.SnsAction(polyglot_svc_support_topic)) # Create CloudWatch Dashboard for Polyglot Service Team polyglot_svc_dashboard = _cloudwatch.Dashboard( self, id="polyglotSvcDashboard", dashboard_name="Polyglot-Svc") polyglot_svc_fn_invocation_metric = polyglot_svc_fn.metric_invocations( label="Invocations", period=core.Duration.minutes(1), statistic="Sum") polyglot_svc_dashboard.add_widgets( _cloudwatch.AlarmWidget(title="Lambda-Errors", alarm=polyglot_svc_fn_error_alarm)) polyglot_svc_dashboard.add_widgets( # Lambda Metrics # TODO: here monitor all lambda concurrency not just the working one. Limitation from CDK # Lambda now supports monitor single lambda concurrency, will change this after CDK support _cloudwatch.GraphWidget( title="Lambda-all-concurrent", left=[ polyglot_svc_fn.metric_all_concurrent_executions( statistic="Sum", period=core.Duration.minutes(1), color=_cloudwatch.Color.GREEN) ]), _cloudwatch.GraphWidget( title="Lambda-invocations/errors/throttles", left=[ polyglot_svc_fn.metric_invocations( statistic="Sum", period=core.Duration.minutes(1)), polyglot_svc_fn.metric_errors( statistic="Sum", period=core.Duration.minutes(1), color=_cloudwatch.Color.RED), polyglot_svc_fn.metric_throttles( statistic="Sum", period=core.Duration.minutes(1), color=_cloudwatch.Color.ORANGE) ]), _cloudwatch.GraphWidget(title="Lambda-duration", left=[ polyglot_svc_fn.metric_duration( statistic="Average", period=core.Duration.minutes(1)) ]), # _cloudwatch.Row(_cloudwatch.TextWidget(markdown="# XRay Profiler KPI")), # _cloudwatch.Row(_cloudwatch.Spacer()), # DynamoDB Metrics _cloudwatch.Row( _cloudwatch.GraphWidget( title="DynamoDB-Write-Capacity-Units", left=[ queries_table.metric_consumed_write_capacity_units( statistic="Sum", period=core.Duration.minutes(1)) ]), _cloudwatch.GraphWidget( title="DynamoDB-Read-Capacity-Units", left=[ queries_table.metric_consumed_read_capacity_units( statistic="Sum", period=core.Duration.minutes(1)) ])), ) ########################################### ################# OUTPUTS ################# ########################################### output_0 = core.CfnOutput( self, "AutomationFrom", value=f"{global_args.SOURCE_INFO}", description= "To know more about this automation stack, check out our github page." ) output_1 = core.CfnOutput( self, 'PolyglotServiceApiUrl', value=f'{self.polyglot_svc_api_resource_01.url}', description= f'Call the polyglot API, replace <query> with your search term')