def delete_alarms_for_campaign(campaign_arn): cw = get_client(service_name='cloudwatch', region_name=extract_region(campaign_arn)) alarm_names_to_delete = set() alarms_paginator = cw.get_paginator('describe_alarms') for alarms_page in alarms_paginator.paginate( AlarmNamePrefix=ALARM_NAME_PREFIX, AlarmTypes=['MetricAlarm']): for alarm in alarms_page['MetricAlarms']: for dim in alarm['Dimensions']: if dim['Name'] == 'CampaignArn' and dim[ 'Value'] == campaign_arn: tags_response = cw.list_tags_for_resource( ResourceARN=alarm['AlarmArn']) for tag in tags_response['Tags']: if tag['Key'] == 'CreatedBy' and tag[ 'Value'] == PROJECT_NAME: alarm_names_to_delete.add(alarm['AlarmName']) break if alarm_names_to_delete: # FUTURE: max check of 100 logger.info('Deleting CloudWatch alarms for campaign %s: %s', campaign_arn, alarm_names_to_delete) cw.delete_alarms(AlarmNames=list(alarm_names_to_delete)) alarms_deleted += len(alarm_names_to_delete) else: logger.info('No CloudWatch alarms to delete for campaign %s', campaign_arn)
def get_campaign_recipe_arn(campaign): recipe_arn = campaign.get('recipeArn') if not recipe_arn: campaign_region = extract_region(campaign['campaignArn']) personalize = get_client('personalize', campaign_region) response = personalize.describe_solution_version(solutionVersionArn = campaign['solutionVersionArn']) recipe_arn = response['solutionVersion']['recipeArn'] campaign['recipeArn'] = recipe_arn return recipe_arn
def get_campaign_sum_requests_datapoints(campaign, start_time, end_time, period): campaign_region = extract_region(campaign['campaignArn']) cw = get_client(service_name = 'cloudwatch', region_name = campaign_region) metric_name = get_campaign_inference_metric_name(campaign) response = cw.get_metric_data( MetricDataQueries = [ { 'Id': 'm1', 'MetricStat': { 'Metric': { 'Namespace': 'AWS/Personalize', 'MetricName': metric_name, 'Dimensions': [ { 'Name': 'CampaignArn', 'Value': campaign['campaignArn'] } ] }, 'Period': period, 'Stat': 'Sum' }, 'ReturnData': True } ], StartTime = start_time, EndTime = end_time, ScanBy = 'TimestampDescending' ) datapoints = [] if response.get('MetricDataResults') and len(response['MetricDataResults']) > 0: results = response['MetricDataResults'][0] for idx, ts in enumerate(results['Timestamps']): datapoints.append({ 'Timestamp': ts, 'Value': results['Values'][idx] }) return datapoints
def lambda_handler(event, context): ''' Initiates the delete of a Personalize campaign ''' if event.get('detail'): campaign_arn = event['detail']['CampaignARN'] reason = event['detail']['Reason'] else: campaign_arn = event['CampaignARN'] reason = event.get('Reason') region = extract_region(campaign_arn) if not region: raise Exception('Region could not be extracted from campaign_arn') personalize = get_client(service_name='personalize', region_name=region) response = personalize.delete_campaign(campaignArn=campaign_arn) if logger.isEnabledFor(logging.DEBUG): logger.debug(json.dumps(response, indent=2, default=str)) if not reason: reason = f'Amazon Personalize campaign {campaign_arn} deletion initiated (reason unspecified)' put_event(detail_type='PersonalizeCampaignDeleted', detail=json.dumps({ 'CampaignARN': campaign_arn, 'Reason': reason }), resources=[campaign_arn]) put_event(detail_type='BuildPersonalizeMonitorDashboard', detail=json.dumps({ 'CampaignARN': campaign_arn, 'Reason': reason }), resources=[campaign_arn]) logger.info({'campaignArn': campaign_arn}) delete_alarms_for_campaign(campaign_arn) return f'Successfully initiated delete of campaign {campaign_arn}'
def delete_resource(event, _): campaign_arns = determine_campaign_arns(event.get('ResourceProperties')) logger.debug('Campaigns to check for resources to delete: %s', campaign_arns) regions = set() for campaign_arn in campaign_arns: regions.add(extract_region(campaign_arn)) logger.debug('Regions to check for resources to delete: %s', regions) alarms_deleted = 0 for region in regions: cw = get_client(service_name='cloudwatch', region_name=region) alarm_names_to_delete = set() alarms_paginator = cw.get_paginator('describe_alarms') for alarms_page in alarms_paginator.paginate( AlarmNamePrefix=ALARM_NAME_PREFIX, AlarmTypes=['MetricAlarm']): for alarm in alarms_page['MetricAlarms']: tags_response = cw.list_tags_for_resource( ResourceARN=alarm['AlarmArn']) for tag in tags_response['Tags']: if tag['Key'] == 'CreatedBy' and tag[ 'Value'] == PROJECT_NAME: alarm_names_to_delete.add(alarm['AlarmName']) break if alarm_names_to_delete: # FUTURE: max check of 100 logger.info( 'Deleting CloudWatch alarms in %s for campaigns %s: %s', region, campaign_arns, alarm_names_to_delete) cw.delete_alarms(AlarmNames=list(alarm_names_to_delete)) alarms_deleted += len(alarm_names_to_delete) logger.info('Deleted %d alarms', alarms_deleted)
def lambda_handler(event, context): ''' Updates the minProvisionedTPS value for an existing Personalize campaign ''' if event.get('detail'): campaign_arn = event['detail']['CampaignARN'] min_tps = event['detail']['MinProvisionedTPS'] reason = event['detail']['Reason'] else: campaign_arn = event['CampaignARN'] min_tps = event['MinProvisionedTPS'] reason = event.get('Reason') if min_tps < 1: raise ValueError(f'"MinProvisionedTPS" must be >= 1') region = extract_region(campaign_arn) if not region: raise Exception('Region could not be extracted from campaign_arn') personalize = get_client(service_name='personalize', region_name=region) response = personalize.update_campaign(campaignArn=campaign_arn, minProvisionedTPS=min_tps) if logger.isEnabledFor(logging.DEBUG): logger.debug(json.dumps(response, indent=2, default=str)) if not reason: reason = f'Amazon Personalize campaign {campaign_arn} deletion initiated (reason unspecified)' put_event(detail_type='PersonalizeCampaignMinProvisionedTPSUpdated', detail=json.dumps({ 'CampaignARN': campaign_arn, 'NewMinProvisionedTPS': min_tps, 'Reason': reason }), resources=[campaign_arn]) logger.info({'campaignArn': campaign_arn, 'minProvisionedTPS': min_tps}) return f'Successfully initiated update of minProvisionedTPS to {min_tps} for campaign {campaign_arn}'
def lambda_handler(event, context): auto_create_utilization_alarms = event.get('AutoCreateCampaignUtilizationAlarms') if not auto_create_utilization_alarms: auto_create_utilization_alarms = os.environ.get('AutoCreateCampaignUtilizationAlarms', 'yes').lower() in [ 'true', 'yes', '1' ] utilization_threshold_lower_bound = event.get('CampaignThresholdAlarmLowerBound') if not utilization_threshold_lower_bound: utilization_threshold_lower_bound = float(os.environ.get('CampaignThresholdAlarmLowerBound', '100.0')) auto_create_idle_alarms = event.get('AutoCreateIdleCampaignAlarms') if not auto_create_idle_alarms: auto_create_idle_alarms = os.environ.get('AutoCreateIdleCampaignAlarms', 'yes').lower() in [ 'true', 'yes', '1' ] auto_delete_idle_campaigns = event.get('AutoDeleteIdleCampaigns') if not auto_delete_idle_campaigns: auto_delete_idle_campaigns = os.environ.get('AutoDeleteIdleCampaigns', 'false').lower() in [ 'true', 'yes', '1' ] idle_campaign_threshold_hours = event.get('IdleCampaignThresholdHours') if not idle_campaign_threshold_hours: idle_campaign_threshold_hours = int(os.environ.get('IdleCampaignThresholdHours', '24')) if idle_campaign_threshold_hours < MIN_IDLE_CAMPAIGN_THRESHOLD_HOURS: raise ValueError(f'"IdleCampaignThresholdHours" must be >= {MIN_IDLE_CAMPAIGN_THRESHOLD_HOURS} hours') auto_adjust_campaign_tps = event.get('AutoAdjustCampaignMinProvisionedTPS') if not auto_adjust_campaign_tps: auto_adjust_campaign_tps = os.environ.get('AutoAdjustCampaignMinProvisionedTPS', 'yes').lower() in [ 'true', 'yes', '1' ] campaigns = get_configured_active_campaigns(event) logger.info('Retrieving minProvisionedTPS for %d active campaigns', len(campaigns)) current_region = os.environ['AWS_REGION'] metric_datas_by_region = {} append_metric(metric_datas_by_region, current_region, { 'MetricName': 'monitoredCampaignCount', 'Value': len(campaigns), 'Unit': 'Count' }) campaign_metrics_written = 0 all_metrics_written = 0 alarms_created = 0 # Define our 5 minute window, ensuring it's on prior 5 minute boundary. end_time = datetime.datetime.now(datetime.timezone.utc) end_time = end_time.replace(microsecond=0,second=0, minute=end_time.minute - end_time.minute % 5) start_time = end_time - datetime.timedelta(minutes=5) for campaign in campaigns: campaign_arn = campaign['campaignArn'] campaign_region = extract_region(campaign_arn) min_provisioned_tps = campaign['minProvisionedTPS'] append_metric(metric_datas_by_region, campaign_region, { 'MetricName': 'minProvisionedTPS', 'Dimensions': [ { 'Name': 'CampaignArn', 'Value': campaign_arn } ], 'Value': min_provisioned_tps, 'Unit': 'Count/Second' }) tps = get_campaign_average_tps(campaign, start_time, end_time) utilization = 0 if tps: append_metric(metric_datas_by_region, campaign_region, { 'MetricName': 'averageTPS', 'Dimensions': [ { 'Name': 'CampaignArn', 'Value': campaign_arn } ], 'Value': tps, 'Unit': 'Count/Second' }) utilization = tps / min_provisioned_tps * 100 append_metric(metric_datas_by_region, campaign_region, { 'MetricName': 'campaignUtilization', 'Dimensions': [ { 'Name': 'CampaignArn', 'Value': campaign_arn } ], 'Value': utilization, 'Unit': 'Percent' }) logger.debug( 'Campaign %s has current minProvisionedTPS of %d and actual TPS of %s yielding %.2f%% utilization', campaign_arn, min_provisioned_tps, tps, utilization ) campaign_metrics_written += 1 # Only do idle campaign and minProvisionedTPS adjustment checks once per hour for each campaign. perform_hourly_checks_this_run = perform_hourly_checks(campaign_arn) # Determine how old the campaign is and time since last update. campaign_age_hours = get_campaign_age_hours(campaign) campaign_update_age_hours = get_campaign_last_update_age_hours(campaign) campaign_delete_event_fired = False if utilization == 0 and perform_hourly_checks_this_run and auto_delete_idle_campaigns: # Campaign is currently idle. Let's see if it's old enough and not being updated recently. logger.info( 'Performing idle delete check for campaign %s; campaign is %d hours old; last updated %s hours ago', campaign_arn, campaign_age_hours, campaign_update_age_hours ) if (campaign_age_hours >= idle_campaign_threshold_hours): # Campaign has been around long enough. Let's see how long it's been idle. end_time_idle_check = datetime.datetime.now(datetime.timezone.utc) start_time_idle_check = end_time_idle_check - datetime.timedelta(hours = idle_campaign_threshold_hours) period_idle_check = idle_campaign_threshold_hours * 60 * 60 total_requests = get_campaign_total_requests(campaign, start_time_idle_check, end_time_idle_check, period_idle_check) if total_requests == 0: if is_campaign_updatable(campaign): reason = f'Campaign {campaign_arn} has been idle for at least {idle_campaign_threshold_hours} hours so initiating delete according to configuration.' logger.info(reason) put_event( detail_type = 'DeletePersonalizeCampaign', detail = json.dumps({ 'CampaignARN': campaign_arn, 'CampaignUtilization': utilization, 'CampaignAgeHours': campaign_age_hours, 'IdleCampaignThresholdHours': idle_campaign_threshold_hours, 'TotalRequestsDuringIdleThresholdHours': total_requests, 'Reason': reason }), resources = [ campaign_arn ] ) campaign_delete_event_fired = True else: logger.warn( 'Campaign %s has been idle for at least %d hours but its status will not allow it to be deleted on this run', campaign_arn, idle_campaign_threshold_hours ) else: logger.warn( 'Campaign %s is currently idle but has had %d requests within the last %d hours so does not meet idle criteria for auto-deletion', campaign_arn, total_requests, idle_campaign_threshold_hours ) else: logger.info( 'Campaign %s is only %d hours old and last update %s hours old; too new to consider for auto-deletion', campaign_arn, campaign_age_hours, campaign_update_age_hours ) if (not campaign_delete_event_fired and perform_hourly_checks_this_run and auto_adjust_campaign_tps and min_provisioned_tps > 1): days_back = 14 end_time_tps_check = datetime.datetime.now(datetime.timezone.utc).replace(minute=0, second=0, microsecond=0) start_time_tps_check = end_time_tps_check - datetime.timedelta(days = days_back) datapoints = get_campaign_sum_requests_by_hour(campaign, start_time_tps_check, end_time_tps_check) min_reqs = sys.maxsize max_reqs = total_reqs = total_avg_tps = min_avg_tps = max_avg_tps = 0 for datapoint in datapoints: total_reqs += datapoint['Value'] min_reqs = min(min_reqs, datapoint['Value']) max_reqs = max(max_reqs, datapoint['Value']) if len(datapoints) > 0: total_avg_tps = int(total_reqs / (len(datapoints) * 3600)) min_avg_tps = int(min_reqs / 3600) max_avg_tps = int(max_reqs / 3600) logger.info( 'Performing minProvisionedTPS adjustment check for campaign %s; min/max/avg hourly TPS over last %d days for %d datapoints: %d/%d/%.2f', campaign_arn, days_back, len(datapoints), min_avg_tps, max_avg_tps, total_avg_tps ) min_age_to_update_hours = 24 age_eligible = True if campaign_age_hours < min_age_to_update_hours: logger.info( 'Campaign %s is less than %d hours old so not eligible for minProvisionedTPS adjustment yet', campaign_arn, min_age_to_update_hours ) age_eligible = False if age_eligible and min_avg_tps < min_provisioned_tps: # Incrementally drop minProvisionedTPS. new_min_tps = max(1, int(math.floor(min_provisioned_tps * .75))) if is_campaign_updatable(campaign): reason = f'Step down adjustment of minProvisionedTPS for campaign {campaign_arn} down from {min_provisioned_tps} to {new_min_tps} based on average hourly TPS low watermark of {min_avg_tps} over last {days_back} days' logger.info(reason) put_event( detail_type = 'UpdatePersonalizeCampaignMinProvisionedTPS', detail = json.dumps({ 'CampaignARN': campaign_arn, 'CampaignUtilization': utilization, 'CampaignAgeHours': campaign_age_hours, 'CurrentProvisionedTPS': min_provisioned_tps, 'MinProvisionedTPS': new_min_tps, 'MinAverageTPS': min_avg_tps, 'MaxAverageTPS': max_avg_tps, 'Datapoints': datapoints, 'Reason': reason }, default = str), resources = [ campaign_arn ] ) else: logger.warn( 'Campaign %s could have its minProvisionedTPS adjusted down from %d to %d based on average hourly TPS low watermark over last %d days but its status will not allow it to be updated on this run', campaign_arn, min_provisioned_tps, new_min_tps, days_back ) if not campaign_delete_event_fired: if auto_create_utilization_alarms: if create_utilization_alarm(campaign_region, campaign, utilization_threshold_lower_bound): alarms_created += 1 if auto_create_idle_alarms: if create_idle_campaign_alarm(campaign_region, campaign, idle_campaign_threshold_hours): alarms_created += 1 for region, metric_datas in metric_datas_by_region.items(): cw = get_client(service_name = 'cloudwatch', region_name = region) metric_datas_chunks = divide_chunks(metric_datas, MAX_METRICS_PER_CALL) for metrics_datas_chunk in metric_datas_chunks: put_metrics(cw, metrics_datas_chunk) all_metrics_written += len(metrics_datas_chunk) outcome = f'Logged {all_metrics_written} TPS utilization metrics for {campaign_metrics_written} active campaigns; {alarms_created} alarms created' logger.info(outcome) if alarms_created > 0: # At least one new alarm was created so that likely means new campaigns were created too. Let's trigger the dashboard to be rebuilt. logger.info('Triggering rebuild of the CloudWatch dashboard since %d new alarm(s) were created', alarms_created) put_event( detail_type = 'BuildPersonalizeMonitorDashboard', detail = json.dumps({ 'Reason': f'Triggered rebuild due to {alarms_created} new alarm(s) being created' }) ) return outcome
def build_dashboard(event): # Will hold the data used to render the template. template_data = {} template_data['namespace'] = 'PersonalizeMonitor' template_data['current_region'] = os.environ['AWS_REGION'] logger.debug('Loading active campaigns') campaigns = get_configured_active_campaigns(event) template_data['active_campaign_count'] = len(campaigns) # Group campaigns by dataset group so we can create DSG specific widgets in rows campaigns_by_dsg_arn = {} # Holds DSG info so we only have describe once per DSG dsgs_by_arn = {} for campaign in campaigns: logger.info('Campaign %s will be added to the dashboard', campaign['campaignArn']) campaign_region = extract_region(campaign['campaignArn']) personalize = get_client('personalize', campaign_region) response = personalize.describe_solution_version( solutionVersionArn=campaign['solutionVersionArn']) dsg_arn = response['solutionVersion']['datasetGroupArn'] recipe_arn = response['solutionVersion']['recipeArn'] dsg = dsgs_by_arn.get(dsg_arn) if not dsg: response = personalize.describe_dataset_group( datasetGroupArn=dsg_arn) dsg = response['datasetGroup'] dsgs_by_arn[dsg_arn] = dsg campaign_datas = campaigns_by_dsg_arn.get(dsg_arn) if not campaign_datas: campaign_datas = [] campaigns_by_dsg_arn[dsg_arn] = campaign_datas campaign_data = { 'name': campaign['name'], 'campaign_arn': campaign['campaignArn'], 'region': campaign_region } if recipe_arn == 'arn:aws:personalize:::recipe/aws-personalized-ranking': campaign_data[ 'campaign_latency_metric_name'] = 'GetPersonalizedRankingLatency' else: campaign_data[ 'campaign_latency_metric_name'] = 'GetRecommendationsLatency' campaign_datas.append(campaign_data) dsgs_for_template = [] for dsg_arn, campaign_datas in campaigns_by_dsg_arn.items(): dsg = dsgs_by_arn[dsg_arn] # Minor hack to know when we're on the last item in list when iterating in template. campaign_datas[len(campaign_datas) - 1]['last_campaign'] = True dsgs_for_template.append({ 'name': dsg['name'], 'region': extract_region(dsg_arn), 'account_id': extract_account_id(dsg_arn), 'campaigns': campaign_datas }) template_data['dataset_groups'] = dsgs_for_template # Render template and use as dashboard body. with open('dashboard-template.mustache', 'r') as f: dashboard = chevron.render(f, template_data) logger.debug(json.dumps(dashboard, indent=2, default=str)) logger.info('Adding/updating dashboard') cloudwatch.put_dashboard(DashboardName=DASHBOARD_NAME, DashboardBody=dashboard)