Пример #1
0
def delete_alarms_for_campaign(campaign_arn):
    cw = get_client(service_name='cloudwatch',
                    region_name=extract_region(campaign_arn))

    alarm_names_to_delete = set()

    alarms_paginator = cw.get_paginator('describe_alarms')
    for alarms_page in alarms_paginator.paginate(
            AlarmNamePrefix=ALARM_NAME_PREFIX, AlarmTypes=['MetricAlarm']):
        for alarm in alarms_page['MetricAlarms']:
            for dim in alarm['Dimensions']:
                if dim['Name'] == 'CampaignArn' and dim[
                        'Value'] == campaign_arn:
                    tags_response = cw.list_tags_for_resource(
                        ResourceARN=alarm['AlarmArn'])

                    for tag in tags_response['Tags']:
                        if tag['Key'] == 'CreatedBy' and tag[
                                'Value'] == PROJECT_NAME:
                            alarm_names_to_delete.add(alarm['AlarmName'])
                            break

    if alarm_names_to_delete:
        # FUTURE: max check of 100
        logger.info('Deleting CloudWatch alarms for campaign %s: %s',
                    campaign_arn, alarm_names_to_delete)
        cw.delete_alarms(AlarmNames=list(alarm_names_to_delete))
        alarms_deleted += len(alarm_names_to_delete)
    else:
        logger.info('No CloudWatch alarms to delete for campaign %s',
                    campaign_arn)
def get_campaign_recipe_arn(campaign):
    recipe_arn = campaign.get('recipeArn')
    if not recipe_arn:
        campaign_region = extract_region(campaign['campaignArn'])
        personalize = get_client('personalize', campaign_region)

        response = personalize.describe_solution_version(solutionVersionArn = campaign['solutionVersionArn'])

        recipe_arn = response['solutionVersion']['recipeArn']
        campaign['recipeArn'] = recipe_arn

    return recipe_arn
def get_campaign_sum_requests_datapoints(campaign, start_time, end_time, period):
    campaign_region = extract_region(campaign['campaignArn'])
    cw = get_client(service_name = 'cloudwatch', region_name = campaign_region)

    metric_name = get_campaign_inference_metric_name(campaign)

    response = cw.get_metric_data(
        MetricDataQueries = [ 
            {
                'Id': 'm1',
                'MetricStat': {
                    'Metric': {
                        'Namespace': 'AWS/Personalize',
                        'MetricName': metric_name,
                        'Dimensions': [
                            {
                                'Name': 'CampaignArn',
                                'Value': campaign['campaignArn']
                            }
                        ]
                    },
                    'Period': period,
                    'Stat': 'Sum'
                },
                'ReturnData': True
            }
        ],
        StartTime = start_time,
        EndTime = end_time,
        ScanBy = 'TimestampDescending'
    )

    datapoints = []

    if response.get('MetricDataResults') and len(response['MetricDataResults']) > 0:
        results = response['MetricDataResults'][0]

        for idx, ts in enumerate(results['Timestamps']):
            datapoints.append({
                'Timestamp': ts,
                'Value': results['Values'][idx]
            })

    return datapoints
Пример #4
0
def lambda_handler(event, context):
    ''' Initiates the delete of a Personalize campaign '''
    if event.get('detail'):
        campaign_arn = event['detail']['CampaignARN']
        reason = event['detail']['Reason']
    else:
        campaign_arn = event['CampaignARN']
        reason = event.get('Reason')

    region = extract_region(campaign_arn)
    if not region:
        raise Exception('Region could not be extracted from campaign_arn')

    personalize = get_client(service_name='personalize', region_name=region)

    response = personalize.delete_campaign(campaignArn=campaign_arn)

    if logger.isEnabledFor(logging.DEBUG):
        logger.debug(json.dumps(response, indent=2, default=str))

    if not reason:
        reason = f'Amazon Personalize campaign {campaign_arn} deletion initiated (reason unspecified)'

    put_event(detail_type='PersonalizeCampaignDeleted',
              detail=json.dumps({
                  'CampaignARN': campaign_arn,
                  'Reason': reason
              }),
              resources=[campaign_arn])

    put_event(detail_type='BuildPersonalizeMonitorDashboard',
              detail=json.dumps({
                  'CampaignARN': campaign_arn,
                  'Reason': reason
              }),
              resources=[campaign_arn])

    logger.info({'campaignArn': campaign_arn})

    delete_alarms_for_campaign(campaign_arn)

    return f'Successfully initiated delete of campaign {campaign_arn}'
def delete_resource(event, _):
    campaign_arns = determine_campaign_arns(event.get('ResourceProperties'))

    logger.debug('Campaigns to check for resources to delete: %s',
                 campaign_arns)

    regions = set()

    for campaign_arn in campaign_arns:
        regions.add(extract_region(campaign_arn))

    logger.debug('Regions to check for resources to delete: %s', regions)

    alarms_deleted = 0

    for region in regions:
        cw = get_client(service_name='cloudwatch', region_name=region)

        alarm_names_to_delete = set()

        alarms_paginator = cw.get_paginator('describe_alarms')
        for alarms_page in alarms_paginator.paginate(
                AlarmNamePrefix=ALARM_NAME_PREFIX, AlarmTypes=['MetricAlarm']):
            for alarm in alarms_page['MetricAlarms']:
                tags_response = cw.list_tags_for_resource(
                    ResourceARN=alarm['AlarmArn'])

                for tag in tags_response['Tags']:
                    if tag['Key'] == 'CreatedBy' and tag[
                            'Value'] == PROJECT_NAME:
                        alarm_names_to_delete.add(alarm['AlarmName'])
                        break

        if alarm_names_to_delete:
            # FUTURE: max check of 100
            logger.info(
                'Deleting CloudWatch alarms in %s for campaigns %s: %s',
                region, campaign_arns, alarm_names_to_delete)
            cw.delete_alarms(AlarmNames=list(alarm_names_to_delete))
            alarms_deleted += len(alarm_names_to_delete)

    logger.info('Deleted %d alarms', alarms_deleted)
Пример #6
0
def lambda_handler(event, context):
    ''' Updates the minProvisionedTPS value for an existing Personalize campaign '''
    if event.get('detail'):
        campaign_arn = event['detail']['CampaignARN']
        min_tps = event['detail']['MinProvisionedTPS']
        reason = event['detail']['Reason']
    else:
        campaign_arn = event['CampaignARN']
        min_tps = event['MinProvisionedTPS']
        reason = event.get('Reason')

    if min_tps < 1:
        raise ValueError(f'"MinProvisionedTPS" must be >= 1')

    region = extract_region(campaign_arn)
    if not region:
        raise Exception('Region could not be extracted from campaign_arn')

    personalize = get_client(service_name='personalize', region_name=region)

    response = personalize.update_campaign(campaignArn=campaign_arn,
                                           minProvisionedTPS=min_tps)

    if logger.isEnabledFor(logging.DEBUG):
        logger.debug(json.dumps(response, indent=2, default=str))

    if not reason:
        reason = f'Amazon Personalize campaign {campaign_arn} deletion initiated (reason unspecified)'

    put_event(detail_type='PersonalizeCampaignMinProvisionedTPSUpdated',
              detail=json.dumps({
                  'CampaignARN': campaign_arn,
                  'NewMinProvisionedTPS': min_tps,
                  'Reason': reason
              }),
              resources=[campaign_arn])

    logger.info({'campaignArn': campaign_arn, 'minProvisionedTPS': min_tps})

    return f'Successfully initiated update of minProvisionedTPS to {min_tps} for campaign {campaign_arn}'
def lambda_handler(event, context):
    auto_create_utilization_alarms = event.get('AutoCreateCampaignUtilizationAlarms')
    if not auto_create_utilization_alarms:
        auto_create_utilization_alarms = os.environ.get('AutoCreateCampaignUtilizationAlarms', 'yes').lower() in [ 'true', 'yes', '1' ]

    utilization_threshold_lower_bound = event.get('CampaignThresholdAlarmLowerBound')
    if not utilization_threshold_lower_bound:
        utilization_threshold_lower_bound = float(os.environ.get('CampaignThresholdAlarmLowerBound', '100.0'))

    auto_create_idle_alarms = event.get('AutoCreateIdleCampaignAlarms')
    if not auto_create_idle_alarms:
        auto_create_idle_alarms = os.environ.get('AutoCreateIdleCampaignAlarms', 'yes').lower() in [ 'true', 'yes', '1' ]

    auto_delete_idle_campaigns = event.get('AutoDeleteIdleCampaigns')
    if not auto_delete_idle_campaigns:
        auto_delete_idle_campaigns = os.environ.get('AutoDeleteIdleCampaigns', 'false').lower() in [ 'true', 'yes', '1' ]

    idle_campaign_threshold_hours = event.get('IdleCampaignThresholdHours')
    if not idle_campaign_threshold_hours:
        idle_campaign_threshold_hours = int(os.environ.get('IdleCampaignThresholdHours', '24'))

    if idle_campaign_threshold_hours < MIN_IDLE_CAMPAIGN_THRESHOLD_HOURS:
        raise ValueError(f'"IdleCampaignThresholdHours" must be >= {MIN_IDLE_CAMPAIGN_THRESHOLD_HOURS} hours')

    auto_adjust_campaign_tps = event.get('AutoAdjustCampaignMinProvisionedTPS')
    if not auto_adjust_campaign_tps:
        auto_adjust_campaign_tps = os.environ.get('AutoAdjustCampaignMinProvisionedTPS', 'yes').lower() in [ 'true', 'yes', '1' ]

    campaigns = get_configured_active_campaigns(event)
    
    logger.info('Retrieving minProvisionedTPS for %d active campaigns', len(campaigns))

    current_region = os.environ['AWS_REGION']
    
    metric_datas_by_region = {}

    append_metric(metric_datas_by_region, current_region, {
        'MetricName': 'monitoredCampaignCount',
        'Value': len(campaigns),
        'Unit': 'Count'
    })
    
    campaign_metrics_written = 0
    all_metrics_written = 0
    alarms_created = 0

    # Define our 5 minute window, ensuring it's on prior 5 minute boundary.
    end_time = datetime.datetime.now(datetime.timezone.utc)
    end_time = end_time.replace(microsecond=0,second=0, minute=end_time.minute - end_time.minute % 5)
    start_time = end_time - datetime.timedelta(minutes=5)

    for campaign in campaigns:
        campaign_arn = campaign['campaignArn']
        campaign_region = extract_region(campaign_arn)

        min_provisioned_tps = campaign['minProvisionedTPS']
        
        append_metric(metric_datas_by_region, campaign_region, {
            'MetricName': 'minProvisionedTPS',
            'Dimensions': [
                {
                    'Name': 'CampaignArn',
                    'Value': campaign_arn
                }
            ],
            'Value': min_provisioned_tps,
            'Unit': 'Count/Second'
        })
        
        tps = get_campaign_average_tps(campaign, start_time, end_time)
        utilization = 0

        if tps:
            append_metric(metric_datas_by_region, campaign_region, {
                'MetricName': 'averageTPS',
                'Dimensions': [
                    {
                        'Name': 'CampaignArn',
                        'Value': campaign_arn
                    }
                ],
                'Value': tps,
                'Unit': 'Count/Second'
            })
            
            utilization = tps / min_provisioned_tps * 100

        append_metric(metric_datas_by_region, campaign_region, {
            'MetricName': 'campaignUtilization',
            'Dimensions': [
                {
                    'Name': 'CampaignArn',
                    'Value': campaign_arn
                }
            ],
            'Value': utilization,
            'Unit': 'Percent'
        })
            
        logger.debug(
            'Campaign %s has current minProvisionedTPS of %d and actual TPS of %s yielding %.2f%% utilization', 
            campaign_arn, min_provisioned_tps, tps, utilization
        )
        campaign_metrics_written += 1

        # Only do idle campaign and minProvisionedTPS adjustment checks once per hour for each campaign.
        perform_hourly_checks_this_run = perform_hourly_checks(campaign_arn)

        # Determine how old the campaign is and time since last update.
        campaign_age_hours = get_campaign_age_hours(campaign)
        campaign_update_age_hours = get_campaign_last_update_age_hours(campaign)

        campaign_delete_event_fired = False

        if utilization == 0 and perform_hourly_checks_this_run and auto_delete_idle_campaigns:
            # Campaign is currently idle. Let's see if it's old enough and not being updated recently.
            logger.info(
                'Performing idle delete check for campaign %s; campaign is %d hours old; last updated %s hours ago', 
                campaign_arn, campaign_age_hours, campaign_update_age_hours
            )

            if (campaign_age_hours >= idle_campaign_threshold_hours):

                # Campaign has been around long enough. Let's see how long it's been idle.
                end_time_idle_check = datetime.datetime.now(datetime.timezone.utc)
                start_time_idle_check = end_time_idle_check - datetime.timedelta(hours = idle_campaign_threshold_hours)
                period_idle_check = idle_campaign_threshold_hours * 60 * 60

                total_requests = get_campaign_total_requests(campaign, start_time_idle_check, end_time_idle_check, period_idle_check)

                if total_requests == 0:
                    if is_campaign_updatable(campaign):
                        reason = f'Campaign {campaign_arn} has been idle for at least {idle_campaign_threshold_hours} hours so initiating delete according to configuration.'

                        logger.info(reason)

                        put_event(
                            detail_type = 'DeletePersonalizeCampaign',
                            detail = json.dumps({
                                'CampaignARN': campaign_arn,
                                'CampaignUtilization': utilization,
                                'CampaignAgeHours': campaign_age_hours,
                                'IdleCampaignThresholdHours': idle_campaign_threshold_hours,
                                'TotalRequestsDuringIdleThresholdHours': total_requests,
                                'Reason': reason
                            }),
                            resources = [ campaign_arn ]
                        )

                        campaign_delete_event_fired = True
                    else:
                        logger.warn(
                            'Campaign %s has been idle for at least %d hours but its status will not allow it to be deleted on this run', 
                            campaign_arn, idle_campaign_threshold_hours
                        )
                else:
                    logger.warn(
                        'Campaign %s is currently idle but has had %d requests within the last %d hours so does not meet idle criteria for auto-deletion', 
                        campaign_arn, total_requests, idle_campaign_threshold_hours
                    )
            else:
                logger.info(
                    'Campaign %s is only %d hours old and last update %s hours old; too new to consider for auto-deletion', 
                    campaign_arn, campaign_age_hours, campaign_update_age_hours
                )

        if (not campaign_delete_event_fired and 
                perform_hourly_checks_this_run and 
                auto_adjust_campaign_tps and 
                min_provisioned_tps > 1):

            days_back = 14
            end_time_tps_check = datetime.datetime.now(datetime.timezone.utc).replace(minute=0, second=0, microsecond=0)
            start_time_tps_check = end_time_tps_check - datetime.timedelta(days = days_back)

            datapoints = get_campaign_sum_requests_by_hour(campaign, start_time_tps_check, end_time_tps_check)
            min_reqs = sys.maxsize
            max_reqs = total_reqs = total_avg_tps = min_avg_tps = max_avg_tps = 0

            for datapoint in datapoints:
                total_reqs += datapoint['Value']
                min_reqs = min(min_reqs, datapoint['Value'])
                max_reqs = max(max_reqs, datapoint['Value'])

            if len(datapoints) > 0:
                total_avg_tps = int(total_reqs / (len(datapoints) * 3600))
                min_avg_tps = int(min_reqs / 3600)
                max_avg_tps = int(max_reqs / 3600)

            logger.info(
                'Performing minProvisionedTPS adjustment check for campaign %s; min/max/avg hourly TPS over last %d days for %d datapoints: %d/%d/%.2f', 
                campaign_arn, days_back, len(datapoints), min_avg_tps, max_avg_tps, total_avg_tps
            )

            min_age_to_update_hours = 24

            age_eligible = True

            if campaign_age_hours < min_age_to_update_hours:
                logger.info(
                    'Campaign %s is less than %d hours old so not eligible for minProvisionedTPS adjustment yet', 
                    campaign_arn, min_age_to_update_hours
                )
                age_eligible = False

            if age_eligible and min_avg_tps < min_provisioned_tps:
                # Incrementally drop minProvisionedTPS.
                new_min_tps = max(1, int(math.floor(min_provisioned_tps * .75)))

                if is_campaign_updatable(campaign):
                    reason = f'Step down adjustment of minProvisionedTPS for campaign {campaign_arn} down from {min_provisioned_tps} to {new_min_tps} based on average hourly TPS low watermark of {min_avg_tps} over last {days_back} days'
                    logger.info(reason)

                    put_event(
                        detail_type = 'UpdatePersonalizeCampaignMinProvisionedTPS',
                        detail = json.dumps({
                            'CampaignARN': campaign_arn,
                            'CampaignUtilization': utilization,
                            'CampaignAgeHours': campaign_age_hours,
                            'CurrentProvisionedTPS': min_provisioned_tps,
                            'MinProvisionedTPS': new_min_tps,
                            'MinAverageTPS': min_avg_tps,
                            'MaxAverageTPS': max_avg_tps,
                            'Datapoints': datapoints,
                            'Reason': reason
                        }, default = str),
                        resources = [ campaign_arn ]
                    )
                else:
                    logger.warn(
                        'Campaign %s could have its minProvisionedTPS adjusted down from %d to %d based on average hourly TPS low watermark over last %d days but its status will not allow it to be updated on this run', 
                        campaign_arn, min_provisioned_tps, new_min_tps, days_back
                    )

        if not campaign_delete_event_fired:
            if auto_create_utilization_alarms:
                if create_utilization_alarm(campaign_region, campaign, utilization_threshold_lower_bound):
                    alarms_created += 1

            if auto_create_idle_alarms:
                if create_idle_campaign_alarm(campaign_region, campaign, idle_campaign_threshold_hours):
                    alarms_created += 1

    for region, metric_datas in metric_datas_by_region.items():
        cw = get_client(service_name = 'cloudwatch', region_name = region)

        metric_datas_chunks = divide_chunks(metric_datas, MAX_METRICS_PER_CALL)

        for metrics_datas_chunk in metric_datas_chunks:
            put_metrics(cw, metrics_datas_chunk)
            all_metrics_written += len(metrics_datas_chunk)

    outcome = f'Logged {all_metrics_written} TPS utilization metrics for {campaign_metrics_written} active campaigns; {alarms_created} alarms created'
    logger.info(outcome)

    if alarms_created > 0:
        # At least one new alarm was created so that likely means new campaigns were created too. Let's trigger the dashboard to be rebuilt.
        logger.info('Triggering rebuild of the CloudWatch dashboard since %d new alarm(s) were created', alarms_created)
        put_event(
            detail_type = 'BuildPersonalizeMonitorDashboard',
            detail = json.dumps({
                'Reason': f'Triggered rebuild due to {alarms_created} new alarm(s) being created'
            })
        )

    return outcome
def build_dashboard(event):
    # Will hold the data used to render the template.
    template_data = {}

    template_data['namespace'] = 'PersonalizeMonitor'
    template_data['current_region'] = os.environ['AWS_REGION']

    logger.debug('Loading active campaigns')

    campaigns = get_configured_active_campaigns(event)
    template_data['active_campaign_count'] = len(campaigns)

    # Group campaigns by dataset group so we can create DSG specific widgets in rows
    campaigns_by_dsg_arn = {}
    # Holds DSG info so we only have describe once per DSG
    dsgs_by_arn = {}

    for campaign in campaigns:
        logger.info('Campaign %s will be added to the dashboard',
                    campaign['campaignArn'])

        campaign_region = extract_region(campaign['campaignArn'])

        personalize = get_client('personalize', campaign_region)

        response = personalize.describe_solution_version(
            solutionVersionArn=campaign['solutionVersionArn'])

        dsg_arn = response['solutionVersion']['datasetGroupArn']
        recipe_arn = response['solutionVersion']['recipeArn']

        dsg = dsgs_by_arn.get(dsg_arn)
        if not dsg:
            response = personalize.describe_dataset_group(
                datasetGroupArn=dsg_arn)
            dsg = response['datasetGroup']
            dsgs_by_arn[dsg_arn] = dsg

        campaign_datas = campaigns_by_dsg_arn.get(dsg_arn)
        if not campaign_datas:
            campaign_datas = []
            campaigns_by_dsg_arn[dsg_arn] = campaign_datas

        campaign_data = {
            'name': campaign['name'],
            'campaign_arn': campaign['campaignArn'],
            'region': campaign_region
        }

        if recipe_arn == 'arn:aws:personalize:::recipe/aws-personalized-ranking':
            campaign_data[
                'campaign_latency_metric_name'] = 'GetPersonalizedRankingLatency'
        else:
            campaign_data[
                'campaign_latency_metric_name'] = 'GetRecommendationsLatency'

        campaign_datas.append(campaign_data)

    dsgs_for_template = []

    for dsg_arn, campaign_datas in campaigns_by_dsg_arn.items():
        dsg = dsgs_by_arn[dsg_arn]

        # Minor hack to know when we're on the last item in list when iterating in template.
        campaign_datas[len(campaign_datas) - 1]['last_campaign'] = True

        dsgs_for_template.append({
            'name': dsg['name'],
            'region': extract_region(dsg_arn),
            'account_id': extract_account_id(dsg_arn),
            'campaigns': campaign_datas
        })

    template_data['dataset_groups'] = dsgs_for_template

    # Render template and use as dashboard body.
    with open('dashboard-template.mustache', 'r') as f:
        dashboard = chevron.render(f, template_data)

        logger.debug(json.dumps(dashboard, indent=2, default=str))

        logger.info('Adding/updating dashboard')

        cloudwatch.put_dashboard(DashboardName=DASHBOARD_NAME,
                                 DashboardBody=dashboard)