Пример #1
0
    def configure_dashboard(self):
        client = self.context["cloudwatch.client"]
        # Cloudwatch service is billing calls to dashboard API. We make sure that we do not call it too often
        now = self.context["now"]
        dashboard_state = Cfg.get_int("cloudwatch.dashboard.use_default")
        dashboard_last_state = self.ec2.get_state(
            "cloudwatch.dashboard.use_default.last_state")
        self.ec2.set_state("cloudwatch.dashboard.use_default.last_state",
                           dashboard_state,
                           TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))

        last_dashboad_action = self.ec2.get_state_date(
            "cloudwatch.dashboard.last_action", default=misc.epoch())
        dashboad_update_interval = Cfg.get_duration_secs(
            "cloudwatch.dashboard.update_interval")
        if (str(dashboard_state) == dashboard_last_state
            ) and (now - last_dashboad_action
                   ).total_seconds() < dashboad_update_interval:
            log.debug("Not yet the time to manage the dashboard.")
            return

        if Cfg.get_int("cloudwatch.dashboard.use_default") != 1:
            try:
                client.delete_dashboards(
                    DashboardNames=[self._get_dashboard_name()])
            except:
                pass
        else:
            content = self.load_dashboard()
            log.log(
                log.NOTICE, "Configuring CloudWatch dashboard '%s'..." %
                self._get_dashboard_name())

            response = client.put_dashboard(
                DashboardName=self._get_dashboard_name(),
                DashboardBody=content)
        self.ec2.set_state("cloudwatch.dashboard.last_action",
                           now,
                           TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
Пример #2
0
    def get_prerequisites(self):
        now = self.context["now"]
        client = self.context["cloudwatch.client"]

        # Read all CloudWatch alarm templates into memory
        alarm_definitions = {}
        for i in range(0, Cfg.get_int("cloudwatch.alarms.max_per_instance")):
            key = "cloudwatch.alarm%02d.configuration_url" % (i)
            r = Cfg.get_extended(key)
            if not r["Success"] or r["Value"] == "":
                continue

            d = misc.parse_line_as_list_of_dict(r["Value"])
            url = d[0]["_"]
            meta = d[0]

            index = "%02d" % i
            alarm_defs = {
                "Index": index,
                "Key": key,
                "Url": url,
                "Definition": r,
                "Metadata": meta
            }

            prefix = "alarmname:"
            if url.startswith(prefix):
                alarm_defs["AlarmName"] = url[len(prefix):]
            else:
                log.log(log.NOTICE, "Read Alarm definition: %s" % r["Value"])
                try:
                    resp = misc.get_url(url.format(**self.context))
                    if resp is None:
                        raise Exception("URL content = <None>")
                    alarm_defs["Content"] = str(resp, "utf-8")
                except Exception as e:
                    log.exception("Failed to load Alarm definition '%s' : %e" %
                                  (r["Value"], e))
                    continue
            alarm_definitions[index] = alarm_defs

        self.alarm_definitions = alarm_definitions

        # Read all existing CloudWatch alarms
        alarms = []
        response = None
        while (response is None or "NextToken" in response):
            response = client.describe_alarms(MaxRecords=Cfg.get_int(
                "cloudwatch.describe_alarms.max_results"),
                                              NextToken=response["NextToken"]
                                              if response is not None else "")
            #log.debug(Dbg.pprint(response))
            for alarm in response["MetricAlarms"]:
                alarm_name = alarm["AlarmName"]
                alarm_def = self.get_alarm_configuration_by_name(alarm_name)
                if alarm_def is not None:
                    # This is an alarm thats belong to this CloneSquad instance
                    alarms.append(alarm)
        #log.debug(Dbg.pprint(alarms))
        self.alarms = alarms

        # Sanity check
        for index in self.alarm_definitions.keys():
            alarm_def = self.alarm_definitions[index]
            if "AlarmName" not in alarm_def:
                continue
            alarm = next(
                filter(lambda a: a["AlarmName"] == alarm_def["AlarmName"],
                       self.alarms), None)
            if alarm is None:
                log.warning(
                    "Alarm definition [%s](%s => %s) doesn't match an existing CloudWatch alarm!"
                    % (alarm_def["Definition"]["Key"],
                       alarm_def["Definition"]["Value"],
                       alarm_def["Definition"]["Status"]))

        # Read all metrics associated with alarms

        # CloudWatch intense polling can be expensive: This algorithm links CW metric polling rate to the
        #    scale rate => Under intense scale up condition, polling is aggresive. If not, it falls down
        #    to one polling every 'cloudwatch.metrics.low_rate_polling_interval' seconds
        # TODO(@jcjorel): Avoid this kind of direct references to an upper level module!!
        integration_period = Cfg.get_duration_secs(
            "ec2.schedule.horizontalscale.integration_period")
        instance_scale_score = self.ec2.get_integrated_float_state(
            "ec2.schedule.scaleout.instance_scale_score", integration_period)

        self.metric_cache = self.get_metric_cache()

        query = {"IdMapping": {}, "Queries": []}

        # Build query for Alarm metrics
        if Cfg.get("ec2.schedule.desired_instance_count") == "-1":
            # Sort by oldest alarms first in cache
            cached_metric_names = [m["_MetricId"] for m in self.metric_cache]
            valid_alarms = []
            for a in alarms:
                alarm_name = a["AlarmName"]
                alarm_def = self.get_alarm_configuration_by_name(alarm_name)
                if alarm_def is None or alarm_def["AlarmDefinition"][
                        "Url"].startswith("alarmname:"):
                    continue
                a["_SamplingTime"] = self.get_metric_by_id(
                    alarm_name
                )["_SamplingTime"] if alarm_name in cached_metric_names else str(
                    misc.epoch())
                valid_alarms.append(a)
            sorted_alarms = sorted(
                valid_alarms, key=lambda a: misc.str2utc(a["_SamplingTime"]))

            # We poll from the oldest to the newest and depending on the instance_scale_score to limit CloudWacth GetMetricData costs
            time_for_full_metric_refresh = max(
                Cfg.get_duration_secs(
                    "cloudwatch.metrics.time_for_full_metric_refresh"), 1)
            app_run_period = Cfg.get_duration_secs("app.run_period")
            minimum_polled_alarms_per_run = Cfg.get_int(
                "cloudwatch.metrics.minimum_polled_alarms_per_run")
            maximum_polled_alarms_per_run = app_run_period / time_for_full_metric_refresh
            maximum_polled_alarms_per_run = min(maximum_polled_alarms_per_run,
                                                1.0)
            weight = min(instance_scale_score, maximum_polled_alarms_per_run)
            max_alarms_for_this_run = max(
                minimum_polled_alarms_per_run,
                int(min(weight, 1.0) * len(sorted_alarms)))
            for alarm in sorted_alarms[:max_alarms_for_this_run]:
                alarm_name = alarm["AlarmName"]
                CloudWatch._format_query(query, alarm_name, alarm)

            # We always poll user supplied alarms
            for alarm in alarms:
                alarm_name = alarm["AlarmName"]
                alarm_def = self.get_alarm_configuration_by_name(alarm_name)
                if alarm_def is None:
                    continue  # Unknown alarm name
                if not alarm_def["AlarmDefinition"]["Url"].startswith(
                        "alarmname:"):
                    continue
                CloudWatch._format_query(query, alarm_name, alarm)

        # Query Metric for Burstable instances
        burstable_instances = self.ec2.get_burstable_instances(
            ScalingState="-error")
        last_collect_date = self.ec2.get_state_date(
            "cloudwatch.metrics.last_burstable_metric_collect_date")
        if last_collect_date is None or (now - last_collect_date) > timedelta(
                minutes=1):
            for i in burstable_instances:
                instance_id = i["InstanceId"]
                if not self.ec2.is_static_subfleet_instance(
                        instance_id) and self.ec2.get_scaling_state(
                            instance_id) == "excluded":
                    continue
                CloudWatch._format_query(
                    query, "%s/%s" % ("CPUCreditBalance", instance_id), {
                        "MetricName":
                        "CPUCreditBalance",
                        "Namespace":
                        "AWS/EC2",
                        "Dimensions": [{
                            "Name": "InstanceId",
                            "Value": instance_id
                        }],
                        "Period":
                        300,
                        "Statistic":
                        "Average"
                    })
            self.ec2.set_state(
                "cloudwatch.metrics.last_burstable_metric_collect_date",
                now,
                TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))

        # Make request to CloudWatch
        query_counter = self.ec2.get_state_int(
            "cloudwatch.metric.query_counter", default=0)
        queries = query["Queries"]
        metric_results = []
        metric_ids = []
        no_metric_ids = []
        while len(queries) > 0:
            q = queries[:500]
            queries = queries[500:]
            results = []
            response = None
            while response is None or "NextToken" in response:
                args = {
                    "MetricDataQueries":
                    q,
                    "StartTime":
                    now - timedelta(seconds=Cfg.get_duration_secs(
                        "cloudwatch.metrics.data_period")),
                    "EndTime":
                    now
                }
                if response is not None:
                    args["NextToken"] = response["NextToken"]
                response = client.get_metric_data(**args)
                results.extend(response["MetricDataResults"])
                query_counter += len(q)

            for r in results:
                if r["StatusCode"] != "Complete":
                    log.error("Failed to retrieve metrics: %s" % q)
                    continue
                metric_id = query["IdMapping"][r["Id"]]
                if len(r["Timestamps"]) == 0:
                    if metric_id not in no_metric_ids:
                        no_metric_ids.append(metric_id)
                    continue
                if metric_id not in metric_ids: metric_ids.append(metric_id)
                r["_MetricId"] = metric_id
                r["_SamplingTime"] = str(now)
                log.debug(r)
                metric_results.append(r)
        if len(no_metric_ids):
            log.info("No metrics returned for alarm '%s'" % no_metric_ids)

        # Merge with existing cache metric
        metric_cache = self.metric_cache
        self.metric_cache = metric_results
        for m in metric_cache:
            max_retention_period = Cfg.get_duration_secs(
                "cloudwatch.metrics.cache.max_retention_period")
            if m["_MetricId"] in metric_ids or "_SamplingTime" not in m:
                continue
            if (now - misc.str2utc(m["_SamplingTime"])
                ).total_seconds() < max_retention_period:
                self.metric_cache.append(m)

        self.ec2.set_state("cloudwatch.metric.query_counter",
                           query_counter,
                           TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
        self.ec2.set_state_json(
            "cloudwatch.metrics.cache",
            self.metric_cache,
            TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
        self.set_metric("Cloudwatch.GetMetricData", query_counter)

        # Augment Alarm definitions and Instances with associated metrics
        for metric in self.metric_cache:
            metric_id = metric["_MetricId"]

            alarm_data = self.get_alarm_data_by_name(metric_id)
            if alarm_data is not None:
                alarm_data["MetricDetails"] = metric
                continue

            instance = next(
                filter(
                    lambda i: "CPUCreditBalance/%s" % i["InstanceId"] ==
                    metric_id, burstable_instances), None)
            if instance is not None:
                instance["_Metrics"] = {}
                instance["_Metrics"]["CPUCreditBalance"] = metric
                continue
Пример #3
0
 def instance_last_stop_date(self, instance_id, default=misc.epoch()):
     return self.get_state_date("ec2.schedule.instance.last_stop_date.%s" %
                                instance_id,
                                default=default)
Пример #4
0
def main_handler_entrypoint(event, context):
    """

    Parameters
    ----------
    event: dict, required

    context: object, required
        Lambda Context runtime methods and attributes

        Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html

    Returns
    ------

    """

    #print(Dbg.pprint(event))

    ctx["now"] = misc.utc_now()
    ctx["FunctionName"] = "Main"

    init()

    if Cfg.get_int("app.disable") != 0 and not misc.is_sam_local():
        log.warning("Application disabled due to 'app.disable' key")
        return

    no_is_called_too_early = False
    # Manage Spot interruption as fast as we can
    if sqs.process_sqs_records(event, function=ec2_schedule.manage_spot_notification, function_arg=ctx):
        log.info("Managed Spot Interruption SQS record!")
        # Force to run now disregarding `app.run_period` as we have at least one Spot instance to 
        #   remove from target groups immediatly
        no_is_called_too_early = True
    
    # Check that we are not called too early
    #   Note: We peform a direct read to the KVTable to spare initialization time when the
    #   Lambda is called too early
    ctx["main.last_call_date"] = ctx["o_ec2"].get_state("main.last_call_date", direct=True)
    if ctx["main.last_call_date"] is None or ctx["main.last_call_date"] == "": 
        ctx["main.last_call_date"] = str(misc.epoch())

    if not no_is_called_too_early and is_called_too_early():
        log.log(log.NOTICE, "Called too early by: %s" % event)
        notify.do_not_notify = True
        sqs.process_sqs_records(event)
        sqs.call_me_back_send()
        return

    log.debug("Load prerequisites.")
    load_prerequisites(["o_state", "o_notify", "o_ec2", "o_cloudwatch", "o_targetgroup", "o_ec2_schedule", "o_scheduler", "o_rds"])

    # Remember 'now' as the last execution date
    ctx["o_ec2"].set_state("main.last_call_date", value=ctx["now"], TTL=Cfg.get_duration_secs("app.default_ttl"))

    Cfg.dump()

    # Perform actions:
    log.debug("Main processing.")
    ctx["o_targetgroup"].manage_targetgroup()
    ctx["o_ec2_schedule"].schedule_instances()
    ctx["o_ec2_schedule"].stop_drained_instances()
    ctx["o_cloudwatch"].configure_alarms()
    ctx["o_rds"].manage_subfleet_rds()
    ctx["o_ec2_schedule"].prepare_metrics()

    ctx["o_cloudwatch"].send_metrics()
    ctx["o_cloudwatch"].configure_dashboard()

    # If we got woke up by SNS, acknowledge the message(s) now
    sqs.process_sqs_records(event)

    ctx["o_notify"].notify_user_arn_resources()

    # Call me back if needed
    sqs.call_me_back_send()
def seconds_since_last_call():
    if "main.last_call_date" not in ctx:
        return 0
    return (misc.utc_now() - misc.str2utc(
        ctx["main.last_call_date"], default=misc.epoch())).total_seconds()