def configure_dashboard(self): client = self.context["cloudwatch.client"] # Cloudwatch service is billing calls to dashboard API. We make sure that we do not call it too often now = self.context["now"] dashboard_state = Cfg.get_int("cloudwatch.dashboard.use_default") dashboard_last_state = self.ec2.get_state( "cloudwatch.dashboard.use_default.last_state") self.ec2.set_state("cloudwatch.dashboard.use_default.last_state", dashboard_state, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) last_dashboad_action = self.ec2.get_state_date( "cloudwatch.dashboard.last_action", default=misc.epoch()) dashboad_update_interval = Cfg.get_duration_secs( "cloudwatch.dashboard.update_interval") if (str(dashboard_state) == dashboard_last_state ) and (now - last_dashboad_action ).total_seconds() < dashboad_update_interval: log.debug("Not yet the time to manage the dashboard.") return if Cfg.get_int("cloudwatch.dashboard.use_default") != 1: try: client.delete_dashboards( DashboardNames=[self._get_dashboard_name()]) except: pass else: content = self.load_dashboard() log.log( log.NOTICE, "Configuring CloudWatch dashboard '%s'..." % self._get_dashboard_name()) response = client.put_dashboard( DashboardName=self._get_dashboard_name(), DashboardBody=content) self.ec2.set_state("cloudwatch.dashboard.last_action", now, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
def get_prerequisites(self): now = self.context["now"] client = self.context["cloudwatch.client"] # Read all CloudWatch alarm templates into memory alarm_definitions = {} for i in range(0, Cfg.get_int("cloudwatch.alarms.max_per_instance")): key = "cloudwatch.alarm%02d.configuration_url" % (i) r = Cfg.get_extended(key) if not r["Success"] or r["Value"] == "": continue d = misc.parse_line_as_list_of_dict(r["Value"]) url = d[0]["_"] meta = d[0] index = "%02d" % i alarm_defs = { "Index": index, "Key": key, "Url": url, "Definition": r, "Metadata": meta } prefix = "alarmname:" if url.startswith(prefix): alarm_defs["AlarmName"] = url[len(prefix):] else: log.log(log.NOTICE, "Read Alarm definition: %s" % r["Value"]) try: resp = misc.get_url(url.format(**self.context)) if resp is None: raise Exception("URL content = <None>") alarm_defs["Content"] = str(resp, "utf-8") except Exception as e: log.exception("Failed to load Alarm definition '%s' : %e" % (r["Value"], e)) continue alarm_definitions[index] = alarm_defs self.alarm_definitions = alarm_definitions # Read all existing CloudWatch alarms alarms = [] response = None while (response is None or "NextToken" in response): response = client.describe_alarms(MaxRecords=Cfg.get_int( "cloudwatch.describe_alarms.max_results"), NextToken=response["NextToken"] if response is not None else "") #log.debug(Dbg.pprint(response)) for alarm in response["MetricAlarms"]: alarm_name = alarm["AlarmName"] alarm_def = self.get_alarm_configuration_by_name(alarm_name) if alarm_def is not None: # This is an alarm thats belong to this CloneSquad instance alarms.append(alarm) #log.debug(Dbg.pprint(alarms)) self.alarms = alarms # Sanity check for index in self.alarm_definitions.keys(): alarm_def = self.alarm_definitions[index] if "AlarmName" not in alarm_def: continue alarm = next( filter(lambda a: a["AlarmName"] == alarm_def["AlarmName"], self.alarms), None) if alarm is None: log.warning( "Alarm definition [%s](%s => %s) doesn't match an existing CloudWatch alarm!" % (alarm_def["Definition"]["Key"], alarm_def["Definition"]["Value"], alarm_def["Definition"]["Status"])) # Read all metrics associated with alarms # CloudWatch intense polling can be expensive: This algorithm links CW metric polling rate to the # scale rate => Under intense scale up condition, polling is aggresive. If not, it falls down # to one polling every 'cloudwatch.metrics.low_rate_polling_interval' seconds # TODO(@jcjorel): Avoid this kind of direct references to an upper level module!! integration_period = Cfg.get_duration_secs( "ec2.schedule.horizontalscale.integration_period") instance_scale_score = self.ec2.get_integrated_float_state( "ec2.schedule.scaleout.instance_scale_score", integration_period) self.metric_cache = self.get_metric_cache() query = {"IdMapping": {}, "Queries": []} # Build query for Alarm metrics if Cfg.get("ec2.schedule.desired_instance_count") == "-1": # Sort by oldest alarms first in cache cached_metric_names = [m["_MetricId"] for m in self.metric_cache] valid_alarms = [] for a in alarms: alarm_name = a["AlarmName"] alarm_def = self.get_alarm_configuration_by_name(alarm_name) if alarm_def is None or alarm_def["AlarmDefinition"][ "Url"].startswith("alarmname:"): continue a["_SamplingTime"] = self.get_metric_by_id( alarm_name )["_SamplingTime"] if alarm_name in cached_metric_names else str( misc.epoch()) valid_alarms.append(a) sorted_alarms = sorted( valid_alarms, key=lambda a: misc.str2utc(a["_SamplingTime"])) # We poll from the oldest to the newest and depending on the instance_scale_score to limit CloudWacth GetMetricData costs time_for_full_metric_refresh = max( Cfg.get_duration_secs( "cloudwatch.metrics.time_for_full_metric_refresh"), 1) app_run_period = Cfg.get_duration_secs("app.run_period") minimum_polled_alarms_per_run = Cfg.get_int( "cloudwatch.metrics.minimum_polled_alarms_per_run") maximum_polled_alarms_per_run = app_run_period / time_for_full_metric_refresh maximum_polled_alarms_per_run = min(maximum_polled_alarms_per_run, 1.0) weight = min(instance_scale_score, maximum_polled_alarms_per_run) max_alarms_for_this_run = max( minimum_polled_alarms_per_run, int(min(weight, 1.0) * len(sorted_alarms))) for alarm in sorted_alarms[:max_alarms_for_this_run]: alarm_name = alarm["AlarmName"] CloudWatch._format_query(query, alarm_name, alarm) # We always poll user supplied alarms for alarm in alarms: alarm_name = alarm["AlarmName"] alarm_def = self.get_alarm_configuration_by_name(alarm_name) if alarm_def is None: continue # Unknown alarm name if not alarm_def["AlarmDefinition"]["Url"].startswith( "alarmname:"): continue CloudWatch._format_query(query, alarm_name, alarm) # Query Metric for Burstable instances burstable_instances = self.ec2.get_burstable_instances( ScalingState="-error") last_collect_date = self.ec2.get_state_date( "cloudwatch.metrics.last_burstable_metric_collect_date") if last_collect_date is None or (now - last_collect_date) > timedelta( minutes=1): for i in burstable_instances: instance_id = i["InstanceId"] if not self.ec2.is_static_subfleet_instance( instance_id) and self.ec2.get_scaling_state( instance_id) == "excluded": continue CloudWatch._format_query( query, "%s/%s" % ("CPUCreditBalance", instance_id), { "MetricName": "CPUCreditBalance", "Namespace": "AWS/EC2", "Dimensions": [{ "Name": "InstanceId", "Value": instance_id }], "Period": 300, "Statistic": "Average" }) self.ec2.set_state( "cloudwatch.metrics.last_burstable_metric_collect_date", now, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) # Make request to CloudWatch query_counter = self.ec2.get_state_int( "cloudwatch.metric.query_counter", default=0) queries = query["Queries"] metric_results = [] metric_ids = [] no_metric_ids = [] while len(queries) > 0: q = queries[:500] queries = queries[500:] results = [] response = None while response is None or "NextToken" in response: args = { "MetricDataQueries": q, "StartTime": now - timedelta(seconds=Cfg.get_duration_secs( "cloudwatch.metrics.data_period")), "EndTime": now } if response is not None: args["NextToken"] = response["NextToken"] response = client.get_metric_data(**args) results.extend(response["MetricDataResults"]) query_counter += len(q) for r in results: if r["StatusCode"] != "Complete": log.error("Failed to retrieve metrics: %s" % q) continue metric_id = query["IdMapping"][r["Id"]] if len(r["Timestamps"]) == 0: if metric_id not in no_metric_ids: no_metric_ids.append(metric_id) continue if metric_id not in metric_ids: metric_ids.append(metric_id) r["_MetricId"] = metric_id r["_SamplingTime"] = str(now) log.debug(r) metric_results.append(r) if len(no_metric_ids): log.info("No metrics returned for alarm '%s'" % no_metric_ids) # Merge with existing cache metric metric_cache = self.metric_cache self.metric_cache = metric_results for m in metric_cache: max_retention_period = Cfg.get_duration_secs( "cloudwatch.metrics.cache.max_retention_period") if m["_MetricId"] in metric_ids or "_SamplingTime" not in m: continue if (now - misc.str2utc(m["_SamplingTime"]) ).total_seconds() < max_retention_period: self.metric_cache.append(m) self.ec2.set_state("cloudwatch.metric.query_counter", query_counter, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) self.ec2.set_state_json( "cloudwatch.metrics.cache", self.metric_cache, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) self.set_metric("Cloudwatch.GetMetricData", query_counter) # Augment Alarm definitions and Instances with associated metrics for metric in self.metric_cache: metric_id = metric["_MetricId"] alarm_data = self.get_alarm_data_by_name(metric_id) if alarm_data is not None: alarm_data["MetricDetails"] = metric continue instance = next( filter( lambda i: "CPUCreditBalance/%s" % i["InstanceId"] == metric_id, burstable_instances), None) if instance is not None: instance["_Metrics"] = {} instance["_Metrics"]["CPUCreditBalance"] = metric continue
def instance_last_stop_date(self, instance_id, default=misc.epoch()): return self.get_state_date("ec2.schedule.instance.last_stop_date.%s" % instance_id, default=default)
def main_handler_entrypoint(event, context): """ Parameters ---------- event: dict, required context: object, required Lambda Context runtime methods and attributes Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html Returns ------ """ #print(Dbg.pprint(event)) ctx["now"] = misc.utc_now() ctx["FunctionName"] = "Main" init() if Cfg.get_int("app.disable") != 0 and not misc.is_sam_local(): log.warning("Application disabled due to 'app.disable' key") return no_is_called_too_early = False # Manage Spot interruption as fast as we can if sqs.process_sqs_records(event, function=ec2_schedule.manage_spot_notification, function_arg=ctx): log.info("Managed Spot Interruption SQS record!") # Force to run now disregarding `app.run_period` as we have at least one Spot instance to # remove from target groups immediatly no_is_called_too_early = True # Check that we are not called too early # Note: We peform a direct read to the KVTable to spare initialization time when the # Lambda is called too early ctx["main.last_call_date"] = ctx["o_ec2"].get_state("main.last_call_date", direct=True) if ctx["main.last_call_date"] is None or ctx["main.last_call_date"] == "": ctx["main.last_call_date"] = str(misc.epoch()) if not no_is_called_too_early and is_called_too_early(): log.log(log.NOTICE, "Called too early by: %s" % event) notify.do_not_notify = True sqs.process_sqs_records(event) sqs.call_me_back_send() return log.debug("Load prerequisites.") load_prerequisites(["o_state", "o_notify", "o_ec2", "o_cloudwatch", "o_targetgroup", "o_ec2_schedule", "o_scheduler", "o_rds"]) # Remember 'now' as the last execution date ctx["o_ec2"].set_state("main.last_call_date", value=ctx["now"], TTL=Cfg.get_duration_secs("app.default_ttl")) Cfg.dump() # Perform actions: log.debug("Main processing.") ctx["o_targetgroup"].manage_targetgroup() ctx["o_ec2_schedule"].schedule_instances() ctx["o_ec2_schedule"].stop_drained_instances() ctx["o_cloudwatch"].configure_alarms() ctx["o_rds"].manage_subfleet_rds() ctx["o_ec2_schedule"].prepare_metrics() ctx["o_cloudwatch"].send_metrics() ctx["o_cloudwatch"].configure_dashboard() # If we got woke up by SNS, acknowledge the message(s) now sqs.process_sqs_records(event) ctx["o_notify"].notify_user_arn_resources() # Call me back if needed sqs.call_me_back_send()
def seconds_since_last_call(): if "main.last_call_date" not in ctx: return 0 return (misc.utc_now() - misc.str2utc( ctx["main.last_call_date"], default=misc.epoch())).total_seconds()