def set_scaling_state(self, instance_id, value, ttl=None, meta=None, default_date=None): if ttl is None: ttl = Cfg.get_duration_secs("ec2.state.default_ttl") if default_date is None: default_date = self.context["now"] #if value in ["draining"] and instance_id in ["i-0ed9bddf74dd2a2f5", "i-0904bbd267f736227"]: pdb.set_trace() meta = {} if meta is None else meta previous_value = self.get_scaling_state(instance_id, meta=meta, do_not_return_excluded=True) date = meta[ "last_action_date"] if previous_value == value else default_date self.set_state( "ec2.instance.scaling.last_action_date.%s" % instance_id, date, ttl) self.set_state( "ec2.instance.scaling.last_%s_date.%s" % (value, instance_id), date, ttl) previous_value = self.get_scaling_state(instance_id, meta=meta) return self.set_state("ec2.instance.scaling.state.%s" % instance_id, value, ttl)
def next_call_delay(): global ctx expected_delay = Cfg.get_duration_secs("app.run_period") last_call_delay = seconds_since_last_call() delta = expected_delay - last_call_delay if delta < 0: return expected_delay return max(int(delta), 0)
def set_instance_state(self, instance_id, targetgroup_name, value): m = re.search("(.*)/([^/]+)/\w+$", targetgroup_name) targetgroup_name = m.group(2) key = "targetgroup.status.%s.%s" % (targetgroup_name, instance_id) self.ec2.set_state( key.replace(":", "_"), value, TTL=Cfg.get_duration_secs("targetgroup.default_state_ttl"))
def update_pending_command_statuses(self): client = self.context["ssm.client"] self.run_cmd_states = self.o_state.get_state_json("ssm.events.run_commands", default={ "Commands": [], "FormerResults": {} }) former_results = self.run_cmd_states["FormerResults"] cmds = self.run_cmd_states["Commands"] for cmd in cmds: command = cmd["Command"] args = cmd["CommandArgs"] if "Complete" not in cmd: cmd_id = cmd["Id"] paginator = client.get_paginator('list_command_invocations') response_iterator = paginator.paginate(CommandId=cmd_id, Details=True, MaxResults=50) for response in response_iterator: for invoc in response["CommandInvocations"]: instance_id = invoc["InstanceId"] status = invoc["Status"] if (status not in ["Success", "Cancelled", "Failed", "TimedOut", "Undeliverable", "Terminated", "Delivery Timed Out", "Execution Timed Out"]): continue stdout = [s.rstrip() for s in io.StringIO(invoc["CommandPlugins"][0]["Output"]).readlines() if s.startswith("CLONESQUAD-SSM-AGENT-")] bie_msg = next(filter(lambda s: s.startswith("CLONESQUAD-SSM-AGENT-BIE:"), stdout), None) if not bie_msg: log.log(log.NOTICE, f"Truncated reply from SSM Command Invocation ({cmd_id}/{instance_id}). " "*Cause: SSM exec error? started shell command too verbose? (please limit to 24kBytes max!)") agent_status = "CLONESQUAD-SSM-AGENT-STATUS:" status_msg = next(filter(lambda s: s.startswith(agent_status), stdout), None) if status_msg is None: status_msg = "ERROR" else: status_msg = status_msg[len(agent_status):] details_msg = list(filter(lambda s: s.startswith("CLONESQUAD-SSM-AGENT-DETAILS:"), stdout)) warning_msg = list(filter(lambda s: ":WARNING:" in s, stdout)) if len(warning_msg): log.warning(f"Got warning while retrieving SSM RunCommand output for {cmd_id}/{instance_id}/{command}: " f"{warning_msg}/{details_msg}") result = { "SSMInvocationStatus": status, "Status": status_msg, "Truncated": bie_msg is None, "Expiration": misc.seconds_from_epoch_utc() + Cfg.get_duration_secs("ssm.state.command.result.default_ttl") } # Keep track if the former result list if instance_id not in former_results: former_results[instance_id] = {} former_results[instance_id][f"{command};{args}"] = result if instance_id not in cmd["ReceivedInstanceIds"]: cmd["ReceivedInstanceIds"].append(instance_id) if set(cmd["ReceivedInstanceIds"]) & set(cmd["InstanceIds"]) == set(cmd["InstanceIds"]): # All invocation results received cmd["Complete"] = True self.commands_to_send = []
def is_maintenance_time(self, fleet=None, meta=None): if not self.is_feature_enabled("maintenance_window"): return False now = self.context["now"] sa = max(Cfg.get_duration_secs("ssm.feature.maintenance_window.start_ahead"), 30) # We compute a predictive jitter to avoid all subleets starting exactly at the same time group_name = self.context["GroupName"] jitter_salt = int(misc.sha256(f"{group_name}:{fleet}")[:3], 16) / (16 * 16 * 16) * sa jitter = Cfg.get_abs_or_percent("ssm.feature.maintenance_window.start_ahead.max_jitter", 0, jitter_salt) start_ahead = timedelta(seconds=(sa-jitter)) windows = copy.deepcopy(self._get_maintenance_windows_for_fleet(fleet=fleet)) for w in windows: window_id= w["WindowId"] if "NextExecutionTime" in w: end_time = w["NextExecutionTime"] + timedelta(hours=int(w["Duration"])) if now >= (w["NextExecutionTime"] - start_ahead) and now < end_time: # We are entering a new maintenance window period. Remember it... self.o_state.set_state(f"ssm.events.maintenance_window.last_next_execution_time.{window_id}", w["NextExecutionTime"], TTL=self.ttl) self.o_state.set_state(f"ssm.events.maintenance_window.last_next_execution_duration.{window_id}", w["Duration"], TTL=self.ttl) w["_FutureNextExecutionTime"] = w["NextExecutionTime"] # SSM maintenance windows do not always have a NextExecutionTime field -=OR=- it contains the future # NextExecutionTime of the next iteration. In both case, we westore it from a backuped one. next_execution_time = self.o_state.get_state_date(f"ssm.events.maintenance_window.last_next_execution_time.{window_id}", TTL=self.ttl) if next_execution_time is not None: w["NextExecutionTime"] = next_execution_time if "Duration" not in w: next_execution_duration = self.o_state.get_state(f"ssm.events.maintenance_window.last_next_execution_duration.{window_id}", TTL=self.ttl) if next_execution_duration is not None: w["Duration"] = next_execution_duration valid_windows = [w for w in windows if "NextExecutionTime" in w and "Duration" in w] fleetname = "Main" if fleet is None else fleet next_window = None for w in sorted(valid_windows, key=lambda w: w["NextExecutionTime"]): end_time = w["NextExecutionTime"] + timedelta(hours=int(w["Duration"])) start_time = w["NextExecutionTime"] - start_ahead if now >= start_time and now < end_time: if meta is not None: meta["MatchingWindow"] = w meta["MatchingWindowMessage"] = f"Found ACTIVE matching window for fleet {fleetname} : {w}" meta["StartTime"] = w["NextExecutionTime"] meta["EndTime"] = end_time return True if ("_FutureNextExecutionTime" in w and w["_FutureNextExecutionTime"] > now and (next_window is None or w["_FutureNextExecutionTime"] < next_window["_FutureNextExecutionTime"])): next_window = w if next_window is not None and meta is not None: meta["NextWindowMessage"] = (f"Next SSM Maintenance Window for {fleetname} fleet is '%s/%s in %s " f"(Fleet will start ahead at %s)." % (w["WindowId"], w["Name"], (w["_FutureNextExecutionTime"] - now), w["_FutureNextExecutionTime"] - start_ahead)) return False
def is_called_too_early(): global ctx delay = Cfg.get_duration_secs("app.run_period") delta = sqs.seconds_since_last_call() if delta != -1 and delta < delay: if misc.is_sam_local(): log.warning("is_called_too_early disabled because running in SAM!") return False log.log(log.NOTICE, "Called too early (now=%s, delay=%s => delta_seconds=%s)..." % (ctx["now"], delay, delta)) return True return False
def configure_dashboard(self): client = self.context["cloudwatch.client"] # Cloudwatch service is billing calls to dashboard API. We make sure that we do not call it too often now = self.context["now"] dashboard_state = Cfg.get_int("cloudwatch.dashboard.use_default") dashboard_last_state = self.ec2.get_state( "cloudwatch.dashboard.use_default.last_state") self.ec2.set_state("cloudwatch.dashboard.use_default.last_state", dashboard_state, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) last_dashboad_action = self.ec2.get_state_date( "cloudwatch.dashboard.last_action", default=misc.epoch()) dashboad_update_interval = Cfg.get_duration_secs( "cloudwatch.dashboard.update_interval") if (str(dashboard_state) == dashboard_last_state ) and (now - last_dashboad_action ).total_seconds() < dashboad_update_interval: log.debug("Not yet the time to manage the dashboard.") return if Cfg.get_int("cloudwatch.dashboard.use_default") != 1: try: client.delete_dashboards( DashboardNames=[self._get_dashboard_name()]) except: pass else: content = self.load_dashboard() log.log( log.NOTICE, "Configuring CloudWatch dashboard '%s'..." % self._get_dashboard_name()) response = client.put_dashboard( DashboardName=self._get_dashboard_name(), DashboardBody=content) self.ec2.set_state("cloudwatch.dashboard.last_action", now, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
def __init__(self, context, state, ec2, targetgroup, cloudwatch): global do_not_notify do_not_notify = False self.context = context self.ec2 = ec2 self.targetgroup = targetgroup self.cloudwatch = cloudwatch self.state = state self.table_name = None Cfg.register({ "notify.event.default_ttl": "minutes=5", "notify.event.longterm.max_records,Stable": { "DefaultValue": 50, "Format": "Integer", "Description": """Maximum records to hold in the Event-LongTerm DynamodDB table Setting this value to 0, disable logging to the LongTerm event table. """ }, "notify.event.longterm.ttl,Stable": { "DefaultValue": "days=5", "Format": "Duration", "Description": """Retention time for Long-Term DynamoDB entries. This table is used to deep-dive analysis of noticeable events encountered by a CloneSquad deployment. It is mainly used to improve CloneSquad over time by allowing easy sharing of essential data for remote debugging. """ }, "notify.event.keep_acked_records": "0", "notify.debug.obfuscate_s3_reports": "1", "notify.debug.send_s3_reports": "1" }) self.state.register_aggregates([{ "Prefix": "notify.", "Compress": True, "DefaultTTL": Cfg.get_duration_secs("notify.event.longterm.ttl"), "Exclude": [] }]) global notify_mgr notify_mgr = self
def __init__(self, context, ec2): self.context = context self.ec2 = ec2 self.state_changed = False self.prereqs_done = False Cfg.register({ "targetgroup.debug.inject_fault_status": "", "targetgroup.default_state_ttl": "minutes=30", "targetgroup.slow_deregister_timeout": "minutes=2" }) self.ec2.register_state_aggregates([{ "Prefix": "targetgroup.status.", "Compress": True, "DefaultTTL": Cfg.get_duration_secs("targetgroup.default_state_ttl") }])
def get_prerequisites(self): ctx = self.context self.table = kvtable.KVTable.create( self.context, self.context["StateTable"], cache_max_age=Cfg.get_duration_secs("statemanager.cache.max_age")) for a in self.table_aggregates: self.table.register_aggregates(a) self.table.reread_table() # Retrieve all CloneSquad resources misc.initialize_clients(["resourcegroupstaggingapi"], self.context) tagging_client = self.context["resourcegroupstaggingapi.client"] paginator = tagging_client.get_paginator('get_resources') tag_mappings = itertools.chain.from_iterable( page['ResourceTagMappingList'] for page in paginator.paginate( TagFilters=[{ 'Key': 'clonesquad:group-name', 'Values': [self.context["GroupName"]] }])) self.clonesquad_resources = list(tag_mappings)
def stop_instances(self, instance_ids_to_stop): now = self.context["now"] client = self.context["ec2.client"] for instance_id in instance_ids_to_stop: try: response = R(lambda args, kwargs, r: r["ResponseMetadata"][ "HTTPStatusCode"] == 200, client.stop_instances, InstanceIds=[instance_id]) if response is not None and "StoppingInstances" in response: for i in response["StoppingInstances"]: instance_id = i["InstanceId"] self.set_scaling_state(instance_id, "") self.set_state( "ec2.schedule.instance.last_stop_date.%s" % instance_id, now, TTL=Cfg.get_duration_secs("ec2.state.status_ttl")) log.debug(response) except Exception as e: log.warning("Failed to stop_instance '%s' : %s" % (instance_id, e))
def configure_alarms(self): """ Configure Cloudwatch Alarms for each instance. The algorithm needs to manage missing alarm as well updating existing alarms """ now = self.context["now"] client = self.context["cloudwatch.client"] valid_alarms = [] nb_of_updated_alarms = 0 max_update_per_batch = Cfg.get_int( "cloudwatch.metrics.max_update_per_batch") log.log( log.NOTICE, "Found following Alarm definition key(s) in configuration: %s" % [d for d in self.alarm_definitions]) # Step 1) Create or Update CloudWatch Alarms for running instances for instance in self.ec2.get_instances( State="pending,running", ScalingState="-error,draining,excluded"): instance_id = instance["InstanceId"] age_secs = (now - instance["LaunchTime"]).total_seconds() min_instance_age = Cfg.get_duration_secs( "cloudwatch.alarms.min_instance_age") if age_secs < min_instance_age: log.log( log.NOTICE, "Instance '%s' too young. Wait %d seconds before to set an alarm..." % (instance_id, min_instance_age - age_secs)) continue #Update alarms for this instance for alarm_definition in self.alarm_definitions: # First, check if an alarm already exists alarm_name = self._get_alarm_name(self.context["GroupName"], instance["InstanceId"], int(alarm_definition)) existing_alarms = list( filter(lambda x: x['AlarmName'] == alarm_name, self.alarms)) # Load alarm template try: if "Content" not in self.alarm_definitions[ alarm_definition]: continue kwargs = self.context.copy() kwargs["InstanceId"] = instance_id alarm_template = self.alarm_definitions[alarm_definition][ "Content"].format(**kwargs) alarm = yaml.safe_load(alarm_template) except Exception as e: log.exception( "[ERROR] Failed to read YAML alarm file '%s' : %s" % (alarm_template, e)) continue alarm["AlarmName"] = alarm_name valid_alarms.append(alarm_name) #Check if an alarm already exist existing_alarm = None if len(existing_alarms) > 0: existing_alarm = existing_alarms[0] # Check if alarm definition will be the same a = {**existing_alarm, **alarm} # 2020/07/20: CloudWatch Alarm API does not return Tags. Have to deal with # while comparing the configurations. if "Tags" in a and "Tags" not in existing_alarm: del a["Tags"] if a == existing_alarm: #log.debug("Not updating alarm '%s' as configuration is already ok" % alarm_name) continue # Check if we updated this alarm very recently delta = datetime.now( timezone.utc ) - existing_alarm["AlarmConfigurationUpdatedTimestamp"] if delta < timedelta(minutes=1): log.debug("Alarm '%s' updated to soon" % alarm_name) continue nb_of_updated_alarms += 1 if nb_of_updated_alarms > max_update_per_batch: break log.log( log.NOTICE, "Updating/creating CloudWatch Alarm '%s' : %s" % (alarm_name, alarm)) resp = client.put_metric_alarm(**alarm) log.debug(Dbg.pprint(resp)) # Step 2) Destroy CloudWatch Alarms for non existing instances (Garbage Collection) for existing_alarm in self.alarms: alarm_name = existing_alarm["AlarmName"] if not alarm_name.startswith("CloneSquad-%s-i-" % (self.context["GroupName"])): continue if alarm_name not in valid_alarms: nb_of_updated_alarms += 1 if nb_of_updated_alarms > max_update_per_batch: break log.debug("Garbage collection orphan Cloudwatch Alarm '%s'" % alarm_name) resp = client.delete_alarms(AlarmNames=[alarm_name]) log.debug(resp) nb_of_updated_alarms += 1 if nb_of_updated_alarms > max_update_per_batch: break
def __init__(self, context, ec2): self.context = context self.ec2 = ec2 self.alarms = None self.metrics = [] Cfg.register({ "cloudwatch.describe_alarms.max_results": "50", "cloudwatch.default_ttl": "days=1", "cloudwatch.alarms.max_per_instance": "6", "cloudwatch.alarms.min_instance_age": "minutes=3", "cloudwatch.configure.max_alarms_deleted_batch_size": "5", "cloudwatch.metrics.namespace": "CloneSquad", "cloudwatch.metrics.subnamespace": "", "cloudwatch.metrics.excluded,Stable": { "DefaultValue": "", "Format": "StringList", "Description": """List of metric pattern names to not send to Cloudwatch This configuration key is used to do Cost optimization by filtering which CloneSquad Metrics are sent to Cloudwatch. It support regex patterns. > Ex: StaticFleet.*;NbOfBouncedInstances """ }, "cloudwatch.metrics.data_period": "minutes=2", "cloudwatch.metrics.max_update_per_batch": "20", "cloudwatch.metrics.cache.max_retention_period": "minutes=10", "cloudwatch.metrics.minimum_polled_alarms_per_run": "1", "cloudwatch.metrics.time_for_full_metric_refresh,Stable": { "DefaultValue": "minutes=1,seconds=30", "Format": "Duration", "Description": """The total period for a complete refresh of EC2 Instance metrics This parameter is a way to reduce Cloudwatch cost induced by GetMetricData API calls. It defines indirectly how many alarm metrics will be polled in a single Main Lambda execution. A dedicated algorithm is used to extrapolate missing data based on previous GetMetricData API calls. Reducing this value increase the accuracy of the scaling criteria and so, the reactivity of CloneSquad to a sudden burst of activity load but at the expense of Cloudwatch.GetMetricData API cost. This parameter does not influence the polling of user supplied alarms that are always polled at each run. """ }, "cloudwatch.dashboard.use_default,Stable": { "DefaultValue": 1, "Format": "Bool", "Description": """Enable or disable the Cloudwatch dashboard for CloneSquad. The dashboard is enabled by default. """ }, "cloudwatch.dashboard.update_interval": "hours=1", "cloudwatch.dashboard.snapshot_width": 1000, "cloudwatch.dashboard.snapshot_height": 400 }) Cfg.register({ "cloudwatch.alarm00.configuration_url,Stable": { "DefaultValue": "", "Format": "MetaString", "Description": """Alarm specification to track for scaling decisions. Ex: internal:ec2.scaleup.alarm-cpu-gt-75pc.yaml,Points=1001,BaselineThreshold=30.0 See [Alarm specification documentation](ALARMS_REFERENCE.md) for more details. """ } }) for i in range(1, Cfg.get_int("cloudwatch.alarms.max_per_instance")): Cfg.register({ "cloudwatch.alarm%02d.configuration_url,Stable" % i: { "DefaultValue": "", "Format": "MetaString", "Description": """See `cloudwatch.alarm00.configuration_url`. """ } }) self.register_metric([{ "MetricName": "Cloudwatch.GetMetricData", "Unit": "Count", "StorageResolution": 60 }]) self.ec2.register_state_aggregates([{ "Prefix": "cloudwatch.dashboard.", "Compress": True, "DefaultTTL": Cfg.get_duration_secs("cloudwatch.default_ttl"), "Exclude": [] }])
def get_prerequisites(self): now = self.context["now"] client = self.context["cloudwatch.client"] # Read all CloudWatch alarm templates into memory alarm_definitions = {} for i in range(0, Cfg.get_int("cloudwatch.alarms.max_per_instance")): key = "cloudwatch.alarm%02d.configuration_url" % (i) r = Cfg.get_extended(key) if not r["Success"] or r["Value"] == "": continue d = misc.parse_line_as_list_of_dict(r["Value"]) url = d[0]["_"] meta = d[0] index = "%02d" % i alarm_defs = { "Index": index, "Key": key, "Url": url, "Definition": r, "Metadata": meta } prefix = "alarmname:" if url.startswith(prefix): alarm_defs["AlarmName"] = url[len(prefix):] else: log.log(log.NOTICE, "Read Alarm definition: %s" % r["Value"]) try: resp = misc.get_url(url.format(**self.context)) if resp is None: raise Exception("URL content = <None>") alarm_defs["Content"] = str(resp, "utf-8") except Exception as e: log.exception("Failed to load Alarm definition '%s' : %e" % (r["Value"], e)) continue alarm_definitions[index] = alarm_defs self.alarm_definitions = alarm_definitions # Read all existing CloudWatch alarms alarms = [] response = None while (response is None or "NextToken" in response): response = client.describe_alarms(MaxRecords=Cfg.get_int( "cloudwatch.describe_alarms.max_results"), NextToken=response["NextToken"] if response is not None else "") #log.debug(Dbg.pprint(response)) for alarm in response["MetricAlarms"]: alarm_name = alarm["AlarmName"] alarm_def = self.get_alarm_configuration_by_name(alarm_name) if alarm_def is not None: # This is an alarm thats belong to this CloneSquad instance alarms.append(alarm) #log.debug(Dbg.pprint(alarms)) self.alarms = alarms # Sanity check for index in self.alarm_definitions.keys(): alarm_def = self.alarm_definitions[index] if "AlarmName" not in alarm_def: continue alarm = next( filter(lambda a: a["AlarmName"] == alarm_def["AlarmName"], self.alarms), None) if alarm is None: log.warning( "Alarm definition [%s](%s => %s) doesn't match an existing CloudWatch alarm!" % (alarm_def["Definition"]["Key"], alarm_def["Definition"]["Value"], alarm_def["Definition"]["Status"])) # Read all metrics associated with alarms # CloudWatch intense polling can be expensive: This algorithm links CW metric polling rate to the # scale rate => Under intense scale up condition, polling is aggresive. If not, it falls down # to one polling every 'cloudwatch.metrics.low_rate_polling_interval' seconds # TODO(@jcjorel): Avoid this kind of direct references to an upper level module!! integration_period = Cfg.get_duration_secs( "ec2.schedule.horizontalscale.integration_period") instance_scale_score = self.ec2.get_integrated_float_state( "ec2.schedule.scaleout.instance_scale_score", integration_period) self.metric_cache = self.get_metric_cache() query = {"IdMapping": {}, "Queries": []} # Build query for Alarm metrics if Cfg.get("ec2.schedule.desired_instance_count") == "-1": # Sort by oldest alarms first in cache cached_metric_names = [m["_MetricId"] for m in self.metric_cache] valid_alarms = [] for a in alarms: alarm_name = a["AlarmName"] alarm_def = self.get_alarm_configuration_by_name(alarm_name) if alarm_def is None or alarm_def["AlarmDefinition"][ "Url"].startswith("alarmname:"): continue a["_SamplingTime"] = self.get_metric_by_id( alarm_name )["_SamplingTime"] if alarm_name in cached_metric_names else str( misc.epoch()) valid_alarms.append(a) sorted_alarms = sorted( valid_alarms, key=lambda a: misc.str2utc(a["_SamplingTime"])) # We poll from the oldest to the newest and depending on the instance_scale_score to limit CloudWacth GetMetricData costs time_for_full_metric_refresh = max( Cfg.get_duration_secs( "cloudwatch.metrics.time_for_full_metric_refresh"), 1) app_run_period = Cfg.get_duration_secs("app.run_period") minimum_polled_alarms_per_run = Cfg.get_int( "cloudwatch.metrics.minimum_polled_alarms_per_run") maximum_polled_alarms_per_run = app_run_period / time_for_full_metric_refresh maximum_polled_alarms_per_run = min(maximum_polled_alarms_per_run, 1.0) weight = min(instance_scale_score, maximum_polled_alarms_per_run) max_alarms_for_this_run = max( minimum_polled_alarms_per_run, int(min(weight, 1.0) * len(sorted_alarms))) for alarm in sorted_alarms[:max_alarms_for_this_run]: alarm_name = alarm["AlarmName"] CloudWatch._format_query(query, alarm_name, alarm) # We always poll user supplied alarms for alarm in alarms: alarm_name = alarm["AlarmName"] alarm_def = self.get_alarm_configuration_by_name(alarm_name) if alarm_def is None: continue # Unknown alarm name if not alarm_def["AlarmDefinition"]["Url"].startswith( "alarmname:"): continue CloudWatch._format_query(query, alarm_name, alarm) # Query Metric for Burstable instances burstable_instances = self.ec2.get_burstable_instances( ScalingState="-error") last_collect_date = self.ec2.get_state_date( "cloudwatch.metrics.last_burstable_metric_collect_date") if last_collect_date is None or (now - last_collect_date) > timedelta( minutes=1): for i in burstable_instances: instance_id = i["InstanceId"] if not self.ec2.is_static_subfleet_instance( instance_id) and self.ec2.get_scaling_state( instance_id) == "excluded": continue CloudWatch._format_query( query, "%s/%s" % ("CPUCreditBalance", instance_id), { "MetricName": "CPUCreditBalance", "Namespace": "AWS/EC2", "Dimensions": [{ "Name": "InstanceId", "Value": instance_id }], "Period": 300, "Statistic": "Average" }) self.ec2.set_state( "cloudwatch.metrics.last_burstable_metric_collect_date", now, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) # Make request to CloudWatch query_counter = self.ec2.get_state_int( "cloudwatch.metric.query_counter", default=0) queries = query["Queries"] metric_results = [] metric_ids = [] no_metric_ids = [] while len(queries) > 0: q = queries[:500] queries = queries[500:] results = [] response = None while response is None or "NextToken" in response: args = { "MetricDataQueries": q, "StartTime": now - timedelta(seconds=Cfg.get_duration_secs( "cloudwatch.metrics.data_period")), "EndTime": now } if response is not None: args["NextToken"] = response["NextToken"] response = client.get_metric_data(**args) results.extend(response["MetricDataResults"]) query_counter += len(q) for r in results: if r["StatusCode"] != "Complete": log.error("Failed to retrieve metrics: %s" % q) continue metric_id = query["IdMapping"][r["Id"]] if len(r["Timestamps"]) == 0: if metric_id not in no_metric_ids: no_metric_ids.append(metric_id) continue if metric_id not in metric_ids: metric_ids.append(metric_id) r["_MetricId"] = metric_id r["_SamplingTime"] = str(now) log.debug(r) metric_results.append(r) if len(no_metric_ids): log.info("No metrics returned for alarm '%s'" % no_metric_ids) # Merge with existing cache metric metric_cache = self.metric_cache self.metric_cache = metric_results for m in metric_cache: max_retention_period = Cfg.get_duration_secs( "cloudwatch.metrics.cache.max_retention_period") if m["_MetricId"] in metric_ids or "_SamplingTime" not in m: continue if (now - misc.str2utc(m["_SamplingTime"]) ).total_seconds() < max_retention_period: self.metric_cache.append(m) self.ec2.set_state("cloudwatch.metric.query_counter", query_counter, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) self.ec2.set_state_json( "cloudwatch.metrics.cache", self.metric_cache, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) self.set_metric("Cloudwatch.GetMetricData", query_counter) # Augment Alarm definitions and Instances with associated metrics for metric in self.metric_cache: metric_id = metric["_MetricId"] alarm_data = self.get_alarm_data_by_name(metric_id) if alarm_data is not None: alarm_data["MetricDetails"] = metric continue instance = next( filter( lambda i: "CPUCreditBalance/%s" % i["InstanceId"] == metric_id, burstable_instances), None) if instance is not None: instance["_Metrics"] = {} instance["_Metrics"]["CPUCreditBalance"] = metric continue
def handler(self, event, context): # Protect from bad data and keep only SNS messages if "Records" not in event: log.error("Not a valid SNS event") return sns_records = [] for sns_msg in event["Records"]: if "EventSource" in sns_msg and sns_msg["EventSource"] == "aws:sns": try: sns_msg["_decoded_message"] = json.loads( sns_msg["Sns"]["Message"]) sns_records.append(sns_msg) except Exception as e: log.exception("Failed to decode message %s : %s" % (sns_msg, e)) log.debug(Dbg.pprint(sns_records)) need_main_update = False # For each SNS records, we keep track of important data in # a DynamoDB table for sns_msg in sns_records: message = sns_msg["_decoded_message"] timestamp = datetime.fromisoformat( message["StateChangeTime"].replace( "+0000", "")).replace(tzinfo=timezone.utc) alarm_name = message["AlarmName"] new_state_reason = message["NewStateReason"] new_state_value = message["NewStateValue"] namespace = message["Trigger"]["Namespace"] metric_name = message["Trigger"]["MetricName"] dimensions = message["Trigger"]["Dimensions"] instance_id = "None" try: instance_id = next( filter(lambda dimension: dimension['name'] == 'InstanceId', message["Trigger"]["Dimensions"]))["value"] except Exception as e: log.exception( "Failed to get InstanceId from dimension %s : %s" % (message["Trigger"]["Dimensions"], e)) continue now = misc.seconds_from_epoch_utc() response = self.context["dynamodb.client"].update_item( Key={"AlarmName": { 'S': alarm_name }}, UpdateExpression= "set InstanceId=:instanceid, %s_LastAlarmTimeStamp=:timestamp, %s_LastNewStateReason=:lastnewstatereason," "%s_LastMetricName=:lastmetricname, %s_LastMetricNamespace=:lastmetricnamespace, " "%s_Event=:event," "ExpirationTime=:expirationtime," "LastRecordUpdateTime=:lastrecordupdatetime" % (new_state_value, new_state_value, new_state_value, new_state_value, new_state_value), ExpressionAttributeValues={ ':instanceid': { 'S': instance_id }, ':timestamp': { 'S': str(timestamp) }, ':lastnewstatereason': { 'S': new_state_reason }, ':lastmetricname': { 'S': metric_name }, ':lastmetricnamespace': { 'S': namespace }, ':event': { 'S': json.dumps(message) }, ':expirationtime': { 'N': str(now + Cfg.get_duration_secs( "snsmgr.record_expiration_delay")) }, ':lastrecordupdatetime': { 'N': str(now) } }, ReturnConsumedCapacity='TOTAL', TableName=self.context["AlarmStateEC2Table"], ) need_main_update = True if need_main_update: # Send a message to wakeup the Main Lambda function that is in # charge to take appropriate decision sqs.call_me_back_send(self.ec2) log.debug("Sent SQS message to Main lambda queue: %s" % self.context["MainSQSQueue"])
def main_handler_entrypoint(event, context): """ Parameters ---------- event: dict, required context: object, required Lambda Context runtime methods and attributes Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html Returns ------ """ #print(Dbg.pprint(event)) ctx["now"] = misc.utc_now() ctx["FunctionName"] = "Main" init() if Cfg.get_int("app.disable") != 0 and not misc.is_sam_local(): log.warning("Application disabled due to 'app.disable' key") return no_is_called_too_early = False # Manage Spot interruption as fast as we can if sqs.process_sqs_records(event, function=ec2_schedule.manage_spot_notification, function_arg=ctx): log.info("Managed Spot Interruption SQS record!") # Force to run now disregarding `app.run_period` as we have at least one Spot instance to # remove from target groups immediatly no_is_called_too_early = True # Check that we are not called too early # Note: We peform a direct read to the KVTable to spare initialization time when the # Lambda is called too early ctx["main.last_call_date"] = ctx["o_ec2"].get_state("main.last_call_date", direct=True) if ctx["main.last_call_date"] is None or ctx["main.last_call_date"] == "": ctx["main.last_call_date"] = str(misc.epoch()) if not no_is_called_too_early and is_called_too_early(): log.log(log.NOTICE, "Called too early by: %s" % event) notify.do_not_notify = True sqs.process_sqs_records(event) sqs.call_me_back_send() return log.debug("Load prerequisites.") load_prerequisites(["o_state", "o_notify", "o_ec2", "o_cloudwatch", "o_targetgroup", "o_ec2_schedule", "o_scheduler", "o_rds"]) # Remember 'now' as the last execution date ctx["o_ec2"].set_state("main.last_call_date", value=ctx["now"], TTL=Cfg.get_duration_secs("app.default_ttl")) Cfg.dump() # Perform actions: log.debug("Main processing.") ctx["o_targetgroup"].manage_targetgroup() ctx["o_ec2_schedule"].schedule_instances() ctx["o_ec2_schedule"].stop_drained_instances() ctx["o_cloudwatch"].configure_alarms() ctx["o_rds"].manage_subfleet_rds() ctx["o_ec2_schedule"].prepare_metrics() ctx["o_cloudwatch"].send_metrics() ctx["o_cloudwatch"].configure_dashboard() # If we got woke up by SNS, acknowledge the message(s) now sqs.process_sqs_records(event) ctx["o_notify"].notify_user_arn_resources() # Call me back if needed sqs.call_me_back_send()
def _manage_targetgroup(self, targetgroup, running_instances, transitions): now = self.context["now"] registered_targets = self.get_registered_targets(targetgroup)[0] # Generate events on instance state transition for instance in self.ec2.get_instances(ScalingState="-excluded"): instance_id = instance["InstanceId"] previous_state = self.get_instance_state(instance_id, targetgroup) if previous_state is None: previous_state = "None" target_instance = self.is_instance_registered(targetgroup, instance_id) current_state = target_instance['TargetHealth']["State"] if target_instance is not None else "None" if current_state != previous_state: transitions.append({ "InstanceId": instance_id, "TargetGroupArn": targetgroup, "PreviousState" : previous_state, "NewState": current_state }) self.set_instance_state(instance_id, targetgroup, current_state) # List instances that are running and not yet in the TargetGroup instance_ids_to_add = [] for instance in running_instances: instance_id = instance["InstanceId"] if self.ec2.get_scaling_state(instance_id) in ["draining", "bounced", "error"]: continue target_instance = self.is_instance_registered(targetgroup, instance_id) if target_instance is None: instance_ids_to_add.append({'Id':instance_id}) self.set_instance_state(instance_id, targetgroup, "None") if len(instance_ids_to_add) > 0: log.debug("Registering instance(s) in TargetGroup: %s" % instance_ids_to_add) for instance_id in instance_ids_to_add: try: response = R(lambda args, kwargs, r: r["ResponseMetadata"]["HTTPStatusCode"] == 200, self.client_elbv2.register_targets, TargetGroupArn=targetgroup, Targets=[instance_id] ) except Exception as e: log.exception("Failed to register target '%s' in targetgroup '%s'!' : %s" % (instance_id, targetgroup["TargetGroupArn"], e)) self.state_changed = True if self.state_changed: return # When there are instances in initial state, we have to react slower to # misbehavior if 'initial' instances fail their health checks. slow_deregister = len(self.get_registered_instance_ids(state="initial")) != 0 # List instances that are no more running but still in the TargetGroup delayed_deregister_instance_ids = [] instance_ids_to_delete = [] draining_instances = self.ec2.get_instances(ScalingState="excluded,draining,bounced,error") slow_deregister_timeout = int(Cfg.get_duration_secs("targetgroup.slow_deregister_timeout")) for instance in registered_targets: instance_id = instance["Target"]["Id"] instance = self.ec2.get_instance_by_id(instance_id) if self.is_instance_registered(targetgroup, instance_id, fail_if_draining=True) is None: continue if instance is None or instance["State"]["Name"] not in ["pending","running"] or instance_id in self.ec2.get_instance_ids(draining_instances): meta = {} self.ec2.get_scaling_state(instance_id, meta=meta) if meta["last_action_date"] is not None and slow_deregister: gap_secs = (now - meta["last_action_date"]).total_seconds() if gap_secs < (slow_deregister_timeout * random.random()): if instance_id not in [ i["InstanceId"] for i in delayed_deregister_instance_ids]: delayed_deregister_instance_ids.append({ "InstanceId": instance_id, "Gap": gap_secs }) continue instance_ids_to_delete.append({'Id':instance_id}) for i in delayed_deregister_instance_ids: log.info("Slow deregister mode: Instance '%s' is waiting deregister for %d seconds... (targetgroup.slow_deregister_timeout=%s + jitter...)" % (i["InstanceId"], i["Gap"], slow_deregister_timeout)) if len(instance_ids_to_delete) > 0: log.debug("Deregistering instance(s) in TargetGroup: %s" % instance_ids_to_delete) response = R(lambda args, kwargs, r: r["ResponseMetadata"]["HTTPStatusCode"] == 200, self.client_elbv2.deregister_targets, TargetGroupArn=targetgroup, Targets=instance_ids_to_delete ) self.state_changed = True
def __init__(self, context, o_state): self.context = context self.instances = None self.instance_ids = None self.instance_statuses = None self.prereqs_done = False self.o_state = o_state self.state_table = None Cfg.register({ "ec2.describe_instances.max_results": "250", "ec2.describe_instance_types.enabled": "0", "ec2.az.statusmgt.disable": 0, "ec2.az.unavailable_list,Stable": { "DefaultValue": "", "Format": "StringList", "Description": """List of Availability Zone names (ex: *eu-west-3c*) or AZ Ids (ex: *euw3-az1*). Typical usage is to force a fleet to consider one or more AZs as unavailable (AZ eviction). The autoscaler will then refuse to schedule new instances on these AZs. Existing instances in those AZs are left unchanged but on scalein condition will be shutdown in priority (see [`ec2.az.evict_instances_when_az_faulty`](#ec2azinstance_faulty_when_az_faulty) to change this behavior). This setting can be used during an AWS LSE (Large Scale Event) to manually define that an AZ is unavailable. > Note: CloneSquad also uses the EC2.describe_availability_zones() API to discover dynamically LSE events. So, setting directly this key should not be needed in most cases. Please notice that, once an AZ is enabled again (either manually or automatically), instance fleet WON'T be rebalanced automatically: * If Instance bouncing is enabled, the fleet will be progressively rebalanced (convergence time will depend on the instance bouncing setting) * If instance bouncing is not configured, user can force a rebalancing by switching temporarily the fleet to `100%` during few minutes (with [`ec2.schedule.desired_instance_count`](#ec2scheduledesired_instance_count) sets temporarily to `100%`) and switch back to the original value. """ }, "ec2.az.evict_instances_when_az_faulty,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Defines if instances running in a AZ with issues must be considered 'unavailable' By Default, instances running in an AZ reported with issues are left untouched and these instances will only be evicted if their invidual healthchecks fail or on scalein events. Settting this parameter to 1 will force Clonesquad to consider all the instances running in faulty AZ as 'unavailable' and so forcing their immediate replacement in healthy AZs in the region. """ }, "ec2.state.default_ttl": "days=1", "ec2.state.error_ttl": "minutes=5", "ec2.state.status_ttl": "days=40", "ec2.state.error_instance_ids": "", "ec2.state.excluded_instance_ids": { "DefaultValue": "", "Format": "List of String", "Description": """List of instance ids to consider as excluded. One of the 2 ways to exclude existant instances to be managed by CloneSquad, this key is a list of instance ids (ex: i-077b2ae6988f33de4;i-0564c45bfa5bb6aa5). The other way to exclude instances, is to tag instances with "clonesquad:excluded" key with value 'True'. """ }, "ec2.debug.availability_zones_impaired": "", }) self.o_state.register_aggregates([{ "Prefix": "ec2.instance.", "Compress": True, "DefaultTTL": Cfg.get_duration_secs("ec2.state.default_ttl"), "Exclude": ["ec2.instance.scaling.state."] }])
def start_instances(self, instance_ids_to_start, max_started_instances=-1): # Remember when we tried to start all these instances. Used to detect instances with issues # by placing them at end of get_instances() generated list if instance_ids_to_start is None or len(instance_ids_to_start) == 0: return now = self.context["now"] client = self.context["ec2.client"] for i in instance_ids_to_start: if max_started_instances == 0: break self.set_state("ec2.instance.last_start_attempt_date.%s" % i, now, TTL=Cfg.get_duration_secs("ec2.schedule.state_ttl")) log.info("Starting instance %s..." % i) response = None try: response = R(lambda args, kwargs, r: r["ResponseMetadata"][ "HTTPStatusCode"] == 200, client.start_instances, InstanceIds=[i]) except Exception as e: log.exception( "Got Exception while trying to start instance '%s' : %s" % (i, e)) # Mark the instance in error only if the status is not 'running' # With Spot instances, from time-to-time, we catch an 'InsufficientCapacityError' even the # instance succeeded to start. We issue a describe_instances to check the real state of this # instance to confirm/infirm the status if response is not None: response = R( lambda args, kwargs, r: "Reservations" in r and len( response["Reservations"]["Instances"]), client.describe_instances, InstanceIds=[i]) if (response is None or "Reservations" not in response or len(response["Reservations"][0]["Instances"]) == 0 or response["Reservations"][0]["Instances"][0]["State"] ["Name"] not in ["pending", "running"]): self.set_scaling_state( i, "error", ttl=Cfg.get_duration_secs("ec2.state.error_ttl")) continue if response is not None: log.debug(Dbg.pprint(response)) # Remember when we started these instances metadata = response["ResponseMetadata"] if metadata["HTTPStatusCode"] == 200: s = response["StartingInstances"] for r in s: instance_id = r["InstanceId"] previous_state = r["PreviousState"] current_state = r["CurrentState"] if current_state["Name"] in ["pending", "running"]: self.set_state( "ec2.instance.last_start_date.%s" % instance_id, now, TTL=Cfg.get_duration_secs("ec2.state.status_ttl")) max_started_instances -= 1 else: log.error( "Failed to start instance '%s'! Blacklist it for a while... (pre/current status=%s/%s)" % (instance_id, previous_state["Name"], current_state["Name"])) self.set_scaling_state( instance_id, "error", ttl=Cfg.get_duration_secs("ec2.state.error_ttl")) R(None, self.instance_in_error, Operation="start", InstanceId=instance_id, PreviousState=previous_state["Name"], CurrentState=current_state["Name"]) else: log.error("Failed to call start_instances: %s" % i)
def get_prerequisites(self): if Cfg.get_int("cron.disable"): return # Get Timezone related info self.timezones = yaml.safe_load( misc.get_url("internal:region-timezones.yaml")) self.tz = os.getenv("TimeZone") self.tz = self.timezones.get(self.context["AWS_DEFAULT_REGION"]) if ( self.tz is None or self.tz == "") else self.tz self.tz = self.tz if self.tz else "UTC" self.local_now = arrow.now( self.tz) # Get local time (with local timezone) self.utc_offset = self.local_now.utcoffset() self.dst_offset = self.local_now.dst() log.log( log.NOTICE, "Current timezone offset to UTC: %s, DST: %s, TimeZone: %s" % (self.utc_offset, self.dst_offset, self.tz)) # Load scheduler KV table self.scheduler_table = kvtable.KVTable.create( self.context, self.context["SchedulerTable"], cache_max_age=Cfg.get_duration_secs("scheduler.cache.max_age")) # Compute event names self.load_event_definitions() # Read all existing event rules client = self.context["events.client"] params = { "NamePrefix": "CS-Cron-%s-" % (self.context["GroupName"]), "Limit": 10 } self.rules = [] paginator = client.get_paginator('list_rules') response_iterator = paginator.paginate(**params) for response in response_iterator: if "Rules" in response: self.rules.extend(response["Rules"]) max_rules_per_batch = Cfg.get_int("cron.max_rules_per_batch") # Create missing rules expected_rule_names = [r["Name"] for r in self.event_names] existing_rule_names = [r["Name"] for r in self.rules] for r in expected_rule_names: if r not in existing_rule_names: max_rules_per_batch -= 1 if max_rules_per_batch <= 0: break rule_def = self.get_ruledef_by_name(r) schedule_spec = rule_def["Data"][0]["schedule"] schedule_expression = self.process_cron_expression( schedule_spec) log.log( log.NOTICE, f"Creating {r} {schedule_spec} => {schedule_expression}..." ) # In order to remove burden on user, we perform a sanity check about a wellknown # limitation of Cloudwatch. if schedule_expression.startswith("cron("): expr = [ i for i in schedule_expression.replace("(", " ").replace( ")", " ").split(" ") if i != "" ] if len(expr) != 7: log.warn( "Schedule rule '%s' has an invalid cron expression '%s' (too short cron syntax)! Ignore it..." % (rule_def["EventName"], schedule_expression)) continue if (expr[5] != '?' and not expr[3] == '?') or ( expr[3] != '?' and not expr[5] == '?'): log.warn( "Schedule rule '%s' has an invalid cron expression '%s'. " "You can't specify the Day-of-month and Day-of-week fields in the same cron expression. If you specify a value (or a *) in one of the fields, you must use a ? (question mark) in the other. " "" % (rule_def["EventName"], schedule_expression)) continue # Update Cloudwatch rule try: response = client.put_rule( Name=r, Description="Schedule Event '%s': %s" % (rule_def["EventName"], rule_def["Event"]), RoleArn=self.context["CloudWatchEventRoleArn"], ScheduleExpression=schedule_expression, State='ENABLED') log.debug("put_rule: %s" % response) except Exception as e: log.exception( "Failed to create scheduler event '%s' (%s) : %s" % (r, schedule_expression, e)) try: response = client.put_targets( Rule=r, Targets=[{ 'Arn': self.context["InteractLambdaArn"], 'Id': "id%s" % r, }]) log.debug("put_targets: %s" % response) except Exception as e: log.exception( "Failed to set targets for event rule '%s' : %s" % (r, e)) # Garbage collect obsolete rules for r in existing_rule_names: if r not in expected_rule_names: max_rules_per_batch -= 1 if max_rules_per_batch <= 0: break try: client.remove_targets(Rule=r, Ids=["id%s" % r]) client.delete_rule(Name=r) except Exception as e: log.exception("Failed to delete rule '%s' : %s" % (r, e))
def get_prerequisites(self): rds_client = self.context["rds.client"] tagging_client = self.context["resourcegroupstaggingapi.client"] self.databases = {"db": [], "cluster": []} for db_type in list(self.databases.keys()): paginator = tagging_client.get_paginator('get_resources') tag_mappings = itertools.chain.from_iterable( page['ResourceTagMappingList'] for page in paginator.paginate( ResourceTypeFilters=["rds:%s" % db_type], TagFilters=[{ 'Key': 'clonesquad:group-name', 'Values': [self.context["GroupName"]] }])) self.databases["%s.tags" % db_type] = list(tag_mappings) if len(self.databases["%s.tags" % db_type]) == 0: continue if db_type == "cluster": func = rds_client.describe_db_clusters filter_key = "db-cluster-id" response_index = "DBClusters" if db_type == "db": func = rds_client.describe_db_instances filter_key = "db-instance-id" response_index = "DBInstances" try: self.databases[db_type].extend( func(Filters=[{ 'Name': filter_key, 'Values': [ t["ResourceARN"] for t in self.databases["%s.tags" % db_type] ] }])[response_index]) except Exception as e: log.exception("Failed to describe RDS database type '%s'" % (db_type)) #log.debug(Dbg.pprint(self.databases)) Cfg.register({ "rds.state.default_ttl": "hours=2", "rds.metrics.time_resolution": "60", }) self.state_table = self.o_state.get_state_table() self.state_table.register_aggregates([{ "Prefix": "rds.", "Compress": True, "DefaultTTL": Cfg.get_duration_secs("rds.state.default_ttl"), "Exclude": [] }]) metric_time_resolution = Cfg.get_int("rds.metrics.time_resolution") if metric_time_resolution < 60: metric_time_resolution = 1 # Switch to highest resolution self.cloudwatch.register_metric([ { "MetricName": "StaticFleet.RDS.Size", "Unit": "Count", "StorageResolution": metric_time_resolution }, { "MetricName": "StaticFleet.RDS.AvailableDBs", "Unit": "Count", "StorageResolution": metric_time_resolution }, { "MetricName": "StaticFleet.RDS.StoppingDBs", "Unit": "Count", "StorageResolution": metric_time_resolution }, { "MetricName": "StaticFleet.RDS.StartingDBs", "Unit": "Count", "StorageResolution": metric_time_resolution }, ]) # We need to register dynamically static subfleet configuration keys to avoid a 'key unknown' warning # when the user is going to set it static_subfleet_names = self.get_rds_subfleet_names() for static_fleet in static_subfleet_names: key = "staticfleet.%s.state" % static_fleet if not Cfg.is_builtin_key_exist(key): Cfg.register({key: ""}) log.log( log.NOTICE, "Detected following static subfleet names across RDS resources: %s" % static_subfleet_names)
def get_prerequisites(self): """ Gather instance status by calling SSM APIs. """ if not Cfg.get_int("ssm.enable"): log.log(log.NOTICE, "SSM support is currently disabled. Set ssm.enable to 1 to enabled it.") return now = self.context["now"] self.ttl = Cfg.get_duration_secs("ssm.state.default_ttl") GroupName = self.context["GroupName"] misc.initialize_clients(["ssm"], self.context) client = self.context["ssm.client"] # Retrive all SSM maintenace windows applicable to this CloneSquad deployment mw_names = { "__globaldefault__": {}, "__default__": {}, "__main__": {}, "__all__": {} } fmt = self.context.copy() mw_names["__globaldefault__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.global_defaults", fmt=fmt) mw_names["__default__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.defaults", fmt=fmt) mw_names["__main__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.mainfleet.defaults", fmt=fmt) mw_names["__all__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.subfleet.__all__.defaults", fmt=fmt) all_mw_names = mw_names["__globaldefault__"]["Names"] all_mw_names.extend([ n for n in mw_names["__default__"]["Names"] if n not in all_mw_names]) all_mw_names.extend([ n for n in mw_names["__main__"]["Names"] if n not in all_mw_names]) all_mw_names.extend([ n for n in mw_names["__all__"]["Names"] if n not in all_mw_names]) Cfg.register({ f"ssm.feature.maintenance_window.subfleet.__all__.force_running": Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running"), f"ssm.feature.events.ec2.scaling_state_changes.draining.__main__.connection_refused_tcp_ports": Cfg.get("ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports") }) for SubfleetName in self.o_ec2.get_subfleet_names(): fmt["SubfleetName"] = SubfleetName mw_names[f"Subfleet.{SubfleetName}"] = {} Cfg.register({ f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults": Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults"), f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.ec2.schedule.min_instance_count": Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.ec2.schedule.min_instance_count"), f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running": Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running"), f"ssm.feature.events.ec2.scaling_state_changes.draining.{SubfleetName}.connection_refused_tcp_ports": Cfg.get("ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports") }) mw_names[f"Subfleet.{SubfleetName}"]["Names"] = Cfg.get_list(f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults", fmt=fmt) all_mw_names.extend([ n for n in mw_names[f"Subfleet.{SubfleetName}"]["Names"] if n not in all_mw_names]) names = all_mw_names mws = [] while len(names): paginator = client.get_paginator('describe_maintenance_windows') response_iterator = paginator.paginate( Filters=[ { 'Key': 'Name', 'Values': names[:20] }, ]) for r in response_iterator: for wi in r["WindowIdentities"]: if not wi["Enabled"]: log.log(log.NOTICE, f"SSM Maintenance Window '%s' not enabled. Ignored..." % wi["Name"]) continue if "NextExecutionTime" not in wi: log.log(log.NOTICE, f"/!\ SSM Maintenance Window '%s' without 'NextExecutionTime'." % wi["Name"]) if wi not in mws: mws.append(wi) names = names[20:] # Make string dates as object dates for d in mws: if "NextExecutionTime" in d: d["NextExecutionTime"] = misc.str2utc(d["NextExecutionTime"]) # Retrieve Maintenace Window tags with the resourcegroup API tagged_mws = self.context["o_state"].get_resources(service="ssm", resource_name="maintenancewindow") for tmw in tagged_mws: mw_id = tmw["ResourceARN"].split("/")[1] mw = next(filter(lambda w: w["WindowId"] == mw_id, mws), None) if mw: mw["Tags"] = tmw["Tags"] valid_mws = [] for mw in mws: mw_id=mw["WindowId"] if "Tags" not in mw: try: response = client.list_tags_for_resource(ResourceType='MaintenanceWindow', ResourceId=mw_id) mw["Tags"] = response['TagList'] if 'TagList' in response else [] except Exception as e: log.error(f"Failed to fetch Tags for MaintenanceWindow '{mw_id}'") if ("Tags" not in mw or not len(mw["Tags"])) and mw["Name"] not in mw_names["__globaldefault__"]["Names"]: log.warning(f"Please tag SSM Maintenance Window '%s/%s' with 'clonesquad:group-name': '%s'!" % (mw["Name"], mw["WindowId"], self.context["GroupName"])) continue valid_mws.append(mw) self.maintenance_windows = { "Names": mw_names, "Windows": valid_mws } # Update asynchronous results from previously launched commands self.update_pending_command_statuses() # Perform maintenance window house keeping self.manage_maintenance_windows() if len(mws): log.log(log.NOTICE, f"Found matching SSM maintenance windows: %s" % self.maintenance_windows["Windows"]) # Hard dependency toward EC2 module. We update the SSM instance initializing states self.o_ec2.update_ssm_initializing_states()
def __init__(self, context): self.context = context self.o_state = self.context["o_state"] self.maintenance_windows = {} self.o_ec2 = self.context["o_ec2"] GroupName = self.context["GroupName"] Cfg.register({ "ssm.enable,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Enable globally support for AWS System Manager by CloneSquad. CloneSquad can leverage AWS SSM to take into account Maintenance Windows and use SSM RunCommand to execute status probe scripts located in managed instances. """ }, "ssm.feature.events.ec2.maintenance_window_period,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Enable/Disable sending Enter/Exit Maintenance Window period events to instances. This enables event notification support of instances when they enter or exit a SSM Maintenance Window. When set to 1, CloneSquad sends a SSM RunCommand to run the script `/etc/cs-ssm/(enter|exit)-maintenance-window-period` script located in each instances. The event is repeasted until the script returns a zero-code. If the script doesn't exist on an instance, the event is sent only once. > This setting is taken into account only if [`ssm.enable`](#ssmenable) is set to 1. """ }, "ssm.feature.events.ec2.instance_ready_for_shutdown,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Ensure instance shutdown readiness with /etc/cs-ssm/instance-ready-for-shutdown script on SSM managed instances." This enables support for direct sensing of instance shutdown readiness based on the return code of a script located in each EC2 instances. When set to 1, CloneSquad sends a SSM RunCommand to a managed instance candidate prior to shutdown: * If `/etc/cs-ssm/instance-ready-for-shutdown` is present, it is executed with the SSM agent daemon user rights: If the script returns a NON-zero code, Clonesquad will postpone the instance shutdown and will call this script again after 2 * [ `app.run_period`](#apprun_period) seconds... * If `/etc/cs-ssm/instance-ready-for-shutdown` is NOT present, immediate shutdown readyness is assumed. > This setting is taken into account only if [`ssm.enable`](#ssmenable) is set to 1. """ }, "ssm.feature.events.ec2.instance_ready_for_shutdown.max_shutdown_delay,Stable": { "DefaultValue": "hours=1", "Format": "Duration", "Description": """ Maximum time to spend waiting for SSM based ready-for-shutdown status. When SSM support is enabled with [`ssm.feature.events.ec2.instance_ready_for_operation`](#ssmfeatureec2instance_ready_for_operation), instances may notify CloneSquad when they are ready for shutdown. This setting defines the maximum time spent by CloneSquad to receive this signal before to forcibly shutdown the instance. """ }, "ssm.feature.events.ec2.instance_ready_for_operation,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Ensure an instance go out from 'initializing' state based on an instance script returns code. This enables support for direct sensing of instance **serving** readiness based on the return code of a script located in each EC2 instances. CloneSquad never stops an instance in the 'initializing' state. This state is normally automatically left after [`ec2.schedule.start.warmup_delay`](#ec2schedulestartwarmup_delay) seconds: When this setting is set, an SSM command is sent to each instance and call a script to get a direct ack that an instance can left the 'initializing' state. * If `/etc/cs-ssm/instance-ready-for-operation` is present, it is executed with the SSM agent daemon user rights: If the script returns a NON-zero code, Clonesquad will postpone the instance go-out from 'initializing' state and will call this script again after 2 * [ `app.run_period`](#apprun_period) seconds... * If `/etc/cs-ssm/instance-ready-for-operation` is NOT present, the instance leaves the 'initializing' state immediatly after 'warmup delay'.. > This setting is taken into account only if [`ssm.enable`](#ssmenable) is set to 1. """ }, "ssm.feature.events.ec2.instance_ready_for_operation.max_initializing_time,Stable": { "DefaultValue": "hours=1", "Format": "Duration", "Description": """Max time that an instance can spend in 'initializing' state. When [`ssm.feature.events.ec2.instance_ready_for_operation`](#ssmfeatureec2instance_ready_for_operation) is set, this setting defines the maximum duration that CloneSquas will attempt to get a status 'ready-for-operation' for a specific instance through SSM RunCommand calls and execution of the `/etc/cs-ssm/instance-ready-for-operation` script. """ }, "ssm.feature.events.ec2.scaling_state_changes,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Call a script in instance when the instance scaling state changes. When this toggle set, the script `/etc/cs-ssm/instance-scaling-state-change` located into managed instances, is called to notify about a scaling status change. Currently, only `draining` and `bounced` events are sent (`bounced`is sent only if the instance bouncing feature is activated). For example, if an instance enters the `draining` state because CloneSquad wants to shutdown it, this event is called. * If the script doesn't exists, the event is sent only once, * If the script returns a non-zero code, the event will be repeated. > Note: This event differs from [`ssm.feature.events.ec2.instance_ready_for_shutdown`](#ssmfeatureeventsec2instance_ready_for_shutdown) one as it is only meant to inform the instance about a status change. The [`ssm.feature.events.ec2.instance_ready_for_shutdown`](#ssmfeatureeventsec2instance_ready_for_shutdown) event is a request toward the instance asking for an approval to shutdown. """ }, "ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports,Stable": { "DefaultValue": "", "Format": "StringList", "Description": """On `draining` state, specified ports are blocked and so forbid new TCP connections (i.e. *Connection refused* message). This features installs, **on `draining` time**, temporary iptables chain and rules denying new TCP connections to the specified port list. This is useful, for example, to break a healthcheck life line as soon as an instance enters the `draining` state: It is especially useful when non-ELB LoadBalancers are used and CloneSquad does not know how to tell these loadbalancers that no more traffic needs to be sent to a drained instance. As it blocks only new TCP connections, currently active connections can terminate gracefully during the draining period. > When instances are served only by CloneSquad managed ELB(s), there is no need to use this feature as CloneSquad will unregister the targets as soon as placed in `draining`state. By default, no blocked port list is specified, so no iptables call is performed on the instance. """ }, "ssm.feature.events.ec2.scaling_state_changes.draining.{SubfleeName}.connection_refused_tcp_ports,Stable": { "DefaultValue": "", "Format": "StringList", "Description": """Defines the blocked TCP port list for the specified fleet. This setting overrides the value defined in [`ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports`](#ssmfeatureeventsec2scaling_state_changesdrainingconnection_refused_tcp_ports) for the specified fleet. > Use `__main__` to designate the main fleet.""" }, "ssm.feature.events.ec2.instance_healthcheck": "0", "ssm.feature.maintenance_window,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Defines if SSM maintenance window support is activated. > This setting is taken into account only if [`ssm.enable`](#ssmenable) is set to 1. """ }, "ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running,Stable": { "DefaultValue": "1", "Format": "Bool", "Description": """Defines if a subfleet is forcibly set to 'running' when a maintenance window is actice. By default, all the subfleets is woken up by a maintenance window ([`subfleet.{SubfleetName}.state`](#subfleetsubfleetnamestate) is temprarily forced to `running`). """, }, "ssm.state.default_ttl": "hours=1", "ssm.state.command.default_ttl": "minutes=10", "ssm.state.command.result.default_ttl": "minutes=5", "ssm.feature.maintenance_window.start_ahead,Stable": { "DefaultValue": "minutes=15", "Format": "Duration", "Description": """Start instances this specified time ahead of the next Maintenance Window. In order to ensure that instances are up and ready when a SSM Maintenance Window starts, they are started in advance of the 'NextExecutionTime' defined in the SSM maintenance window object. """ }, "ssm.feature.maintenance_window.start_ahead.max_jitter": "66%", "ssm.feature.maintenance_window.global_defaults": "CS-GlobalDefaultMaintenanceWindow", "ssm.feature.maintenance_window.defaults": "CS-{GroupName}", "ssm.feature.maintenance_window.mainfleet.defaults": "CS-{GroupName}-Mainfleet", "ssm.feature.maintenance_window.mainfleet.ec2.schedule.min_instance_count": { "DefaultValue": "100%", "Format": "IntegerOrPercentage", "Description": """Minimum number of instances serving in the fleet when the Maintenance Window occurs. > Note: If this value is set to the special value '100%', the setting [`ec2.schedule.desired_instance_count`](#ec2scheduledesired_instance_count) is also forced to '100%'. This implies that any LightHouse instances will also be started and full fleet stability ensured during the Maintenance Window. """ }, "ssm.feature.maintenance_window.subfleet.__all__.defaults": "CS-{GroupName}-Subfleet.__all__", "ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults": "CS-{GroupName}-Subfleet.{SubfleetName}", "ssm.feature.maintenance_window.subfleet.{SubfleetName}.ec2.schedule.min_instance_count": { "DefaultValue": "100%", "Format": "IntegerOrPercentage", "Description": """Minimum number of instances serving in the fleet when the Maintenance Window occurs. > Note: If this value is set to the special value '100%', the setting [`subfleet.{subfleet}.ec2.schedule.desired_instance_count`](#subfleetsubfleetec2scheduledesired_instance_count) is also forced to '100%' ensuring full subfleet stability. """ }, }) self.o_state.register_aggregates([ { "Prefix": "ssm.events", "Compress": True, "DefaultTTL": Cfg.get_duration_secs("ssm.state.default_ttl"), "Exclude" : [] }, ])
def send_commands(self): if not Cfg.get_int("ssm.enable"): return client = self.context["ssm.client"] refs = { "Linux": { "document": "AWS-RunShellScript", "shell": [s.rstrip() for s in io.StringIO(str(misc.get_url("internal:cs-ssm-agent.sh"), "utf-8")).readlines()], "ids": [], } } # Purge already replied results valid_cmds = [] for cmd in self.run_cmd_states["Commands"]: if cmd.get("Complete") or cmd["Expiration"] < misc.seconds_from_epoch_utc(): continue valid_cmds.append(cmd) self.run_cmd_states["Commands"] = valid_cmds # Purge outdated former results former_results = self.run_cmd_states["FormerResults"] for i in list(former_results.keys()): for cmd in list(former_results[i].keys()): if former_results[i][cmd]["Expiration"] < misc.seconds_from_epoch_utc(): del former_results[i][cmd] if len(former_results[i].keys()) == 0: del former_results[i] # Send commands for cmd in self.commands_to_send: platforms = {} for i in cmd["InstanceIds"]: info = self.is_instance_online(i) if info is None: continue platform_type = info["PlatformType"] pltf = refs.get(platform_type) if pltf is None: log.warning("Can't run a command on an unsupported platform : %s" % info["PlatformType"]) continue # Unsupported platform if platform_type not in platforms: platforms[platform_type] = copy.deepcopy(pltf) if i not in platforms[platform_type]["ids"]: platforms[platform_type]["ids"].append(i) command = cmd["Command"] args = cmd["CommandArgs"] for p in platforms: pltf = platforms[p] instance_ids = pltf["ids"] if not len(instance_ids): continue document = pltf["document"] shell = pltf["shell"] i_ids = instance_ids # Perform string parameter substitutions in the helper script shell_input = [l.replace("##Cmd##", command) for l in shell] shell_input = [l.replace("##ApiGwUrl##", self.context["InteractAPIGWUrl"]) for l in shell_input] if isinstance(args, str): shell_input = [l.replace("##Args##", args) for l in shell_input] else: shell_input = [l.replace("##Args##", args["Args"] if "Args" in args else "") for l in shell_input] for s in args: shell_input = [l.replace(f"##{s}##", str(args[s])) for l in shell_input] while len(i_ids): log.log(log.NOTICE, f"SSM SendCommand({p}): {command}({args}) to %s." % i_ids[:50]) try: response = client.send_command( InstanceIds=i_ids[:50], DocumentName=document, TimeoutSeconds=cmd["Timeout"], Comment=cmd["Comment"], Parameters={ 'commands': shell_input, 'executionTimeout': [str(cmd["Timeout"])] }, MaxConcurrency='100%', MaxErrors='100%', CloudWatchOutputConfig={ 'CloudWatchLogGroupName': self.context["SSMLogGroup"], 'CloudWatchOutputEnabled': True } ) self.run_cmd_states["Commands"].append({ "Id": response["Command"]["CommandId"], "InstanceIds": i_ids[:50], "ReceivedInstanceIds": [], "Command": command, "CommandArgs": args, "Results": {}, "Expiration": misc.seconds_from_epoch_utc() + Cfg.get_duration_secs("ssm.state.command.default_ttl") }) log.log(log.NOTICE, f"SSM RunCommand (Id:%s) : {command}({args})" % response["Command"]["CommandId"]) except Exception as e: # Under rare circumstance, we can receive an Exception while trying to send log.log(log.NOTICE, f"Failed to do SSM SendCommand : {e}, %s" % i_ids[:50]) i_ids = i_ids[50:] self.o_state.set_state_json("ssm.events.run_commands", self.run_cmd_states, compress=True, TTL=self.ttl)
def _record_call(need_shortterm_record, is_success_func, f, *args, **kwargs): global records global notify_mgr record = {} record["EventType"] = f.__name__ record["Input"] = {"*args": list(args), "**kwargs": dict(kwargs)} managed_exception = None xray_recorder.begin_subsegment("notifycall-call:%s" % f.__name__) try: r = f(*args, **kwargs) record["Output"] = json.dumps(r, default=str) except Exception as e: managed_exception = e record["Except"] = { "Exception": traceback.format_exc(), "Stackstrace": traceback.extract_stack(), "Reason": json.dumps(e, default=str) } xray_recorder.end_subsegment() if managed_exception is not None: # Persist now all aggregated data to not lose them xray_recorder.begin_subsegment("notifycall-persist_aggregates:%s" % f.__name__) try: KVTable.persist_aggregates() except Exception as e: log.exception("Failed to persist aggregated date!") xray_recorder.end_subsegment() if notify_mgr is None or do_not_notify: log.debug( "Do not write Event in event table: notify_mgr=%s, do_not_notify=%s" % (notify_mgr, do_not_notify)) if managed_exception is not None: raise managed_exception return r ctx = notify_mgr.context try: need_longterm_record = managed_exception is not None or not is_success_func( args, kwargs, r) if is_success_func is not None else False except Exception as e: log.exception( "Got an exception while assessing long term event management : %s" % e) need_longterm_record = True # Try to catch the maximum available metadata to ease later diagnosis # Protect against exceptions to ensure proper logging record["Metadata"] = {} xray_recorder.begin_subsegment("notifycall-build_metadata:%s" % f.__name__) try: notify_mgr.ec2.get_prerequisites(only_if_not_already_done=True) record["Metadata"]["EC2"] = { "AllInstanceDetails": notify_mgr.ec2.get_instances(), "AllInstanceStatuses": notify_mgr.ec2.get_instance_statuses(), "DrainingInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="draining") ], "BouncedInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="bounced") ], "ExcludedInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="excluded") ], "ErrorInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="error") ], "ScalingStates": notify_mgr.ec2.get_all_scaling_states() } except Exception as e: log.exception('Failed to create record["Metadata"]["EC2"] : %s' % e) xray_recorder.end_subsegment() xray_recorder.begin_subsegment("notifycall-build_metadata_targetgroup:%s" % f.__name__) try: notify_mgr.targetgroup.get_prerequisites(only_if_not_already_done=True) record["Metadata"][ "TargetGroups"] = notify_mgr.targetgroup.get_targetgroups_info() except Exception as e: log.exception( 'Failed to create record["Metadata"]["TargetGroups"] : %s' % e) xray_recorder.end_subsegment() for key in ["Metadata"]: zipped_bytes = gzip.compress( bytes(json.dumps(record[key], default=str), "utf-8")) record[key] = str(base64.b64encode(zipped_bytes), "utf-8") now = misc.utc_now() now_seconds = misc.seconds_from_epoch_utc() max_longterm_records = Cfg.get_int("notify.event.longterm.max_records") if max_longterm_records <= 0: need_longterm_record = 0 tables = [ { "Name": ctx["EventTable"], "NeedWrite": need_shortterm_record, "TTL": Cfg.get_duration_secs("notify.event.default_ttl"), "DBImages": False, "DebugReport": False }, { "Name": ctx["LongTermEventTable"], "NeedWrite": need_longterm_record, "TTL": Cfg.get_duration_secs("notify.event.longterm.ttl"), "DBImages": True, "DebugReport": True }, ] xray_recorder.begin_subsegment("notifycall-update_tables:%s" % f.__name__) for table in tables: if not table["NeedWrite"]: continue UpdateExpression = "set EventSource=:entrypoint, EventType=:eventtype, InputData=:input, OutputData=:output, HandledException=:exception, " UpdateExpression += "Metadata=:metadata, ExpirationTime=:expirationtime" ExpressionAttributeValues = { ':entrypoint': { 'S': ctx["FunctionName"] }, ':eventtype': { 'S': record["EventType"] }, ':input': { 'S': json.dumps(record["Input"], default=str) }, ':output': { 'S': json.dumps(record["Output"] if "Output" in record else {}, default=str) }, ':exception': { 'S': json.dumps(record["Except"] if "Except" in record else "", default=str) }, ':metadata': { 'S': json.dumps(record["Metadata"], default=str) }, ':expirationtime': { 'N': str(now_seconds + table["TTL"]) } } if table["DBImages"]: # Insert snapshots of the CloudWatch dashboard try: log.log(log.NOTICE, "Generating snapshots for Dashboard graphs...") images = notify_mgr.cloudwatch.get_dashboard_images() for i in images: compressed_name = i.replace(" ", "") UpdateExpression += ", Graph_%s_PNG=:graph%s" % ( compressed_name, compressed_name) ExpressionAttributeValues[":graph%s" % compressed_name] = { 'S': images[i] } log.info( "/!\ Generated CloudWatch dashboard PNG snapshots in DynamoDb table '%s' for further event analysis!" % table["Name"]) except Exception as e: log.exception( "Failed to retrieve CloudWatch snapshot images! : %s" % e) response = ctx["dynamodb.client"].update_item( Key={"EventDate": { 'S': str(now) }}, UpdateExpression=UpdateExpression, ExpressionAttributeValues=ExpressionAttributeValues, ReturnConsumedCapacity='TOTAL', TableName=table["Name"], ) log.debug(Dbg.pprint(response)) log.log( log.NOTICE, "Written event '[%s] %s' to table '%s'." % (str(now), record["EventType"], table["Name"])) # Keep under control the number of LongTerm items stored in DynamoDB table if need_longterm_record: longterm_item_eventdates = [ m["_"] for m in notify_mgr.state.get_metastring_list( "notify.longterm.itemlist", default=[]) ] log.log(log.NOTICE, "Guessed number of records in LongTerm Event table : %d", len(longterm_item_eventdates)) longterm_item_eventdates.append(str(now)) nb_records_to_delete = max( len(longterm_item_eventdates) - max_longterm_records, 0) for eventdate in longterm_item_eventdates[:nb_records_to_delete]: try: response = ctx["dynamodb.client"].delete_item( Key={'EventDate': { 'S': eventdate }}, TableName=ctx["LongTermEventTable"]) log.debug(response) log.log( log.NOTICE, "Purged LongTerm Event record '%s' as too many are already stored (notify.event.longterm.max_records=%d)" % (eventdate, max_longterm_records)) except Exception as e: log.exception( "Got exception while deleting LongTerm record '%s' : %e" % (eventdate, e)) notify_mgr.state.set_state( "notify.longterm.itemlist", ";".join(longterm_item_eventdates[nb_records_to_delete:]), TTL=Cfg.get_duration_secs("notify.event.longterm.ttl")) try: KVTable.persist_aggregates() except Exception as e: log.exception("Got exception while persisting KVTables : %s" % e) # Manage Debug report export to S3 url = ctx["LoggingS3Path"] if url != "" and table["DebugReport"] and Cfg.get_int( "notify.debug.send_s3_reports"): xray_recorder.begin_subsegment( "notifycall-publish_all_reports:%s" % f.__name__) if ctx["FunctionName"] == "Interact": # Avoid recursion if throwing from InteractFunction log.info("Publishing Debug reports synchronously...") debug.publish_all_reports(ctx, url, "notifymgr_report") else: client = ctx["sqs.client"] log.info( "Notifying Interact SQS Queue '%s' for asynchronous debug report generation..." % ctx["InteractSQSUrl"]) response = client.send_message(QueueUrl=ctx["InteractSQSUrl"], MessageBody=json.dumps({ "OpType": "Debug/PublishReportNow", "Events": { "Timestamp": str(ctx["now"]) } })) log.debug(response) xray_recorder.end_subsegment() xray_recorder.end_subsegment() if managed_exception is not None: raise managed_exception return r
def save_cached_data(self, data): d = misc.encode_json(data, compress=True) self.context["o_state"].set_state( "interact.precomputed", d, TTL=max(Cfg.get_duration_secs("app.run_period") * 2, 240))