def get_duration_secs(key, fmt=None): try: return misc.str2duration_seconds(get(key, fmt=fmt)) except Exception as e: raise Exception( "[ERROR] Failed to parse config key '%s' as a duration! : %s" % (key, e))
def scheduler(self, context, event, response, cacheddata): m = re.search("scheduler/(.*)$", event["OpType"]) if m is None: response["statusCode"] = 400 response["body"] = "Missing config key path." return False config_key = m.group(1) if "httpMethod" in event and event["httpMethod"] == "POST": value = event["body"].partition('\n')[0] ttl = misc.str2duration_seconds(event.get("ttl"), no_exception=True, default=None) log.info( f"Scheduler configuration write for key '{config_key}' = '{value}' (ttl={ttl})." ) kvtable.KVTable.set_kv_direct(config_key, value, self.context["SchedulerTable"], context=self.context, TTL=ttl) response["statusCode"] = 200 response["body"] = value else: value = kvtable.KVTable.get_kv_direct( config_key, self.context["SchedulerTable"], context=self.context) if value is None: response["statusCode"] = 400 response["body"] = "Unknown configuration key!" return False else: response["statusCode"] = 200 response["body"] = value return True
def manage_rule_event(self, event): if Cfg.get_int("cron.disable"): return if "source" in event and event["source"] == "aws.events" and event[ "detail-type"] == "Scheduled Event": # Triggered by an AWS CloudWatch Scheduled event. We look for a ParameterSet # request based on the ARN misc.initialize_clients(["events"], self.context) misc.load_prerequisites(self.context, ["o_scheduler"]) for r in event["resources"]: log.debug("Processing Scheduled event '%s'..." % r) m = re.search( "^arn:aws:events:[a-z-0-9]+:[0-9]+:rule/CS-Cron-%s-(.*)" % self.context["GroupName"], r) if m is not None and len(m.groups()) == 1: rule_num = m.group(1) log.info("Got event rule '%s'" % rule_num) self.load_event_definitions() rule_def = self.get_ruledef_by_name( "CS-Cron-%s-%s" % (self.context["GroupName"], rule_num)) log.debug(rule_def) ttl = None try: ttl = misc.str2duration_seconds( rule_def["TTL"] ) if rule_def is not None and "TTL" in rule_def else None except Exception as e: log.exception( "[WARNING] Failed to read 'TTL' value '%s'!" % (TTL)) params = dict(rule_def["Data"][0]) for k in params: if k in ["TTL", "schedule"]: continue Cfg.set(k, params[k], ttl=ttl) return True return False
def configuration(self, context, event, response, cacheddata): m = re.search("configuration/(.*)$", event["OpType"]) if m is None: response["statusCode"] = 400 response["body"] = "Missing config key path." return False config_key = m.group(1) with_maintenance_window = "with_maintenance_window" in event and event[ "with_maintenance_window"].lower() == "true" if with_maintenance_window: # We load the EC2 and SSM modules to inherit their override parameters if a SSM Maintenance Window is active misc.load_prerequisites(self.context, ["o_ec2", "o_ssm"]) if "httpMethod" in event and event["httpMethod"] == "POST": value = event["body"].partition('\n')[0] log.info(f"TTL=%s" % event.get("ttl")) ttl = misc.str2duration_seconds(event.get("ttl"), no_exception=True, default=None) log.info( f"Configuration write for key '{config_key}' = '{value}' (ttl={ttl})." ) kvtable.KVTable.set_kv_direct(config_key, value, self.context["ConfigurationTable"], context=self.context, TTL=ttl) response["statusCode"] = 200 response["body"] = value else: value = Cfg.get(config_key, none_on_failure=True) if value is None: response["statusCode"] = 400 response[ "body"] = "Unknown configuration key '%s'!" % config_key return False else: response["statusCode"] = 200 response["body"] = value return True
parser.add_argument('loadbalancer_url', help="LoadBalancer URL", type=str, nargs=1) parser.add_argument('--period', help="Duration", type=str, default="hours=2") parser.add_argument('--max-concurrency', help="Connection concurrency to load balancer", type=int, default=30) args = parser.parse_args() args_dict = {} for a in args._get_kwargs(): args_dict[a[0]] = a[1] period = misc.str2duration_seconds(args.period) time_offset = misc.seconds_from_epoch_utc() max_concurrency = args.max_concurrency while True: now = misc.seconds_from_epoch_utc() seconds = now - time_offset concurrency = 1 + int( (max_concurrency - 1) * ((1 - math.cos(2 * math.pi * (seconds % period) / period)) / 2.0)) cmd = "ab -c %(concurrency)s -n %(concurrency)s %(loadbalancer_url)s" % { "concurrency": concurrency, "loadbalancer_url": args.loadbalancer_url[0] } print(cmd)
def get_prerequisites(self): now = self.context["now"] client = self.context["cloudwatch.client"] # Read all CloudWatch alarm templates into memory alarm_definitions = {} for i in range(0, Cfg.get_int("cloudwatch.alarms.max_per_instance")): key = "cloudwatch.alarm%02d.configuration_url" % (i) r = Cfg.get_extended(key) if not r["Success"] or r["Value"] == "": continue d = misc.parse_line_as_list_of_dict(r["Value"]) url = d[0]["_"] meta = d[0] index = "%02d" % i alarm_defs = { "Index": index, "Key": key, "Url": url, "Definition": r, "Metadata": meta } prefix = "alarmname:" if url.startswith(prefix): alarm_defs["AlarmName"] = url[len(prefix):] elif url.startswith("ignore:"): continue # This entry need to be ignored. else: log.log(log.NOTICE, "Read Alarm definition: %s" % r["Value"]) try: resp = misc.get_url(url.format(**self.context)) if resp is None: raise Exception("URL content = <None>") alarm_defs["Content"] = str(resp, "utf-8") except Exception as e: log.exception( f"Failed to load Alarm definition '%s' : {e}" % r["Value"]) continue alarm_definitions[index] = alarm_defs self.alarm_definitions = alarm_definitions # Read all existing CloudWatch alarms alarms = [] paginator = client.get_paginator('describe_alarms') response_iterator = paginator.paginate( MaxRecords=Cfg.get_int("cloudwatch.describe_alarms.max_results")) for response in response_iterator: #log.debug(Dbg.pprint(response)) for alarm in response["MetricAlarms"]: alarm_name = alarm["AlarmName"] alarm_def = self.get_alarm_configuration_by_name(alarm_name) if alarm_def is not None: # This is an alarm thats belong to this CloneSquad instance alarms.append(alarm) #log.debug(Dbg.pprint(alarms)) self.alarms = alarms # Sanity check for index in self.alarm_definitions.keys(): alarm_def = self.alarm_definitions[index] if "AlarmName" not in alarm_def: continue alarm = next( filter(lambda a: a["AlarmName"] == alarm_def["AlarmName"], self.alarms), None) if alarm is None: log.warning( "Alarm definition [%s](%s => %s) doesn't match an existing CloudWatch alarm!" % (alarm_def["Definition"]["Key"], alarm_def["Definition"]["Value"], alarm_def["Definition"]["Status"])) # Read all metrics associated with alarms # CloudWatch intense polling can be expensive: This algorithm links CW metric polling rate to the # scale rate => Under intense scale up condition, polling is aggresive. If not, it falls down # to one polling every 'cloudwatch.metrics.low_rate_polling_interval' seconds # TODO(@jcjorel): Avoid this kind of direct references to an upper level module!! integration_period = Cfg.get_duration_secs( "ec2.schedule.horizontalscale.integration_period") instance_scale_score = self.ec2.get_integrated_float_state( "ec2.schedule.scaleout.instance_scale_score", integration_period) self.metric_cache = self.get_metric_cache() query = {"IdMapping": {}, "Queries": []} # Build query for Alarm metrics if Cfg.get("ec2.schedule.desired_instance_count") == "-1": # Sort by oldest alarms first in cache cached_metric_names = [m["_MetricId"] for m in self.metric_cache] valid_alarms = [] for a in alarms: alarm_name = a["AlarmName"] alarm_def = self.get_alarm_configuration_by_name(alarm_name) if alarm_def is None or alarm_def["AlarmDefinition"][ "Url"].startswith("alarmname:"): continue a["_SamplingTime"] = self.get_metric_by_id( alarm_name )["_SamplingTime"] if alarm_name in cached_metric_names else str( misc.epoch()) valid_alarms.append(a) sorted_alarms = sorted( valid_alarms, key=lambda a: misc.str2utc(a["_SamplingTime"])) # We poll from the oldest to the newest and depending on the instance_scale_score to limit CloudWacth GetMetricData costs time_for_full_metric_refresh = max( Cfg.get_duration_secs( "cloudwatch.metrics.time_for_full_metric_refresh"), 1) app_run_period = Cfg.get_duration_secs("app.run_period") minimum_polled_alarms_per_run = Cfg.get_int( "cloudwatch.metrics.minimum_polled_alarms_per_run") maximum_polled_alarms_per_run = app_run_period / time_for_full_metric_refresh maximum_polled_alarms_per_run = min(maximum_polled_alarms_per_run, 1.0) weight = min(instance_scale_score, maximum_polled_alarms_per_run) max_alarms_for_this_run = max( minimum_polled_alarms_per_run, int(min(weight, 1.0) * len(sorted_alarms))) for alarm in sorted_alarms[:max_alarms_for_this_run]: alarm_name = alarm["AlarmName"] CloudWatch._format_query(query, alarm_name, alarm) # We always poll user supplied alarms for alarm in alarms: alarm_name = alarm["AlarmName"] alarm_def = self.get_alarm_configuration_by_name(alarm_name) if alarm_def is None: continue # Unknown alarm name if not alarm_def["AlarmDefinition"]["Url"].startswith( "alarmname:"): continue CloudWatch._format_query(query, alarm_name, alarm) max_retention_period = Cfg.get_duration_secs( "cloudwatch.metrics.cache.max_retention_period") # Query Metric for Burstable instances instance_minimum_age_for_cpu_credit_polling = Cfg.get_duration_secs( "cloudwatch.metrics.instance_minimum_age_for_cpu_credit_polling") burstable_instances = self.ec2.get_burstable_instances( State="running", ScalingState="-error") cpu_credit_polling = 0 for i in burstable_instances: instance_id = i["InstanceId"] if (now - i["LaunchTime"]).total_seconds( ) < instance_minimum_age_for_cpu_credit_polling: continue cached_metric = self.get_metric_by_id( f"CPUCreditBalance/{instance_id}") if cached_metric is not None: # Note: Polling of CPU Credit Balance is a bit tricky as this API takes a lot of time to update and sometime # do send back results from time to time. So we need to try multiple times... if ("_LastSamplingAttempt" in cached_metric and (now - misc.str2utc(cached_metric["_LastSamplingAttempt"]) ).total_seconds() < misc.str2duration_seconds("minutes=1")): continue # We do not want to poll more than one per minute if (now - misc.str2utc(cached_metric["_SamplingTime"]) ).total_seconds() < max_retention_period * 0.8: # Current data point is not yet expired. Keep of this attempt continue cached_metric["_LastSamplingAttempt"] = now cpu_credit_polling += 1 CloudWatch._format_query( query, "%s/%s" % ("CPUCreditBalance", instance_id), { "MetricName": "CPUCreditBalance", "Namespace": "AWS/EC2", "Dimensions": [{ "Name": "InstanceId", "Value": instance_id }], "Period": 300, "Statistic": "Average" }) log.log( log.NOTICE, f"Will poll {cpu_credit_polling} instances for CPU Credit balance." ) # Make request to CloudWatch query_counter = self.ec2.get_state_int( "cloudwatch.metric.query_counter", default=0) queries = query["Queries"] metric_results = [] metric_ids = [] no_metric_ids = [] while len(queries) > 0: q = queries[:500] queries = queries[500:] results = [] args = { "MetricDataQueries": q, "StartTime": now - timedelta(seconds=Cfg.get_duration_secs( "cloudwatch.metrics.data_period")), "EndTime": now } paginator = client.get_paginator('get_metric_data') response_iterator = paginator.paginate(**args) for response in response_iterator: results.extend(response["MetricDataResults"]) query_counter += len(q) for r in results: if r["StatusCode"] != "Complete": log.error(f"Failed to retrieve metrics: {q}") continue metric_id = query["IdMapping"][r["Id"]] if len(r["Timestamps"]) == 0: if metric_id not in no_metric_ids: no_metric_ids.append(metric_id) continue if metric_id not in metric_ids: metric_ids.append(metric_id) r["_MetricId"] = metric_id r["_SamplingTime"] = str(now) log.debug(r) metric_results.append(r) if len(no_metric_ids): log.info(f"No metrics returned for alarm '{no_metric_ids}'") # Merge with existing cache metric metric_cache = self.metric_cache self.metric_cache = metric_results for m in metric_cache: if m["_MetricId"] in metric_ids or "_SamplingTime" not in m: continue if (now - misc.str2utc(m["_SamplingTime"]) ).total_seconds() < max_retention_period: self.metric_cache.append(m) self.ec2.set_state("cloudwatch.metric.query_counter", query_counter, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) self.ec2.set_state_json( "cloudwatch.metrics.cache", self.metric_cache, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) self.set_metric("Cloudwatch.GetMetricData", query_counter) # Augment Alarm definitions and Instances with associated metrics for metric in self.metric_cache: metric_id = metric["_MetricId"] alarm_data = self.get_alarm_data_by_name(metric_id) if alarm_data is not None: alarm_data["MetricDetails"] = metric continue instance = next( filter( lambda i: "CPUCreditBalance/%s" % i["InstanceId"] == metric_id, burstable_instances), None) if instance is not None: instance["_Metrics"] = {} instance["_Metrics"]["CPUCreditBalance"] = metric continue