def get_duration_secs(key, fmt=None):
    try:
        return misc.str2duration_seconds(get(key, fmt=fmt))
    except Exception as e:
        raise Exception(
            "[ERROR] Failed to parse config key '%s' as a duration! : %s" %
            (key, e))
示例#2
0
 def scheduler(self, context, event, response, cacheddata):
     m = re.search("scheduler/(.*)$", event["OpType"])
     if m is None:
         response["statusCode"] = 400
         response["body"] = "Missing config key path."
         return False
     config_key = m.group(1)
     if "httpMethod" in event and event["httpMethod"] == "POST":
         value = event["body"].partition('\n')[0]
         ttl = misc.str2duration_seconds(event.get("ttl"),
                                         no_exception=True,
                                         default=None)
         log.info(
             f"Scheduler configuration write for key '{config_key}' = '{value}' (ttl={ttl})."
         )
         kvtable.KVTable.set_kv_direct(config_key,
                                       value,
                                       self.context["SchedulerTable"],
                                       context=self.context,
                                       TTL=ttl)
         response["statusCode"] = 200
         response["body"] = value
     else:
         value = kvtable.KVTable.get_kv_direct(
             config_key,
             self.context["SchedulerTable"],
             context=self.context)
         if value is None:
             response["statusCode"] = 400
             response["body"] = "Unknown configuration key!"
             return False
         else:
             response["statusCode"] = 200
             response["body"] = value
     return True
    def manage_rule_event(self, event):
        if Cfg.get_int("cron.disable"):
            return
        if "source" in event and event["source"] == "aws.events" and event[
                "detail-type"] == "Scheduled Event":
            # Triggered by an AWS CloudWatch Scheduled event. We look for a ParameterSet
            #   request based on the ARN
            misc.initialize_clients(["events"], self.context)
            misc.load_prerequisites(self.context, ["o_scheduler"])
            for r in event["resources"]:
                log.debug("Processing Scheduled event '%s'..." % r)
                m = re.search(
                    "^arn:aws:events:[a-z-0-9]+:[0-9]+:rule/CS-Cron-%s-(.*)" %
                    self.context["GroupName"], r)
                if m is not None and len(m.groups()) == 1:
                    rule_num = m.group(1)
                    log.info("Got event rule '%s'" % rule_num)
                    self.load_event_definitions()
                    rule_def = self.get_ruledef_by_name(
                        "CS-Cron-%s-%s" %
                        (self.context["GroupName"], rule_num))
                    log.debug(rule_def)

                    ttl = None
                    try:
                        ttl = misc.str2duration_seconds(
                            rule_def["TTL"]
                        ) if rule_def is not None and "TTL" in rule_def else None
                    except Exception as e:
                        log.exception(
                            "[WARNING] Failed to read 'TTL' value '%s'!" %
                            (TTL))

                    params = dict(rule_def["Data"][0])
                    for k in params:
                        if k in ["TTL", "schedule"]: continue
                        Cfg.set(k, params[k], ttl=ttl)
            return True
        return False
示例#4
0
 def configuration(self, context, event, response, cacheddata):
     m = re.search("configuration/(.*)$", event["OpType"])
     if m is None:
         response["statusCode"] = 400
         response["body"] = "Missing config key path."
         return False
     config_key = m.group(1)
     with_maintenance_window = "with_maintenance_window" in event and event[
         "with_maintenance_window"].lower() == "true"
     if with_maintenance_window:
         # We load the EC2 and SSM modules to inherit their override parameters if a SSM Maintenance Window is active
         misc.load_prerequisites(self.context, ["o_ec2", "o_ssm"])
     if "httpMethod" in event and event["httpMethod"] == "POST":
         value = event["body"].partition('\n')[0]
         log.info(f"TTL=%s" % event.get("ttl"))
         ttl = misc.str2duration_seconds(event.get("ttl"),
                                         no_exception=True,
                                         default=None)
         log.info(
             f"Configuration write for key '{config_key}' = '{value}' (ttl={ttl})."
         )
         kvtable.KVTable.set_kv_direct(config_key,
                                       value,
                                       self.context["ConfigurationTable"],
                                       context=self.context,
                                       TTL=ttl)
         response["statusCode"] = 200
         response["body"] = value
     else:
         value = Cfg.get(config_key, none_on_failure=True)
         if value is None:
             response["statusCode"] = 400
             response[
                 "body"] = "Unknown configuration key '%s'!" % config_key
             return False
         else:
             response["statusCode"] = 200
             response["body"] = value
     return True
示例#5
0
parser.add_argument('loadbalancer_url',
                    help="LoadBalancer URL",
                    type=str,
                    nargs=1)
parser.add_argument('--period', help="Duration", type=str, default="hours=2")
parser.add_argument('--max-concurrency',
                    help="Connection concurrency to load balancer",
                    type=int,
                    default=30)

args = parser.parse_args()
args_dict = {}
for a in args._get_kwargs():
    args_dict[a[0]] = a[1]

period = misc.str2duration_seconds(args.period)
time_offset = misc.seconds_from_epoch_utc()
max_concurrency = args.max_concurrency

while True:
    now = misc.seconds_from_epoch_utc()
    seconds = now - time_offset
    concurrency = 1 + int(
        (max_concurrency - 1) *
        ((1 - math.cos(2 * math.pi * (seconds % period) / period)) / 2.0))

    cmd = "ab -c %(concurrency)s -n %(concurrency)s %(loadbalancer_url)s" % {
        "concurrency": concurrency,
        "loadbalancer_url": args.loadbalancer_url[0]
    }
    print(cmd)
示例#6
0
    def get_prerequisites(self):
        now = self.context["now"]
        client = self.context["cloudwatch.client"]

        # Read all CloudWatch alarm templates into memory
        alarm_definitions = {}
        for i in range(0, Cfg.get_int("cloudwatch.alarms.max_per_instance")):
            key = "cloudwatch.alarm%02d.configuration_url" % (i)
            r = Cfg.get_extended(key)
            if not r["Success"] or r["Value"] == "":
                continue

            d = misc.parse_line_as_list_of_dict(r["Value"])
            url = d[0]["_"]
            meta = d[0]

            index = "%02d" % i
            alarm_defs = {
                "Index": index,
                "Key": key,
                "Url": url,
                "Definition": r,
                "Metadata": meta
            }

            prefix = "alarmname:"
            if url.startswith(prefix):
                alarm_defs["AlarmName"] = url[len(prefix):]
            elif url.startswith("ignore:"):
                continue  # This entry need to be ignored.
            else:
                log.log(log.NOTICE, "Read Alarm definition: %s" % r["Value"])
                try:
                    resp = misc.get_url(url.format(**self.context))
                    if resp is None:
                        raise Exception("URL content = <None>")
                    alarm_defs["Content"] = str(resp, "utf-8")
                except Exception as e:
                    log.exception(
                        f"Failed to load Alarm definition '%s' : {e}" %
                        r["Value"])
                    continue
            alarm_definitions[index] = alarm_defs

        self.alarm_definitions = alarm_definitions

        # Read all existing CloudWatch alarms
        alarms = []
        paginator = client.get_paginator('describe_alarms')
        response_iterator = paginator.paginate(
            MaxRecords=Cfg.get_int("cloudwatch.describe_alarms.max_results"))
        for response in response_iterator:
            #log.debug(Dbg.pprint(response))
            for alarm in response["MetricAlarms"]:
                alarm_name = alarm["AlarmName"]
                alarm_def = self.get_alarm_configuration_by_name(alarm_name)
                if alarm_def is not None:
                    # This is an alarm thats belong to this CloneSquad instance
                    alarms.append(alarm)
        #log.debug(Dbg.pprint(alarms))
        self.alarms = alarms

        # Sanity check
        for index in self.alarm_definitions.keys():
            alarm_def = self.alarm_definitions[index]
            if "AlarmName" not in alarm_def:
                continue
            alarm = next(
                filter(lambda a: a["AlarmName"] == alarm_def["AlarmName"],
                       self.alarms), None)
            if alarm is None:
                log.warning(
                    "Alarm definition [%s](%s => %s) doesn't match an existing CloudWatch alarm!"
                    % (alarm_def["Definition"]["Key"],
                       alarm_def["Definition"]["Value"],
                       alarm_def["Definition"]["Status"]))

        # Read all metrics associated with alarms

        # CloudWatch intense polling can be expensive: This algorithm links CW metric polling rate to the
        #    scale rate => Under intense scale up condition, polling is aggresive. If not, it falls down
        #    to one polling every 'cloudwatch.metrics.low_rate_polling_interval' seconds
        # TODO(@jcjorel): Avoid this kind of direct references to an upper level module!!
        integration_period = Cfg.get_duration_secs(
            "ec2.schedule.horizontalscale.integration_period")
        instance_scale_score = self.ec2.get_integrated_float_state(
            "ec2.schedule.scaleout.instance_scale_score", integration_period)

        self.metric_cache = self.get_metric_cache()

        query = {"IdMapping": {}, "Queries": []}

        # Build query for Alarm metrics
        if Cfg.get("ec2.schedule.desired_instance_count") == "-1":
            # Sort by oldest alarms first in cache
            cached_metric_names = [m["_MetricId"] for m in self.metric_cache]
            valid_alarms = []
            for a in alarms:
                alarm_name = a["AlarmName"]
                alarm_def = self.get_alarm_configuration_by_name(alarm_name)
                if alarm_def is None or alarm_def["AlarmDefinition"][
                        "Url"].startswith("alarmname:"):
                    continue
                a["_SamplingTime"] = self.get_metric_by_id(
                    alarm_name
                )["_SamplingTime"] if alarm_name in cached_metric_names else str(
                    misc.epoch())
                valid_alarms.append(a)
            sorted_alarms = sorted(
                valid_alarms, key=lambda a: misc.str2utc(a["_SamplingTime"]))

            # We poll from the oldest to the newest and depending on the instance_scale_score to limit CloudWacth GetMetricData costs
            time_for_full_metric_refresh = max(
                Cfg.get_duration_secs(
                    "cloudwatch.metrics.time_for_full_metric_refresh"), 1)
            app_run_period = Cfg.get_duration_secs("app.run_period")
            minimum_polled_alarms_per_run = Cfg.get_int(
                "cloudwatch.metrics.minimum_polled_alarms_per_run")
            maximum_polled_alarms_per_run = app_run_period / time_for_full_metric_refresh
            maximum_polled_alarms_per_run = min(maximum_polled_alarms_per_run,
                                                1.0)
            weight = min(instance_scale_score, maximum_polled_alarms_per_run)
            max_alarms_for_this_run = max(
                minimum_polled_alarms_per_run,
                int(min(weight, 1.0) * len(sorted_alarms)))
            for alarm in sorted_alarms[:max_alarms_for_this_run]:
                alarm_name = alarm["AlarmName"]
                CloudWatch._format_query(query, alarm_name, alarm)

            # We always poll user supplied alarms
            for alarm in alarms:
                alarm_name = alarm["AlarmName"]
                alarm_def = self.get_alarm_configuration_by_name(alarm_name)
                if alarm_def is None:
                    continue  # Unknown alarm name
                if not alarm_def["AlarmDefinition"]["Url"].startswith(
                        "alarmname:"):
                    continue
                CloudWatch._format_query(query, alarm_name, alarm)

        max_retention_period = Cfg.get_duration_secs(
            "cloudwatch.metrics.cache.max_retention_period")

        # Query Metric for Burstable instances
        instance_minimum_age_for_cpu_credit_polling = Cfg.get_duration_secs(
            "cloudwatch.metrics.instance_minimum_age_for_cpu_credit_polling")
        burstable_instances = self.ec2.get_burstable_instances(
            State="running", ScalingState="-error")
        cpu_credit_polling = 0
        for i in burstable_instances:
            instance_id = i["InstanceId"]
            if (now - i["LaunchTime"]).total_seconds(
            ) < instance_minimum_age_for_cpu_credit_polling:
                continue
            cached_metric = self.get_metric_by_id(
                f"CPUCreditBalance/{instance_id}")
            if cached_metric is not None:
                # Note: Polling of CPU Credit Balance is a bit tricky as this API takes a lot of time to update and sometime
                #   do send back results from time to time. So we need to try multiple times...
                if ("_LastSamplingAttempt" in cached_metric and
                    (now - misc.str2utc(cached_metric["_LastSamplingAttempt"])
                     ).total_seconds() <
                        misc.str2duration_seconds("minutes=1")):
                    continue  # We do not want to poll more than one per minute
                if (now - misc.str2utc(cached_metric["_SamplingTime"])
                    ).total_seconds() < max_retention_period * 0.8:
                    # Current data point is not yet expired. Keep of this attempt
                    continue
                cached_metric["_LastSamplingAttempt"] = now
            cpu_credit_polling += 1
            CloudWatch._format_query(
                query, "%s/%s" % ("CPUCreditBalance", instance_id), {
                    "MetricName": "CPUCreditBalance",
                    "Namespace": "AWS/EC2",
                    "Dimensions": [{
                        "Name": "InstanceId",
                        "Value": instance_id
                    }],
                    "Period": 300,
                    "Statistic": "Average"
                })
        log.log(
            log.NOTICE,
            f"Will poll {cpu_credit_polling} instances for CPU Credit balance."
        )

        # Make request to CloudWatch
        query_counter = self.ec2.get_state_int(
            "cloudwatch.metric.query_counter", default=0)
        queries = query["Queries"]
        metric_results = []
        metric_ids = []
        no_metric_ids = []
        while len(queries) > 0:
            q = queries[:500]
            queries = queries[500:]
            results = []
            args = {
                "MetricDataQueries":
                q,
                "StartTime":
                now - timedelta(seconds=Cfg.get_duration_secs(
                    "cloudwatch.metrics.data_period")),
                "EndTime":
                now
            }
            paginator = client.get_paginator('get_metric_data')
            response_iterator = paginator.paginate(**args)
            for response in response_iterator:
                results.extend(response["MetricDataResults"])
                query_counter += len(q)

            for r in results:
                if r["StatusCode"] != "Complete":
                    log.error(f"Failed to retrieve metrics: {q}")
                    continue
                metric_id = query["IdMapping"][r["Id"]]
                if len(r["Timestamps"]) == 0:
                    if metric_id not in no_metric_ids:
                        no_metric_ids.append(metric_id)
                    continue
                if metric_id not in metric_ids: metric_ids.append(metric_id)
                r["_MetricId"] = metric_id
                r["_SamplingTime"] = str(now)
                log.debug(r)
                metric_results.append(r)
        if len(no_metric_ids):
            log.info(f"No metrics returned for alarm '{no_metric_ids}'")

        # Merge with existing cache metric
        metric_cache = self.metric_cache
        self.metric_cache = metric_results
        for m in metric_cache:
            if m["_MetricId"] in metric_ids or "_SamplingTime" not in m:
                continue
            if (now - misc.str2utc(m["_SamplingTime"])
                ).total_seconds() < max_retention_period:
                self.metric_cache.append(m)

        self.ec2.set_state("cloudwatch.metric.query_counter",
                           query_counter,
                           TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
        self.ec2.set_state_json(
            "cloudwatch.metrics.cache",
            self.metric_cache,
            TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
        self.set_metric("Cloudwatch.GetMetricData", query_counter)

        # Augment Alarm definitions and Instances with associated metrics
        for metric in self.metric_cache:
            metric_id = metric["_MetricId"]

            alarm_data = self.get_alarm_data_by_name(metric_id)
            if alarm_data is not None:
                alarm_data["MetricDetails"] = metric
                continue

            instance = next(
                filter(
                    lambda i: "CPUCreditBalance/%s" % i["InstanceId"] ==
                    metric_id, burstable_instances), None)
            if instance is not None:
                instance["_Metrics"] = {}
                instance["_Metrics"]["CPUCreditBalance"] = metric
                continue