def __init__(self, context, ec2, cloudwatch): self.context = context self.ec2 = ec2 self.cloudwatch = cloudwatch self.scheduler_table = None self.event_names = [] self.rules = [] Cfg.register({"cron.max_rules_per_batch": "10", "cron.disable": "0"})
def __init__(self, context): self.context = context self.table = None self.table_aggregates = [] self.clonesquad_resources = [] Cfg.register({ "statemanager.cache.max_age": "minutes=5", }, ignore_double_definition=True)
def __init__(self, context=None, ec2=None, cloudwatch=None): self.context = context self.ec2 = ec2 self.cloudwatch = cloudwatch self.scheduler_table = None self.event_names = [] self.rules = [] self.local_now = None Cfg.register({ "cron.max_rules_per_batch": "10", "scheduler.cache.max_age": "seconds=60", "cron.disable": "0" })
def get_prerequisites(self): if "transfer" not in self.context["o_state"].get_resource_services(): return self.resources = self.o_state.get_resources(service="transfer") self.servers = [] transfer_client = self.context["transfer.client"] paginator = transfer_client.get_paginator('list_servers') tag_mappings = itertools.chain.from_iterable( page['Servers'] for page in paginator.paginate()) self.servers = list(tag_mappings) #self.state_table = self.o_state.get_state_table() #self.state_table.register_aggregates([ # { # "Prefix": "transferfamily.", # "Compress": True, # "DefaultTTL": Cfg.get_duration_secs("transferfamily.state.default_ttl"), # "Exclude" : [] # } # ]) metric_time_resolution = Cfg.get_int( "transferfamily.metrics.time_resolution") if metric_time_resolution < 60: metric_time_resolution = 1 # Switch to highest resolution self.cloudwatch.register_metric([ { "MetricName": "Subfleet.TransferFamily.Size", "Unit": "Count", "StorageResolution": metric_time_resolution }, { "MetricName": "Subfleet.TransferFamily.RunningServers", "Unit": "Count", "StorageResolution": metric_time_resolution }, ]) # We need to register dynamically subfleet configuration keys to avoid a 'key unknown' warning # when the user is going to set it subfleet_names = self.get_subfleet_names() for subfleet in subfleet_names: key = "subfleet.%s.state" % subfleet if not Cfg.is_builtin_key_exist(key): Cfg.register({key: ""}) log.log(log.NOTICE, "Detected TransferFamily subfleets '%s'." % subfleet_names)
def __init__(self, context, state, ec2, targetgroup, cloudwatch): global do_not_notify do_not_notify = False self.context = context self.ec2 = ec2 self.targetgroup = targetgroup self.cloudwatch = cloudwatch self.state = state self.table_name = None Cfg.register({ "notify.event.default_ttl": "minutes=5", "notify.event.longterm.max_records,Stable": { "DefaultValue": 50, "Format": "Integer", "Description": """Maximum records to hold in the Event-LongTerm DynamodDB table Setting this value to 0, disable logging to the LongTerm event table. """ }, "notify.event.longterm.ttl,Stable": { "DefaultValue": "days=5", "Format": "Duration", "Description": """Retention time for Long-Term DynamoDB entries. This table is used to deep-dive analysis of noticeable events encountered by a CloneSquad deployment. It is mainly used to improve CloneSquad over time by allowing easy sharing of essential data for remote debugging. """ }, "notify.event.keep_acked_records": "0", "notify.debug.obfuscate_s3_reports": "1", "notify.debug.send_s3_reports": "1" }) self.state.register_aggregates([{ "Prefix": "notify.", "Compress": True, "DefaultTTL": Cfg.get_duration_secs("notify.event.longterm.ttl"), "Exclude": [] }]) global notify_mgr notify_mgr = self
def init(with_kvtable=True, with_predefined_configuration=True): config.init(ctx, with_kvtable=with_kvtable, with_predefined_configuration=with_predefined_configuration) Cfg.register({ "app.run_period,Stable" : { "DefaultValue": "seconds=20", "Format" : "Duration", "Description" : """Period when the Main scheduling Lambda function is run. The smaller, the more accurate and reactive is CloneSquad. The bigger, the cheaper is CloneSquad to run itself (Lambda executions, Cloudwatch GetMetricData, DynamoDB queries...) """ }, "app.default_ttl" : "300", "app.disable,Stable": { "DefaultValue": 0, "Format": "Bool", "Description": """Flag to disable Main Lambda function responsible to start/stop EC2 instances. It disables completly CloneSquad. While disabled, the Lambda will continue to be started every minute to test if this flag changed its status and allow normal operation again.""" }, "app.archive_interact_events": "0" }) log.debug("Setup management objects.") log.debug("o_state setup...") ctx["o_state"] = state.StateManager(ctx) log.debug("o_ec2 setup...") ctx["o_ec2"] = ec2.EC2(ctx, ctx["o_state"]) log.debug("o_ssm setup...") ctx["o_ssm"] = ssm.SSM(ctx) log.debug("o_targetgroup setup...") ctx["o_targetgroup"] = targetgroup.ManagedTargetGroup(ctx, ctx["o_ec2"]) log.debug("o_cloudwatch setup...") ctx["o_cloudwatch"] = cloudwatch.CloudWatch(ctx, ctx["o_ec2"]) log.debug("o_notify setup...") ctx["o_notify"] = notify.NotifyMgr(ctx, ctx["o_state"], ctx["o_ec2"], ctx["o_targetgroup"], ctx["o_cloudwatch"]) log.debug("o_ec2_schedule setup...") ctx["o_ec2_schedule"] = ec2_schedule.EC2_Schedule(ctx, ctx["o_ec2"], ctx["o_targetgroup"], ctx["o_cloudwatch"]) log.debug("o_scheduler setup...") ctx["o_scheduler"] = scheduler.Scheduler(ctx, ctx["o_ec2"], ctx["o_cloudwatch"]) log.debug("o_interact setup...") ctx["o_interact"] = interact.Interact(ctx) log.debug("o_rds setup...") ctx["o_rds"] = rds.RDS(ctx, ctx["o_state"], ctx["o_cloudwatch"]) log.debug("o_transferfamily setup...") ctx["o_transferfamily"] = transferfamily.TransferFamily(ctx, ctx["o_state"], ctx["o_cloudwatch"])
def init(with_kvtable=True, with_predefined_configuration=True): log.debug("Init.") config.init(ctx, with_kvtable=with_kvtable, with_predefined_configuration=with_predefined_configuration) Cfg.register({ "app.run_period,Stable" : { "DefaultValue": "seconds=20", "Format" : "Duration", "Description" : """Period when the Main scheduling Lambda function is run. The smaller, the more accurate and reactive is CloneSquad. The bigger, the cheaper is CloneSquad to run itself (Lambda executions, Cloudwatch GetMetricData, DynamoDB queries...) """ }, "app.default_ttl" : "300", "app.disable,Stable": { "DefaultValue": 0, "Format": "Bool", "Description": """Flag to disable Main Lambda function responsible to start/stop EC2 instances. It disables completly CloneSquad. While disabled, the Lambda will continue to be started every minute to test if this flag changed its status and allow normal operation again.""" } }) log.debug("Setup management objects.") o_state = state.StateManager(ctx) o_ec2 = ec2.EC2(ctx, o_state) o_targetgroup = targetgroup.ManagedTargetGroup(ctx, o_ec2) o_cloudwatch = cloudwatch.CloudWatch(ctx, o_ec2) o_notify = notify.NotifyMgr(ctx, o_state, o_ec2, o_targetgroup, o_cloudwatch) o_ec2_schedule = ec2_schedule.EC2_Schedule(ctx, o_ec2, o_targetgroup, o_cloudwatch) o_scheduler = scheduler.Scheduler(ctx, o_ec2, o_cloudwatch) o_interact = interact.Interact(ctx) o_rds = rds.RDS(ctx, o_state, o_cloudwatch) ctx.update({ "o_state" : o_state, "o_ec2" : o_ec2, "o_targetgroup" : o_targetgroup, "o_cloudwatch" : o_cloudwatch, "o_notify" : o_notify, "o_ec2_schedule" : o_ec2_schedule, "o_scheduler" : o_scheduler, "o_interact" : o_interact, "o_rds" : o_rds })
def __init__(self, context, ec2): self.context = context self.ec2 = ec2 self.state_changed = False self.prereqs_done = False Cfg.register({ "targetgroup.debug.inject_fault_status": "", "targetgroup.default_state_ttl": "minutes=30", "targetgroup.slow_deregister_timeout": "minutes=2" }) self.ec2.register_state_aggregates([{ "Prefix": "targetgroup.status.", "Compress": True, "DefaultTTL": Cfg.get_duration_secs("targetgroup.default_state_ttl") }])
def __init__(self, context, o_state, o_cloudwatch): self.context = context self.o_state = o_state self.cloudwatch = o_cloudwatch Cfg.register({ "rds.enable,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Enable management of RDS databases. DEPRECATED. (Now automatic detection of RDS resources is implemented.) """ }, "rds.state.default_ttl": "hours=2", "rds.metrics.time_resolution": "60", })
def __init__(self, context, o_state, o_cloudwatch): self.context = context self.o_state = o_state self.cloudwatch = o_cloudwatch self.servers = [] Cfg.register({ "transferfamily.enable,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Enable management of TransferFamily services. DEPRECATED. (Now automatic detection of TransferFamilty services is implemented). """ }, "transferfamily.state.default_ttl": "hours=2", "transferfamily.metrics.time_resolution": "60", })
def __init__(self, context): self.context = context self.o_state = self.context["o_state"] self.maintenance_windows = {} self.o_ec2 = self.context["o_ec2"] GroupName = self.context["GroupName"] Cfg.register({ "ssm.enable,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Enable globally support for AWS System Manager by CloneSquad. CloneSquad can leverage AWS SSM to take into account Maintenance Windows and use SSM RunCommand to execute status probe scripts located in managed instances. """ }, "ssm.feature.events.ec2.maintenance_window_period,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Enable/Disable sending Enter/Exit Maintenance Window period events to instances. This enables event notification support of instances when they enter or exit a SSM Maintenance Window. When set to 1, CloneSquad sends a SSM RunCommand to run the script `/etc/cs-ssm/(enter|exit)-maintenance-window-period` script located in each instances. The event is repeasted until the script returns a zero-code. If the script doesn't exist on an instance, the event is sent only once. > This setting is taken into account only if [`ssm.enable`](#ssmenable) is set to 1. """ }, "ssm.feature.events.ec2.instance_ready_for_shutdown,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Ensure instance shutdown readiness with /etc/cs-ssm/instance-ready-for-shutdown script on SSM managed instances." This enables support for direct sensing of instance shutdown readiness based on the return code of a script located in each EC2 instances. When set to 1, CloneSquad sends a SSM RunCommand to a managed instance candidate prior to shutdown: * If `/etc/cs-ssm/instance-ready-for-shutdown` is present, it is executed with the SSM agent daemon user rights: If the script returns a NON-zero code, Clonesquad will postpone the instance shutdown and will call this script again after 2 * [ `app.run_period`](#apprun_period) seconds... * If `/etc/cs-ssm/instance-ready-for-shutdown` is NOT present, immediate shutdown readyness is assumed. > This setting is taken into account only if [`ssm.enable`](#ssmenable) is set to 1. """ }, "ssm.feature.events.ec2.instance_ready_for_shutdown.max_shutdown_delay,Stable": { "DefaultValue": "hours=1", "Format": "Duration", "Description": """ Maximum time to spend waiting for SSM based ready-for-shutdown status. When SSM support is enabled with [`ssm.feature.events.ec2.instance_ready_for_operation`](#ssmfeatureec2instance_ready_for_operation), instances may notify CloneSquad when they are ready for shutdown. This setting defines the maximum time spent by CloneSquad to receive this signal before to forcibly shutdown the instance. """ }, "ssm.feature.events.ec2.instance_ready_for_operation,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Ensure an instance go out from 'initializing' state based on an instance script returns code. This enables support for direct sensing of instance **serving** readiness based on the return code of a script located in each EC2 instances. CloneSquad never stops an instance in the 'initializing' state. This state is normally automatically left after [`ec2.schedule.start.warmup_delay`](#ec2schedulestartwarmup_delay) seconds: When this setting is set, an SSM command is sent to each instance and call a script to get a direct ack that an instance can left the 'initializing' state. * If `/etc/cs-ssm/instance-ready-for-operation` is present, it is executed with the SSM agent daemon user rights: If the script returns a NON-zero code, Clonesquad will postpone the instance go-out from 'initializing' state and will call this script again after 2 * [ `app.run_period`](#apprun_period) seconds... * If `/etc/cs-ssm/instance-ready-for-operation` is NOT present, the instance leaves the 'initializing' state immediatly after 'warmup delay'.. > This setting is taken into account only if [`ssm.enable`](#ssmenable) is set to 1. """ }, "ssm.feature.events.ec2.instance_ready_for_operation.max_initializing_time,Stable": { "DefaultValue": "hours=1", "Format": "Duration", "Description": """Max time that an instance can spend in 'initializing' state. When [`ssm.feature.events.ec2.instance_ready_for_operation`](#ssmfeatureec2instance_ready_for_operation) is set, this setting defines the maximum duration that CloneSquas will attempt to get a status 'ready-for-operation' for a specific instance through SSM RunCommand calls and execution of the `/etc/cs-ssm/instance-ready-for-operation` script. """ }, "ssm.feature.events.ec2.scaling_state_changes,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Call a script in instance when the instance scaling state changes. When this toggle set, the script `/etc/cs-ssm/instance-scaling-state-change` located into managed instances, is called to notify about a scaling status change. Currently, only `draining` and `bounced` events are sent (`bounced`is sent only if the instance bouncing feature is activated). For example, if an instance enters the `draining` state because CloneSquad wants to shutdown it, this event is called. * If the script doesn't exists, the event is sent only once, * If the script returns a non-zero code, the event will be repeated. > Note: This event differs from [`ssm.feature.events.ec2.instance_ready_for_shutdown`](#ssmfeatureeventsec2instance_ready_for_shutdown) one as it is only meant to inform the instance about a status change. The [`ssm.feature.events.ec2.instance_ready_for_shutdown`](#ssmfeatureeventsec2instance_ready_for_shutdown) event is a request toward the instance asking for an approval to shutdown. """ }, "ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports,Stable": { "DefaultValue": "", "Format": "StringList", "Description": """On `draining` state, specified ports are blocked and so forbid new TCP connections (i.e. *Connection refused* message). This features installs, **on `draining` time**, temporary iptables chain and rules denying new TCP connections to the specified port list. This is useful, for example, to break a healthcheck life line as soon as an instance enters the `draining` state: It is especially useful when non-ELB LoadBalancers are used and CloneSquad does not know how to tell these loadbalancers that no more traffic needs to be sent to a drained instance. As it blocks only new TCP connections, currently active connections can terminate gracefully during the draining period. > When instances are served only by CloneSquad managed ELB(s), there is no need to use this feature as CloneSquad will unregister the targets as soon as placed in `draining`state. By default, no blocked port list is specified, so no iptables call is performed on the instance. """ }, "ssm.feature.events.ec2.scaling_state_changes.draining.{SubfleeName}.connection_refused_tcp_ports,Stable": { "DefaultValue": "", "Format": "StringList", "Description": """Defines the blocked TCP port list for the specified fleet. This setting overrides the value defined in [`ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports`](#ssmfeatureeventsec2scaling_state_changesdrainingconnection_refused_tcp_ports) for the specified fleet. > Use `__main__` to designate the main fleet.""" }, "ssm.feature.events.ec2.instance_healthcheck": "0", "ssm.feature.maintenance_window,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Defines if SSM maintenance window support is activated. > This setting is taken into account only if [`ssm.enable`](#ssmenable) is set to 1. """ }, "ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running,Stable": { "DefaultValue": "1", "Format": "Bool", "Description": """Defines if a subfleet is forcibly set to 'running' when a maintenance window is actice. By default, all the subfleets is woken up by a maintenance window ([`subfleet.{SubfleetName}.state`](#subfleetsubfleetnamestate) is temprarily forced to `running`). """, }, "ssm.state.default_ttl": "hours=1", "ssm.state.command.default_ttl": "minutes=10", "ssm.state.command.result.default_ttl": "minutes=5", "ssm.feature.maintenance_window.start_ahead,Stable": { "DefaultValue": "minutes=15", "Format": "Duration", "Description": """Start instances this specified time ahead of the next Maintenance Window. In order to ensure that instances are up and ready when a SSM Maintenance Window starts, they are started in advance of the 'NextExecutionTime' defined in the SSM maintenance window object. """ }, "ssm.feature.maintenance_window.start_ahead.max_jitter": "66%", "ssm.feature.maintenance_window.global_defaults": "CS-GlobalDefaultMaintenanceWindow", "ssm.feature.maintenance_window.defaults": "CS-{GroupName}", "ssm.feature.maintenance_window.mainfleet.defaults": "CS-{GroupName}-Mainfleet", "ssm.feature.maintenance_window.mainfleet.ec2.schedule.min_instance_count": { "DefaultValue": "100%", "Format": "IntegerOrPercentage", "Description": """Minimum number of instances serving in the fleet when the Maintenance Window occurs. > Note: If this value is set to the special value '100%', the setting [`ec2.schedule.desired_instance_count`](#ec2scheduledesired_instance_count) is also forced to '100%'. This implies that any LightHouse instances will also be started and full fleet stability ensured during the Maintenance Window. """ }, "ssm.feature.maintenance_window.subfleet.__all__.defaults": "CS-{GroupName}-Subfleet.__all__", "ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults": "CS-{GroupName}-Subfleet.{SubfleetName}", "ssm.feature.maintenance_window.subfleet.{SubfleetName}.ec2.schedule.min_instance_count": { "DefaultValue": "100%", "Format": "IntegerOrPercentage", "Description": """Minimum number of instances serving in the fleet when the Maintenance Window occurs. > Note: If this value is set to the special value '100%', the setting [`subfleet.{subfleet}.ec2.schedule.desired_instance_count`](#subfleetsubfleetec2scheduledesired_instance_count) is also forced to '100%' ensuring full subfleet stability. """ }, }) self.o_state.register_aggregates([ { "Prefix": "ssm.events", "Compress": True, "DefaultTTL": Cfg.get_duration_secs("ssm.state.default_ttl"), "Exclude" : [] }, ])
def __init__(self, context, o_state): self.context = context self.instances = None self.instance_ids = None self.instance_statuses = None self.prereqs_done = False self.o_state = o_state self.state_table = None Cfg.register({ "ec2.describe_instances.max_results": "250", "ec2.describe_instance_types.enabled": "0", "ec2.az.statusmgt.disable": 0, "ec2.az.unavailable_list,Stable": { "DefaultValue": "", "Format": "StringList", "Description": """List of Availability Zone names (ex: *eu-west-3c*) or AZ Ids (ex: *euw3-az1*). Typical usage is to force a fleet to consider one or more AZs as unavailable (AZ eviction). The autoscaler will then refuse to schedule new instances on these AZs. Existing instances in those AZs are left unchanged but on scalein condition will be shutdown in priority (see [`ec2.az.evict_instances_when_az_faulty`](#ec2azinstance_faulty_when_az_faulty) to change this behavior). This setting can be used during an AWS LSE (Large Scale Event) to manually define that an AZ is unavailable. > Note: CloneSquad also uses the EC2.describe_availability_zones() API to discover dynamically LSE events. So, setting directly this key should not be needed in most cases. Please notice that, once an AZ is enabled again (either manually or automatically), instance fleet WON'T be rebalanced automatically: * If Instance bouncing is enabled, the fleet will be progressively rebalanced (convergence time will depend on the instance bouncing setting) * If instance bouncing is not configured, user can force a rebalancing by switching temporarily the fleet to `100%` during few minutes (with [`ec2.schedule.desired_instance_count`](#ec2scheduledesired_instance_count) sets temporarily to `100%`) and switch back to the original value. """ }, "ec2.az.evict_instances_when_az_faulty,Stable": { "DefaultValue": "0", "Format": "Bool", "Description": """Defines if instances running in a AZ with issues must be considered 'unavailable' By Default, instances running in an AZ reported with issues are left untouched and these instances will only be evicted if their invidual healthchecks fail or on scalein events. Settting this parameter to 1 will force Clonesquad to consider all the instances running in faulty AZ as 'unavailable' and so forcing their immediate replacement in healthy AZs in the region. """ }, "ec2.state.default_ttl": "days=1", "ec2.state.error_ttl": "minutes=5", "ec2.state.status_ttl": "days=40", "ec2.state.error_instance_ids": "", "ec2.state.excluded_instance_ids": { "DefaultValue": "", "Format": "List of String", "Description": """List of instance ids to consider as excluded. One of the 2 ways to exclude existant instances to be managed by CloneSquad, this key is a list of instance ids (ex: i-077b2ae6988f33de4;i-0564c45bfa5bb6aa5). The other way to exclude instances, is to tag instances with "clonesquad:excluded" key with value 'True'. """ }, "ec2.debug.availability_zones_impaired": "", }) self.o_state.register_aggregates([{ "Prefix": "ec2.instance.", "Compress": True, "DefaultTTL": Cfg.get_duration_secs("ec2.state.default_ttl"), "Exclude": ["ec2.instance.scaling.state."] }])
def get_prerequisites(self, only_if_not_already_done=False): if only_if_not_already_done and self.prereqs_done: return self.state_table = self.o_state.get_state_table() client = self.context["ec2.client"] # Retrieve list of instances with appropriate tag Filters = [{ 'Name': 'tag:clonesquad:group-name', 'Values': [self.context["GroupName"]] }] instances = [] response = None while (response is None or "NextToken" in response): response = client.describe_instances( Filters=Filters, MaxResults=Cfg.get_int("ec2.describe_instances.max_results"), NextToken=response["NextToken"] if response is not None else "") for reservation in response["Reservations"]: instances.extend(reservation["Instances"]) # Filter out instances with inappropriate state non_terminated_instances = [] for i in instances: if i["State"]["Name"] not in ["shutting-down", "terminated"]: non_terminated_instances.append(i) self.instances = non_terminated_instances self.instance_ids = [i["InstanceId"] for i in self.instances] # Enrich describe_instances output with instance type details if Cfg.get_int("ec2.describe_instance_types.enabled"): self.instance_types = [] [ self.instance_types.append(i["InstanceType"]) for i in self.instances if i["InstanceType"] not in self.instance_types ] if len(self.instance_types): response = client.describe_instance_types( InstanceTypes=self.instance_types) self.instance_type_details = response["InstanceTypes"] for i in self.instances: i["_InstanceType"] = next( filter( lambda it: it["InstanceType"] == i["InstanceType"], self.instance_type_details), None) # Get instances status instance_statuses = [] response = None while response is None or "NextToken" in response: q = {"InstanceIds": self.instance_ids} if response is not None and "NextToken" in response: q["NextToken"] = response["NextToken"] response = client.describe_instance_status(**q) instance_statuses.extend(response["InstanceStatuses"]) self.instance_statuses = instance_statuses # Get AZ status response = client.describe_availability_zones() self.availability_zones = response["AvailabilityZones"] if len(self.availability_zones) == 0: raise Exception("Can't have a region with no AZ...") self.az_with_issues = [] if not Cfg.get_int("ec2.az.statusmgt.disable"): for az in self.availability_zones: if az["State"] in ["impaired", "unavailable"]: self.az_with_issues.append(az) if az["State"] != "available": log.warning( "AZ %s(%s) is marked with status '%s' by EC2.describe_availability_zones() API!" % (zone_name, zone_id, zone_state)) else: log.warning( "Automatic AZ issues detection through describe_availability_zones() is DISABLED (ec2.az.statusmgt.disable != 0)..." ) # Use these config keys to simulate an AWS Large Scale Event all_az_names = [az["ZoneName"] for az in self.availability_zones] all_az_ids = [az["ZoneId"] for az in self.availability_zones] [ log.warning( "ec2.debug.availability_zones_impaired do not match local AZs! '%s'" % a) for a in Cfg.get_list("ec2.debug.availability_zones_impaired", default=[]) if a not in all_az_names and a not in all_az_ids ] [ log.warning( "ec2.az.unavailable_list do not match local AZs! '%s'" % a) for a in Cfg.get_list("ec2.az.unavailable_list", default=[]) if a not in all_az_names and a not in all_az_ids ] for az in self.availability_zones: zone_name = az["ZoneName"] zone_id = az["ZoneId"] zone_state = az["State"] if zone_name in Cfg.get_list( "ec2.debug.availability_zones_impaired", default=[]): zone_state = "impaired" if zone_id in Cfg.get_list("ec2.debug.availability_zones_impaired", default=[]): zone_state = "impaired" if zone_name in Cfg.get_list("ec2.az.unavailable_list", default=[]): zone_state = "unavailable" if zone_id in Cfg.get_list("ec2.az.unavailable_list", default=[]): zone_state = "unavailable" if zone_state != az["State"] and zone_state in [ "impaired", "unavailable" ] and az not in self.az_with_issues: self.az_with_issues.append(az) az["State"] = zone_state if zone_state != "available": log.warning( "AZ %s(%s) is marked with status '%s' by configuration keys!" % (zone_name, zone_id, zone_state)) # We need to register dynamically static subfleet configuration keys to avoid a 'key unknown' warning # when the user is going to set it static_subfleet_names = self.get_static_subfleet_names() for static_fleet in static_subfleet_names: key = "staticfleet.%s.state" % static_fleet if not Cfg.is_builtin_key_exist(key): Cfg.register({key: ""}) log.log( log.NOTICE, "Detected following static subfleet names across EC2 resources: %s" % static_subfleet_names) self.prereqs_done = True
def get_prerequisites(self): if "rds" not in self.context["o_state"].get_resource_services(): return rds_client = self.context["rds.client"] tagging_client = self.context["resourcegroupstaggingapi.client"] self.databases = {"db": [], "cluster": []} for db_type in list(self.databases.keys()): paginator = tagging_client.get_paginator('get_resources') tag_mappings = itertools.chain.from_iterable( page['ResourceTagMappingList'] for page in paginator.paginate( ResourceTypeFilters=["rds:%s" % db_type], TagFilters=[{ 'Key': 'clonesquad:group-name', 'Values': [self.context["GroupName"]] }])) self.databases["%s.tags" % db_type] = list(tag_mappings) if len(self.databases["%s.tags" % db_type]) == 0: continue if db_type == "cluster": func = rds_client.describe_db_clusters filter_key = "db-cluster-id" response_index = "DBClusters" if db_type == "db": func = rds_client.describe_db_instances filter_key = "db-instance-id" response_index = "DBInstances" try: self.databases[db_type].extend( func(Filters=[{ 'Name': filter_key, 'Values': [ t["ResourceARN"] for t in self.databases["%s.tags" % db_type] ] }])[response_index]) except Exception as e: log.exception("Failed to describe RDS database type '%s'" % (db_type)) #self.state_table = self.o_state.get_state_table() #self.state_table.register_aggregates([ # { # "Prefix": "rds.", # "Compress": True, # "DefaultTTL": Cfg.get_duration_secs("rds.state.default_ttl"), # "Exclude" : [] # } # ]) metric_time_resolution = Cfg.get_int("rds.metrics.time_resolution") if metric_time_resolution < 60: metric_time_resolution = 1 # Switch to highest resolution self.cloudwatch.register_metric([ { "MetricName": "Subfleet.RDS.Size", "Unit": "Count", "StorageResolution": metric_time_resolution }, { "MetricName": "Subfleet.RDS.AvailableDBs", "Unit": "Count", "StorageResolution": metric_time_resolution }, { "MetricName": "Subfleet.RDS.StoppingDBs", "Unit": "Count", "StorageResolution": metric_time_resolution }, { "MetricName": "Subfleet.RDS.StartingDBs", "Unit": "Count", "StorageResolution": metric_time_resolution }, ]) # We need to register dynamically subfleet configuration keys to avoid a 'key unknown' warning # when the user is going to set it subfleet_names = self.get_rds_subfleet_names() for subfleet in subfleet_names: key = "subfleet.%s.state" % subfleet if not Cfg.is_builtin_key_exist(key): Cfg.register({key: ""}) log.log( log.NOTICE, "Detected following subfleet names across RDS resources: %s" % subfleet_names)
def __init__(self, context, ec2): self.context = context self.ec2 = ec2 Cfg.register({"snsmgr.record_expiration_delay": "hours=1"})
def __init__(self, context, ec2): self.context = context self.ec2 = ec2 self.alarms = None self.metrics = [] Cfg.register({ "cloudwatch.describe_alarms.max_results": "50", "cloudwatch.default_ttl": "days=1", "cloudwatch.alarms.max_per_instance": "6", "cloudwatch.alarms.min_instance_age": "minutes=3", "cloudwatch.configure.max_alarms_deleted_batch_size": "5", "cloudwatch.metrics.namespace": "CloneSquad", "cloudwatch.metrics.subnamespace": "", "cloudwatch.metrics.excluded,Stable": { "DefaultValue": "", "Format": "StringList", "Description": """List of metric pattern names to not send to Cloudwatch This configuration key is used to do Cost optimization by filtering which CloneSquad Metrics are sent to Cloudwatch. It support regex patterns. > Ex: StaticFleet.*;NbOfBouncedInstances """ }, "cloudwatch.metrics.data_period": "minutes=2", "cloudwatch.metrics.max_update_per_batch": "20", "cloudwatch.metrics.cache.max_retention_period": "minutes=10", "cloudwatch.metrics.minimum_polled_alarms_per_run": "1", "cloudwatch.metrics.time_for_full_metric_refresh,Stable": { "DefaultValue": "minutes=1,seconds=30", "Format": "Duration", "Description": """The total period for a complete refresh of EC2 Instance metrics This parameter is a way to reduce Cloudwatch cost induced by GetMetricData API calls. It defines indirectly how many alarm metrics will be polled in a single Main Lambda execution. A dedicated algorithm is used to extrapolate missing data based on previous GetMetricData API calls. Reducing this value increase the accuracy of the scaling criteria and so, the reactivity of CloneSquad to a sudden burst of activity load but at the expense of Cloudwatch.GetMetricData API cost. This parameter does not influence the polling of user supplied alarms that are always polled at each run. """ }, "cloudwatch.dashboard.use_default,Stable": { "DefaultValue": 1, "Format": "Bool", "Description": """Enable or disable the Cloudwatch dashboard for CloneSquad. The dashboard is enabled by default. """ }, "cloudwatch.dashboard.update_interval": "hours=1", "cloudwatch.dashboard.snapshot_width": 1000, "cloudwatch.dashboard.snapshot_height": 400 }) Cfg.register({ "cloudwatch.alarm00.configuration_url,Stable": { "DefaultValue": "", "Format": "MetaString", "Description": """Alarm specification to track for scaling decisions. Ex: internal:ec2.scaleup.alarm-cpu-gt-75pc.yaml,Points=1001,BaselineThreshold=30.0 See [Alarm specification documentation](ALARMS_REFERENCE.md) for more details. """ } }) for i in range(1, Cfg.get_int("cloudwatch.alarms.max_per_instance")): Cfg.register({ "cloudwatch.alarm%02d.configuration_url,Stable" % i: { "DefaultValue": "", "Format": "MetaString", "Description": """See `cloudwatch.alarm00.configuration_url`. """ } }) self.register_metric([{ "MetricName": "Cloudwatch.GetMetricData", "Unit": "Count", "StorageResolution": 60 }]) self.ec2.register_state_aggregates([{ "Prefix": "cloudwatch.dashboard.", "Compress": True, "DefaultTTL": Cfg.get_duration_secs("cloudwatch.default_ttl"), "Exclude": [] }])
def initialize(context): config.register() # register plugin roles for collective.teamwork patch_access_logging()
def get_prerequisites(self): """ Gather instance status by calling SSM APIs. """ if not Cfg.get_int("ssm.enable"): log.log(log.NOTICE, "SSM support is currently disabled. Set ssm.enable to 1 to enabled it.") return now = self.context["now"] self.ttl = Cfg.get_duration_secs("ssm.state.default_ttl") GroupName = self.context["GroupName"] misc.initialize_clients(["ssm"], self.context) client = self.context["ssm.client"] # Retrive all SSM maintenace windows applicable to this CloneSquad deployment mw_names = { "__globaldefault__": {}, "__default__": {}, "__main__": {}, "__all__": {} } fmt = self.context.copy() mw_names["__globaldefault__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.global_defaults", fmt=fmt) mw_names["__default__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.defaults", fmt=fmt) mw_names["__main__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.mainfleet.defaults", fmt=fmt) mw_names["__all__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.subfleet.__all__.defaults", fmt=fmt) all_mw_names = mw_names["__globaldefault__"]["Names"] all_mw_names.extend([ n for n in mw_names["__default__"]["Names"] if n not in all_mw_names]) all_mw_names.extend([ n for n in mw_names["__main__"]["Names"] if n not in all_mw_names]) all_mw_names.extend([ n for n in mw_names["__all__"]["Names"] if n not in all_mw_names]) Cfg.register({ f"ssm.feature.maintenance_window.subfleet.__all__.force_running": Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running"), f"ssm.feature.events.ec2.scaling_state_changes.draining.__main__.connection_refused_tcp_ports": Cfg.get("ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports") }) for SubfleetName in self.o_ec2.get_subfleet_names(): fmt["SubfleetName"] = SubfleetName mw_names[f"Subfleet.{SubfleetName}"] = {} Cfg.register({ f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults": Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults"), f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.ec2.schedule.min_instance_count": Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.ec2.schedule.min_instance_count"), f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running": Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running"), f"ssm.feature.events.ec2.scaling_state_changes.draining.{SubfleetName}.connection_refused_tcp_ports": Cfg.get("ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports") }) mw_names[f"Subfleet.{SubfleetName}"]["Names"] = Cfg.get_list(f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults", fmt=fmt) all_mw_names.extend([ n for n in mw_names[f"Subfleet.{SubfleetName}"]["Names"] if n not in all_mw_names]) names = all_mw_names mws = [] while len(names): paginator = client.get_paginator('describe_maintenance_windows') response_iterator = paginator.paginate( Filters=[ { 'Key': 'Name', 'Values': names[:20] }, ]) for r in response_iterator: for wi in r["WindowIdentities"]: if not wi["Enabled"]: log.log(log.NOTICE, f"SSM Maintenance Window '%s' not enabled. Ignored..." % wi["Name"]) continue if "NextExecutionTime" not in wi: log.log(log.NOTICE, f"/!\ SSM Maintenance Window '%s' without 'NextExecutionTime'." % wi["Name"]) if wi not in mws: mws.append(wi) names = names[20:] # Make string dates as object dates for d in mws: if "NextExecutionTime" in d: d["NextExecutionTime"] = misc.str2utc(d["NextExecutionTime"]) # Retrieve Maintenace Window tags with the resourcegroup API tagged_mws = self.context["o_state"].get_resources(service="ssm", resource_name="maintenancewindow") for tmw in tagged_mws: mw_id = tmw["ResourceARN"].split("/")[1] mw = next(filter(lambda w: w["WindowId"] == mw_id, mws), None) if mw: mw["Tags"] = tmw["Tags"] valid_mws = [] for mw in mws: mw_id=mw["WindowId"] if "Tags" not in mw: try: response = client.list_tags_for_resource(ResourceType='MaintenanceWindow', ResourceId=mw_id) mw["Tags"] = response['TagList'] if 'TagList' in response else [] except Exception as e: log.error(f"Failed to fetch Tags for MaintenanceWindow '{mw_id}'") if ("Tags" not in mw or not len(mw["Tags"])) and mw["Name"] not in mw_names["__globaldefault__"]["Names"]: log.warning(f"Please tag SSM Maintenance Window '%s/%s' with 'clonesquad:group-name': '%s'!" % (mw["Name"], mw["WindowId"], self.context["GroupName"])) continue valid_mws.append(mw) self.maintenance_windows = { "Names": mw_names, "Windows": valid_mws } # Update asynchronous results from previously launched commands self.update_pending_command_statuses() # Perform maintenance window house keeping self.manage_maintenance_windows() if len(mws): log.log(log.NOTICE, f"Found matching SSM maintenance windows: %s" % self.maintenance_windows["Windows"]) # Hard dependency toward EC2 module. We update the SSM instance initializing states self.o_ec2.update_ssm_initializing_states()
import discord import random import config import wisdom with open("./statuses.txt") as f: STATUSES = f.read().splitlines() SAVE = [] LOG_CHANNEL_ID = None messages_since_startup = 0 messages_total = 0 config.register(__name__, 'LOG_CHANNEL_ID') config.register(__name__, 'messages_total') async def game_status_per_message(client, message): if message.channel.is_private: sanitized = await client.sanitize(message.content) client.log(f'Recieved message {sanitized} from {message.author}') return wisdom.add_message(message.author.id) global messages_since_startup, messages_total if messages_since_startup % 200 == 0: await client.change_presence(game=discord.Game( name=random.choice(STATUSES)))
def initialize(context): config.register() # register plugin roles for collective.teamwork patch_access_logging() patch_tinymce_settings()
def manage_maintenance_windows(self): """ Read SSM Maintenance Window information and apply temporary configuration during maintenance period. """ config_tag = "clonesquad:config:" def _set_tag(fleet, config, mw): min_instance_count = None if "Tags" in mw: tags = {} for t in mw["Tags"]: if t["Key"].startswith(config_tag): tags[t["Key"][len(config_tag):]] = t["Value"] if fleet is None: if "ec2.schedule.min_instance_count" in tags: min_instance_count = tags["ec2.schedule.min_instance_count"] else: tag = f"subfleet.{fleet}.ec2.schedule.min_instance_count" if tag in tags: min_instance_count = tags[tag] del tags[tag] tag = f"subfleet.__all__.ec2.schedule.min_instance_count" if tag in tags: min_instance_count = tags[tag] del tags[tag] for t in tags: if not Cfg.is_builtin_key_exist(t): log.warning(f"On SSM MaintenanceWindow objection %s/%s, tag '{config_tag}.{t}' does not refer " "to an existing configuration key!!" % (mw["WindowId"], mw["Name"])) continue config[f"override:{t}"] = tags[t] return min_instance_count config = {} meta = {} is_maintenance_time = self.is_maintenance_time(meta=meta) self._record_last_maintenance_window_time(is_maintenance_time) # Send events with SSM and notify users instances = self.o_ec2.get_instances(State="pending,running", main_fleet_only=True) instance_ids = [i["InstanceId"] for i in instances] event_name = "ENTER_MAINTENANCE_WINDOW_PERIOD" if is_maintenance_time else "EXIT_MAINTENANCE_WINDOW_PERIOD" pretty_event_name = "EnterMaintenanceWindowPeriod" if is_maintenance_time else "ExitMaintenanceWindowPeriod" self.send_events(instance_ids, "maintenance_window.state_change", event_name, { }, notification_handler=self.ssm_maintenance_window_event, pretty_event_name=pretty_event_name) # Main fleet Maintenance window management if not is_maintenance_time: if "NextWindowMessage" in meta: log.log(log.NOTICE, meta["NextWindowMessage"]) else: log.log(log.NOTICE, f"Main fleet under Active Maintenance Window until %s : %s" % (meta["EndTime"], meta["MatchingWindow"])) min_instance_count = _set_tag(None, config, meta["MatchingWindow"]) if min_instance_count is None: min_instance_count = Cfg.get("ssm.feature.maintenance_window.mainfleet.ec2.schedule.min_instance_count") config["override:ec2.schedule.min_instance_count"] = min_instance_count if min_instance_count == "100%": config["override:ec2.schedule.desired_instance_count"] = "100%" # Subfleet Maintenance window management for subfleet in self.o_ec2.get_subfleet_names(): meta = {} is_maintenance_time = self.is_maintenance_time(fleet=subfleet, meta=meta) self._record_last_maintenance_window_time(is_maintenance_time, fleet=subfleet) # Send events with SSM and notify users instances = self.o_ec2.get_instances(State="running", instances=self.o_ec2.get_subfleet_instances(subfleet_name=subfleet)) instance_ids = [i["InstanceId"] for i in instances] event_name = "ENTER_MAINTENANCE_WINDOW_PERIOD" if is_maintenance_time else "EXIT_MAINTENANCE_WINDOW_PERIOD" self.send_events(instance_ids, "maintenance_window.state_change", event_name, { }, notification_handler=self.ssm_maintenance_window_event, pretty_event_name=pretty_event_name) if not is_maintenance_time: if "NextWindowMessage" in meta: log.log(log.NOTICE, meta["NextWindowMessage"]) else: log.log(log.NOTICE, f"Subflee '{subfleet}' fleet under Active Maintenance Window until %s : %s" % (meta["EndTime"], meta["MatchingWindow"])) min_instance_count = _set_tag(subfleet, config, meta["MatchingWindow"]) if min_instance_count is None: min_instance_count = Cfg.get(f"ssm.feature.maintenance_window.subfleet.{subfleet}.ec2.schedule.min_instance_count") config[f"override:subfleet.{subfleet}.ec2.schedule.min_instance_count"] = min_instance_count if min_instance_count == "100%": config[f"override:subfleet.{subfleet}.ec2.schedule.desired_instance_count"] = "100%" if Cfg.get_int("ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running"): config[f"override:subfleet.{subfleet}.state"] = "running" # Register SSM Maintenance Window configuration override Cfg.register(config, layer="SSM Maintenance window override", create_layer_when_needed=True)
import config import discord import re # unicode fe57 = 'small exclamation mark' # unicode ff01 = 'fullwidth exclamation mark' PREFIX = ['!', '\ufe57', '\uff01'] config.register(__name__, 'PREFIX') if isinstance(PREFIX, str): PREFIX = [PREFIX] OFF_TOPIC_ID = [] config.register(__name__, 'OFF_TOPIC_ID') if isinstance(OFF_TOPIC_ID, str): OFF_TOPIC_ID = [OFF_TOPIC_ID] listed_commands = [] admin_commands = [] commands_help = [] def register(name, command, leisure=True, admin=False, delete=True, help=''): admin_commands.append(name) commands_help.append((name, help)) if not admin: listed_commands.append(name) def check(message): if message.channel.is_private and message.content.startswith(name): return "" if len(message.content) < 1:
import re import asyncio from karma import get_mentions import time import config from discord import Forbidden, NotFound time_reg = (r'^(?:\s|(?:<[^>]*>)|(?:(?<= )for ([\w\s]+)(?<![0-9\s])(?! for)))*' r'(?:([0-9]+):)?([0-9]+)' r'(?:\s|(?:<[^>]*>)|(?: for ([\w\s]+)(?! for)))*$') MUTES, BANS = {}, {} config.register(__name__, 'MUTES') config.register(__name__, 'BANS') async def add_mute(client, user, duration, server_id, mute_role): await client.add_roles(user, mute_role) MUTES[user.id] = (time.time() + duration, server_id, mute_role.id) async def add_ban(client, user, duration, server_id): BANS[user.id] = (time.time() + duration, server_id) await client.ban(user, 0) if user.id in MUTES: del MUTES[user.id] async def check_all(client): current_time = time.time()