def get_container_resources_dict(): # Remote database operation containers = get_structures(db_handler, debug, subtype="container") if not containers: return # Get all the different hosts of the containers hosts_info = dict() for container in containers: host = container["host"] if host not in hosts_info: hosts_info[host] = dict() hosts_info[host]["host_rescaler_ip"] = container[ "host_rescaler_ip"] hosts_info[host]["host_rescaler_port"] = container[ "host_rescaler_port"] # For each host, retrieve its containers and persist the ones we look for container_info = fill_container_dict(hosts_info, containers) container_resources_dict = dict() for container in containers: container_name = container["name"] if container_name not in container_info: log_warning( "Container info for {0} not found, check that it is really living in its supposed host '{1}', and that " "the host is alive and with the Node Scaler service running". format(container_name, container["host"]), debug) continue container_resources_dict[container_name] = container container_resources_dict[container_name]["resources"] = container_info[ container_name] return container_resources_dict
def match_usages_and_limits(self, structure_name, rules, usages, limits, resources): resources_with_rules = list() for rule in rules: if rule["resource"] in resources_with_rules: pass else: resources_with_rules.append(rule["resource"]) useful_resources = list() for resource in self.guardable_resources: if resource not in resources_with_rules: log_warning( "Resource {0} has no rules applied to it".format(resource), self.debug) else: useful_resources.append(resource) data = dict() for resource in useful_resources: if resource in resources: data[resource] = { "limits": { resource: limits[resource] }, "structure": { resource: resources[resource] } } for usage_metric in usages: keys = usage_metric.split(".") struct_type, usage_resource = keys[0], keys[1] # Split the key from the retrieved data, e.g., structure.mem.usages, where mem is the resource if usage_resource in useful_resources: data[usage_resource][struct_type][usage_resource][ keys[2]] = usages[usage_metric] events = [] for rule in rules: try: # Check that the rule is active, the resource to watch is guarded and that the rule is activated if self.rule_triggers_event(rule, data, resources): event_name = generate_event_name(rule["action"]["events"], rule["resource"]) event = self.generate_event(event_name, structure_name, rule["resource"], rule["action"]) events.append(event) except KeyError as e: log_warning( "rule: {0} is missing a parameter {1} {2}".format( rule["name"], str(e), str(traceback.format_exc())), self.debug) return events
def get_data(funct): docs = list() try: docs += funct_map[funct]() except (requests.exceptions.HTTPError, KeyError, ValueError) as e: # An error might have been thrown because database was recently updated or created log_warning( "Couldn't retrieve {0} info, error {1}.".format(funct, str(e)), debug) return docs
def refeed(self, ): myConfig = MyConfig(CONFIG_DEFAULT_VALUES) logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO) while True: # Get service info service = get_service(self.couchdb_handler, SERVICE_NAME) # Heartbeat beat(self.couchdb_handler, SERVICE_NAME) # CONFIG myConfig.set_config(service["config"]) self.debug = myConfig.get_value("DEBUG") debug = self.debug self.window_difference = myConfig.get_value("WINDOW_TIMELAPSE") self.window_delay = myConfig.get_value("WINDOW_DELAY") SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE") t0 = start_epoch(self.debug) log_info("Config is as follows:", debug) log_info(".............................................", debug) log_info("Time window lapse -> {0}".format(self.window_difference), debug) log_info("Delay -> {0}".format(self.window_delay), debug) log_info(".............................................", debug) thread = None if SERVICE_IS_ACTIVATED: # Remote database operation host_info_cache = dict() containers = get_structures(self.couchdb_handler, debug, subtype="container") if not containers: # As no container info is available, no application information will be able to be generated log_info("No structures to process", debug) time.sleep(self.window_difference) end_epoch(self.debug, self.window_difference, t0) continue else: thread = Thread(target=self.refeed_thread, args=()) thread.start() else: log_warning("Refeeder is not activated", debug) time.sleep(self.window_difference) wait_operation_thread(thread, debug) log_info("Refeed processed", debug) end_epoch(self.debug, self.window_difference, t0)
def generate_application_metrics(self, application): application_info = dict() for c in application["containers"]: container_info = self.get_container_usages(c) application_info = self.merge(application_info, container_info) for resource in application_info: if resource in application["resources"]: application["resources"][resource]["usage"] = application_info[ resource] else: log_warning("No resource {0} info for application {1}".format( resource, application["name"]), debug=True) return application
def get_configs(): docs = list() services = db_handler.get_services() # Remote database operation filtered_services = [ s for s in services if s["name"] in PERSIST_CONFIG_SERVICES_NAMES ] for service in filtered_services: for parameter in PERSIST_CONFIG_SERVICES_DOCS[service["name"]]: database_key_name, timeseries_metric_name = parameter if database_key_name in service["config"]: timeseries = dict(metric=timeseries_metric_name, value=service["config"][database_key_name], timestamp=int(time.time()), tags={"service": service["name"]}) docs.append(timeseries) else: log_warning( "Missing config key '{0}' in service '{1}'".format( database_key_name, service["name"]), debug) return docs
def get_container_usages(self, container_name): try: container_info = self.opentsdb_handler.get_structure_timeseries( {"host": container_name}, self.window_difference, self.window_delay, BDWATCHDOG_METRICS, REFEEDER_APPLICATION_METRICS) for metric in REFEEDER_APPLICATION_METRICS: if metric not in CONFIG_DEFAULT_VALUES["GENERATED_METRICS"]: continue if container_info[metric] == self.NO_METRIC_DATA_DEFAULT_VALUE: log_warning( "No metric info for {0} in container {1}".format( metric, container_name), debug=True) except requests.ConnectionError as e: log_error("Connection error: {0} {1}".format( str(e), str(traceback.format_exc())), debug=True) raise e return container_info
def __get_container_usages(self, container): window_difference = get_config_value(self.__config, CONFIG_DEFAULT_VALUES, "WINDOW_TIMELAPSE") window_delay = get_config_value(self.__config, CONFIG_DEFAULT_VALUES, "WINDOW_DELAY") try: # Remote database operation usages = self.__opentsdb_handler.get_structure_timeseries({"host": container["name"]}, window_difference, window_delay, BDWATCHDOG_CONTAINER_METRICS, GUARDIAN_CONTAINER_METRICS) # Skip this structure if all the usage metrics are unavailable if all([usages[metric] == self.__NO_METRIC_DATA_DEFAULT_VALUE for metric in usages]): log_warning("container: {0} has no usage data".format(container["name"]), self.__debug) return None return usages except Exception as e: log_error("error with structure: {0} {1} {2}".format(container["name"], str(e), str(traceback.format_exc())), self.__debug) return None
def persist(): logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO) global debug myConfig = MyConfig(CONFIG_DEFAULT_VALUES) while True: log_info("----------------------", debug) log_info("Starting Epoch", debug) t0 = time.time() # Get service info service = get_service(db_handler, SERVICE_NAME) # Remote database operation # Heartbeat beat(db_handler, SERVICE_NAME) # Remote database operation # CONFIG myConfig.set_config(service["config"]) polling_frequency = myConfig.get_value("POLLING_FREQUENCY") debug = myConfig.get_value("DEBUG") documents_persisted = myConfig.get_value("DOCUMENTS_PERSISTED") SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE") log_info("Config is as follows:", debug) log_info(".............................................", debug) log_info("Polling frequency -> {0}".format(polling_frequency), debug) log_info( "Documents to be persisted are -> {0}".format(documents_persisted), debug) log_info(".............................................", debug) ## CHECK INVALID CONFIG ## # TODO THis code is duplicated on the structures and database snapshoters invalid, message = invalid_conf(myConfig) if invalid: log_error(message, debug) time.sleep(polling_frequency) if polling_frequency < 4: log_error( "Polling frequency is too short, replacing with DEFAULT value '{0}'" .format(CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"]), debug) polling_frequency = CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"] log_info("----------------------\n", debug) time.sleep(polling_frequency) continue if SERVICE_IS_ACTIVATED: for docType in documents_persisted: persist_docs(docType) else: log_warning( "Database snapshoter is not activated, will not do anything", debug) t1 = time.time() log_info("Epoch processed in {0} seconds ".format("%.2f" % (t1 - t0)), debug) log_info("----------------------\n", debug) time.sleep(polling_frequency)
def guard(self, ): myConfig = MyConfig(CONFIG_DEFAULT_VALUES) logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO) while True: # Get service info service = get_service(self.couchdb_handler, SERVICE_NAME) # Heartbeat beat(self.couchdb_handler, SERVICE_NAME) # CONFIG myConfig.set_config(service["config"]) self.debug = myConfig.get_value("DEBUG") debug = self.debug self.guardable_resources = myConfig.get_value( "GUARDABLE_RESOURCES") self.cpu_shares_per_watt = myConfig.get_value( "CPU_SHARES_PER_WATT") self.window_difference = myConfig.get_value("WINDOW_TIMELAPSE") self.window_delay = myConfig.get_value("WINDOW_DELAY") self.structure_guarded = myConfig.get_value("STRUCTURE_GUARDED") self.event_timeout = myConfig.get_value("EVENT_TIMEOUT") SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE") t0 = start_epoch(self.debug) log_info("Config is as follows:", debug) log_info(".............................................", debug) log_info("Time window lapse -> {0}".format(self.window_difference), debug) log_info("Delay -> {0}".format(self.window_delay), debug) log_info("Event timeout -> {0}".format(self.event_timeout), debug) log_info( "Resources guarded are -> {0}".format( self.guardable_resources), debug) log_info( "Structure type guarded is -> {0}".format( self.structure_guarded), debug) log_info(".............................................", debug) ## CHECK INVALID CONFIG ## invalid, message = self.invalid_conf() if invalid: log_error(message, debug) if self.window_difference < 5: log_error( "Window difference is too short, replacing with DEFAULT value '{0}'" .format(CONFIG_DEFAULT_VALUES["WINDOW_TIMELAPSE"]), self.debug) self.window_difference = CONFIG_DEFAULT_VALUES[ "WINDOW_TIMELAPSE"] time.sleep(self.window_difference) end_epoch(self.debug, self.window_difference, t0) continue thread = None if SERVICE_IS_ACTIVATED: # Remote database operation structures = get_structures(self.couchdb_handler, debug, subtype=self.structure_guarded) if structures: log_info( "{0} Structures to process, launching threads".format( len(structures)), debug) thread = Thread(name="guard_structures", target=self.guard_structures, args=(structures, )) thread.start() else: log_info("No structures to process", debug) else: log_warning("Guardian is not activated", debug) time.sleep(self.window_difference) wait_operation_thread(thread, debug) end_epoch(t0, self.window_difference, t0)
def serverless(self, structure, rules): structure_subtype = structure["subtype"] # Check if structure is guarded if "guard" not in structure or not structure["guard"]: log_warning( "structure: {0} is set to leave alone, skipping".format( structure["name"]), self.debug) return # Check if the structure has any resource set to guarded struct_guarded_resources = list() for res in self.guardable_resources: if res in structure["resources"] and "guard" in structure[ "resources"][res] and structure["resources"][res]["guard"]: struct_guarded_resources.append(res) if not struct_guarded_resources: log_warning( "Structure {0} is set to guarded but has no resource marked to guard" .format(structure["name"]), self.debug) return # Check if structure is being monitored, otherwise, ignore if structure_subtype not in BDWATCHDOG_METRICS or structure_subtype not in GUARDIAN_METRICS or structure_subtype not in TAGS: log_error( "Unknown structure subtype '{0}'".format(structure_subtype), self.debug) return try: metrics_to_retrieve = list() metrics_to_generate = dict() for res in struct_guarded_resources: metrics_to_retrieve += BDWATCHDOG_METRICS[structure_subtype][ res] metrics_to_generate[generate_structure_usage_metric( res)] = GUARDIAN_METRICS[structure_subtype][ generate_structure_usage_metric(res)] tag = TAGS[structure_subtype] # Remote database operation usages = self.opentsdb_handler.get_structure_timeseries( {tag: structure["name"]}, self.window_difference, self.window_delay, metrics_to_retrieve, metrics_to_generate) for metric in usages: if usages[metric] == self.NO_METRIC_DATA_DEFAULT_VALUE: log_warning( "structure: {0} has no usage data for {1}".format( structure["name"], metric), self.debug) # Skip this structure if all the usage metrics are unavailable if all([ usages[metric] == self.NO_METRIC_DATA_DEFAULT_VALUE for metric in usages ]): log_warning( "structure: {0} has no usage data for any metric, skipping" .format(structure["name"]), self.debug) return resources = structure["resources"] # Remote database operation limits = self.couchdb_handler.get_limits(structure) limits_resources = limits["resources"] if not limits_resources: log_warning( "structure: {0} has no limits".format(structure["name"]), self.debug) return # Adjust the structure limits according to the current value limits["resources"] = self.adjust_container_state( resources, limits_resources, self.guardable_resources) # Remote database operation self.couchdb_handler.update_limit(limits) self.process_serverless_structure(structure, usages, limits_resources, rules) except Exception as e: log_error( "Error with structure {0}: {1}".format(structure["name"], str(e)), self.debug)
def match_rules_and_events(self, structure, rules, events, limits, usages): generated_requests = list() events_to_remove = dict() for rule in rules: # Check that the rule has the required parameters rule_invalid = False for key in [ "active", "resource", "generates", "name", ]: if key not in rule: log_warning( "Rule: {0} is missing a key parameter '{1}', skipping it" .format(rule["name"], key), self.debug) rule_invalid = True if rule_invalid: continue if rule["generates"] == "requests": if "rescale_policy" not in rule or "rescale_type" not in rule: log_warning( "Rule: {0} is missing the 'rescale_type' or the 'rescale_policy' parameter, skipping it" .format(rule["name"]), self.debug) continue if rule["rescale_type"] == "up" and "amount" not in rule: log_warning( "Rule: {0} is missing a the amount parameter, skipping it" .format(rule["name"]), self.debug) continue resource_label = rule["resource"] rule_activated = rule["active"] and \ rule["generates"] == "requests" and \ resource_label in events and \ jsonLogic(rule["rule"], events[resource_label]) if not rule_activated: continue # RULE HAS BEEN ACTIVATED # If rescaling a container, check that the current resource value exists, otherwise there is nothing to rescale if structure_is_container( structure ) and "current" not in structure["resources"][resource_label]: log_warning( "No current value for container' {0}' and " "resource '{1}', can't rescale".format( structure["name"], resource_label), self.debug) continue # Get the amount to be applied from the policy set if rule["rescale_type"] == "up": if rule["rescale_policy"] == "amount": amount = rule["amount"] elif rule["rescale_policy"] == "proportional": amount = rule["amount"] current_resource_limit = structure["resources"][ resource_label]["current"] upper_limit = limits[resource_label]["upper"] usage = usages[translator_dict[resource_label]] ratio = min((usage - upper_limit) / (current_resource_limit - upper_limit), 1) amount = int(ratio * amount) log_warning( "PROP -> cur : {0} | upp : {1} | usa: {2} | ratio {3} | amount {4}" .format(current_resource_limit, upper_limit, usage, ratio, amount), self.debug) else: log_warning( "Invalid rescale policy '{0} for Rule {1}, skipping it" .format(rule["rescale_policy"], rule["name"]), self.debug) continue elif rule["rescale_type"] == "down": if rule["rescale_policy"] == "amount": amount = rule["amount"] elif rule["rescale_policy"] == "fit_to_usage": current_resource_limit = structure["resources"][ resource_label]["current"] boundary = limits[resource_label]["boundary"] usage = usages[translator_dict[resource_label]] amount = self.get_amount_from_fit_reduction( current_resource_limit, boundary, usage) elif rule["rescale_policy"] == "proportional" and rule[ "resource"] == "energy": amount = self.get_amount_from_proportional_energy_rescaling( structure, resource_label) else: log_warning( "Invalid rescale policy '{0} for Rule {1}, skipping it" .format(rule["rescale_policy"], rule["name"]), self.debug) continue else: log_warning( "Invalid rescale type '{0} for Rule {1}, skipping it". format(rule["rescale_type"], rule["name"]), self.debug) continue # Ensure that amount is an integer, either by converting float -> int, or string -> int amount = int(amount) # If it is 0, because there was a previous floating value between -1 and 1, set it to 0 so that it does not generate any Request if amount == 0: log_warning( "Amount generated for structure {0} with rule {1} is 0". format(structure["name"], rule["name"]), self.debug) # If the resource is susceptible to check, ensure that it does not surpass any limit new_amount = amount if resource_label not in NON_ADJUSTABLE_RESOURCES: structure_resources = structure["resources"][resource_label] structure_limits = limits[resource_label] new_amount = self.adjust_amount(amount, structure_resources, structure_limits) if new_amount != amount: log_warning( "Amount generated for structure {0} with rule {1} has been trimmed from {2} to {3}" .format(structure["name"], rule["name"], amount, new_amount), self.debug) # Generate the request and append it request = self.generate_request(structure, new_amount, resource_label) generated_requests.append(request) # Remove the events that triggered the request event_name = generate_event_name(events[resource_label]["events"], resource_label) if event_name not in events_to_remove: events_to_remove[event_name] = 0 events_to_remove[event_name] += rule["events_to_remove"] return generated_requests, events_to_remove
def persist(): logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO) global resources_persisted global debug myConfig = MyConfig(CONFIG_DEFAULT_VALUES) while True: log_info("----------------------", debug) log_info("Starting Epoch", debug) t0 = time.time() # Get service info service = get_service(db_handler, SERVICE_NAME) # Remote database operation # Heartbeat beat(db_handler, SERVICE_NAME) # Remote database operation # CONFIG myConfig.set_config(service["config"]) polling_frequency = myConfig.get_value("POLLING_FREQUENCY") debug = myConfig.get_value("DEBUG") resources_persisted = myConfig.get_value("RESOURCES_PERSISTED") SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE") log_info( "Going to snapshot resources: {0}".format(resources_persisted), debug) log_info("Config is as follows:", debug) log_info(".............................................", debug) log_info("Polling frequency -> {0}".format(polling_frequency), debug) log_info( "Resources to be snapshoter are -> {0}".format( resources_persisted), debug) log_info(".............................................", debug) ## CHECK INVALID CONFIG ## # TODO This code is duplicated on the structures and database snapshoters invalid, message = invalid_conf(myConfig) if invalid: log_error(message, debug) time.sleep(polling_frequency) if polling_frequency < 3: log_error( "Polling frequency is too short, replacing with DEFAULT value '{0}'" .format(CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"]), debug) polling_frequency = CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"] log_info("----------------------\n", debug) time.sleep(polling_frequency) continue thread = None if SERVICE_IS_ACTIVATED: thread = Thread(target=persist_thread, args=()) thread.start() else: log_warning( "Structure snapshoter is not activated, will not do anything", debug) time.sleep(polling_frequency) wait_operation_thread(thread, debug) t1 = time.time() time_proc = "%.2f" % (t1 - t0 - polling_frequency) time_total = "%.2f" % (t1 - t0) log_info( "Epoch processed in {0} seconds ({1} processing and {2} sleeping)". format(time_total, time_proc, str(polling_frequency)), debug) log_info("----------------------\n", debug)