def rebalance_containers(self, config): self.__config = config self.__debug = get_config_value(self.__config, CONFIG_DEFAULT_VALUES, "DEBUG") log_info("_______________", self.__debug) log_info("Performing CONTAINER CPU Balancing", self.__debug) # Get the containers and applications try: applications = get_structures(self.__couchdb_handler, self.__debug, subtype="application") containers = get_structures(self.__couchdb_handler, self.__debug, subtype="container") except requests.exceptions.HTTPError as e: log_error("Couldn't get applications", self.__debug) log_error(str(e), self.__debug) return # Filter out the ones that do not accept rebalancing or that do not need any internal rebalancing rebalanceable_apps = list() for app in applications: # TODO Improve this management if "rebalance" not in app or app["rebalance"] == True: pass else: continue if len(app["containers"]) <= 1: continue if self.__app_containers_can_be_rebalanced(app): rebalanceable_apps.append(app) # Sort them according to each application they belong app_containers = dict() for app in rebalanceable_apps: app_name = app["name"] app_containers[app_name] = list() app_containers_names = app["containers"] for container in containers: if container["name"] in app_containers_names: app_containers[app_name].append(container) # Get the container usages app_containers[app_name] = self.__fill_containers_with_usage_info(app_containers[app_name]) # Rebalance applications for app in rebalanceable_apps: app_name = app["name"] log_info("Going to rebalance {0} now".format(app_name), self.__debug) self.__rebalance_containers_by_pair_swapping(app_containers[app_name], app_name) log_info("_______________", self.__debug)
def refeed_user_used_energy(self, applications, users, db_handler, debug): for user in users: if "cpu" not in user: user["cpu"] = {} if "energy" not in user: user["energy"] = {} total_user = {"cpu": 0, "energy": 0} total_user_current_cpu = 0 user_apps = get_user_apps(applications, user) for app in user_apps: for resource in ["energy", "cpu"]: if "usage" in app["resources"][resource] and app[ "resources"][resource]["usage"]: total_user[resource] += app["resources"][resource][ "usage"] else: log_error( "Application {0} of user {1} has no used {2} field or value" .format(app["name"], user["name"], resource), debug) if "current" in app["resources"]["cpu"] and app["resources"][ "cpu"]["usage"]: total_user_current_cpu += app["resources"][resource][ "current"] else: log_error( "Application {0} of user {1} has no current cpu field or value" .format(app["name"], user["name"]), debug) user["energy"]["used"] = total_user["energy"] user["cpu"]["usage"] = total_user["cpu"] user["cpu"]["current"] = total_user_current_cpu db_handler.update_user(user) log_info( "Updated energy consumed by user {0}".format(user["name"]), debug)
def print_structure_info(self, container, usages, limits, triggered_events, triggered_requests): resources = container["resources"] container_name_str = "@" + container["name"] resources_str = "| " for resource in self.guardable_resources: if container["resources"][resource]["guard"]: resources_str += resource + "({0})".format( self.get_resource_summary(resource, resources, limits, usages)) + " | " ev, req = list(), list() for event in triggered_events: ev.append(event["name"]) for request in triggered_requests: req.append(request["action"]) triggered_requests_and_events = "#TRIGGERED EVENTS {0} AND TRIGGERED REQUESTS {1}".format( str(ev), str(req)) log_info( " ".join([ container_name_str, resources_str, triggered_requests_and_events ]), self.debug)
def persist_thread(): t0 = time.time() container_resources_dict = get_container_resources_dict() t1 = time.time() persist_applications(container_resources_dict) t2 = time.time() persist_containers(container_resources_dict) t3 = time.time() log_info( "It took {0} seconds to get container info".format( str("%.2f" % (t1 - t0))), debug) log_info( "It took {0} seconds to snapshot applications".format( str("%.2f" % (t2 - t1))), debug) log_info( "It took {0} seconds to snapshot containers".format( str("%.2f" % (t3 - t2))), debug)
def persist_docs(funct): t0 = time.time() docs = get_data(funct) t1 = time.time() if docs: log_info( "It took {0} seconds to get {1} info".format( str("%.2f" % (t1 - t0)), funct), debug) num_docs = send_data(docs) t2 = time.time() if num_docs > 0: log_info( "It took {0} seconds to send {1} info".format( str("%.2f" % (t2 - t1)), funct), debug) log_info( "Post was done with {0} documents of '{1}'".format( str(num_docs), funct), debug)
def persist(): logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO) global debug myConfig = MyConfig(CONFIG_DEFAULT_VALUES) while True: log_info("----------------------", debug) log_info("Starting Epoch", debug) t0 = time.time() # Get service info service = get_service(db_handler, SERVICE_NAME) # Remote database operation # Heartbeat beat(db_handler, SERVICE_NAME) # Remote database operation # CONFIG myConfig.set_config(service["config"]) polling_frequency = myConfig.get_value("POLLING_FREQUENCY") debug = myConfig.get_value("DEBUG") documents_persisted = myConfig.get_value("DOCUMENTS_PERSISTED") SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE") log_info("Config is as follows:", debug) log_info(".............................................", debug) log_info("Polling frequency -> {0}".format(polling_frequency), debug) log_info( "Documents to be persisted are -> {0}".format(documents_persisted), debug) log_info(".............................................", debug) ## CHECK INVALID CONFIG ## # TODO THis code is duplicated on the structures and database snapshoters invalid, message = invalid_conf(myConfig) if invalid: log_error(message, debug) time.sleep(polling_frequency) if polling_frequency < 4: log_error( "Polling frequency is too short, replacing with DEFAULT value '{0}'" .format(CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"]), debug) polling_frequency = CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"] log_info("----------------------\n", debug) time.sleep(polling_frequency) continue if SERVICE_IS_ACTIVATED: for docType in documents_persisted: persist_docs(docType) else: log_warning( "Database snapshoter is not activated, will not do anything", debug) t1 = time.time() log_info("Epoch processed in {0} seconds ".format("%.2f" % (t1 - t0)), debug) log_info("----------------------\n", debug) time.sleep(polling_frequency)
def guard(self, ): myConfig = MyConfig(CONFIG_DEFAULT_VALUES) logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO) while True: # Get service info service = get_service(self.couchdb_handler, SERVICE_NAME) # Heartbeat beat(self.couchdb_handler, SERVICE_NAME) # CONFIG myConfig.set_config(service["config"]) self.debug = myConfig.get_value("DEBUG") debug = self.debug self.guardable_resources = myConfig.get_value( "GUARDABLE_RESOURCES") self.cpu_shares_per_watt = myConfig.get_value( "CPU_SHARES_PER_WATT") self.window_difference = myConfig.get_value("WINDOW_TIMELAPSE") self.window_delay = myConfig.get_value("WINDOW_DELAY") self.structure_guarded = myConfig.get_value("STRUCTURE_GUARDED") self.event_timeout = myConfig.get_value("EVENT_TIMEOUT") SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE") t0 = start_epoch(self.debug) log_info("Config is as follows:", debug) log_info(".............................................", debug) log_info("Time window lapse -> {0}".format(self.window_difference), debug) log_info("Delay -> {0}".format(self.window_delay), debug) log_info("Event timeout -> {0}".format(self.event_timeout), debug) log_info( "Resources guarded are -> {0}".format( self.guardable_resources), debug) log_info( "Structure type guarded is -> {0}".format( self.structure_guarded), debug) log_info(".............................................", debug) ## CHECK INVALID CONFIG ## invalid, message = self.invalid_conf() if invalid: log_error(message, debug) if self.window_difference < 5: log_error( "Window difference is too short, replacing with DEFAULT value '{0}'" .format(CONFIG_DEFAULT_VALUES["WINDOW_TIMELAPSE"]), self.debug) self.window_difference = CONFIG_DEFAULT_VALUES[ "WINDOW_TIMELAPSE"] time.sleep(self.window_difference) end_epoch(self.debug, self.window_difference, t0) continue thread = None if SERVICE_IS_ACTIVATED: # Remote database operation structures = get_structures(self.couchdb_handler, debug, subtype=self.structure_guarded) if structures: log_info( "{0} Structures to process, launching threads".format( len(structures)), debug) thread = Thread(name="guard_structures", target=self.guard_structures, args=(structures, )) thread.start() else: log_info("No structures to process", debug) else: log_warning("Guardian is not activated", debug) time.sleep(self.window_difference) wait_operation_thread(thread, debug) end_epoch(t0, self.window_difference, t0)
def __rebalance_containers_by_pair_swapping(self, containers, app_name): # Filter the containers between donors and receivers, according to usage and rules donors = self.__get_container_donors(containers) receivers = self.__get_container_receivers(containers) log_info("Nodes that will give: {0}".format(str([c["name"] for c in donors])), self.__debug) log_info("Nodes that will receive: {0}".format(str([c["name"] for c in receivers])), self.__debug) if not receivers: log_info("No containers in need of rebalancing for {0}".format(app_name), self.__debug) return else: # Order the containers from lower to upper current CPU limit receivers = sorted(receivers, key=lambda c: c["resources"]["cpu"]["current"]) # Steal resources from the low-usage containers (givers), create 'slices' of resources donor_slices = list() id = 0 for container in donors: # Ensure that this request will be successfully processed, otherwise we are 'giving' away extra resources current_value = container["resources"]["cpu"]["current"] min_value = container["resources"]["cpu"]["min"] usage_value = container["resources"]["cpu"]["usage"] stolen_amount = 0.5 * (current_value - max(min_value, usage_value)) slice_amount = 25 acum = 0 while acum + slice_amount < stolen_amount: donor_slices.append((container, slice_amount, id)) acum += slice_amount id += 1 # Remaining if acum < stolen_amount: donor_slices.append((container, int(stolen_amount-acum), id)) acum += slice_amount id += 1 donor_slices = sorted(donor_slices, key=lambda c: c[1]) print("Donor slices are") for c in donor_slices: print(c[0]["name"], c[1]) # Remove those donors that are of no use (there are no possible receivers for them) viable_donors = list() for c in donor_slices: viable = False for r in receivers: if r["host"] == c[0]["host"]: viable = True break if viable: viable_donors.append(c) print("VIABLE donor slices are") for c in viable_donors: print(c[0]["name"], c[1], c[2]) donor_slices = viable_donors # Give the resources to the bottlenecked containers requests = dict() while donor_slices: print("Donor slices are") for c in donor_slices: print(c[0]["name"], c[1], c[2]) for receiver in receivers: # Look for a donor container on the same host amount_to_scale, donor, id = None, None, None for c, amount, i in donor_slices: if c["host"] == receiver["host"]: amount_to_scale = amount donor = c id = i break if not amount_to_scale: log_info("No more donors on its host, container {0} left out".format(receiver["name"]), self.__debug) continue # Remove this slice from the list donor_slices = list(filter(lambda x: x[2] != id, donor_slices)) max_receiver_amount = receiver["resources"]["cpu"]["max"] - receiver["resources"]["cpu"]["current"] # If this container can't be scaled anymore, skip if max_receiver_amount == 0: continue # Trim the amount to scale if needed if amount_to_scale > max_receiver_amount: amount_to_scale = max_receiver_amount # Create the pair of scaling requests # TODO This should use Guardians method to generate requests request = dict( type="request", resource="cpu", amount=int(amount_to_scale), structure=receiver["name"], action="CpuRescaleUp", timestamp=int(time.time()), structure_type="container", host=receiver["host"], host_rescaler_ip=receiver["host_rescaler_ip"], host_rescaler_port=receiver["host_rescaler_port"] ) if receiver["name"] not in requests: requests[receiver["name"]] = list() requests[receiver["name"]].append(request) # TODO This should use Guardians method to generate requests request = dict( type="request", resource="cpu", amount=int(-amount_to_scale), structure=donor["name"], action="CpuRescaleDown", timestamp=int(time.time()), structure_type="container", host=donor["host"], host_rescaler_ip=donor["host_rescaler_ip"], host_rescaler_port=donor["host_rescaler_port"] ) if donor["name"] not in requests: requests[donor["name"]] = list() requests[donor["name"]].append(request) log_info("Resource swap between {0}(donor) and {1}(receiver)".format(donor["name"], receiver["name"]), self.__debug) log_info("No more donors", self.__debug) final_requests = list() for container in requests: # Copy the first request as the base request flat_request = dict(requests[container][0]) flat_request["amount"] = 0 for request in requests[container]: flat_request["amount"] += request["amount"] final_requests.append(flat_request) log_info("REQUESTS ARE:", self.__debug) for c in requests.values(): for r in c: print(r) # TODO # Adjust requests amounts according to the maximums (trim), otherwise the scaling down will be performed but not the scaling up, and shares will be lost log_info("FINAL REQUESTS ARE:", self.__debug) for r in final_requests: print(r) self.__couchdb_handler.add_request(r)
def refeed(self, ): myConfig = MyConfig(CONFIG_DEFAULT_VALUES) logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO) while True: # Get service info service = get_service(self.couchdb_handler, SERVICE_NAME) # Heartbeat beat(self.couchdb_handler, SERVICE_NAME) # CONFIG myConfig.set_config(service["config"]) self.debug = myConfig.get_value("DEBUG") debug = self.debug self.window_difference = myConfig.get_value("WINDOW_TIMELAPSE") self.window_delay = myConfig.get_value("WINDOW_DELAY") SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE") t0 = start_epoch(self.debug) log_info("Config is as follows:", debug) log_info(".............................................", debug) log_info("Time window lapse -> {0}".format(self.window_difference), debug) log_info("Delay -> {0}".format(self.window_delay), debug) log_info(".............................................", debug) thread = None if SERVICE_IS_ACTIVATED: # Remote database operation host_info_cache = dict() containers = get_structures(self.couchdb_handler, debug, subtype="container") if not containers: # As no container info is available, no application information will be able to be generated log_info("No structures to process", debug) time.sleep(self.window_difference) end_epoch(self.debug, self.window_difference, t0) continue else: thread = Thread(target=self.refeed_thread, args=()) thread.start() else: log_warning("Refeeder is not activated", debug) time.sleep(self.window_difference) wait_operation_thread(thread, debug) log_info("Refeed processed", debug) end_epoch(self.debug, self.window_difference, t0)
def persist(): logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO) global resources_persisted global debug myConfig = MyConfig(CONFIG_DEFAULT_VALUES) while True: log_info("----------------------", debug) log_info("Starting Epoch", debug) t0 = time.time() # Get service info service = get_service(db_handler, SERVICE_NAME) # Remote database operation # Heartbeat beat(db_handler, SERVICE_NAME) # Remote database operation # CONFIG myConfig.set_config(service["config"]) polling_frequency = myConfig.get_value("POLLING_FREQUENCY") debug = myConfig.get_value("DEBUG") resources_persisted = myConfig.get_value("RESOURCES_PERSISTED") SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE") log_info( "Going to snapshot resources: {0}".format(resources_persisted), debug) log_info("Config is as follows:", debug) log_info(".............................................", debug) log_info("Polling frequency -> {0}".format(polling_frequency), debug) log_info( "Resources to be snapshoter are -> {0}".format( resources_persisted), debug) log_info(".............................................", debug) ## CHECK INVALID CONFIG ## # TODO This code is duplicated on the structures and database snapshoters invalid, message = invalid_conf(myConfig) if invalid: log_error(message, debug) time.sleep(polling_frequency) if polling_frequency < 3: log_error( "Polling frequency is too short, replacing with DEFAULT value '{0}'" .format(CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"]), debug) polling_frequency = CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"] log_info("----------------------\n", debug) time.sleep(polling_frequency) continue thread = None if SERVICE_IS_ACTIVATED: thread = Thread(target=persist_thread, args=()) thread.start() else: log_warning( "Structure snapshoter is not activated, will not do anything", debug) time.sleep(polling_frequency) wait_operation_thread(thread, debug) t1 = time.time() time_proc = "%.2f" % (t1 - t0 - polling_frequency) time_total = "%.2f" % (t1 - t0) log_info( "Epoch processed in {0} seconds ({1} processing and {2} sleeping)". format(time_total, time_proc, str(polling_frequency)), debug) log_info("----------------------\n", debug)