def get_container_resources_dict():
    # Remote database operation
    containers = get_structures(db_handler, debug, subtype="container")
    if not containers:
        return

    # Get all the different hosts of the containers
    hosts_info = dict()
    for container in containers:
        host = container["host"]
        if host not in hosts_info:
            hosts_info[host] = dict()
            hosts_info[host]["host_rescaler_ip"] = container[
                "host_rescaler_ip"]
            hosts_info[host]["host_rescaler_port"] = container[
                "host_rescaler_port"]

    # For each host, retrieve its containers and persist the ones we look for
    container_info = fill_container_dict(hosts_info, containers)

    container_resources_dict = dict()
    for container in containers:
        container_name = container["name"]
        if container_name not in container_info:
            log_warning(
                "Container info for {0} not found, check that it is really living in its supposed host '{1}', and that "
                "the host is alive and with the Node Scaler service running".
                format(container_name, container["host"]), debug)
            continue
        container_resources_dict[container_name] = container
        container_resources_dict[container_name]["resources"] = container_info[
            container_name]

    return container_resources_dict
Пример #2
0
    def match_usages_and_limits(self, structure_name, rules, usages, limits,
                                resources):

        resources_with_rules = list()
        for rule in rules:
            if rule["resource"] in resources_with_rules:
                pass
            else:
                resources_with_rules.append(rule["resource"])

        useful_resources = list()
        for resource in self.guardable_resources:
            if resource not in resources_with_rules:
                log_warning(
                    "Resource {0} has no rules applied to it".format(resource),
                    self.debug)
            else:
                useful_resources.append(resource)

        data = dict()
        for resource in useful_resources:
            if resource in resources:
                data[resource] = {
                    "limits": {
                        resource: limits[resource]
                    },
                    "structure": {
                        resource: resources[resource]
                    }
                }

        for usage_metric in usages:
            keys = usage_metric.split(".")
            struct_type, usage_resource = keys[0], keys[1]
            # Split the key from the retrieved data, e.g., structure.mem.usages, where mem is the resource
            if usage_resource in useful_resources:
                data[usage_resource][struct_type][usage_resource][
                    keys[2]] = usages[usage_metric]

        events = []
        for rule in rules:
            try:
                # Check that the rule is active, the resource to watch is guarded and that the rule is activated
                if self.rule_triggers_event(rule, data, resources):
                    event_name = generate_event_name(rule["action"]["events"],
                                                     rule["resource"])
                    event = self.generate_event(event_name, structure_name,
                                                rule["resource"],
                                                rule["action"])
                    events.append(event)

            except KeyError as e:
                log_warning(
                    "rule: {0} is missing a parameter {1} {2}".format(
                        rule["name"], str(e), str(traceback.format_exc())),
                    self.debug)

        return events
def get_data(funct):
    docs = list()
    try:
        docs += funct_map[funct]()
    except (requests.exceptions.HTTPError, KeyError, ValueError) as e:
        # An error might have been thrown because database was recently updated or created
        log_warning(
            "Couldn't retrieve {0} info, error {1}.".format(funct, str(e)),
            debug)
    return docs
Пример #4
0
    def refeed(self, ):
        myConfig = MyConfig(CONFIG_DEFAULT_VALUES)
        logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO)

        while True:
            # Get service info
            service = get_service(self.couchdb_handler, SERVICE_NAME)

            # Heartbeat
            beat(self.couchdb_handler, SERVICE_NAME)

            # CONFIG
            myConfig.set_config(service["config"])
            self.debug = myConfig.get_value("DEBUG")
            debug = self.debug
            self.window_difference = myConfig.get_value("WINDOW_TIMELAPSE")
            self.window_delay = myConfig.get_value("WINDOW_DELAY")
            SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE")

            t0 = start_epoch(self.debug)

            log_info("Config is as follows:", debug)
            log_info(".............................................", debug)
            log_info("Time window lapse -> {0}".format(self.window_difference),
                     debug)
            log_info("Delay -> {0}".format(self.window_delay), debug)
            log_info(".............................................", debug)

            thread = None
            if SERVICE_IS_ACTIVATED:
                # Remote database operation
                host_info_cache = dict()
                containers = get_structures(self.couchdb_handler,
                                            debug,
                                            subtype="container")
                if not containers:
                    # As no container info is available, no application information will be able to be generated
                    log_info("No structures to process", debug)
                    time.sleep(self.window_difference)
                    end_epoch(self.debug, self.window_difference, t0)
                    continue
                else:
                    thread = Thread(target=self.refeed_thread, args=())
                    thread.start()
            else:
                log_warning("Refeeder is not activated", debug)

            time.sleep(self.window_difference)

            wait_operation_thread(thread, debug)
            log_info("Refeed processed", debug)

            end_epoch(self.debug, self.window_difference, t0)
Пример #5
0
    def generate_application_metrics(self, application):
        application_info = dict()
        for c in application["containers"]:
            container_info = self.get_container_usages(c)
            application_info = self.merge(application_info, container_info)

        for resource in application_info:
            if resource in application["resources"]:
                application["resources"][resource]["usage"] = application_info[
                    resource]
            else:
                log_warning("No resource {0} info for application {1}".format(
                    resource, application["name"]),
                            debug=True)

        return application
def get_configs():
    docs = list()
    services = db_handler.get_services()  # Remote database operation
    filtered_services = [
        s for s in services if s["name"] in PERSIST_CONFIG_SERVICES_NAMES
    ]
    for service in filtered_services:
        for parameter in PERSIST_CONFIG_SERVICES_DOCS[service["name"]]:
            database_key_name, timeseries_metric_name = parameter
            if database_key_name in service["config"]:
                timeseries = dict(metric=timeseries_metric_name,
                                  value=service["config"][database_key_name],
                                  timestamp=int(time.time()),
                                  tags={"service": service["name"]})
                docs.append(timeseries)
            else:
                log_warning(
                    "Missing config key '{0}' in service '{1}'".format(
                        database_key_name, service["name"]), debug)
    return docs
Пример #7
0
    def get_container_usages(self, container_name):
        try:
            container_info = self.opentsdb_handler.get_structure_timeseries(
                {"host": container_name}, self.window_difference,
                self.window_delay, BDWATCHDOG_METRICS,
                REFEEDER_APPLICATION_METRICS)

            for metric in REFEEDER_APPLICATION_METRICS:
                if metric not in CONFIG_DEFAULT_VALUES["GENERATED_METRICS"]:
                    continue
                if container_info[metric] == self.NO_METRIC_DATA_DEFAULT_VALUE:
                    log_warning(
                        "No metric info for {0} in container {1}".format(
                            metric, container_name),
                        debug=True)

        except requests.ConnectionError as e:
            log_error("Connection error: {0} {1}".format(
                str(e), str(traceback.format_exc())),
                      debug=True)
            raise e
        return container_info
    def __get_container_usages(self, container):
        window_difference = get_config_value(self.__config, CONFIG_DEFAULT_VALUES, "WINDOW_TIMELAPSE")
        window_delay = get_config_value(self.__config, CONFIG_DEFAULT_VALUES, "WINDOW_DELAY")

        try:
            # Remote database operation
            usages = self.__opentsdb_handler.get_structure_timeseries({"host": container["name"]},
                                                                      window_difference,
                                                                      window_delay,
                                                                      BDWATCHDOG_CONTAINER_METRICS,
                                                                      GUARDIAN_CONTAINER_METRICS)

            # Skip this structure if all the usage metrics are unavailable
            if all([usages[metric] == self.__NO_METRIC_DATA_DEFAULT_VALUE for metric in usages]):
                log_warning("container: {0} has no usage data".format(container["name"]), self.__debug)
                return None

            return usages
        except Exception as e:
            log_error("error with structure: {0} {1} {2}".format(container["name"], str(e), str(traceback.format_exc())),
                      self.__debug)

            return None
def persist():
    logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO)

    global debug

    myConfig = MyConfig(CONFIG_DEFAULT_VALUES)

    while True:
        log_info("----------------------", debug)
        log_info("Starting Epoch", debug)
        t0 = time.time()

        # Get service info
        service = get_service(db_handler,
                              SERVICE_NAME)  # Remote database operation

        # Heartbeat
        beat(db_handler, SERVICE_NAME)  # Remote database operation

        # CONFIG
        myConfig.set_config(service["config"])
        polling_frequency = myConfig.get_value("POLLING_FREQUENCY")
        debug = myConfig.get_value("DEBUG")
        documents_persisted = myConfig.get_value("DOCUMENTS_PERSISTED")
        SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE")

        log_info("Config is as follows:", debug)
        log_info(".............................................", debug)
        log_info("Polling frequency -> {0}".format(polling_frequency), debug)
        log_info(
            "Documents to be persisted are -> {0}".format(documents_persisted),
            debug)
        log_info(".............................................", debug)

        ## CHECK INVALID CONFIG ##
        # TODO THis code is duplicated on the structures and database snapshoters
        invalid, message = invalid_conf(myConfig)
        if invalid:
            log_error(message, debug)
            time.sleep(polling_frequency)
            if polling_frequency < 4:
                log_error(
                    "Polling frequency is too short, replacing with DEFAULT value '{0}'"
                    .format(CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"]), debug)
                polling_frequency = CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"]

            log_info("----------------------\n", debug)
            time.sleep(polling_frequency)
            continue

        if SERVICE_IS_ACTIVATED:
            for docType in documents_persisted:
                persist_docs(docType)
        else:
            log_warning(
                "Database snapshoter is not activated, will not do anything",
                debug)

        t1 = time.time()
        log_info("Epoch processed in {0} seconds ".format("%.2f" % (t1 - t0)),
                 debug)
        log_info("----------------------\n", debug)

        time.sleep(polling_frequency)
Пример #10
0
    def guard(self, ):
        myConfig = MyConfig(CONFIG_DEFAULT_VALUES)
        logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO)

        while True:
            # Get service info
            service = get_service(self.couchdb_handler, SERVICE_NAME)

            # Heartbeat
            beat(self.couchdb_handler, SERVICE_NAME)

            # CONFIG
            myConfig.set_config(service["config"])
            self.debug = myConfig.get_value("DEBUG")
            debug = self.debug
            self.guardable_resources = myConfig.get_value(
                "GUARDABLE_RESOURCES")
            self.cpu_shares_per_watt = myConfig.get_value(
                "CPU_SHARES_PER_WATT")
            self.window_difference = myConfig.get_value("WINDOW_TIMELAPSE")
            self.window_delay = myConfig.get_value("WINDOW_DELAY")
            self.structure_guarded = myConfig.get_value("STRUCTURE_GUARDED")
            self.event_timeout = myConfig.get_value("EVENT_TIMEOUT")
            SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE")

            t0 = start_epoch(self.debug)

            log_info("Config is as follows:", debug)
            log_info(".............................................", debug)
            log_info("Time window lapse -> {0}".format(self.window_difference),
                     debug)
            log_info("Delay -> {0}".format(self.window_delay), debug)
            log_info("Event timeout -> {0}".format(self.event_timeout), debug)
            log_info(
                "Resources guarded are -> {0}".format(
                    self.guardable_resources), debug)
            log_info(
                "Structure type guarded is -> {0}".format(
                    self.structure_guarded), debug)
            log_info(".............................................", debug)

            ## CHECK INVALID CONFIG ##
            invalid, message = self.invalid_conf()
            if invalid:
                log_error(message, debug)
                if self.window_difference < 5:
                    log_error(
                        "Window difference is too short, replacing with DEFAULT value '{0}'"
                        .format(CONFIG_DEFAULT_VALUES["WINDOW_TIMELAPSE"]),
                        self.debug)
                    self.window_difference = CONFIG_DEFAULT_VALUES[
                        "WINDOW_TIMELAPSE"]
                time.sleep(self.window_difference)
                end_epoch(self.debug, self.window_difference, t0)
                continue

            thread = None
            if SERVICE_IS_ACTIVATED:
                # Remote database operation
                structures = get_structures(self.couchdb_handler,
                                            debug,
                                            subtype=self.structure_guarded)
                if structures:
                    log_info(
                        "{0} Structures to process, launching threads".format(
                            len(structures)), debug)
                    thread = Thread(name="guard_structures",
                                    target=self.guard_structures,
                                    args=(structures, ))
                    thread.start()
                else:
                    log_info("No structures to process", debug)
            else:
                log_warning("Guardian is not activated", debug)

            time.sleep(self.window_difference)

            wait_operation_thread(thread, debug)

            end_epoch(t0, self.window_difference, t0)
Пример #11
0
    def serverless(self, structure, rules):
        structure_subtype = structure["subtype"]

        # Check if structure is guarded
        if "guard" not in structure or not structure["guard"]:
            log_warning(
                "structure: {0} is set to leave alone, skipping".format(
                    structure["name"]), self.debug)
            return

        # Check if the structure has any resource set to guarded
        struct_guarded_resources = list()
        for res in self.guardable_resources:
            if res in structure["resources"] and "guard" in structure[
                    "resources"][res] and structure["resources"][res]["guard"]:
                struct_guarded_resources.append(res)
        if not struct_guarded_resources:
            log_warning(
                "Structure {0} is set to guarded but has no resource marked to guard"
                .format(structure["name"]), self.debug)
            return

        # Check if structure is being monitored, otherwise, ignore
        if structure_subtype not in BDWATCHDOG_METRICS or structure_subtype not in GUARDIAN_METRICS or structure_subtype not in TAGS:
            log_error(
                "Unknown structure subtype '{0}'".format(structure_subtype),
                self.debug)
            return

        try:
            metrics_to_retrieve = list()
            metrics_to_generate = dict()
            for res in struct_guarded_resources:
                metrics_to_retrieve += BDWATCHDOG_METRICS[structure_subtype][
                    res]
                metrics_to_generate[generate_structure_usage_metric(
                    res)] = GUARDIAN_METRICS[structure_subtype][
                        generate_structure_usage_metric(res)]
            tag = TAGS[structure_subtype]

            # Remote database operation
            usages = self.opentsdb_handler.get_structure_timeseries(
                {tag: structure["name"]}, self.window_difference,
                self.window_delay, metrics_to_retrieve, metrics_to_generate)

            for metric in usages:
                if usages[metric] == self.NO_METRIC_DATA_DEFAULT_VALUE:
                    log_warning(
                        "structure: {0} has no usage data for {1}".format(
                            structure["name"], metric), self.debug)

            # Skip this structure if all the usage metrics are unavailable
            if all([
                    usages[metric] == self.NO_METRIC_DATA_DEFAULT_VALUE
                    for metric in usages
            ]):
                log_warning(
                    "structure: {0} has no usage data for any metric, skipping"
                    .format(structure["name"]), self.debug)
                return

            resources = structure["resources"]

            # Remote database operation
            limits = self.couchdb_handler.get_limits(structure)
            limits_resources = limits["resources"]

            if not limits_resources:
                log_warning(
                    "structure: {0} has no limits".format(structure["name"]),
                    self.debug)
                return

            # Adjust the structure limits according to the current value
            limits["resources"] = self.adjust_container_state(
                resources, limits_resources, self.guardable_resources)

            # Remote database operation
            self.couchdb_handler.update_limit(limits)

            self.process_serverless_structure(structure, usages,
                                              limits_resources, rules)

        except Exception as e:
            log_error(
                "Error with structure {0}: {1}".format(structure["name"],
                                                       str(e)), self.debug)
Пример #12
0
    def match_rules_and_events(self, structure, rules, events, limits, usages):
        generated_requests = list()
        events_to_remove = dict()

        for rule in rules:
            # Check that the rule has the required parameters
            rule_invalid = False
            for key in [
                    "active",
                    "resource",
                    "generates",
                    "name",
            ]:
                if key not in rule:
                    log_warning(
                        "Rule: {0} is missing a key parameter '{1}', skipping it"
                        .format(rule["name"], key), self.debug)
                    rule_invalid = True
            if rule_invalid:
                continue

            if rule["generates"] == "requests":
                if "rescale_policy" not in rule or "rescale_type" not in rule:
                    log_warning(
                        "Rule: {0} is missing the 'rescale_type' or the 'rescale_policy' parameter, skipping it"
                        .format(rule["name"]), self.debug)
                    continue

                if rule["rescale_type"] == "up" and "amount" not in rule:
                    log_warning(
                        "Rule: {0} is missing a the amount parameter, skipping it"
                        .format(rule["name"]), self.debug)
                    continue

            resource_label = rule["resource"]

            rule_activated = rule["active"] and \
                             rule["generates"] == "requests" and \
                             resource_label in events and \
                             jsonLogic(rule["rule"], events[resource_label])

            if not rule_activated:
                continue

            # RULE HAS BEEN ACTIVATED

            # If rescaling a container, check that the current resource value exists, otherwise there is nothing to rescale
            if structure_is_container(
                    structure
            ) and "current" not in structure["resources"][resource_label]:
                log_warning(
                    "No current value for container' {0}' and "
                    "resource '{1}', can't rescale".format(
                        structure["name"], resource_label), self.debug)
                continue

            # Get the amount to be applied from the policy set
            if rule["rescale_type"] == "up":
                if rule["rescale_policy"] == "amount":
                    amount = rule["amount"]
                elif rule["rescale_policy"] == "proportional":
                    amount = rule["amount"]
                    current_resource_limit = structure["resources"][
                        resource_label]["current"]
                    upper_limit = limits[resource_label]["upper"]
                    usage = usages[translator_dict[resource_label]]
                    ratio = min((usage - upper_limit) /
                                (current_resource_limit - upper_limit), 1)
                    amount = int(ratio * amount)
                    log_warning(
                        "PROP -> cur : {0} | upp : {1} | usa: {2} | ratio {3} | amount {4}"
                        .format(current_resource_limit, upper_limit, usage,
                                ratio, amount), self.debug)
                else:
                    log_warning(
                        "Invalid rescale policy '{0} for Rule {1}, skipping it"
                        .format(rule["rescale_policy"],
                                rule["name"]), self.debug)
                    continue
            elif rule["rescale_type"] == "down":
                if rule["rescale_policy"] == "amount":
                    amount = rule["amount"]
                elif rule["rescale_policy"] == "fit_to_usage":
                    current_resource_limit = structure["resources"][
                        resource_label]["current"]
                    boundary = limits[resource_label]["boundary"]
                    usage = usages[translator_dict[resource_label]]
                    amount = self.get_amount_from_fit_reduction(
                        current_resource_limit, boundary, usage)
                elif rule["rescale_policy"] == "proportional" and rule[
                        "resource"] == "energy":
                    amount = self.get_amount_from_proportional_energy_rescaling(
                        structure, resource_label)
                else:
                    log_warning(
                        "Invalid rescale policy '{0} for Rule {1}, skipping it"
                        .format(rule["rescale_policy"],
                                rule["name"]), self.debug)
                    continue
            else:
                log_warning(
                    "Invalid rescale type '{0} for Rule {1}, skipping it".
                    format(rule["rescale_type"], rule["name"]), self.debug)
                continue

            # Ensure that amount is an integer, either by converting float -> int, or string -> int
            amount = int(amount)

            # If it is 0, because there was a previous floating value between -1 and 1, set it to 0 so that it does not generate any Request
            if amount == 0:
                log_warning(
                    "Amount generated for structure {0} with rule {1} is 0".
                    format(structure["name"], rule["name"]), self.debug)

            # If the resource is susceptible to check, ensure that it does not surpass any limit
            new_amount = amount
            if resource_label not in NON_ADJUSTABLE_RESOURCES:
                structure_resources = structure["resources"][resource_label]
                structure_limits = limits[resource_label]
                new_amount = self.adjust_amount(amount, structure_resources,
                                                structure_limits)
                if new_amount != amount:
                    log_warning(
                        "Amount generated for structure {0} with rule {1} has been trimmed from {2} to {3}"
                        .format(structure["name"], rule["name"], amount,
                                new_amount), self.debug)

            # Generate the request and append it
            request = self.generate_request(structure, new_amount,
                                            resource_label)
            generated_requests.append(request)

            # Remove the events that triggered the request
            event_name = generate_event_name(events[resource_label]["events"],
                                             resource_label)
            if event_name not in events_to_remove:
                events_to_remove[event_name] = 0
            events_to_remove[event_name] += rule["events_to_remove"]

        return generated_requests, events_to_remove
def persist():
    logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO)

    global resources_persisted
    global debug

    myConfig = MyConfig(CONFIG_DEFAULT_VALUES)

    while True:
        log_info("----------------------", debug)
        log_info("Starting Epoch", debug)
        t0 = time.time()

        # Get service info
        service = get_service(db_handler,
                              SERVICE_NAME)  # Remote database operation

        # Heartbeat
        beat(db_handler, SERVICE_NAME)  # Remote database operation

        # CONFIG
        myConfig.set_config(service["config"])
        polling_frequency = myConfig.get_value("POLLING_FREQUENCY")
        debug = myConfig.get_value("DEBUG")
        resources_persisted = myConfig.get_value("RESOURCES_PERSISTED")
        SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE")
        log_info(
            "Going to snapshot resources: {0}".format(resources_persisted),
            debug)

        log_info("Config is as follows:", debug)
        log_info(".............................................", debug)
        log_info("Polling frequency -> {0}".format(polling_frequency), debug)
        log_info(
            "Resources to be snapshoter are -> {0}".format(
                resources_persisted), debug)
        log_info(".............................................", debug)

        ## CHECK INVALID CONFIG ##
        # TODO This code is duplicated on the structures and database snapshoters
        invalid, message = invalid_conf(myConfig)
        if invalid:
            log_error(message, debug)
            time.sleep(polling_frequency)
            if polling_frequency < 3:
                log_error(
                    "Polling frequency is too short, replacing with DEFAULT value '{0}'"
                    .format(CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"]), debug)
                polling_frequency = CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"]

            log_info("----------------------\n", debug)
            time.sleep(polling_frequency)
            continue

        thread = None
        if SERVICE_IS_ACTIVATED:
            thread = Thread(target=persist_thread, args=())
            thread.start()
        else:
            log_warning(
                "Structure snapshoter is not activated, will not do anything",
                debug)

        time.sleep(polling_frequency)

        wait_operation_thread(thread, debug)

        t1 = time.time()
        time_proc = "%.2f" % (t1 - t0 - polling_frequency)
        time_total = "%.2f" % (t1 - t0)
        log_info(
            "Epoch processed in {0} seconds ({1} processing and {2} sleeping)".
            format(time_total, time_proc, str(polling_frequency)), debug)
        log_info("----------------------\n", debug)