class RedisMetricSource(MetricSource): def __init__(self, parameters): self.rds = redis.StrictRedis(host=parameters['redis_ip'], port=parameters['redis_port']) self.LOG = Log('redis_log', 'redis.log') self.last_metric = 0.0 self.last_timestamp = datetime.datetime.now() def get_most_recent_value(self, app_id): measurement = self.rds.rpop("%s:metrics" % app_id) self.LOG.log("\n%s\n%s\n\n" % (measurement, app_id)) if measurement is not None: measurement = str(measurement, 'utf-8') measurement = ast.literal_eval(measurement) timestamp = datetime.datetime.fromtimestamp( measurement['timestamp'] / 1000) value = float(measurement['value']) if timestamp > self.last_timestamp: self.last_timestamp = timestamp self.last_metric = value return timestamp, value else: return self.last_timestamp, self.last_metric else: return self.last_timestamp, self.last_metric
class K8sActuator: def __init__(self, app_id, k8s_manifest): try: config.load_kube_config(k8s_manifest) except Exception: raise Exception("Couldn't load kube config") self.k8s_api = client.BatchV1Api() self.app_id = app_id self.logger = Log("basic.controller.log", "controller.log") # TODO: validation def adjust_resources(self, replicas, namespace="default"): patch_object = {"spec": {"parallelism": replicas}} try: self.k8s_api.patch_namespaced_job(self.app_id, namespace, patch_object) except Exception as e: self.logger.log(str(e)) # TODO: validation def get_number_of_replicas(self, namespace="default"): all_jobs = self.k8s_api.list_namespaced_job(namespace) for job in all_jobs.items: if job.metadata.name == self.app_id: return job.spec.parallelism
class OpenstackGenericMetricSource(MetricSource): def __init__(self, parameters): self.keypair_path = parameters['keypair_path'] self.host_ip = parameters['host_ip'] self.log_path = parameters['log_path'] self.start_time = parameters['start_time'] self.expected_time = parameters['reference_value'] self.host_username = '******' self.last_checked = '' self.logger = Log("metrics.log", "metrics.log") configure_logging() def _get_metric_value_from_log_line(self, log): value = None for i in range(len(log) - 1, 0, -1): if log[i] == '#': value = float(log[i + 1:-1]) return value def _get_elapsed_time(self): delay = time.time() - self.start_time return delay # This is an auxiliary function to prepare and publish the metric. # The point is to keep # monitoring_application as simple as possible. def _extract_metric_from_log(self, last_log): # Check if this log line contains a new metric measurement. if '[Progress]' in last_log and self.last_checked != last_log: self.last_checked = last_log ref_value = self._get_elapsed_time() / self.expected_time measurement_value = self._get_metric_value_from_log_line(last_log) error = measurement_value - ref_value self.logger.log("ref-value:%f|measurement-value:%f|error:%f" % (ref_value, measurement_value, error)) return 100 * error # Flag that checks if the log capture is ended elif '[END]' in last_log: self.running = False def _monitoring_application(self): try: result = SSHUtils().run_and_get_result( "sudo tail -1 %s" % self.log_path, self.host_username, self.host_ip, self.keypair_path) timestamp = datetime.datetime.fromtimestamp(time.time()) return timestamp, self._extract_metric_from_log(result) except Exception as ex: print "Monitoring is not possible. \nError: %s" % (ex.message) raise ex def get_most_recent_value(self, metric_name, options): return self._monitoring_application()
class SingleApplicationController(Controller): def __init__(self, application_id, plugin_info): self.logger = Log("single.controller.log", "controller.log") configure_logging() plugin_info = plugin_info["plugin_info"] self.application_id = application_id self.instances = plugin_info["instances"] self.check_interval = plugin_info["check_interval"] self.trigger_down = plugin_info["trigger_down"] self.trigger_up = plugin_info["trigger_up"] self.min_cap = plugin_info["min_cap"] self.max_cap = plugin_info["max_cap"] self.actuation_size = plugin_info["actuation_size"] self.metric_rounding = plugin_info["metric_rounding"] self.actuator_type = plugin_info["actuator"] self.metric_source_type = plugin_info["metric_source"] self.running = True self.running_lock = threading.RLock() metric_source = MetricSourceBuilder().get_metric_source( self.metric_source_type, plugin_info) actuator = ActuatorBuilder().get_actuator(self.actuator_type, plugin_info) self.alarm = BasicAlarm(actuator, metric_source, self.trigger_down, self.trigger_up, self.min_cap, self.max_cap, self.actuation_size, self.metric_rounding) def start_application_scaling(self): run = True while run: self.logger.log("Monitoring application: %s" % (self.application_id)) self.alarm.check_application_state(self.application_id, self.instances) time.sleep(float(self.check_interval)) with self.running_lock: run = self.running def stop_application_scaling(self): with self.running_lock: self.running = False def status(self): return ""
class _BasicControllerThread(): def __init__(self, applications, applications_lock, alarm, check_interval): self.logger = Log("basic.controller_thread.log", "controller.log") configure_logging() self.applications = applications self.applications_lock = applications_lock self.alarm = alarm self.check_interval = check_interval self.running = True def start(self): self.logger.log("Starting controller thread") while self.running: # acquire lock, check applications and wait with self.applications_lock: self.logger.log("Monitoring applications: %s" % (str(self.applications.keys()))) applications_ids = self.applications.keys() # for each application check state for application_id in applications_ids: instances = self.applications[application_id]["instances"] self.logger.log("Checking application:%s|instances:%s" % (application_id, instances)) self.alarm.check_application_state(application_id, instances) time.sleep(float(self.check_interval))
class BasicController(Controller): def __init__(self, metric_source, actuator, plugin_info): # Set up logging self.logger = Log("basic.controller.log", "controller.log") configure_logging() check_interval = plugin_info["check_interval"] trigger_down = plugin_info["trigger_down"] trigger_up = plugin_info["trigger_up"] min_cap = plugin_info["min_cap"] max_cap = plugin_info["max_cap"] actuation_size = plugin_info["actuation_size"] metric_rounding = plugin_info["metric_rounding"] # Start alarm self.alarm = BasicAlarm(actuator, metric_source, trigger_down, trigger_up, min_cap, max_cap, actuation_size, metric_rounding) # Start up controller thread # Create lock to access application list self.applications_lock = threading.RLock() self.applications = {} self.controller = _BasicControllerThread(self.applications, self.applications_lock, self.alarm, check_interval) self.controller_thread = threading.Thread(target=self.controller.start) self.controller_thread.start() def start_application_scaling(self, app_id, plugin_info): self.logger.log("Adding application id: %s" % (app_id)) # Acquire lock and add application with self.applications_lock: self.applications[app_id] = plugin_info def stop_application_scaling(self, app_id): # Acquire lock and remove application with self.applications_lock: if app_id in self.applications.keys(): self.logger.log("Removing application id: %s" % (app_id)) self.applications.pop(app_id) else: self.logger.log("Application %s not found" % (app_id)) def stop_controller(self): self.controller.running = False def status(self): return ""
class TendencyAwareProportionalController(Controller): def __init__(self, application_id, plugin_info): self.logger = Log("tendency.proportional.controller.log", "controller.log") configure_logging() plugin_info = plugin_info["plugin_info"] self.application_id = application_id self.instances = plugin_info["instances"] self.check_interval = plugin_info["check_interval"] self.trigger_down = plugin_info["trigger_down"] self.trigger_up = plugin_info["trigger_up"] self.min_cap = plugin_info["min_cap"] self.max_cap = plugin_info["max_cap"] self.metric_rounding = plugin_info["metric_rounding"] self.actuation_size = plugin_info["actuation_size"] self.actuator_type = plugin_info["actuator"] self.metric_source_type = plugin_info["metric_source"] self.running = True self.running_lock = threading.RLock() # Gets a new metric source plugin using the given name metric_source = MetricSourceBuilder().get_metric_source( self.metric_source_type, plugin_info) # Gets a new actuator plugin using the given name actuator = ActuatorBuilder().get_actuator(self.actuator_type, plugin_info) """ The alarm here is responsible for deciding whether to scale up or down, or even do nothing """ self.alarm = TendencyAwareProportionalAlarm(actuator, metric_source, self.trigger_down, self.trigger_up, self.min_cap, self.max_cap, self.actuation_size, self.metric_rounding) def start_application_scaling(self): run = True while run: self.logger.log("Monitoring application: %s" % (self.application_id)) # Call the alarm to check the application try: self.alarm.check_application_state(self.application_id, self.instances) except MetricNotFoundException: self.logger.log("No metrics available") except Exception as e: self.logger.log(str(e)) # Wait some time time.sleep(float(self.check_interval)) with self.running_lock: run = self.running def stop_application_scaling(self): with self.running_lock: self.running = False def status(self): return self.alarm.status()
class BasicAlarm: # TODO: Think about these constants placements PROGRESS_METRIC_NAME = "spark.job_progress" ELAPSED_TIME_METRIC_NAME = 'spark.elapsed_time' def __init__(self, actuator, metric_source, trigger_down, trigger_up, min_cap, max_cap, actuation_size, metric_rounding): # TODO: Check parameters self.metric_source = metric_source self.actuator = actuator self.trigger_down = trigger_down self.trigger_up = trigger_up self.min_cap = min_cap self.max_cap = max_cap self.actuation_size = actuation_size self.metric_rounding = metric_rounding self.logger = Log("basic.alarm.log", "controller.log") configure_logging() self.last_time_progress_timestamp = datetime.datetime.strptime( "0001-01-01T00:00:00.0Z", '%Y-%m-%dT%H:%M:%S.%fZ') self.last_job_progress_timestamp = datetime.datetime.strptime( "0001-01-01T00:00:00.0Z", '%Y-%m-%dT%H:%M:%S.%fZ') def get_job_progress(self, application_id): job_progress_measurement = self.metric_source.get_most_recent_value( Basic_Alarm.PROGRESS_METRIC_NAME, {"application_id": application_id}) job_progress_timestamp = job_progress_measurement[0] job_progress = job_progress_measurement[1] job_progress = round(job_progress, self.metric_rounding) return job_progress_timestamp, job_progress def get_time_progress(self, application_id): time_progress_measurement = self.metric_source.get_most_recent_value( Basic_Alarm.ELAPSED_TIME_METRIC_NAME, {"application_id": application_id}) time_progress_timestamp = time_progress_measurement[0] time_progress = time_progress_measurement[1] time_progress = round(time_progress, self.metric_rounding) return time_progress_timestamp, time_progress def check_measurements_are_new(self, job_progress_timestamp, time_progress_timestamp): return (self.last_job_progress_timestamp < job_progress_timestamp and self.last_time_progress_timestamp < time_progress_timestamp) def check_application_state(self, application_id, instances): # TODO: Check parameters try: self.logger.log("Getting progress") job_progress_timestamp, job_progress = self.get_job_progress( application_id) self.logger.log("Getting time progress") time_progress_timestamp, time_progress = self.get_time_progress( application_id) self.logger.log("Progress-[%s]-%f|Time progress-[%s]-%f" % (str(job_progress_timestamp), job_progress, str(time_progress_timestamp), time_progress)) if self.check_measurements_are_new(job_progress_timestamp, time_progress_timestamp): diff = job_progress - time_progress self.scale_down(diff, instances) self.scale_up(diff, instances) self.last_job_progress_timestamp = job_progress_timestamp self.last_time_progress_timestamp = time_progress_timestamp else: self.logger.log("Could not acquire more recent metrics") except Exception: # TODO: Check exception type self.logger.log("Could not get metrics") return def scale_down(self, diff, instances): if diff > 0 and diff >= self.trigger_down: self.logger.log("Scaling down") cap = self.actuator.get_allocated_resources(instances[0]) new_cap = max(cap - self.actuation_size, self.min_cap) cap_instances = {instance: new_cap for instance in instances} self.actuator.adjust_resources(cap_instances) def scale_up(self, diff, instances): if diff < 0 and abs(diff) >= self.trigger_up: self.logger.log("Scaling up") cap = self.actuator.get_allocated_resources(instances[0]) new_cap = min(cap + self.actuation_size, self.max_cap) cap_instances = {instance: new_cap for instance in instances} self.actuator.adjust_resources(cap_instances)
""" Validate if really exists a section to listed plugins """ for plugin in actuator_plugins: if plugin != '' and plugin not in config.sections(): raise Exception("plugin '%s' section missing" % plugin) for plugin in metric_source_plugins: if plugin != '' and plugin not in config.sections(): raise Exception("plugin '%s' section missing" % plugin) if 'monasca' in metric_source_plugins: monasca_endpoint = config.get('monasca', 'monasca_endpoint') monasca_username = config.get('monasca', 'username') monasca_password = config.get('monasca', 'password') monasca_auth_url = config.get('monasca', 'auth_url') monasca_project_name = config.get('monasca', 'project_name') monasca_api_version = config.get('monasca', 'api_version') if 'k8s_replicas' in actuator_plugins: # Setting default value k8s_manifest = "./data/conf" # If explicitly stated in the cfg file, overwrite the variable if (config.has_section('k8s_replicas')): if (config.has_option('k8s_replicas', 'k8s_manifest')): k8s_manifest = config.get("k8s_replicas", "k8s_manifest") except Exception as e: LOG.log("Error: %s" % e) quit()
class MonascaClient: def __init__(self): self.monasca_username = api.monasca_username self.monasca_password = api.monasca_password self.monasca_auth_url = api.monasca_auth_url self.monasca_project_name = api.monasca_project_name self.monasca_api_version = api.monasca_api_version self._get_monasca_client() self.LOG = Log('monasca_client_log', 'monasca_client.log') def get_measurements(self, metric_name, dimensions, start_time='2014-01-01T00:00:00Z'): measurements = [] try: monasca_client = self._get_monasca_client() dimensions = {'application_id': dimensions['application_id']} measurements = monasca_client.metrics.list_measurements( name=metric_name, dimensions=dimensions, start_time=start_time, debug=False) except exc.HTTPException as httpex: self.LOG.log(httpex) except Exception as ex: self.LOG.log(ex) if len(measurements) > 0: return measurements[0]['measurements'] else: return None def first_measurement(self, name, dimensions): return ( [None, None, None] if self.get_measurements(name, dimensions) is None else self.get_measurements(name, dimensions)[0]) def last_measurement(self, name, dimensions): measurements = self.get_measurements(name, dimensions) if measurements is None: raise MetricNotFoundException() else: return measurements[-1] def _get_monasca_client(self): # Authenticate to Keystone ks = ksclient.KSClient( auth_url=self.monasca_auth_url, username=self.monasca_username, password=self.monasca_password, project_name=self.monasca_project_name, debug=False ) # Monasca Client monasca_client = monclient.Client(self.monasca_api_version, ks.monasca_url, token=ks.token, debug=False) return monasca_client def send_metrics(self, measurements): batch_metrics = {'jsonbody': measurements} try: monasca_client = self._get_monasca_client() monasca_client.metrics.create(**batch_metrics) except exc.HTTPException as httpex: self.LOG.log(httpex) except Exception as ex: self.LOG.log(ex)
class TendencyAwareProportionalAlarm: ERROR_METRIC_NAME = "application-progress.error" def __init__(self, actuator, metric_source, trigger_down, trigger_up, min_cap, max_cap, actuation_size, metric_rounding): self.metric_source = metric_source self.actuator = actuator self.trigger_down = trigger_down self.trigger_up = trigger_up self.min_cap = min_cap self.max_cap = max_cap self.metric_rounding = metric_rounding self.actuation_size = actuation_size self.logger = Log("proportional.alarm.log", "controller.log") self.cap_logger = Log("cap.log", "cap.log") configure_logging() self.last_progress_error_timestamp = datetime.datetime.strptime( "0001-01-01T00:00:00.0Z", '%Y-%m-%dT%H:%M:%S.%fZ') self.last_progress_error = None self.cap = -1 self.last_action = "" def check_application_state(self, application_id, instances): """ Checks the application progress by getting progress metrics from a metric source, checks if the metrics are new and tries to modify the amount of allocated resources if necessary. """ self.logger.log("Getting progress error") self.last_action = "getting progress error" # Get the progress error value and timestamp progress_error_timestamp, progress_error = self._get_progress_error( application_id) self.logger.log("Progress error-[%s]-%f" % (str(progress_error_timestamp), progress_error)) self.last_action = "Progress error-[%s]-%f" % ( str(progress_error_timestamp), progress_error) """ Check if the metric is new by comparing the timestamps of the current metric and most recent metric """ if self._check_measurements_are_new(progress_error_timestamp): self._scale(progress_error, instances) if self.cap != -1: self.cap_logger.log("%.0f|%s|%s" % ( time.time(), str(application_id), str(self.cap))) self.last_progress_error = progress_error self.last_progress_error_timestamp = progress_error_timestamp else: self.logger.log("Could not acquire more recent metrics") def _scale(self, progress_error, instances): # If error is positive and its absolute value is too high, scale down if progress_error > 0 and progress_error >= self.trigger_down: self._scale_down(instances) # If the error is negative and its absolute value is too high, scale up elif progress_error < 0 and abs(progress_error) >= self.trigger_up: self._scale_up(instances) else: self._tendency_scale(progress_error, instances) def _scale_down(self, instances): self.logger.log("Scaling down") self.last_action = "Getting allocated resources" # Get current CPU cap cap = self.actuator.get_allocated_resources_to_cluster(instances) new_cap = max(cap - self.actuation_size, self.min_cap) self.logger.log("Scaling from %d to %d" % (cap, new_cap)) self.last_action = "Scaling from %d to %d" % (cap, new_cap) # Currently, we use the same cap for all the vms cap_instances = {instance: new_cap for instance in instances} # Set the new cap self.actuator.adjust_resources(cap_instances) self.cap = new_cap def _scale_up(self, instances): self.logger.log("Scaling up") self.last_action = "Getting allocated resources" # Get current CPU cap cap = self.actuator.get_allocated_resources_to_cluster(instances) new_cap = min(cap + self.actuation_size, self.max_cap) self.logger.log("Scaling from %d to %d" % (cap, new_cap)) self.last_action = "Scaling from %d to %d" % (cap, new_cap) # Currently, we use the same cap for all the vms cap_instances = {instance: new_cap for instance in instances} # Set the new cap self.actuator.adjust_resources(cap_instances) self.cap = new_cap def _tendency_scale(self, progress_error, instances): if self.last_progress_error is not None: difference = progress_error - self.last_progress_error else: difference = 0.0 if difference < 0.0: cap = self.actuator.get_allocated_resources_to_cluster(instances) new_cap = min(cap + self.actuation_size, self.max_cap) self.logger.log("Scaling from %d to %d" % (cap, new_cap)) self.last_action = "Scaling from %d to %d" % (cap, new_cap) cap_instances = {instance: new_cap for instance in instances} self.actuator.adjust_resources(cap_instances) self.cap = new_cap elif difference > 0.0: cap = self.actuator.get_allocated_resources_to_cluster(instances) new_cap = max(cap - self.actuation_size, self.min_cap) self.logger.log("Scaling from %d to %d" % (cap, new_cap)) self.last_action = "Scaling from %d to %d" % (cap, new_cap) cap_instances = {instance: new_cap for instance in instances} self.actuator.adjust_resources(cap_instances) self.cap = new_cap def _get_progress_error(self, application_id): progress_error_measurement = \ self.metric_source.get_most_recent_value( TendencyAwareProportionalAlarm.ERROR_METRIC_NAME, {"application_id": application_id} ) progress_error_timestamp = progress_error_measurement[0] progress_error = progress_error_measurement[1] progress_error = round(progress_error, self.metric_rounding) return progress_error_timestamp, progress_error def _check_measurements_are_new(self, progress_error_timestamp): return self.last_progress_error_timestamp < progress_error_timestamp