def __init__(self, config_parser): try: self.slo = 700 self.weight_slow_violation = WEIGHT_SLO_VIOLATION self.web_monitoring_data = {} self.backend_monitoring_data = {} self.proxy_monitoring_data = {} self.last_change_time = 0 self.last_scaling_operation = 0 self.calculate_scaling_error = False self.predictor = Prediction_Models(logger) self.trigger_prediction = 0 ## FIXME: Size is 5 due to an excessive number of items to be predicted, please repair this part. ##, as we want to store the monitoring data during 60min, considering 5min between iterations self.predictorScaler_cpu_usage_1h = Queue( [] , 5) self.predictorScaler_req_rate_1h = Queue( [] , 5) self.forecast_model_selected = 0 self.forecast_resp_predicted = 0 self.forecast_list = {} self.pool_predictors = ThreadPool(processes=5) self.killed_backends = [] self.trigger_weight_balancing = False self.autoscaling_running = True self.iaas_driver = config_parser.get('iaas', 'DRIVER').upper() self.cost_controller = Cost_Controller(logger, self.iaas_driver) ## Parameters to establish a preference for selecting the most appropriate resource. self.optimal_scaling = Strategy_Finder(logger, self.iaas_driver, self.cost_controller, 'low', True, self.weight_slow_violation) self.stat_utils = StatUtils() self.monitoring = Monitoring_Controller( logger, self.cost_controller, config_parser, '/root/config.cfg', MANAGER_HOST, MANAGER_PORT, PS_RUNNING, self.ganglia_rrd_dir) self.dyc_load_balancer = Dynamic_Load_Balancer(logger, MANAGER_HOST, MANAGER_PORT, client) self.profiler = Profiler(logger, self.slo, self.cost_controller, MAX_CPU_USAGE, MIN_CPU_USAGE, UPPER_THRS_SLO, LOWER_THRS_SLO ) except Exception as e: logger.critical('Scaler: Error when initializing the ProvisioningManager in scaler.py \n' + str(e))
class ProvisioningManager: """ The ProvisioningManager takes decisions about adding and removing nodes from the service. """ ## FIXME: CHANGED TO BE ADAPTED TO QCOW2 BOOTING TIME. ### time_between_changes = TIME_BTW_SCALING_ACTIONS time_between_scaling_predictions = TIME_BTW_SCALING_PREDICTIONS ########################################################## ganglia_rrd_dir = '/var/lib/ganglia/rrds/conpaas/' def __init__(self, config_parser): try: self.slo = 700 self.weight_slow_violation = WEIGHT_SLO_VIOLATION self.web_monitoring_data = {} self.backend_monitoring_data = {} self.proxy_monitoring_data = {} self.last_change_time = 0 self.last_scaling_operation = 0 self.calculate_scaling_error = False self.predictor = Prediction_Models(logger) self.trigger_prediction = 0 ## FIXME: Size is 5 due to an excessive number of items to be predicted, please repair this part. ##, as we want to store the monitoring data during 60min, considering 5min between iterations self.predictorScaler_cpu_usage_1h = Queue( [] , 5) self.predictorScaler_req_rate_1h = Queue( [] , 5) self.forecast_model_selected = 0 self.forecast_resp_predicted = 0 self.forecast_list = {} self.pool_predictors = ThreadPool(processes=5) self.killed_backends = [] self.trigger_weight_balancing = False self.autoscaling_running = True self.iaas_driver = config_parser.get('iaas', 'DRIVER').upper() self.cost_controller = Cost_Controller(logger, self.iaas_driver) ## Parameters to establish a preference for selecting the most appropriate resource. self.optimal_scaling = Strategy_Finder(logger, self.iaas_driver, self.cost_controller, 'low', True, self.weight_slow_violation) self.stat_utils = StatUtils() self.monitoring = Monitoring_Controller( logger, self.cost_controller, config_parser, '/root/config.cfg', MANAGER_HOST, MANAGER_PORT, PS_RUNNING, self.ganglia_rrd_dir) self.dyc_load_balancer = Dynamic_Load_Balancer(logger, MANAGER_HOST, MANAGER_PORT, client) self.profiler = Profiler(logger, self.slo, self.cost_controller, MAX_CPU_USAGE, MIN_CPU_USAGE, UPPER_THRS_SLO, LOWER_THRS_SLO ) except Exception as e: logger.critical('Scaler: Error when initializing the ProvisioningManager in scaler.py \n' + str(e)) def log_monitoring_data(self): logger.debug('**** Web monitoring data: *****\n' + str(self.web_monitoring_data)) logger.debug('**** Backend monitoring data: *****\n' + str(self.backend_monitoring_data)) logger.debug('**** Proxy monitoring data: *****\n' + str(self.proxy_monitoring_data)) def prediction_evaluation_proxy(self, proxy_ip, php_resp_data): try: forecast_list_aux = {} php_resp_filtered = [] for i in php_resp_data: if i > 0: php_resp_filtered.append(i) logger.debug("PhP response time list data: "+str(php_resp_filtered)) async_result_ar = self.pool_predictors.apply_async(self.predictor.auto_regression, (php_resp_filtered,30)) async_result_lr = self.pool_predictors.apply_async(self.predictor.linear_regression, (php_resp_filtered,30)) async_result_exp_smoothing = self.pool_predictors.apply_async(self.predictor.exponential_smoothing, (php_resp_filtered,12)) async_result_var = self.pool_predictors.apply_async(self.predictor.vector_auto_regression, (php_resp_filtered, php_resp_filtered, 30)) # async_result_arma = self.pool_predictors.apply_async(self.predictor.arma, (php_resp_filtered,30)) forecast_list_aux[1] = async_result_lr.get() forecast_list_aux[2] = async_result_exp_smoothing.get() forecast_list_aux[3] = async_result_var.get() forecast_list_aux[0] = async_result_ar.get() # forecast_list_aux[0] = async_result_arma.get() try: logger.debug("Getting the forecast response time for the best model in the previous iteration "+str(self.forecast_model_selected)) weight_avg_predictions = self.stat_utils.compute_weight_average(forecast_list_aux[self.forecast_model_selected]) if weight_avg_predictions > 0: self.forecast_resp_predicted = weight_avg_predictions logger.debug("Prediction value for model "+str(self.forecast_model_selected)+"-- Prediction php_resp_time: "+str(self.forecast_resp_predicted)) except Exception as e: logger.warning("Warning trying to predict a future value for the model." + str(e)) self.forecast_list[proxy_ip] = forecast_list_aux except Exception as e: logger.error("Error trying to predict the future response_time values. "+ str(e))\ def store_predictorScaler_workload(self, cpu_usage, req_rate): list_cpu = [] list_req_rate = [] for cpu, req_rate in itertools.izip(cpu_usage, req_rate): if cpu > 10 and req_rate > 0: list_cpu.append(cpu) list_req_rate.append(req_rate) logger.debug("store_predictorScaler_workload: Filtered cpu "+str(list_cpu)) logger.debug("store_predictorScaler_workload: Filtered req_rate "+str(list_req_rate)) self.predictorScaler_cpu_usage_1h.push(list_cpu) self.predictorScaler_req_rate_1h.push(list_req_rate) ## logger.debug("store_proxy_workload: Proxy historic data "+ str(self.historic_proxy_1h.q)) def calculate_error_prediction(self, php_resp_data, ip): forecast_list_aux = {} min_error_prediction = 1000000 forecast_model = 0 #forecast_resp = 0 try: logger.debug("calculate_error_prediction: with ip: "+str(ip)) # php_resp_data = self.proxy_monitoring_data[ip]['php_response_time_lb'] #php_resp_data = [x for x in php_resp_data[0:30]] forecast_list_aux = self.forecast_list[ip] logger.debug("calculate_error_prediction: once obtained the forecast_list. ") weight_avg_current = self.stat_utils.compute_weight_average_response(php_resp_data, self.slo, self.weight_slow_violation) # try: # weight_avg_predictions = self.stat_utils.compute_weight_average(forecast_list_aux[0]) # prediction_error = math.fabs( weight_avg_current - weight_avg_predictions ) # if min_error_prediction > prediction_error and prediction_error > 0: # forecast_model = 0 # min_error_prediction = prediction_error # forecast_resp = weight_avg_predictions # logger.debug("Prediction error ARMA with php_resp_time: "+str(weight_avg_current)+" -- Prediction php_resp_time: "+str(weight_avg_predictions)+" Error: "+str(prediction_error)) #except Exception as e: # logger.warning("Warning trying to predict the error estimate for ARMA." + str(e)) try: weight_avg_predictions = self.stat_utils.compute_weight_average(forecast_list_aux[0]) prediction_error = math.fabs( weight_avg_current - weight_avg_predictions ) if min_error_prediction > prediction_error and prediction_error > 0: forecast_model = 0 min_error_prediction = prediction_error #forecast_resp = weight_avg_predictions logger.debug("Prediction error AR with php_resp_time: "+str(weight_avg_current)+" -- Prediction php_resp_time: "+str(weight_avg_predictions)+" Error: "+str(prediction_error)) except Exception as e: logger.warning("Warning trying to predict the error estimate for AR." + str(e)) try: weight_avg_predictions = self.stat_utils.compute_weight_average(forecast_list_aux[1]) prediction_error = math.fabs( weight_avg_current - weight_avg_predictions ) if min_error_prediction > prediction_error and prediction_error > 0: forecast_model = 1 min_error_prediction = prediction_error #forecast_resp = weight_avg_predictions logger.debug("Prediction error LR with php_resp_time: "+str(weight_avg_current)+" -- Prediction php_resp_time: "+str(weight_avg_predictions)+" Error: "+str(prediction_error)) except Exception as e: logger.warning("Warning trying to predict the error estimate for LR." + str(e)) try: # php_resp_data = [x for x in php_resp_data[0:12]] weight_avg_predictions = self.stat_utils.compute_weight_average(forecast_list_aux[2]) prediction_error = math.fabs( weight_avg_current - weight_avg_predictions ) if min_error_prediction > prediction_error and prediction_error > 0: forecast_model = 2 min_error_prediction = prediction_error #forecast_resp = weight_avg_predictions logger.debug("Prediction error EXP. SMOOTHING with php_resp_time: "+str(weight_avg_current)+" -- Prediction php_resp_time: "+str(weight_avg_predictions)+" Error: "+str(prediction_error)) except Exception as e: logger.warning("Warning trying to predict the error estimate for EXP. SMOOTHING." + str(e)) try: # php_resp_data = [x for x in php_resp_data[0:12]] weight_avg_predictions = self.stat_utils.compute_weight_average(forecast_list_aux[3]) prediction_error = math.fabs( weight_avg_current - weight_avg_predictions ) if min_error_prediction > prediction_error and prediction_error > 0: forecast_model = 3 min_error_prediction = prediction_error #forecast_resp = weight_avg_predictions logger.debug("Prediction error VAR with php_resp_time: "+str(weight_avg_current)+" -- Prediction php_resp_time: "+str(weight_avg_predictions)+" Error: "+str(prediction_error)) except Exception as e: logger.warning("Warning trying to predict the error estimate for VAR." + str(e)) self.forecast_model_selected = forecast_model except Exception as ex: logger.error("Error trying to predict the error estimate for the different models. " + str(ex)) def obtain_prediciton_decision(self, greater, slo): logger.info("Obtain_prediciton_decision model: " + str(self.forecast_model_selected) + " Php resp: "+str(self.forecast_resp_predicted) + " SLO: "+str(slo)) if self.forecast_resp_predicted == 0: logger.critical("Forecast_resp_predicted: ERROR all the prediction models failed predicting a future value. ") return True if (greater and self.forecast_resp_predicted > slo): logger.info("Forecast_resp_predicted is greater than slo. ") return True elif not greater and self.forecast_resp_predicted < slo: logger.info("Forecast_resp_predicted is lower than slo. ") return True else: logger.info("Forecast_resp_predicted don't do anything. ") return False def calculate_strategy(self, avg_cpu_backends_now, backend_nodes, req_rate_backends, cpu_usage_backends): logger.info("calculate_strategy: req_rate_backends "+str(req_rate_backends)+" cpu: "+str(cpu_usage_backends)) max_performance_throughtput = MAX_CPU_USAGE min_performance_throughtput = MIN_CPU_USAGE ## In cloud infrastructures, VMes perform close to the SL O violation with cpu usage quite lower than 75. if avg_cpu_backends_now < MAX_CPU_USAGE: max_performance_throughtput = avg_cpu_backends_now capacity_inst_type = {} combination_machines = [] ## Initialize the maximum capacity for each type of instance based on the monitoring data ### for name, cost in self.cost_controller.get_instance_prices()[self.iaas_driver].iteritems(): capacity_max_inst_type = self.profiler.calculate_ideal_throughput(name) logger.info("Calculate max capacity instance "+str(name)+" : "+str(capacity_max_inst_type)) capacity_inst_type[name] = capacity_max_inst_type for backend_node in backend_nodes: inst_type = self.cost_controller.get_type_instance(backend_node.ip) combination_machines.append(inst_type) list_req_rate_data = [] for value in self.predictorScaler_req_rate_1h.q: list_req_rate_data.extend(value) list_cpu_data = [] for value in self.predictorScaler_cpu_usage_1h.q: list_cpu_data.extend(value) logger.info("calculate_strategy: list_cpu_data "+str(list_cpu_data)) strategy = self.optimal_scaling.calculate_adaptive_scaling(backend_nodes, combination_machines, cpu_usage_backends, req_rate_backends, max_performance_throughtput, min_performance_throughtput, capacity_inst_type, list_cpu_data, list_req_rate_data) logger.info("calculate_strategy: Final strategy: "+str(strategy)) return strategy def consolidate_vmes(self, nodes): logger.info("consolidate_vmes: Initializing consolidate ") ## Check if the rest of nodes can support the req_rate of another VM, then we release it. for node in nodes: req_rate = self.stat_utils.compute_weight_average(self.stat_utils.filter_cpu_data(self.backend_monitoring_data[node.ip]['php_request_rate'])) inst_type = self.cost_controller.get_type_instance(node.ip) try: compute_units = self.optimal_scaling.get_compute_units(inst_type) ## Verify if we can remove the machine if self.cost_controller.cost_shutdown_constraint(node.ip): for node_aux in nodes: if not node_aux.ip in node.ip: inst_type_check = self.cost_controller.get_type_instance(node_aux.ip) try: (cpu_inst, req_rate_inst) = self.profiler.get_vm_type_max_throughput(inst_type_check) if req_rate_inst > 0: req_rate_check = self.stat_utils.compute_weight_average(self.stat_utils.filter_cpu_data(self.backend_monitoring_data[node_aux.ip]['php_request_rate'])) if ( (req_rate_inst - req_rate_check) >= req_rate): return node.ip ## No maximum data so lets check other possibilities else: if self.optimal_scaling.get_compute_units(inst_type_check) > compute_units: return node.ip except: if self.optimal_scaling.get_compute_units(inst_type_check) > compute_units: return node.ip except Exception: logger.critical("consolidate_vmes: ERROR when trying to remove a vm with ip: "+str(node.ip)) ## There is not any possible vm to be released... return '' def decide_actions(self): n_web_to_add = n_web_to_remove = 0 n_backend_to_add = n_backend_to_remove = 0 avg_web_req_rate_lb = avg_web_resp_time_lb = 0 avg_backend_req_rate_lb = avg_backend_resp_time_lb = 0 avg_cpu_user_backend = avg_cpu_web = 0 #backends_req_rate = 0 ret = {'add_web_nodes': 0, 'remove_web_nodes': 0, 'add_backend_nodes': 0, 'remove_backend_nodes': 0, 'vm_backend_instance': 'small', 'vm_web_instance': 'small', 'node_ip_remove':''} perf_info = self.monitoring._performance_info_get() web_nodes = perf_info.getWebServiceNodes() backend_nodes = perf_info.getBackendServiceNodes() proxy_nodes = perf_info.getProxyServiceNodes() self.profiler.store_instance_workload(backend_nodes, self.backend_monitoring_data) current_time = time() if (current_time - self.last_change_time < self.time_between_changes): self.trigger_weight_balancing = True logger.info('Configuration was recently updated, not making any decisions for now...') return ret # For the moment we assume only 1 proxy node for proxy_node in proxy_nodes: if ( self.trigger_prediction == 1): self.calculate_error_prediction(self.proxy_monitoring_data[proxy_node.ip]['php_response_time_lb'],proxy_node.ip) self.trigger_prediction = 0 avg_web_resp_time_lb = self.stat_utils.compute_weight_average_response(self.proxy_monitoring_data[proxy_node.ip]['web_response_time_lb'], self.slo, self.weight_slow_violation) avg_web_req_rate_lb = self.stat_utils.compute_weight_average(self.proxy_monitoring_data[proxy_node.ip]['web_request_rate_lb']) logger.debug('Found average value for proxy web request rate: %s web response time: %s' \ % (str(avg_web_req_rate_lb), str(avg_web_resp_time_lb))) if avg_web_resp_time_lb > UPPER_THRS_SLO * self.slo: n_web_to_add = 1 if avg_web_resp_time_lb < LOWER_THRS_SLO * self.slo: n_web_to_remove = 1 avg_backend_req_rate_lb = self.stat_utils.compute_weight_average(self.proxy_monitoring_data[proxy_node.ip]['php_request_rate_lb']) #proxy_backends_filtered_data = self.stat_utils.filter_response_data(self.proxy_monitoring_data[proxy_node.ip]['php_response_time_lb']) proxy_backends_filtered_data = self.proxy_monitoring_data[proxy_node.ip]['php_response_time_lb'] avg_backend_resp_time_lb = self.stat_utils.compute_weight_average_response(proxy_backends_filtered_data, self.slo, self.weight_slow_violation ) logger.debug('Found average value for proxy backend request rate: %s backend response time: %s' \ % (str(avg_backend_req_rate_lb), str(avg_backend_resp_time_lb))) ##### TRIGGER PREDICTION EVALUATION ####### if (avg_backend_resp_time_lb > UPPER_THRS_PREDICTION * self.slo ): self.trigger_prediction = 1 self.prediction_evaluation_proxy(proxy_node.ip, proxy_backends_filtered_data) if (avg_backend_resp_time_lb < LOWER_THRS_PREDICTION * self.slo ): self.trigger_prediction = 1 self.prediction_evaluation_proxy(proxy_node.ip, proxy_backends_filtered_data) ############################################ if avg_backend_resp_time_lb > UPPER_THRS_SLO * self.slo and self.obtain_prediciton_decision( True, UPPER_THRS_SLO * self.slo): n_backend_to_add = 1 if avg_backend_resp_time_lb < 10 or (avg_backend_resp_time_lb < LOWER_THRS_SLO * self.slo and self.obtain_prediciton_decision( False, LOWER_THRS_SLO * self.slo)): n_backend_to_remove = 1 #### CPU AVERAGE VERIFICATION ##### for web_node in web_nodes: avg_cpu_web_node = self.stat_utils.compute_weight_average(self.web_monitoring_data[web_node.ip]['cpu_user']) logger.info('Average CPU usage per Web with IP '+web_node.ip+' '+str(avg_cpu_web_node)) avg_cpu_web += avg_cpu_web_node avg_cpu_web = float(avg_cpu_web) / len(web_nodes) req_rate_backends_list = [] cpu_backends_list = [] sum_php_req_rate_backends = 0 for backend_node in backend_nodes: cpu_backends = self.stat_utils.filter_cpu_data(self.backend_monitoring_data[backend_node.ip]['cpu_user']) avg_cpu_user_node = self.stat_utils.compute_weight_average(cpu_backends) req_rate_backends = self.stat_utils.filter_cpu_data(self.backend_monitoring_data[backend_node.ip]['php_request_rate']) php_req_rate_node = self.stat_utils.compute_weight_average(req_rate_backends) sum_php_req_rate_backends += php_req_rate_node logger.info('Average CPU usage per Backend with IP '+backend_node.ip+' '+str(avg_cpu_user_node)+ ' Req_rate:' +str(php_req_rate_node)) if avg_cpu_user_node > MAX_CPU_USAGE: self.trigger_weight_balancing = True avg_cpu_user_backend += avg_cpu_user_node if len(req_rate_backends_list) ==0: req_rate_backends_list = req_rate_backends else: req_rate_backends_list = [(req_rate_list_a + req_rate_list_b) for req_rate_list_a, req_rate_list_b in itertools.izip_longest(req_rate_backends_list, req_rate_backends, fillvalue=0)] if len(cpu_backends_list) == 0: cpu_backends_list = cpu_backends else: cpu_backends_list = [(cpu_list_a + cpu_list_b) / 2 for cpu_list_a, cpu_list_b in itertools.izip_longest(cpu_backends_list, cpu_backends, fillvalue=0)] avg_cpu_user_backend = float(avg_cpu_user_backend) / len(backend_nodes) ### Check the cpu usage to add or remove backends... As shown in the plots, there is a correlation between cpu usage and sla violations. ### if (avg_cpu_user_backend > MAX_CPU_USAGE): n_backend_to_add = 1 n_backend_to_remove = 0 elif (len(backend_nodes) > 1) and (avg_cpu_user_backend < 35) and (n_backend_to_add == 0): n_backend_to_remove = 1 n_backend_to_add = 0 if (avg_cpu_web > MAX_CPU_USAGE) and (len(web_nodes) < MIN_NUM_BACKENDS + 1): n_web_to_add = 1 n_web_to_remove = 0 elif (len(web_nodes) > 1) and (avg_cpu_web < MIN_CPU_USAGE) and (n_web_to_add == 0): n_backend_to_remove = 1 n_backend_to_add = 0 logger.info('Total average CPU usage (user) backend: %f' % avg_cpu_user_backend) logger.info('Total average CPU usage (user) web: %f' % avg_cpu_web) self.store_predictorScaler_workload(cpu_backends_list, req_rate_backends_list) ####################################################################### if (current_time - self.last_scaling_operation >= self.time_between_scaling_predictions and self.calculate_scaling_error): logger.info('ProvisioningV3_proxy: Calculating the prediction error of our last scaling action...') list_data_cpu = list_data_req_rate = [] for value in self.predictorScaler_cpu_usage_1h.q: list_data_cpu.extend(value) for value in self.predictorScaler_req_rate_1h.q: list_data_req_rate.extend(value) self.optimal_scaling.calculate_error_prediction_cpu(list_data_cpu) self.optimal_scaling.calculate_error_prediction_req_rate(list_data_req_rate) self.calculate_scaling_error = False logger.info("ProvisioningV3_proxy: Prediction cpu model: "+str(self.optimal_scaling.get_cpu_prediction_model())+" and Cpu prediction: "+str(self.optimal_scaling.get_cpu_prediction())) logger.info("ProvisioningV3_proxy: Prediction req_rate model: "+str(self.optimal_scaling.get_req_rate_prediction_model())+" and Req_rate prediction: "+str(self.optimal_scaling.get_req_rate_prediction())) ### CONDITIONS TO ABORT A WEB OR BACKEND REMOVAL OPERATION ### abort_backend_removal = 0 abort_web_removal = avg_cpu_after_removal =0 consolidate_vm = '' if len(backend_nodes) > MIN_NUM_BACKENDS and n_backend_to_remove > 0: avg_cpu_after_removal = ( float(avg_cpu_user_backend) / (len(backend_nodes) - 1)) + avg_cpu_user_backend logger.info("ProvisioningV3: Prediction avg_cpu_after_removal: "+str(avg_cpu_after_removal)) if len(backend_nodes) > MIN_NUM_BACKENDS and n_backend_to_remove > 0 and avg_cpu_user_backend > 35: if len(backend_nodes) == (MIN_NUM_BACKENDS + 1): inst_type_1 = self.cost_controller.get_type_instance(backend_nodes[0].ip) inst_type_2 = self.cost_controller.get_type_instance(backend_nodes[1].ip) if not inst_type_1 in inst_type_2: consolidate_vm = self.consolidate_vmes(backend_nodes) else: consolidate_vm = self.consolidate_vmes(backend_nodes) logger.info("ProvisioningV3: consolidate_vm "+str(consolidate_vm)) # if( (avg_backend_req_rate_lb / len(backend_nodes) > 1.3 and avg_cpu_user_backend > 40 ) # or (avg_cpu_user_backend > 40 and self.obtain_prediciton_decision( True, 0.5 * self.slo) ) # or (avg_backend_req_rate_lb / len(backend_nodes) > 1.3 and avg_backend_resp_time_lb > 0.5 * self.slo) ): # abort_backend_removal = 1 if( ( avg_cpu_after_removal > MAX_CPU_USAGE and len(consolidate_vm) == 0) or (avg_cpu_user_backend > 40 and self.obtain_prediciton_decision( True, 0.5 * self.slo) ) or (avg_backend_req_rate_lb / len(backend_nodes) > 1.3 and avg_backend_resp_time_lb > 0.5 * self.slo) ): abort_backend_removal = 1 if( (avg_web_req_rate_lb >= 4.0 and avg_cpu_web > 40) or (avg_web_req_rate_lb >= 4.0 and avg_web_resp_time_lb > 0.5 * self.slo) ): abort_web_removal = 1 if (len(backend_nodes) == MIN_NUM_BACKENDS or n_backend_to_add != 0 or abort_backend_removal == 1): n_backend_to_remove = 0 if (len(web_nodes) == MIN_NUM_WEBS or n_web_to_add != 0 or abort_web_removal == 1): n_web_to_remove = 0 ################################################################## ret['add_web_nodes'] = n_web_to_add ret['remove_web_nodes'] = n_web_to_remove ret['add_backend_nodes'] = n_backend_to_add ret['remove_backend_nodes'] = n_backend_to_remove ##### DECIDE VM CANDIDATE OR INSTANCE TPYE TO ADD OR REMOVE ##### if n_web_to_add > 0: ret['vm_web_instance'] = self.optimal_scaling.get_vm_inst_types()[0] if n_web_to_remove > 0: ret['node_ip_remove'] = web_nodes[0].ip if n_backend_to_remove > 0: self.trigger_prediction = 0 if len(consolidate_vm) == 0: ret['node_ip_remove'] = self.optimal_scaling.remove_backend_vm_candidate(backend_nodes, self.backend_monitoring_data) else: ret['node_ip_remove'] = consolidate_vm if n_backend_to_add > 0: self.trigger_prediction = 0 strategy = self.calculate_strategy(avg_cpu_user_backend, backend_nodes, sum_php_req_rate_backends, avg_cpu_user_backend) self.calculate_scaling_error = True self.last_scaling_operation = time() ret['vm_backend_instance'] = strategy self.cost_controller.print_vm_cost() logger.info('Provisioning decisions: %s' % str(ret)) return ret def execute_actions(self, actions): n_backend_to_add = actions['add_backend_nodes'] n_backend_to_remove = actions['remove_backend_nodes'] n_web_to_add = actions['add_web_nodes'] n_web_to_remove = actions['remove_web_nodes'] vm_web_type = actions['vm_web_instance'] ip=actions['node_ip_remove'] strategy = [] strategy = actions['vm_backend_instance'] if ( n_backend_to_add> 0 and len(strategy) > 0 or n_web_to_add > 0): logger.info('Adding nodes: %d , backend strategy: %s ' % (n_web_to_add, str(strategy) )) if n_backend_to_add > 0: concurrent_ops = False perf_info = self.monitoring._performance_info_get() backend_nodes = perf_info.getBackendServiceNodes() for op, (vm_type, num) in sorted(strategy): if 'add' in op: if not concurrent_ops: concurrent_ops = True num_retries = NUM_RETRIES_SCALING_ACTION added_node = False while not added_node and num_retries > 0: try: logger.info('Adding backend nodes, quantity: %s , vm_type: %s ' % (str(num), str(vm_type) )) client.add_nodes(MANAGER_HOST, MANAGER_PORT, web=0, backend=num, cloud='default', vm_backend_instance=vm_type, vm_web_instance=vm_web_type) added_node = True except Exception as ex: logger.warning('Error when trying to add a node: '+str(ex)) num_retries = num_retries - 1 logger.warning('Node cannot be added at this time, retrying in 1min. Number of additional retries: '+str(num_retries)) added_node = False sleep(100) if 'remove' in op: logger.info('Removing backend nodes, quantity: %s , vm_type: %s ' % (str(num), str(vm_type) )) vmes_ip = [] vmes_ip = self.optimal_scaling.remove_vmes_type_candidate(backend_nodes, self.backend_monitoring_data, vm_type, num) for vm_ip in vmes_ip: if concurrent_ops: ## Before I used 60, but it seems it is not enough for the system to recognize the changes... sleep(100) num_retries = NUM_RETRIES_SCALING_ACTION removed_node = False while not removed_node and num_retries > 0: try: client.remove_nodes(MANAGER_HOST, MANAGER_PORT, web=0, backend=1, node_ip=vm_ip) #remove_node = True server_id = self.dyc_load_balancer.get_updated_backend_weights_id(vm_ip) self.killed_backends.append(server_id) self.dyc_load_balancer.remove_updated_backend_weights(server_id) self.dyc_load_balancer.remove_updated_backend_weights_id(vm_ip) except Exception as ex: logger.warning('Error when trying to remove a node: '+str(ex)) num_retries = num_retries - 1 logger.warning('Node cannot be removed at this time, retrying in 1min. Number of additional retries: '+str(num_retries)) removed_node = False sleep(100) self.last_change_time = time() if n_web_to_add > 0: num_retries = NUM_RETRIES_SCALING_ACTION added_node = False while not added_node and num_retries > 0: try: logger.info('Adding a web node: %d , inst type: %s ' % (n_web_to_add, str(vm_web_type) )) vm_backend_type=self.optimal_scaling.get_vm_inst_types()[0] client.add_nodes(MANAGER_HOST, MANAGER_PORT, web=n_web_to_add, backend=0, cloud='default', vm_backend_instance=vm_backend_type, vm_web_instance=vm_web_type) added_node = True except Exception as ex: logger.warning('Error when trying to add a web node: '+str(ex)) num_retries = num_retries - 1 logger.warning('Web node cannot be added at this time, retrying in 1min. Number of additional retries: '+str(num_retries)) added_node = False sleep(100) self.last_change_time = time() if ((n_backend_to_remove > 0 or n_web_to_remove > 0) and len(ip) > 0): logger.info('Removing web nodes: %d , backend nodes: %d ' % (n_web_to_remove, n_backend_to_remove)) client.remove_nodes(MANAGER_HOST, MANAGER_PORT, web=n_web_to_remove, backend=n_backend_to_remove, node_ip=ip) if n_backend_to_remove > 0: try: server_id = self.dyc_load_balancer.get_updated_backend_weights_id(ip) self.killed_backends.append(server_id) self.dyc_load_balancer.remove_updated_backend_weights(server_id) self.dyc_load_balancer.remove_updated_backend_weights_id(ip) except: logger.warning("Backend weight cannot be deleted for the backend with ip "+str(ip)) self.last_change_time = time() logger.info('After triggering the remove operation web nodes: %d , backend nodes: %d ' % (n_web_to_remove, n_backend_to_remove)) self.last_change_time = time() def collect_monitoring_data(self): self.monitoring.init_collect_monitoring_data() self.web_monitoring_data = self.monitoring.collect_monitoring_data_web() self.backend_monitoring_data = self.monitoring.collect_monitoring_data_backend() self.proxy_monitoring_data = self.monitoring.collect_monitoring_data_proxy() if len(self.proxy_monitoring_data) == 0 or len(self.backend_monitoring_data) == 0 or len(self.web_monitoring_data) == 0: return False return True def stop_provisioning(self): self.autoscaling_running = False #try: # os.remove(PATH_LOG_FILE) #except OSError as e: # logger.critical('stop_provisioning: Error when removing the autoscaling log '+str(e)) def do_provisioning(self,slo,cooldown_time, slo_fulfillment_degree): step_no = 0 self.slo = slo self.time_between_changes = cooldown_time*60 self.autoscaling_running = True self.optimal_scaling.set_slo_fulfillment_degree(slo_fulfillment_degree) logger.info('Autoscaling: Starting with QoS autoscaling: '+str(slo_fulfillment_degree)) try: while self.autoscaling_running: step_no += 1 tstart = datetime.now() print 'Synchronizing node info with manager...' self.monitoring.nodes_info_update(self.killed_backends) print "Collecting monitoring data..." ret = self.collect_monitoring_data() if not ret: logger.warning('Monitoring data was not properly retrieved, will retry later...') sleep(60) continue else: self.log_monitoring_data() actions = self.decide_actions() self.execute_actions(actions) n_backend_to_add = actions['add_backend_nodes'] n_backend_to_remove = actions['remove_backend_nodes'] n_web_to_add = actions['add_web_nodes'] n_web_to_remove = actions['remove_web_nodes'] # Adjust node weights every 3 steps if ( (step_no % 2 == 0 and n_backend_to_add == 0 and n_backend_to_remove == 0 \ and n_web_to_add == 0 and n_web_to_remove == 0) or self.trigger_weight_balancing ): logger.info('Calling adjust_node_weights ...') Thread(target=self.dyc_load_balancer.adjust_node_weights(self.monitoring, self.backend_monitoring_data)).start() self.trigger_weight_balancing = False tend = datetime.now() logger.info('--> EXECUTION TIME SCALING DECISION: %s ' % str(tend-tstart)) sleep(300) logger.info('Autoscaling: Terminated.') except Exception as ex: logger.critical('Autoscaling: Error in the autoscaling system '+str(ex))