Exemplo n.º 1
0
    def __init__(self, logger, manager_host, manager_port, client):
        self.manager_host = manager_host
        self.manager_port = manager_port
        self.updated_backend_weights = {}
        self.updated_backend_weights_id = {}

        self.stat_utils = StatUtils()

        self.client_manager = client
        self.logger = logger
Exemplo n.º 2
0
    def __init__(self, logger, cost_controller, config_parser, config_file_path, manager_host, manager_port, process_state, ganglia_rrd_dir):
        self.cost_controller = cost_controller
        self.config_parser = config_parser
        self.manager_host = manager_host
        self.manager_port = manager_port
        self.logger = logger
        self.process_state = process_state
        self.ganglia_rrd_dir = ganglia_rrd_dir
        self.last_collect_time = time()

        self.stat_utils = StatUtils()

        try:
            self.config_parser.read(config_file_path)
        except:
            print >>sys.stderr, 'Failed to read configuration file'
            sys.exit(1)

        self.perf_info = ServicePerformance()
        self._performance_info_set(self.perf_info)

        self.monitoring_metrics_web = ['web_request_rate', 'web_response_time', 'cpu_user', 'boottime']
        self.monitoring_metrics_backend = ['php_request_rate', 'php_response_time', 'cpu_user', 'cpu_system', 'cpu_num', 'mem_total', 'boottime']
        self.monitoring_metrics_proxy = ['web_request_rate_lb', 'web_response_time_lb',
                                         'php_request_rate_lb', 'php_response_time_lb', 'cpu_user', 'boottime']
Exemplo n.º 3
0
    def __init__(self, logger, manager_host, manager_port, client):
        self.manager_host = manager_host
        self.manager_port = manager_port
        self.updated_backend_weights = {}
        self.updated_backend_weights_id = {}

        self.stat_utils = StatUtils()

        self.client_manager = client
        self.logger = logger
Exemplo n.º 4
0
    def __init__(self, logger, cost_controller, config_parser,
                 config_file_path, manager_host, manager_port, process_state,
                 ganglia_rrd_dir):
        self.cost_controller = cost_controller
        self.config_parser = config_parser
        self.manager_host = manager_host
        self.manager_port = manager_port
        self.logger = logger
        self.process_state = process_state
        self.ganglia_rrd_dir = ganglia_rrd_dir
        self.last_collect_time = time()

        self.stat_utils = StatUtils()

        try:
            self.config_parser.read(config_file_path)
        except:
            print >> sys.stderr, 'Failed to read configuration file'
            sys.exit(1)

        #initialize a memcache client
        memcache_addr = config_parser.get('manager', 'MEMCACHE_ADDR')

        if memcache_addr == '':
            print >> sys.stderr, 'Failed to find memcache address in the config file'
            sys.exit(1)

        self.memcache = memcache.Client([memcache_addr])
        self.perf_info = ServicePerformance()
        self._performance_info_set(self.perf_info)

        self.monitoring_metrics_web = [
            'web_request_rate', 'web_response_time', 'cpu_user', 'boottime'
        ]
        self.monitoring_metrics_backend = [
            'php_request_rate', 'php_response_time', 'cpu_user', 'cpu_system',
            'cpu_num', 'mem_total', 'boottime'
        ]
        self.monitoring_metrics_proxy = ['web_request_rate_lb', 'web_response_time_lb', \
                                       'php_request_rate_lb', 'php_response_time_lb', 'cpu_user', 'boottime']
Exemplo n.º 5
0
  def __init__(self, config_parser):
   try: 
    self.slo = 700
    
    self.weight_slow_violation = WEIGHT_SLO_VIOLATION
    
    self.web_monitoring_data = {}
    self.backend_monitoring_data = {}
    self.proxy_monitoring_data = {}
    
    
    self.last_change_time = 0
    self.last_scaling_operation = 0
    self.calculate_scaling_error = False
    
    self.predictor = Prediction_Models(logger)
    self.trigger_prediction = 0
    
    ## FIXME: Size is 5 due to an excessive number of items to be predicted, please repair this part.
    ##, as we want to store the monitoring data during 60min, considering 5min between iterations
    self.predictorScaler_cpu_usage_1h = Queue( [] , 5)
    self.predictorScaler_req_rate_1h = Queue( [] , 5)
    
    self.forecast_model_selected = 0 
    self.forecast_resp_predicted = 0
    self.forecast_list = {}
    
    self.pool_predictors = ThreadPool(processes=5)
    
    self.killed_backends = []

    self.trigger_weight_balancing = False
    self.autoscaling_running = True
    
    self.iaas_driver = config_parser.get('iaas', 'DRIVER').upper()
    
    self.cost_controller = Cost_Controller(logger, self.iaas_driver)
    
    ## Parameters to establish a preference for selecting the most appropriate resource.
    self.optimal_scaling = Strategy_Finder(logger, self.iaas_driver, self.cost_controller, 'low', True, self.weight_slow_violation) 
  
    self.stat_utils = StatUtils()
    
    self.monitoring =  Monitoring_Controller( logger, self.cost_controller, config_parser, '/root/config.cfg', MANAGER_HOST, MANAGER_PORT, PS_RUNNING, self.ganglia_rrd_dir)

    self.dyc_load_balancer = Dynamic_Load_Balancer(logger, MANAGER_HOST, MANAGER_PORT, client)
    
    self.profiler = Profiler(logger, self.slo, self.cost_controller, MAX_CPU_USAGE, MIN_CPU_USAGE, UPPER_THRS_SLO, LOWER_THRS_SLO )
 
   except Exception as e:
      logger.critical('Scaler: Error when initializing the ProvisioningManager in scaler.py \n' + str(e))
Exemplo n.º 6
0
    def __init__(
        self, logger_autoscaling, slo, cost_controller, max_cpu_usage, min_cpu_usage, upper_thr_slo, lower_thr_slo
    ):
        self.max_cpu_usage = max_cpu_usage
        self.min_cpu_usage = min_cpu_usage
        self.upper_thr_slo = upper_thr_slo
        self.lower_thr_slo = lower_thr_slo
        self.logger = logger_autoscaling

        self.machines = {}
        self.slo = slo

        self.vm_type_ideal_throughput = {}
        self.vm_type_max_throughput = {}
        self.max_iterations = 6

        self.cost_controller = cost_controller

        self.stat_utils = StatUtils()
Exemplo n.º 7
0
class Dynamic_Load_Balancer:
    def __init__(self, logger, manager_host, manager_port, client):
        self.manager_host = manager_host
        self.manager_port = manager_port
        self.updated_backend_weights = {}
        self.updated_backend_weights_id = {}

        self.stat_utils = StatUtils()

        self.client_manager = client
        self.logger = logger

    def get_updated_backend_weights_id(self, vm_ip):
        return self.updated_backend_weights_id[vm_ip]

    def remove_updated_backend_weights(self, server_id):
        del self.updated_backend_weights[server_id]

    def remove_updated_backend_weights_id(self, vm_ip):
        del self.updated_backend_weights_id[vm_ip]

    """ 
 TODO: Perhaps we may want to calculate the percent_factor using cpu_usage and response time
 """

    def adjust_node_weights(self, monitoring, backend_monitoring_data):
        """
    Adjusts the weights of the servers based on the latest monitoring data.
    """
        self.logger.info('Adjusting node weights...')
        try:

            perf_info = monitoring._performance_info_get()
            backend_nodes = perf_info.getBackendServiceNodes()

            # Compute the average cpu usage of each backend node, and the maximum among all the nodes
            avg_nodes_usage = {}
            max_node_usage = 0
            for backend_node in backend_nodes:

                node_usage = self.stat_utils.compute_weight_average(
                    backend_monitoring_data[backend_node.ip]['cpu_user'])
                #node_usage =  self.stat_utils.compute_weight_average_response(self.backend_monitoring_data[backend_node.ip]['php_response_time'], self.slo, self.weight_slow_violation)

                self.logger.debug(
                    'Current weight for node %s is: %s and avg cpu usage: %s' %
                    (backend_node.vmid, str(
                        backend_node.weightBackend), str(node_usage)))
                avg_nodes_usage[backend_node.vmid] = node_usage
                if node_usage > max_node_usage:
                    max_node_usage = node_usage

            self.logger.debug('Weight calculation: maximum cpu usage is %s' %
                              str(max_node_usage))
            weight_changes = False

            ## Limit the maximum weight for all the elements:
            for vmid in self.updated_backend_weights:
                if self.updated_backend_weights[vmid] > MAX_WEIGHT_VALUE:
                    for vmid_aux in self.updated_backend_weights:
                        self.updated_backend_weights[
                            vmid_aux] = STANDARD_WEIGHT_VALUE

            # Compute the weight adjustment for each node
            for backend_node in backend_nodes:
                node_usage = avg_nodes_usage[backend_node.vmid]
                # When a node has just been added, the monitoring data might be 0

                if max_node_usage > MIN_CPU_USAGE_WEIGHTS:

                    if node_usage == 0:
                        node_usage = max_node_usage

                    # How much faster is this node compared with the slowest node
                    percent_factor = (max_node_usage / node_usage) * 100

                    # We ignore differences of less than 30%. For each 30% difference
                    # we'll increase the weight of the faster server with 10%.
                    percent_multiplier = int(
                        (percent_factor - 100) / PERCENTAGE_PERFORMANCE_DIFF)
                    weight_value = 0
                    if (node_usage > MAX_CPU_USAGE_WEIGHTS
                            and percent_multiplier < 1):
                        self.logger.debug('Node with cpu usage > 77 ')
                        try:
                            if self.updated_backend_weights[backend_node.vmid]:
                                weight_value = int(
                                    self.updated_backend_weights[
                                        backend_node.vmid] - 10)
                        except:
                            weight_value = int(backend_node.weightBackend - 10)
                    else:
                        self.logger.debug(
                            'Percent multiplier for node %s is: %s' %
                            (backend_node.vmid, str(percent_multiplier)))
                        ## Verify if there is any change in the weights, otherwise we won't call the update_nodes_weight function
                        try:
                            if self.updated_backend_weights[backend_node.vmid]:
                                weight_value = int(self.updated_backend_weights[backend_node.vmid] * \
                                                   (100 + 15 * percent_multiplier) / 100)
                        except:
                            weight_value = int(backend_node.weightBackend * \
                                (100 + 15 * percent_multiplier) / 100)

                        ## Added to avoid Requests rejections from Nginx when weight values are low and there is not reason to do that.
                        if percent_multiplier == 0:
                            try:
                                if self.updated_backend_weights[
                                        backend_node.
                                        vmid] and self.updated_backend_weights[
                                            backend_node.vmid] < 500:
                                    weight_value = int(
                                        self.updated_backend_weights[
                                            backend_node.vmid] + 10)
                            except:
                                if backend_node.weightBackend < 500:
                                    weight_value = int(
                                        backend_node.weightBackend + 10)

                    try:

                        if self.updated_backend_weights[
                                backend_node.
                                vmid] and self.updated_backend_weights[
                                    backend_node.vmid] == weight_value:
                            self.logger.debug(
                                'Same weight than previous iteration for node: %s to: %s'
                                % (backend_node.vmid,
                                   str(self.updated_backend_weights[
                                       backend_node.vmid])))
                        else:
                            weight_changes = True
                            self.logger.debug(
                                'Updating weight for node: %s to: %s' %
                                (backend_node.vmid,
                                 str(self.updated_backend_weights[
                                     backend_node.vmid])))
                            self.updated_backend_weights[
                                backend_node.vmid] = weight_value
                            self.updated_backend_weights_id[
                                backend_node.ip] = backend_node.vmid
                    except Exception:
                        weight_changes = True
                        self.logger.debug('Adding weight for node: %s ' %
                                          (backend_node.vmid))
                        self.updated_backend_weights[
                            backend_node.vmid] = weight_value
                        self.updated_backend_weights_id[
                            backend_node.ip] = backend_node.vmid

                    #updated_backend_weights[backend_node.vmid] = backend_node.compute_node_weight('backend', self.slo * 0.8 )
            if weight_changes:
                self.logger.debug('Updating weights for the nodes: %s ' %
                                  (str(self.updated_backend_weights)))

                try:
                    self.client_manager.update_nodes_weight(
                        self.manager_host,
                        self.manager_port,
                        web={},
                        backend=self.updated_backend_weights)
                except Exception:
                    tb = traceback.format_exc()
                    self.logger.error(
                        'Could not update node weight, exception stack trace: %s'
                        % tb)

        except Exception as ex:
            self.logger.error(
                "Could not update node weight, exception stack trace: " +
                str(ex))

        self.logger.info('Adjusting node weights finished.')
Exemplo n.º 8
0
class Monitoring_Controller:

    def __init__(self, logger, cost_controller, config_parser, config_file_path, manager_host, manager_port, process_state, ganglia_rrd_dir):
        self.cost_controller = cost_controller
        self.config_parser = config_parser
        self.manager_host = manager_host
        self.manager_port = manager_port
        self.logger = logger
        self.process_state = process_state
        self.ganglia_rrd_dir = ganglia_rrd_dir
        self.last_collect_time = time()

        self.stat_utils = StatUtils()

        try:
            self.config_parser.read(config_file_path)
        except:
            print >>sys.stderr, 'Failed to read configuration file'
            sys.exit(1)

        self.perf_info = ServicePerformance()
        self._performance_info_set(self.perf_info)

        self.monitoring_metrics_web = ['web_request_rate', 'web_response_time', 'cpu_user', 'boottime']
        self.monitoring_metrics_backend = ['php_request_rate', 'php_response_time', 'cpu_user', 'cpu_system', 'cpu_num', 'mem_total', 'boottime']
        self.monitoring_metrics_proxy = ['web_request_rate_lb', 'web_response_time_lb',
                                         'php_request_rate_lb', 'php_response_time_lb', 'cpu_user', 'boottime']

    def _performance_info_get(self):
        return self.performance_info

    def _performance_info_set(self, perf_info):
        self.performance_info = perf_info

    def nodes_info_update(self, killed_backends):
        #conpaas_init_ssl_ctx(self.certs_dir, 'manager')
        print('MANAGER %s' % self.manager_host)
        print('PORT %s' % self.manager_port)

        nodes = client.list_nodes(self.manager_host, self.manager_port)
        self.logger.debug('Got update info from manager')

        perf_info = self._performance_info_get()

        perf_info.reset_role_info()

        self.logger.debug('Updating nodes...')
        for node_id in nodes['proxy']:
            node = perf_info.serviceNodes.get(node_id)
            if node is not None:
                node.registered_with_manager = True
                node.isRunningProxy = True
            else:
                perf_info.serviceNodes[node_id] = ServiceNodePerf(node_id, '', True, False, False, self.process_state)

        for node_id in nodes['web']:
            node = perf_info.serviceNodes.get(node_id)
            if node is not None:
                node.registered_with_manager = True
                node.isRunningWeb = True
            else:
                perf_info.serviceNodes[node_id] = ServiceNodePerf(node_id, '', False, True, False, self.process_state)

        for node_id in nodes['backend']:
            node = perf_info.serviceNodes.get(node_id)
            if node is not None:
                node.registered_with_manager = True
                node.isRunningBackend = True
            else:
                perf_info.serviceNodes[node_id] = ServiceNodePerf(node_id, '', False, False, True, self.process_state)

        self.logger.info('Filtering backend nodes killed_backends : ' + str(killed_backends) + ' ' + str(perf_info.serviceNodes))
        for id, node in perf_info.serviceNodes.items():
            if node.ip == '':
                response = client.get_node_info(self.manager_host, self.manager_port, id)
                node.ip = response['serviceNode']['ip']
            if not node.registered_with_manager:
                del perf_info.serviceNodes[id]
            # FIXME TO FILTER REMOVE OF BACKENDS #####
            if id in killed_backends:
                self.logger.info('Filtered backend  with id: ' + str(id))
                try:
                    del perf_info.serviceNodes[id]
                except:
                    self.logger.warning('Backend already removed or not containing in serviceNodes: ' + str(id))
            #
        self.logger.info('Filtered backend nodes killed_backends : ' + str(killed_backends) + ' ' + str(perf_info.serviceNodes))

        self._performance_info_set(perf_info)

        self.logger.info('Updating nodes information from ConPaaS manager...')
        self.logger.info('Updated service nodes: %s' % str(perf_info.serviceNodes))

    def collect_monitoring_metric(self, node_ip, metric_name):
        timestamps = []
        param_values = []
        # Added this for EC2, where the RRD directory names in Ganglia are hosts and not IPs:
        ganglia_dir_name = ''

        if node_ip.find('amazonaws') > 0:  # this is an IP address
            ganglia_dir_name = node_ip
        else:  # this is a DNS name
            for ganglia_host in listdir(self.ganglia_rrd_dir):
                #self.logger.error('collect from ganglia host: ' + str(ganglia_host))
                if ganglia_host.find('Summary') > 0:
                    continue
                try:
                    hostname, array, array_ip = socket.gethostbyaddr(node_ip)
                except Exception as ex:
                    self.logger.warning('Found private ip when trying to get the hostname for ip %s: %s. ' % (str(node_ip), ex))
                    ganglia_dir_name = node_ip
                    break
                #self.logger.error('gethostbyaddr: ' + hostname)
                if ganglia_host == hostname:
                    ganglia_dir_name = ganglia_host
                    break

        rrd_file_name = self.ganglia_rrd_dir + ganglia_dir_name + '/' + metric_name + '.rrd'
#        self.logger.debug('rrd_file_name: ' + str(rrd_file_name))
#    logger.info('Searching in RRD file:' + rrd_file_name)
        if (not path.isfile(rrd_file_name)):
            self.logger.error('RRD file not found: ' + rrd_file_name)
            return []

        #logger.info('Getting monitoring info for node %s, parameter %s ...' % (node_ip, metric_name))
#    logger.info('last collect time: ' + str(int(self.last_collect_time)))
        collect_from = self.last_collect_time - (time() - self.last_collect_time)
        #collect_from = self.last_collect_time
        fetch_cmd = ['rrdtool', 'fetch', '-s', str(int(collect_from)), '-r', '15',
                     str(rrd_file_name), 'AVERAGE']
        self.logger.debug("Fetching data with command: %s" % ' '.join(fetch_cmd))
        proc = Popen(fetch_cmd, stdout=PIPE, stderr=PIPE, close_fds=True)
        stdout_req, stderr_req = proc.communicate()

        lines = stdout_req.splitlines()
        for line in lines:
            # logger.debug(line)
            tokens = line.split()
            if (line.find('sum') >= 0 or len(tokens) < 2):
                continue

            timestamps.append(int(tokens[0].replace(':', '')))

            if (tokens[1].find('nan') < 0):
                param_values.append(float(tokens[1]))
            else:
                param_values.append(-1)

        # Cleaning the memory allocated by subprocess.Popen()
        try:
            proc.terminate()
        except OSError:
            #  logger.critical("Cannot kill the subprocess.popen rrdtool")
            # can't kill a dead proc
            pass

        #logger.debug('timestamps: ' + str(timestamps))
        #logger.debug('param values: ' + str(param_values))
        return [timestamps, param_values]

    def init_collect_monitoring_data(self):
        self.perf_info = self._performance_info_get()

    # FIXME: dead code?
    def collect_monitoring_data(self):

        web_monitoring_data = {}
        backend_monitoring_data = {}
        proxy_monitoring_data = {}

        for web_node in self.perf_info.getWebServiceNodes():
            self.logger.info('Getting web monitoring info for %s ...' % web_node.ip)
            # if web_node.ip not in web_monitoring_data:
            web_monitoring_data[web_node.ip] = {}
            cpu_num = DEFAULT_NUM_CPU
            mem_total = DEFAULT_RAM_MEMORY
            self.logger.info('Getting web monitoring info 1')
            for it in range(len(self.monitoring_metrics_web)):
                self.logger.info('Getting web monitoring info 2')
                ret = self.collect_monitoring_metric(web_node.ip, self.monitoring_metrics_web[it])
                self.logger.info('Getting web monitoring info 3')
                if len(ret) == 0:  # monitoring data was not found
                    self.logger.warning("Could not retrieve data for metric %s: failed to collect all data for web node." % self.monitoring_metrics_web[it])
                    self.logger.info('Getting web monitoring info 4')
                    return False
                if 'timestamps' not in web_monitoring_data[web_node.ip]:
                    web_monitoring_data[web_node.ip]['timestamps'] = ret[0]
                web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]] = ret[1]

                if self.monitoring_metrics_web[it] == 'cpu_num' and web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0] != -1:
                    cpu_num = backend_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0]

                if self.monitoring_metrics_web[it] == 'mem_total' and web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0] != -1:
                    mem_total = str(backend_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0])

                if self.monitoring_metrics_web[it] == 'boottime' and web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0] != -1:
                    self.cost_controller.update_vm_usage(web_node.ip, web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0], self.cost_controller.instance_type_detector(cpu_num, mem_total))

        for backend_node in self.perf_info.getBackendServiceNodes():
            self.logger.info('Getting backend monitoring info for %s ...' % backend_node.ip)
            backend_monitoring_data[backend_node.ip] = {}
            cpu_num = DEFAULT_NUM_CPU
            mem_total = DEFAULT_RAM_MEMORY
            """
            It iterates over the array to get the metrics in the same order, they defined added.
            It allows to detect the type of instance by analyzing the cpu, mem_total.
            """
            for it in range(len(self.monitoring_metrics_backend)):
                ret = self.collect_monitoring_metric(backend_node.ip, self.monitoring_metrics_backend[it])
                if len(ret) == 0:  # monitoring data was not found
                    self.logger.warning("Could not retrieve data for metric %s: failed to collect all data for backend node." % self.monitoring_metrics_backend[it])
                    return False
                if 'timestamps' not in backend_monitoring_data[backend_node.ip]:
                    backend_monitoring_data[backend_node.ip]['timestamps'] = ret[0]
                backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]] = ret[1]

                self.logger.info('There is a metric name: ' + str(self.monitoring_metrics_backend[it]))
                if self.monitoring_metrics_backend[it] == 'cpu_num':
                    if backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] > 0:
                        self.logger.info('There is a metric cpu_num with content: ' + str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0]))
                        cpu_num = str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0])
                    else:
                        # This is done to clean the negative and worng values from the monitoring data
                        self.logger.info('There is a metric cpu_num with content equal or minus to zero ')
                        for value in backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]]:
                            if value > 0:
                                self.logger.info('There is a metric cpu_num with content: ' + str(value))
                                cpu_num = value
                                break

                if self.monitoring_metrics_backend[it] == 'mem_total':
                    if backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] > 0:
                        self.logger.info('There is a metric mem_total with content: ' + str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0]))
                        mem_total = str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0])
                    else:
                        # This is done to clean the negative and worng values from the monitoring data
                        self.logger.info('There is a metric mem_total with content equal or minus to zero ')
                        for value in backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]]:
                            if value > 0:
                                self.logger.info('There is a metric mem_total with content: ' + str(value))
                                mem_total = value
                                break

                if self.monitoring_metrics_backend[it] == 'boottime':
                    if backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] > 0:
                        self.logger.info('There is a metric boottime with content: ' + str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0]))
                        self.cost_controller.update_vm_usage(backend_node.ip, backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0], self.cost_controller.instance_type_detector(cpu_num, mem_total))
                    else:
                        # This is done to clean the negative and worng values from the monitoring data
                        boottime = backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0]
                        self.logger.info('There is a metric boottime with content equal or minus to zero ')
                        for value in backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]]:
                            if value > 0:
                                self.logger.info('There is a metric boottime with content: ' + str(value))
                                self.cost_controller.update_vm_usage(backend_node.ip, float(boottime), self.cost_controller.instance_type_detector(cpu_num, mem_total))
                                boottime = value
                                break

        for proxy_node in self.perf_info.getProxyServiceNodes():
            self.logger.info('Getting proxy monitoring info for %s ...' % proxy_node.ip)
            proxy_monitoring_data[proxy_node.ip] = {}
            for it in range(len(self.monitoring_metrics_proxy)):
                ret = self.collect_monitoring_metric(proxy_node.ip, self.monitoring_metrics_proxy[it])
                if len(ret) == 0:  # monitoring data was not found
                    self.logger.warning("Could not retrieve data for metric %s: failed to collect all data for proxy node." % self.monitoring_metrics_proxy[it])
                    return False
                if 'timestamps' not in proxy_monitoring_data[proxy_node.ip]:
                    proxy_monitoring_data[proxy_node.ip]['timestamps'] = ret[0]
                proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]] = ret[1]

                if self.monitoring_metrics_proxy[it] == 'cpu_num' and proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0] != -1:
                    cpu_num = proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0]

                if self.monitoring_metrics_proxy[it] == 'mem_total' and proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0] != -1:
                    mem_total = str(proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0])

                if self.monitoring_metrics_proxy[it] == 'boottime' and proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0] != -1:
                    self.cost_controller.update_vm_usage(proxy_node.ip, proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0], self.cost_controller.instance_type_detector(cpu_num, mem_total))

                proxy_monitoring_data[proxy_node.ip] = self.stat_utils.filter_monitoring_data(proxy_monitoring_data[proxy_node.ip], self.monitoring_metrics_proxy)

        print proxy_monitoring_data
        print web_monitoring_data
        print backend_monitoring_data
        self.last_collect_time = time()
        print "Done getting monitoring data..."
        return True

    def collect_monitoring_data_web(self):

        web_monitoring_data = {}

        for web_node in self.perf_info.getWebServiceNodes():
            self.logger.info('Getting web monitoring info for %s ...' % web_node.ip)
            # if web_node.ip not in web_monitoring_data:
            web_monitoring_data[web_node.ip] = {}
            cpu_num = DEFAULT_NUM_CPU
            mem_total = DEFAULT_RAM_MEMORY
            for it in range(len(self.monitoring_metrics_web)):
                ret = self.collect_monitoring_metric(web_node.ip, self.monitoring_metrics_web[it])
                if len(ret) == 0:  # monitoring data was not found
                    self.logger.warning("Could not retrieve data for metric %s: failed to collect all data for web node." % self.monitoring_metrics_web[it])
                    return {}

                if 'timestamps' not in web_monitoring_data[web_node.ip]:
                    web_monitoring_data[web_node.ip]['timestamps'] = ret[0]
                web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]] = ret[1]

                if self.monitoring_metrics_web[it] == 'cpu_num' and web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0] != -1:
                    cpu_num = web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0]

                if self.monitoring_metrics_web[it] == 'mem_total' and web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0] != -1:
                    mem_total = str(web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0])

                if self.monitoring_metrics_web[it] == 'boottime':
                    self.cost_controller.update_vm_usage(web_node.ip, web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0], self.cost_controller.instance_type_detector(cpu_num, mem_total))

        return web_monitoring_data

    def collect_monitoring_data_backend(self):
        backend_monitoring_data = {}

        for backend_node in self.perf_info.getBackendServiceNodes():
            self.logger.info('Getting backend monitoring info for %s ...' % backend_node.ip)
            backend_monitoring_data[backend_node.ip] = {}
            cpu_num = DEFAULT_NUM_CPU
            mem_total = DEFAULT_RAM_MEMORY
            """
              It iterates over the array to get the metrics in the same order, they defined added.
              It allows to detect the type of instance by analyzing the cpu, mem_total.
            """
            for it in range(len(self.monitoring_metrics_backend)):
                ret = self.collect_monitoring_metric(backend_node.ip, self.monitoring_metrics_backend[it])
                if len(ret) == 0:  # monitoring data was not found
                    self.logger.warning("Could not retrieve data for metric %s: failed to collect all data for backend node." % self.monitoring_metrics_backend[it])
                    return {}
                if 'timestamps' not in backend_monitoring_data[backend_node.ip]:
                    backend_monitoring_data[backend_node.ip]['timestamps'] = ret[0]
                backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]] = ret[1]

                #self.logger.info('There is a metric name: '+str(self.monitoring_metrics_backend[it]))
                if self.monitoring_metrics_backend[it] == 'cpu_num':
                    if backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] > 0:
                        self.logger.info('There is a metric cpu_num with content: ' + str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0]))
                        cpu_num = backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0]
                    else:
                        # This is done to clean the negative and worng values from the monitoring data
                        self.logger.info('There is a metric cpu_num with content equal or minus to zero ')
                        for value in backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]]:
                            if value > 0:
                                self.logger.info('There is a metric cpu_num with content: ' + str(value))
                                cpu_num = value
                                break

                if self.monitoring_metrics_backend[it] == 'mem_total':
                    if backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] > 0:
                        self.logger.info('There is a metric mem_total with content: ' + str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0]))
                        mem_total = backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0]
                    else:
                        # This is done to clean the negative and worng values from the monitoring data
                        self.logger.info('There is a metric mem_total with content equal or minus to zero ')
                        for value in backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]]:
                            if value > 0:
                                self.logger.info('There is a metric mem_total with content: ' + str(value))
                                mem_total = value
                                break

                if self.monitoring_metrics_backend[it] == 'boottime':
                    if backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] > 0:
                        self.logger.info('There is a metric boottime with content: ' + str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0]))
                        self.cost_controller.update_vm_usage(backend_node.ip, backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0], self.cost_controller.instance_type_detector(cpu_num, mem_total))
                    else:
                        # This is done to clean the negative and worng values from the monitoring data
                        boottime = backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0]
                        self.logger.info('There is a metric boottime with content equal or minus to zero ')
                        for value in backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]]:
                            if value > 0:
                                self.logger.info('There is a metric boottime with content: ' + str(value))
                                boottime = value
                                break
                        self.cost_controller.update_vm_usage(backend_node.ip, float(boottime), self.cost_controller.instance_type_detector(cpu_num, mem_total))

        return backend_monitoring_data

    def collect_monitoring_data_proxy(self):
        self.perf_info = self._performance_info_get()
        proxy_monitoring_data = {}

        for proxy_node in self.perf_info.getProxyServiceNodes():
            self.logger.info('Getting proxy monitoring info for %s ...' % proxy_node.ip)
            proxy_monitoring_data[proxy_node.ip] = {}
            cpu_num = DEFAULT_NUM_CPU
            mem_total = DEFAULT_RAM_MEMORY
            for it in range(len(self.monitoring_metrics_proxy)):
                ret = self.collect_monitoring_metric(proxy_node.ip, self.monitoring_metrics_proxy[it])
                if len(ret) == 0:  # monitoring data was not found
                    self.logger.warning("Could not retrieve data for metric %s: failed to collect all data for proxy node." % self.monitoring_metrics_proxy[it])
                    return {}
                if 'timestamps' not in proxy_monitoring_data[proxy_node.ip]:
                    proxy_monitoring_data[proxy_node.ip]['timestamps'] = ret[0]
                proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]] = ret[1]

                if self.monitoring_metrics_proxy[it] == 'cpu_num' and proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0] != -1:
                    cpu_num = proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0]

                if self.monitoring_metrics_proxy[it] == 'mem_total' and proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0] != -1:
                    mem_total = str(proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0])

                if self.monitoring_metrics_proxy[it] == 'boottime':
                    self.cost_controller.update_vm_usage(proxy_node.ip, proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0], self.cost_controller.instance_type_detector(cpu_num, mem_total))

            proxy_monitoring_data[proxy_node.ip] = self.stat_utils.filter_monitoring_data(proxy_monitoring_data[proxy_node.ip], self.monitoring_metrics_proxy)

        self.last_collect_time = time()

        return proxy_monitoring_data
Exemplo n.º 9
0
class Dynamic_Load_Balancer:

    def __init__(self, logger, manager_host, manager_port, client):
        self.manager_host = manager_host
        self.manager_port = manager_port
        self.updated_backend_weights = {}
        self.updated_backend_weights_id = {}

        self.stat_utils = StatUtils()

        self.client_manager = client
        self.logger = logger

    def get_updated_backend_weights_id(self, vm_ip):
        return self.updated_backend_weights_id[vm_ip]

    def remove_updated_backend_weights(self, server_id):
        del self.updated_backend_weights[server_id]

    def remove_updated_backend_weights_id(self, vm_ip):
        del self.updated_backend_weights_id[vm_ip]

    """
    TODO: Perhaps we may want to calculate the percent_factor using cpu_usage and response time
    """

    def adjust_node_weights(self, monitoring, backend_monitoring_data):
        """
        Adjusts the weights of the servers based on the latest monitoring data.
        """
        self.logger.info('Adjusting node weights...')
        try:

            perf_info = monitoring._performance_info_get()
            backend_nodes = perf_info.getBackendServiceNodes()

            # Compute the average cpu usage of each backend node, and the maximum among all the nodes
            avg_nodes_usage = {}
            max_node_usage = 0
            for backend_node in backend_nodes:

                node_usage = self.stat_utils.compute_weight_average(backend_monitoring_data[backend_node.ip]['cpu_user'])
                #node_usage =  self.stat_utils.compute_weight_average_response(self.backend_monitoring_data[backend_node.ip]['php_response_time'], self.slo, self.weight_slow_violation)

                self.logger.debug('Current weight for node %s is: %s and avg cpu usage: %s' % (backend_node.vmid, str(backend_node.weightBackend), str(node_usage)))
                avg_nodes_usage[backend_node.vmid] = node_usage
                if node_usage > max_node_usage:
                    max_node_usage = node_usage

            self.logger.debug('Weight calculation: maximum cpu usage is %s' % str(max_node_usage))
            weight_changes = False

            # Limit the maximum weight for all the elements:
            for vmid in self.updated_backend_weights:
                if self.updated_backend_weights[vmid] > MAX_WEIGHT_VALUE:
                    for vmid_aux in self.updated_backend_weights:
                        self.updated_backend_weights[vmid_aux] = STANDARD_WEIGHT_VALUE

            # Compute the weight adjustment for each node
            for backend_node in backend_nodes:
                node_usage = avg_nodes_usage[backend_node.vmid]
                # When a node has just been added, the monitoring data might be 0

                if max_node_usage > MIN_CPU_USAGE_WEIGHTS:

                    if node_usage == 0:
                        node_usage = max_node_usage

                    # How much faster is this node compared with the slowest node
                    percent_factor = (max_node_usage / node_usage) * 100

                    # We ignore differences of less than 30%. For each 30% difference
                    # we'll increase the weight of the faster server with 10%.
                    percent_multiplier = int((percent_factor - 100) / PERCENTAGE_PERFORMANCE_DIFF)
                    weight_value = 0
                    if(node_usage > MAX_CPU_USAGE_WEIGHTS and percent_multiplier < 1):
                        self.logger.debug('Node with cpu usage > 77 ')
                        try:
                            if self.updated_backend_weights[backend_node.vmid]:
                                weight_value = int(self.updated_backend_weights[backend_node.vmid] - 10)
                        except:
                            weight_value = int(backend_node.weightBackend - 10)
                    else:
                        self.logger.debug('Percent multiplier for node %s is: %s' % (backend_node.vmid, str(percent_multiplier)))
                        # Verify if there is any change in the weights, otherwise we won't call the update_nodes_weight function
                        try:
                            if self.updated_backend_weights[backend_node.vmid]:
                                weight_value = int(self.updated_backend_weights[backend_node.vmid] *
                                                   (100 + 15 * percent_multiplier) / 100)
                        except:
                            weight_value = int(backend_node.weightBackend *
                                               (100 + 15 * percent_multiplier) / 100)

                        # Added to avoid Requests rejections from Nginx when weight values are low and there is not reason to do that.
                        if percent_multiplier == 0:
                            try:
                                if self.updated_backend_weights[backend_node.vmid] and self.updated_backend_weights[backend_node.vmid] < 500:
                                    weight_value = int(self.updated_backend_weights[backend_node.vmid] + 10)
                            except:
                                if backend_node.weightBackend < 500:
                                    weight_value = int(backend_node.weightBackend + 10)

                    try:

                        if self.updated_backend_weights[backend_node.vmid] and self.updated_backend_weights[backend_node.vmid] == weight_value:
                            self.logger.debug('Same weight than previous iteration for node: %s to: %s' % (backend_node.vmid, str(self.updated_backend_weights[backend_node.vmid])))
                        else:
                            weight_changes = True
                            self.logger.debug('Updating weight for node: %s to: %s' % (backend_node.vmid, str(self.updated_backend_weights[backend_node.vmid])))
                            self.updated_backend_weights[backend_node.vmid] = weight_value
                            self.updated_backend_weights_id[backend_node.ip] = backend_node.vmid
                    except Exception:
                        weight_changes = True
                        self.logger.debug('Adding weight for node: %s ' % (backend_node.vmid))
                        self.updated_backend_weights[backend_node.vmid] = weight_value
                        self.updated_backend_weights_id[backend_node.ip] = backend_node.vmid

                    #updated_backend_weights[backend_node.vmid] = backend_node.compute_node_weight('backend', self.slo * 0.8 )
            if weight_changes:
                self.logger.debug('Updating weights for the nodes: %s ' % (str(self.updated_backend_weights)))

                try:
                    self.client_manager.update_nodes_weight(self.manager_host, self.manager_port, web={}, backend=self.updated_backend_weights)
                except Exception:
                    tb = traceback.format_exc()
                    self.logger.error('Could not update node weight, exception stack trace: %s' % tb)

        except Exception as ex:
            self.logger.error("Could not update node weight, exception stack trace: " + str(ex))

        self.logger.info('Adjusting node weights finished.')
Exemplo n.º 10
0
class Profiler:
    def __init__(
        self, logger_autoscaling, slo, cost_controller, max_cpu_usage, min_cpu_usage, upper_thr_slo, lower_thr_slo
    ):
        self.max_cpu_usage = max_cpu_usage
        self.min_cpu_usage = min_cpu_usage
        self.upper_thr_slo = upper_thr_slo
        self.lower_thr_slo = lower_thr_slo
        self.logger = logger_autoscaling

        self.machines = {}
        self.slo = slo

        self.vm_type_ideal_throughput = {}
        self.vm_type_max_throughput = {}
        self.max_iterations = 6

        self.cost_controller = cost_controller

        self.stat_utils = StatUtils()

    def get_vm_type_ideal_throughput(self, inst_type):
        return self.vm_type_ideal_throughput[inst_type]

    def calculate_ideal_throughput(self, inst_type):
        self.logger.debug("calculate_ideal_throughput: starting the computation for instance " + str(inst_type))
        try:
            if self.machines[inst_type]:
                resp_times_filtered = []
                cpu_user_values_filtered = []
                req_rates_filtered = []
                array_monitoring_data = self.machines[inst_type]

                self.logger.info("calculate_ideal_throughput: Adapting to the  threshold values.")
                for it in range(array_monitoring_data[4][0]):
                    for resp_time, cpu_usage, req_rate in itertools.izip(
                        array_monitoring_data[0][it], array_monitoring_data[1][it], array_monitoring_data[2][it]
                    ):
                        # Add cpu_usage as we found data with 0.10 cpu_usage; and 80% when having 500ms, and request rate upper than 0.5
                        # FIXME: Changed resp_time as lower value are used in large and highcpu-medium instances
                        # if resp_time > (0.4 * self.slo) and resp_time <= (0.75 * self.slo) and cpu_usage < 75 and cpu_usage > 25 and req_rate > 0.5:
                        if (
                            resp_time > (0.4 * self.slo)
                            and resp_time <= (self.upper_thr_slo * self.slo)
                            and cpu_usage < self.max_cpu_usage
                            and cpu_usage > self.min_cpu_usage
                            and req_rate > 0.5
                        ):
                            resp_times_filtered.append(resp_time)
                            cpu_user_values_filtered.append(cpu_usage)
                            req_rates_filtered.append(req_rate)

                # Get the average of Resp times between 50% and 75% of the SLO
                self.logger.info(
                    "calculate_ideal_throughput: Calculating the average of response time:" + str(resp_times_filtered)
                )
                self.logger.info(
                    "calculate_ideal_throughput: Calculating the average of cpu :" + str(cpu_user_values_filtered)
                )
                self.logger.info(
                    "calculate_ideal_throughput: Calculating the average of req_rate :" + str(req_rates_filtered)
                )

                cpu_user_avg = 0
                # cpu_sys_avg = 0
                req_rate_avg = 0
                # resp_time_avg = 0
                if len(resp_times_filtered) == 0 or len(cpu_user_values_filtered) == 0 or len(req_rates_filtered) == 0:

                    try:
                        if self.vm_type_ideal_throughput[inst_type]:
                            max_throughput = self.vm_type_ideal_throughput[inst_type]
                    except Exception:
                        self.logger.error(
                            "calculate_ideal_throughput: ERROR cannot find inst_type in "
                            + str(self.vm_type_ideal_throughput)
                        )
                        max_throughput = 0

                else:
                    # Get the average of CPU user and Req. rate between 50% and 75% of the SLO
                    #  resp_time_avg = self.stat_utils.compute_average(resp_times_filtered)
                    cpu_user_avg = self.stat_utils.compute_average(cpu_user_values_filtered)
                    req_rate_avg = self.stat_utils.compute_average(req_rates_filtered)
                    max_throughput = float(cpu_user_avg) / req_rate_avg
                    self.vm_type_ideal_throughput[inst_type] = max_throughput

                self.logger.info(
                    "Ideal throughput of inst_type: "
                    + inst_type
                    + "  Cpu speed: "
                    + str(max_throughput)
                    + " cpu_avg: "
                    + str(cpu_user_avg)
                    + " req_avg: "
                    + str(req_rate_avg)
                )
                return max_throughput
        except Exception as e:
            self.logger.error("calculate_ideal_throughput: ERROR calculating ideal throughput " + str(e))
            return 0

    def get_vm_type_max_throughput(self, inst_type):
        return self.vm_type_max_throughput[inst_type]

    def calculate_max_instance_throughput(self, inst_type, array_monitoring_data):
        self.logger.debug("calculate_max_instance_throughput: starting the computation for instance " + str(inst_type))
        try:
            max_resp_time = max_cpu = max_req_rate = 0

            self.logger.info("calculate_max_instance_throughput: Adapting to the  threshold values.")
            for it in range(array_monitoring_data[4][0]):
                for resp_time, cpu_usage, req_rate in itertools.izip(
                    array_monitoring_data[0][it], array_monitoring_data[1][it], array_monitoring_data[2][it]
                ):
                    if (
                        resp_time > (0.5 * self.slo)
                        and resp_time <= (self.upper_thr_slo * self.slo)
                        and cpu_usage < 80
                        and cpu_usage > self.min_cpu_usage
                        and req_rate > 0.5
                    ):
                        if max_resp_time < resp_time:
                            max_resp_time = resp_time
                            max_cpu = cpu_usage
                            max_req_rate = req_rate

            # if max_resp_time == 0 or max_cpu or max_req_rate==0:

            #   try:
            #      if self.vm_type_max_throughput[inst_type]:
            #        max_throughput = self.vm_type_max_throughput[inst_type]
            #   except Exception:
            #        self.logger.error("calculate_max_instance_throughput: ERROR cannot find inst_type in "+ str(self.vm_type_max_throughput))
            #        max_throughput = 0

            else:
                try:

                    if self.vm_type_max_throughput[inst_type]:
                        (cpu_inst, req_inst) = self.vm_type_max_throughput[inst_type]
                        if req_inst <= max_req_rate:
                            self.vm_type_max_throughput[inst_type] = (max_cpu, max_req_rate)

                except Exception as e:
                    self.vm_type_max_throughput[inst_type] = (max_cpu, max_req_rate)

            self.logger.info(
                "Max throughput of inst_type: "
                + inst_type
                + "  Cpu: "
                + str(max_cpu)
                + " Resp time: "
                + str(max_resp_time)
                + " Req: "
                + str(max_req_rate)
            )

        except Exception as e:
            self.logger.error("calculate_max_instance_throughput: ERROR calculating max throughput " + str(e))

    def store_workload(self, inst_type, ip, backend_monitoring_data):
        array_monitoring_data = [[0 for i in range(6)] for j in range(5)]

        if len(self.machines) == 0:
            num_iterations = array_monitoring_data[4][0]
            array_monitoring_data[0][num_iterations] = backend_monitoring_data[ip]["php_response_time"]
            array_monitoring_data[1][num_iterations] = backend_monitoring_data[ip]["cpu_user"]
            array_monitoring_data[2][num_iterations] = backend_monitoring_data[ip]["php_request_rate"]
            array_monitoring_data[3][num_iterations] = backend_monitoring_data[ip]["cpu_system"]
            num_iterations = num_iterations + 1
            array_monitoring_data[4][0] = num_iterations
            self.logger.debug(
                "store_machine_workload: Data to add instance " + str(inst_type) + "  --> " + str(array_monitoring_data)
            )
            self.machines[inst_type] = array_monitoring_data

            # Gather information about the maximum throughput of one instance type.
            self.calculate_max_instance_throughput(inst_type, array_monitoring_data)
        else:
            try:
                if self.machines[inst_type]:
                    array_monitoring_data = self.machines[inst_type]
                    num_iterations = array_monitoring_data[4][0]
                    if array_monitoring_data[4][0] == self.max_iterations:
                        num_iterations = 0

                    array_monitoring_data[0][num_iterations] = backend_monitoring_data[ip]["php_response_time"]
                    array_monitoring_data[1][num_iterations] = backend_monitoring_data[ip]["cpu_user"]
                    array_monitoring_data[2][num_iterations] = backend_monitoring_data[ip]["php_request_rate"]
                    array_monitoring_data[3][num_iterations] = backend_monitoring_data[ip]["cpu_system"]
                    num_iterations = num_iterations + 1
                    array_monitoring_data[4][0] = num_iterations

                    self.machines[inst_type] = array_monitoring_data

                    # Gather information about the maximum throughput of one instance type.
                    self.calculate_max_instance_throughput(inst_type, array_monitoring_data)
                    self.logger.debug(
                        "store_machine_workload: Data to add instance "
                        + str(inst_type)
                        + "  --> "
                        + str(array_monitoring_data)
                    )

            except:
                num_iterations = array_monitoring_data[4][0]
                array_monitoring_data[0][num_iterations] = backend_monitoring_data[ip]["php_response_time"]
                array_monitoring_data[1][num_iterations] = backend_monitoring_data[ip]["cpu_user"]
                array_monitoring_data[2][num_iterations] = backend_monitoring_data[ip]["php_request_rate"]
                array_monitoring_data[3][num_iterations] = backend_monitoring_data[ip]["cpu_system"]
                num_iterations = num_iterations + 1
                array_monitoring_data[4][0] = num_iterations

                self.machines[inst_type] = array_monitoring_data

                # Gather information about the maximum throughput of one instance type.
                self.calculate_max_instance_throughput(inst_type, array_monitoring_data)
                self.logger.debug(
                    "store_machine_workload: Data to add instance "
                    + str(inst_type)
                    + "  --> "
                    + str(array_monitoring_data)
                )

    def store_instance_workload(self, backend_nodes, backend_monitoring_data):
        inst_type_ip = {}

        for backend_node in sorted(backend_nodes):
            if len(backend_monitoring_data[backend_node.ip]["php_response_time"]) > 20:
                name = self.cost_controller.get_type_instance(backend_node.ip)
                inst_type_ip[name] = backend_node.ip

        for type, ip in inst_type_ip.iteritems():
            self.store_workload(type, ip, backend_monitoring_data)
Exemplo n.º 11
0
class Monitoring_Controller:
    def __init__(self, logger, cost_controller, config_parser,
                 config_file_path, manager_host, manager_port, process_state,
                 ganglia_rrd_dir):
        self.cost_controller = cost_controller
        self.config_parser = config_parser
        self.manager_host = manager_host
        self.manager_port = manager_port
        self.logger = logger
        self.process_state = process_state
        self.ganglia_rrd_dir = ganglia_rrd_dir
        self.last_collect_time = time()

        self.stat_utils = StatUtils()

        try:
            self.config_parser.read(config_file_path)
        except:
            print >> sys.stderr, 'Failed to read configuration file'
            sys.exit(1)

        #initialize a memcache client
        memcache_addr = config_parser.get('manager', 'MEMCACHE_ADDR')

        if memcache_addr == '':
            print >> sys.stderr, 'Failed to find memcache address in the config file'
            sys.exit(1)

        self.memcache = memcache.Client([memcache_addr])
        self.perf_info = ServicePerformance()
        self._performance_info_set(self.perf_info)

        self.monitoring_metrics_web = [
            'web_request_rate', 'web_response_time', 'cpu_user', 'boottime'
        ]
        self.monitoring_metrics_backend = [
            'php_request_rate', 'php_response_time', 'cpu_user', 'cpu_system',
            'cpu_num', 'mem_total', 'boottime'
        ]
        self.monitoring_metrics_proxy = ['web_request_rate_lb', 'web_response_time_lb', \
                                       'php_request_rate_lb', 'php_response_time_lb', 'cpu_user', 'boottime']

    def _performance_info_get(self):
        return self.memcache.get('performance_info')

    def _performance_info_set(self, perf_info):
        self.memcache.set('performance_info', perf_info)

    def nodes_info_update(self, killed_backends):
        #conpaas_init_ssl_ctx(self.certs_dir, 'manager')
        print('MANAGER %s' % self.manager_host)
        print('PORT %s' % self.manager_port)

        nodes = client.list_nodes(self.manager_host, self.manager_port)
        self.logger.debug('Got update info from manager')

        perf_info = self._performance_info_get()

        perf_info.reset_role_info()

        self.logger.debug('Updating nodes...')
        for node_id in nodes['proxy']:
            node = perf_info.serviceNodes.get(node_id)
            if node != None:
                node.registered_with_manager = True
                node.isRunningProxy = True
            else:
                perf_info.serviceNodes[node_id] = ServiceNodePerf(
                    node_id, '', True, False, False, self.process_state)

        for node_id in nodes['web']:
            node = perf_info.serviceNodes.get(node_id)
            if node != None:
                node.registered_with_manager = True
                node.isRunningWeb = True
            else:
                perf_info.serviceNodes[node_id] = ServiceNodePerf(
                    node_id, '', False, True, False, self.process_state)

        for node_id in nodes['backend']:
            node = perf_info.serviceNodes.get(node_id)
            if node != None:
                node.registered_with_manager = True
                node.isRunningBackend = True
            else:
                perf_info.serviceNodes[node_id] = ServiceNodePerf(
                    node_id, '', False, False, True, self.process_state)

        self.logger.info('Filtering backend nodes killed_backends : ' +
                         str(killed_backends) + ' ' +
                         str(perf_info.serviceNodes))
        for id, node in perf_info.serviceNodes.items():
            if node.ip == '':
                response = client.get_node_info(self.manager_host,
                                                self.manager_port, id)
                node.ip = response['serviceNode']['ip']
            if node.registered_with_manager == False:
                del perf_info.serviceNodes[id]
            ####FIXME TO FILTER REMOVE OF BACKENDS #####
            if id in killed_backends:
                self.logger.info('Filtered backend  with id: ' + str(id))
                try:
                    del perf_info.serviceNodes[id]
                except:
                    self.logger.warning(
                        'Backend already removed or not containing in serviceNodes: '
                        + str(id))
            ###########################################
        self.logger.info('Filtered backend nodes killed_backends : ' +
                         str(killed_backends) + ' ' +
                         str(perf_info.serviceNodes))

        self._performance_info_set(perf_info)

        self.logger.info('Updating nodes information from ConPaaS manager...')
        self.logger.info('Updated service nodes: %s' %
                         str(perf_info.serviceNodes))

    def collect_monitoring_metric(self, node_ip, metric_name):
        timestamps = []
        param_values = []
        # Added this for EC2, where the RRD directory names in Ganglia are hosts and not IPs:
        ganglia_dir_name = ''

        if node_ip.find('amazonaws') > 0:  # this is an IP address
            ganglia_dir_name = node_ip
        else:  # this is a DNS name
            for ganglia_host in listdir(self.ganglia_rrd_dir):
                #self.logger.error('collect from ganglia host: ' + str(ganglia_host))
                if ganglia_host.find('Summary') > 0:
                    continue
                try:
                    hostname, array, array_ip = socket.gethostbyaddr(node_ip)
                except:
                    self.logger.warning(
                        'Found private ip when trying to get the hostname for ip: '
                        + str(node_ip))
                    ganglia_dir_name = node_ip
                    break
                #self.logger.error('gethostbyaddr: ' + hostname)
                if ganglia_host == hostname:
                    ganglia_dir_name = ganglia_host
                    break

        rrd_file_name = self.ganglia_rrd_dir + ganglia_dir_name + '/' + metric_name + '.rrd'
        self.logger.error('rrd_file_name: ' + str(rrd_file_name))
        #    logger.info('Searching in RRD file:' + rrd_file_name)
        if (not path.isfile(rrd_file_name)):
            self.logger.error('RRD file not found: ' + rrd_file_name)
            return []

        #logger.info('Getting monitoring info for node %s, parameter %s ...' % (node_ip, metric_name))


#    logger.info('last collect time: ' + str(int(self.last_collect_time)))
        collect_from = self.last_collect_time - (time() -
                                                 self.last_collect_time)
        #collect_from = self.last_collect_time
        proc = Popen(['rrdtool', 'fetch', '-s', str(int(collect_from)), '-r', '15', \
                      str(rrd_file_name), 'AVERAGE'], stdout=PIPE, stderr=PIPE, close_fds=True)
        stdout_req, stderr_req = proc.communicate()

        lines = stdout_req.splitlines()
        for line in lines:
            #logger.debug(line)
            tokens = line.split()
            if (line.find('sum') >= 0 or len(tokens) < 2):
                continue

            timestamps.append(int(tokens[0].replace(':', '')))

            if (tokens[1].find('nan') < 0):
                param_values.append(float(tokens[1]))
            else:
                param_values.append(-1)

        ## Cleaning the memory allocated by subprocess.Popen()
        try:
            proc.terminate()
        except OSError:
            #  logger.critical("Cannot kill the subprocess.popen rrdtool")
            # can't kill a dead proc
            pass

        #logger.debug('timestamps: ' + str(timestamps))
        #logger.debug('param values: ' + str(param_values))
        return [timestamps, param_values]

    def init_collect_monitoring_data(self):
        self.perf_info = self._performance_info_get()

    def collect_monitoring_data(self):

        web_monitoring_data = {}
        backend_monitoring_data = {}
        proxy_monitoring_data = {}

        for web_node in self.perf_info.getWebServiceNodes():
            self.logger.info('Getting web monitoring info for %s ...' %
                             web_node.ip)
            #if web_node.ip not in web_monitoring_data:
            web_monitoring_data[web_node.ip] = {}
            cpu_num = DEFAULT_NUM_CPU
            mem_total = DEFAULT_RAM_MEMORY
            self.logger.info('Getting web monitoring info 1')
            for it in range(len(self.monitoring_metrics_web)):
                self.logger.info('Getting web monitoring info 2')
                ret = self.collect_monitoring_metric(
                    web_node.ip, self.monitoring_metrics_web[it])
                self.logger.info('Getting web monitoring info 3')
                if len(ret) == 0:  # monitoring data was not found
                    self.logger.info('Getting web monitoring info 4')
                    return False
                if 'timestamps' not in web_monitoring_data[web_node.ip]:
                    web_monitoring_data[web_node.ip]['timestamps'] = ret[0]
                web_monitoring_data[web_node.ip][
                    self.monitoring_metrics_web[it]] = ret[1]

                if self.monitoring_metrics_web[
                        it] == 'cpu_num' and web_monitoring_data[web_node.ip][
                            self.monitoring_metrics_web[it]][0] != -1:
                    cpu_num = backend_monitoring_data[web_node.ip][
                        self.monitoring_metrics_web[it]][0]

                if self.monitoring_metrics_web[
                        it] == 'mem_total' and web_monitoring_data[
                            web_node.ip][
                                self.monitoring_metrics_web[it]][0] != -1:
                    mem_total = str(backend_monitoring_data[web_node.ip][
                        self.monitoring_metrics_web[it]][0])

                if self.monitoring_metrics_web[
                        it] == 'boottime' and web_monitoring_data[web_node.ip][
                            self.monitoring_metrics_web[it]][0] != -1:
                    self.cost_controller.update_vm_usage(
                        web_node.ip, web_monitoring_data[web_node.ip][
                            self.monitoring_metrics_web[it]][0],
                        self.cost_controller.instance_type_detector(
                            cpu_num, mem_total))

        for backend_node in self.perf_info.getBackendServiceNodes():
            self.logger.info('Getting backend monitoring info for %s ...' %
                             backend_node.ip)
            backend_monitoring_data[backend_node.ip] = {}
            cpu_num = DEFAULT_NUM_CPU
            mem_total = DEFAULT_RAM_MEMORY
            """ 
          It iterates over the array to get the metrics in the same order, they defined added. 
          It allows to detect the type of instance by analyzing the cpu, mem_total.
      """
            for it in range(len(self.monitoring_metrics_backend)):
                ret = self.collect_monitoring_metric(
                    backend_node.ip, self.monitoring_metrics_backend[it])
                if len(ret) == 0:  # monitoring data was not found
                    return False
                if 'timestamps' not in backend_monitoring_data[
                        backend_node.ip]:
                    backend_monitoring_data[
                        backend_node.ip]['timestamps'] = ret[0]
                backend_monitoring_data[backend_node.ip][
                    self.monitoring_metrics_backend[it]] = ret[1]

                self.logger.info('There is a metric name: ' +
                                 str(self.monitoring_metrics_backend[it]))
                if self.monitoring_metrics_backend[it] == 'cpu_num':
                    if backend_monitoring_data[backend_node.ip][
                            self.monitoring_metrics_backend[it]][0] > 0:
                        self.logger.info(
                            'There is a metric cpu_num with content: ' +
                            str(backend_monitoring_data[backend_node.ip][
                                self.monitoring_metrics_backend[it]][0]))
                        cpu_num = str(backend_monitoring_data[backend_node.ip][
                            self.monitoring_metrics_backend[it]][0])
                    else:
                        ## This is done to clean the negative and worng values from the monitoring data
                        self.logger.info(
                            'There is a metric cpu_num with content equal or minus to zero '
                        )
                        for value in backend_monitoring_data[backend_node.ip][
                                self.monitoring_metrics_backend[it]]:
                            if value > 0:
                                self.logger.info(
                                    'There is a metric cpu_num with content: '
                                    + str(value))
                                cpu_num = value
                                break

                if self.monitoring_metrics_backend[it] == 'mem_total':
                    if backend_monitoring_data[backend_node.ip][
                            self.monitoring_metrics_backend[it]][0] > 0:
                        self.logger.info(
                            'There is a metric mem_total with content: ' +
                            str(backend_monitoring_data[backend_node.ip][
                                self.monitoring_metrics_backend[it]][0]))
                        mem_total = str(
                            backend_monitoring_data[backend_node.ip][
                                self.monitoring_metrics_backend[it]][0])
                    else:
                        ## This is done to clean the negative and worng values from the monitoring data
                        self.logger.info(
                            'There is a metric mem_total with content equal or minus to zero '
                        )
                        for value in backend_monitoring_data[backend_node.ip][
                                self.monitoring_metrics_backend[it]]:
                            if value > 0:
                                self.logger.info(
                                    'There is a metric mem_total with content: '
                                    + str(value))
                                mem_total = value
                                break

                if self.monitoring_metrics_backend[it] == 'boottime':
                    if backend_monitoring_data[backend_node.ip][
                            self.monitoring_metrics_backend[it]][0] > 0:
                        self.logger.info(
                            'There is a metric boottime with content: ' +
                            str(backend_monitoring_data[backend_node.ip][
                                self.monitoring_metrics_backend[it]][0]))
                        self.cost_controller.update_vm_usage(
                            backend_node.ip,
                            backend_monitoring_data[backend_node.ip][
                                self.monitoring_metrics_backend[it]][0],
                            self.cost_controller.instance_type_detector(
                                cpu_num, mem_total))
                    else:
                        ## This is done to clean the negative and worng values from the monitoring data
                        boottime = backend_monitoring_data[backend_node.ip][
                            self.monitoring_metrics_backend[it]][0]
                        self.logger.info(
                            'There is a metric boottime with content equal or minus to zero '
                        )
                        for value in backend_monitoring_data[backend_node.ip][
                                self.monitoring_metrics_backend[it]]:
                            if value > 0:
                                self.logger.info(
                                    'There is a metric boottime with content: '
                                    + str(value))
                                self.cost_controller.update_vm_usage(
                                    backend_node.ip, float(boottime),
                                    self.cost_controller.
                                    instance_type_detector(cpu_num, mem_total))
                                boottime = value
                                break

        for proxy_node in self.perf_info.getProxyServiceNodes():
            self.logger.info('Getting proxy monitoring info for %s ...' %
                             proxy_node.ip)
            proxy_monitoring_data[proxy_node.ip] = {}
            for it in range(len(self.monitoring_metrics_proxy)):
                ret = self.collect_monitoring_metric(
                    proxy_node.ip, self.monitoring_metrics_proxy[it])
                if len(ret) == 0:  # monitoring data was not found
                    return False
                if 'timestamps' not in proxy_monitoring_data[proxy_node.ip]:
                    proxy_monitoring_data[proxy_node.ip]['timestamps'] = ret[0]
                proxy_monitoring_data[proxy_node.ip][
                    self.monitoring_metrics_proxy[it]] = ret[1]

                if self.monitoring_metrics_proxy[
                        it] == 'cpu_num' and proxy_monitoring_data[
                            proxy_node.ip][
                                self.monitoring_metrics_proxy[it]][0] != -1:
                    cpu_num = proxy_monitoring_data[proxy_node.ip][
                        self.monitoring_metrics_proxy[it]][0]

                if self.monitoring_metrics_proxy[
                        it] == 'mem_total' and proxy_monitoring_data[
                            proxy_node.ip][
                                self.monitoring_metrics_proxy[it]][0] != -1:
                    mem_total = str(proxy_monitoring_data[proxy_node.ip][
                        self.monitoring_metrics_proxy[it]][0])

                if self.monitoring_metrics_proxy[
                        it] == 'boottime' and proxy_monitoring_data[
                            proxy_node.ip][
                                self.monitoring_metrics_proxy[it]][0] != -1:
                    self.cost_controller.update_vm_usage(
                        proxy_node.ip, proxy_monitoring_data[proxy_node.ip][
                            self.monitoring_metrics_proxy[it]][0],
                        self.cost_controller.instance_type_detector(
                            cpu_num, mem_total))

                proxy_monitoring_data[
                    proxy_node.ip] = self.stat_utils.filter_monitoring_data(
                        proxy_monitoring_data[proxy_node.ip],
                        self.monitoring_metrics_proxy)

        print proxy_monitoring_data
        print web_monitoring_data
        print backend_monitoring_data
        self.last_collect_time = time()
        print "Done getting monitoring data..."
        return True

    def collect_monitoring_data_web(self):

        web_monitoring_data = {}

        for web_node in self.perf_info.getWebServiceNodes():
            self.logger.info('Getting web monitoring info for %s ...' %
                             web_node.ip)
            #if web_node.ip not in web_monitoring_data:
            web_monitoring_data[web_node.ip] = {}
            cpu_num = DEFAULT_NUM_CPU
            mem_total = DEFAULT_RAM_MEMORY
            for it in range(len(self.monitoring_metrics_web)):
                ret = self.collect_monitoring_metric(
                    web_node.ip, self.monitoring_metrics_web[it])
                if len(ret) == 0:  # monitoring data was not found
                    return {}

                if 'timestamps' not in web_monitoring_data[web_node.ip]:
                    web_monitoring_data[web_node.ip]['timestamps'] = ret[0]
                web_monitoring_data[web_node.ip][
                    self.monitoring_metrics_web[it]] = ret[1]

                if self.monitoring_metrics_web[
                        it] == 'cpu_num' and web_monitoring_data[web_node.ip][
                            self.monitoring_metrics_web[it]][0] != -1:
                    cpu_num = web_monitoring_data[web_node.ip][
                        self.monitoring_metrics_web[it]][0]

                if self.monitoring_metrics_web[
                        it] == 'mem_total' and web_monitoring_data[
                            web_node.ip][
                                self.monitoring_metrics_web[it]][0] != -1:
                    mem_total = str(web_monitoring_data[web_node.ip][
                        self.monitoring_metrics_web[it]][0])

                if self.monitoring_metrics_web[it] == 'boottime':
                    self.cost_controller.update_vm_usage(
                        web_node.ip, web_monitoring_data[web_node.ip][
                            self.monitoring_metrics_web[it]][0],
                        self.cost_controller.instance_type_detector(
                            cpu_num, mem_total))

        return web_monitoring_data

    def collect_monitoring_data_backend(self):
        backend_monitoring_data = {}

        for backend_node in self.perf_info.getBackendServiceNodes():
            self.logger.info('Getting backend monitoring info for %s ...' %
                             backend_node.ip)
            backend_monitoring_data[backend_node.ip] = {}
            cpu_num = DEFAULT_NUM_CPU
            mem_total = DEFAULT_RAM_MEMORY
            """ 
          It iterates over the array to get the metrics in the same order, they defined added. 
          It allows to detect the type of instance by analyzing the cpu, mem_total.
      """
            for it in range(len(self.monitoring_metrics_backend)):
                ret = self.collect_monitoring_metric(
                    backend_node.ip, self.monitoring_metrics_backend[it])
                if len(ret) == 0:  # monitoring data was not found
                    return {}
                if 'timestamps' not in backend_monitoring_data[
                        backend_node.ip]:
                    backend_monitoring_data[
                        backend_node.ip]['timestamps'] = ret[0]
                backend_monitoring_data[backend_node.ip][
                    self.monitoring_metrics_backend[it]] = ret[1]

                #self.logger.info('There is a metric name: '+str(self.monitoring_metrics_backend[it]))
                if self.monitoring_metrics_backend[it] == 'cpu_num':
                    if backend_monitoring_data[backend_node.ip][
                            self.monitoring_metrics_backend[it]][0] > 0:
                        self.logger.info(
                            'There is a metric cpu_num with content: ' +
                            str(backend_monitoring_data[backend_node.ip][
                                self.monitoring_metrics_backend[it]][0]))
                        cpu_num = backend_monitoring_data[backend_node.ip][
                            self.monitoring_metrics_backend[it]][0]
                    else:
                        ## This is done to clean the negative and worng values from the monitoring data
                        self.logger.info(
                            'There is a metric cpu_num with content equal or minus to zero '
                        )
                        for value in backend_monitoring_data[backend_node.ip][
                                self.monitoring_metrics_backend[it]]:
                            if value > 0:
                                self.logger.info(
                                    'There is a metric cpu_num with content: '
                                    + str(value))
                                cpu_num = value
                                break

                if self.monitoring_metrics_backend[it] == 'mem_total':
                    if backend_monitoring_data[backend_node.ip][
                            self.monitoring_metrics_backend[it]][0] > 0:
                        self.logger.info(
                            'There is a metric mem_total with content: ' +
                            str(backend_monitoring_data[backend_node.ip][
                                self.monitoring_metrics_backend[it]][0]))
                        mem_total = backend_monitoring_data[backend_node.ip][
                            self.monitoring_metrics_backend[it]][0]
                    else:
                        ## This is done to clean the negative and worng values from the monitoring data
                        self.logger.info(
                            'There is a metric mem_total with content equal or minus to zero '
                        )
                        for value in backend_monitoring_data[backend_node.ip][
                                self.monitoring_metrics_backend[it]]:
                            if value > 0:
                                self.logger.info(
                                    'There is a metric mem_total with content: '
                                    + str(value))
                                mem_total = value
                                break

                if self.monitoring_metrics_backend[it] == 'boottime':
                    if backend_monitoring_data[backend_node.ip][
                            self.monitoring_metrics_backend[it]][0] > 0:
                        self.logger.info(
                            'There is a metric boottime with content: ' +
                            str(backend_monitoring_data[backend_node.ip][
                                self.monitoring_metrics_backend[it]][0]))
                        self.cost_controller.update_vm_usage(
                            backend_node.ip,
                            backend_monitoring_data[backend_node.ip][
                                self.monitoring_metrics_backend[it]][0],
                            self.cost_controller.instance_type_detector(
                                cpu_num, mem_total))
                    else:
                        ## This is done to clean the negative and worng values from the monitoring data
                        boottime = backend_monitoring_data[backend_node.ip][
                            self.monitoring_metrics_backend[it]][0]
                        self.logger.info(
                            'There is a metric boottime with content equal or minus to zero '
                        )
                        for value in backend_monitoring_data[backend_node.ip][
                                self.monitoring_metrics_backend[it]]:
                            if value > 0:
                                self.logger.info(
                                    'There is a metric boottime with content: '
                                    + str(value))
                                boottime = value
                                break
                        self.cost_controller.update_vm_usage(
                            backend_node.ip, float(boottime),
                            self.cost_controller.instance_type_detector(
                                cpu_num, mem_total))

        return backend_monitoring_data

    def collect_monitoring_data_proxy(self):
        self.perf_info = self._performance_info_get()
        proxy_monitoring_data = {}

        for proxy_node in self.perf_info.getProxyServiceNodes():
            self.logger.info('Getting proxy monitoring info for %s ...' %
                             proxy_node.ip)
            proxy_monitoring_data[proxy_node.ip] = {}
            cpu_num = DEFAULT_NUM_CPU
            mem_total = DEFAULT_RAM_MEMORY
            for it in range(len(self.monitoring_metrics_proxy)):
                ret = self.collect_monitoring_metric(
                    proxy_node.ip, self.monitoring_metrics_proxy[it])
                if len(ret) == 0:  # monitoring data was not found
                    return {}
                if 'timestamps' not in proxy_monitoring_data[proxy_node.ip]:
                    proxy_monitoring_data[proxy_node.ip]['timestamps'] = ret[0]
                proxy_monitoring_data[proxy_node.ip][
                    self.monitoring_metrics_proxy[it]] = ret[1]

                if self.monitoring_metrics_proxy[
                        it] == 'cpu_num' and proxy_monitoring_data[
                            proxy_node.ip][
                                self.monitoring_metrics_proxy[it]][0] != -1:
                    cpu_num = proxy_monitoring_data[proxy_node.ip][
                        self.monitoring_metrics_proxy[it]][0]

                if self.monitoring_metrics_proxy[
                        it] == 'mem_total' and proxy_monitoring_data[
                            proxy_node.ip][
                                self.monitoring_metrics_proxy[it]][0] != -1:
                    mem_total = str(proxy_monitoring_data[proxy_node.ip][
                        self.monitoring_metrics_proxy[it]][0])

                if self.monitoring_metrics_proxy[it] == 'boottime':
                    self.cost_controller.update_vm_usage(
                        proxy_node.ip, proxy_monitoring_data[proxy_node.ip][
                            self.monitoring_metrics_proxy[it]][0],
                        self.cost_controller.instance_type_detector(
                            cpu_num, mem_total))

            proxy_monitoring_data[
                proxy_node.ip] = self.stat_utils.filter_monitoring_data(
                    proxy_monitoring_data[proxy_node.ip],
                    self.monitoring_metrics_proxy)

        self.last_collect_time = time()

        return proxy_monitoring_data
Exemplo n.º 12
0
class ProvisioningManager:
  """
  The ProvisioningManager takes decisions about adding and removing nodes from the service.
  """
  
## FIXME: CHANGED TO BE ADAPTED TO QCOW2 BOOTING TIME. ###
  time_between_changes = TIME_BTW_SCALING_ACTIONS
  time_between_scaling_predictions = TIME_BTW_SCALING_PREDICTIONS
##########################################################
  

  ganglia_rrd_dir = '/var/lib/ganglia/rrds/conpaas/'
  
  def __init__(self, config_parser):
   try: 
    self.slo = 700
    
    self.weight_slow_violation = WEIGHT_SLO_VIOLATION
    
    self.web_monitoring_data = {}
    self.backend_monitoring_data = {}
    self.proxy_monitoring_data = {}
    
    
    self.last_change_time = 0
    self.last_scaling_operation = 0
    self.calculate_scaling_error = False
    
    self.predictor = Prediction_Models(logger)
    self.trigger_prediction = 0
    
    ## FIXME: Size is 5 due to an excessive number of items to be predicted, please repair this part.
    ##, as we want to store the monitoring data during 60min, considering 5min between iterations
    self.predictorScaler_cpu_usage_1h = Queue( [] , 5)
    self.predictorScaler_req_rate_1h = Queue( [] , 5)
    
    self.forecast_model_selected = 0 
    self.forecast_resp_predicted = 0
    self.forecast_list = {}
    
    self.pool_predictors = ThreadPool(processes=5)
    
    self.killed_backends = []

    self.trigger_weight_balancing = False
    self.autoscaling_running = True
    
    self.iaas_driver = config_parser.get('iaas', 'DRIVER').upper()
    
    self.cost_controller = Cost_Controller(logger, self.iaas_driver)
    
    ## Parameters to establish a preference for selecting the most appropriate resource.
    self.optimal_scaling = Strategy_Finder(logger, self.iaas_driver, self.cost_controller, 'low', True, self.weight_slow_violation) 
  
    self.stat_utils = StatUtils()
    
    self.monitoring =  Monitoring_Controller( logger, self.cost_controller, config_parser, '/root/config.cfg', MANAGER_HOST, MANAGER_PORT, PS_RUNNING, self.ganglia_rrd_dir)

    self.dyc_load_balancer = Dynamic_Load_Balancer(logger, MANAGER_HOST, MANAGER_PORT, client)
    
    self.profiler = Profiler(logger, self.slo, self.cost_controller, MAX_CPU_USAGE, MIN_CPU_USAGE, UPPER_THRS_SLO, LOWER_THRS_SLO )
 
   except Exception as e:
      logger.critical('Scaler: Error when initializing the ProvisioningManager in scaler.py \n' + str(e))
   

  def log_monitoring_data(self):
    logger.debug('**** Web monitoring data: *****\n' + str(self.web_monitoring_data))
    logger.debug('**** Backend monitoring data: *****\n' + str(self.backend_monitoring_data))
    logger.debug('**** Proxy monitoring data: *****\n' + str(self.proxy_monitoring_data))
              
     
    
  def prediction_evaluation_proxy(self, proxy_ip, php_resp_data):
    
    try:        
            forecast_list_aux = {}
            php_resp_filtered = []
            for i in php_resp_data:
                if i > 0:
                    php_resp_filtered.append(i)
            
            logger.debug("PhP response time list data: "+str(php_resp_filtered))           
           
            async_result_ar =  self.pool_predictors.apply_async(self.predictor.auto_regression, (php_resp_filtered,30))
            async_result_lr =  self.pool_predictors.apply_async(self.predictor.linear_regression, (php_resp_filtered,30))
            async_result_exp_smoothing =  self.pool_predictors.apply_async(self.predictor.exponential_smoothing, (php_resp_filtered,12))
            async_result_var =  self.pool_predictors.apply_async(self.predictor.vector_auto_regression, (php_resp_filtered, php_resp_filtered, 30))
          #  async_result_arma =  self.pool_predictors.apply_async(self.predictor.arma, (php_resp_filtered,30))
                                           
            forecast_list_aux[1] = async_result_lr.get()
            forecast_list_aux[2] = async_result_exp_smoothing.get()
            forecast_list_aux[3] = async_result_var.get()
            forecast_list_aux[0] = async_result_ar.get()
           # forecast_list_aux[0] = async_result_arma.get()
            
            try:
               logger.debug("Getting the forecast response time for the best model in the previous iteration "+str(self.forecast_model_selected)) 
               weight_avg_predictions = self.stat_utils.compute_weight_average(forecast_list_aux[self.forecast_model_selected])
               
               if weight_avg_predictions > 0:
                   self.forecast_resp_predicted  = weight_avg_predictions      
               logger.debug("Prediction value for model "+str(self.forecast_model_selected)+"--  Prediction php_resp_time: "+str(self.forecast_resp_predicted))
      
            except Exception as e:  
                logger.warning("Warning trying to predict a future value for the model." + str(e))
               
            self.forecast_list[proxy_ip] = forecast_list_aux
    except Exception as e:
        logger.error("Error trying to predict the future response_time values. "+ str(e))\
        

        
  def store_predictorScaler_workload(self, cpu_usage, req_rate):
      list_cpu = []
      list_req_rate = []
      for cpu, req_rate in itertools.izip(cpu_usage, req_rate):
          if cpu > 10 and req_rate > 0:
              list_cpu.append(cpu)
              list_req_rate.append(req_rate)
      logger.debug("store_predictorScaler_workload: Filtered cpu "+str(list_cpu))
      logger.debug("store_predictorScaler_workload: Filtered req_rate "+str(list_req_rate))        
      self.predictorScaler_cpu_usage_1h.push(list_cpu)
      self.predictorScaler_req_rate_1h.push(list_req_rate)
     ## logger.debug("store_proxy_workload: Proxy historic data "+ str(self.historic_proxy_1h.q))
    
  def calculate_error_prediction(self, php_resp_data, ip):
    forecast_list_aux = {}
    min_error_prediction = 1000000
    forecast_model = 0
    #forecast_resp = 0
    try:    
             
           logger.debug("calculate_error_prediction: with ip: "+str(ip))
        #   php_resp_data = self.proxy_monitoring_data[ip]['php_response_time_lb']
           #php_resp_data = [x for x in php_resp_data[0:30]]         
           forecast_list_aux = self.forecast_list[ip] 
           logger.debug("calculate_error_prediction: once obtained the forecast_list. ")
           weight_avg_current = self.stat_utils.compute_weight_average_response(php_resp_data, self.slo, self.weight_slow_violation)
       #    try:
        #       weight_avg_predictions = self.stat_utils.compute_weight_average(forecast_list_aux[0])
         #      prediction_error = math.fabs( weight_avg_current - weight_avg_predictions )
           
          #     if min_error_prediction > prediction_error and prediction_error > 0:
           #        forecast_model = 0
            #       min_error_prediction = prediction_error
                  # forecast_resp  = weight_avg_predictions      
             #  logger.debug("Prediction error ARMA with php_resp_time: "+str(weight_avg_current)+" --  Prediction php_resp_time: "+str(weight_avg_predictions)+" Error: "+str(prediction_error))
      
           #except Exception as e:  
            #    logger.warning("Warning trying to predict the error estimate for ARMA." + str(e))
           try:
                  
               weight_avg_predictions = self.stat_utils.compute_weight_average(forecast_list_aux[0])
               prediction_error = math.fabs( weight_avg_current - weight_avg_predictions )
           
               if min_error_prediction > prediction_error and prediction_error > 0:
                    forecast_model = 0
                    min_error_prediction = prediction_error
                    #forecast_resp  = weight_avg_predictions
               logger.debug("Prediction error AR with php_resp_time: "+str(weight_avg_current)+" --  Prediction php_resp_time: "+str(weight_avg_predictions)+" Error: "+str(prediction_error))
           
           except Exception as e:  
                logger.warning("Warning trying to predict the error estimate for AR." + str(e))
       
           try: 
               weight_avg_predictions = self.stat_utils.compute_weight_average(forecast_list_aux[1])
               prediction_error = math.fabs( weight_avg_current - weight_avg_predictions )
           
               if min_error_prediction > prediction_error and prediction_error > 0:
                   forecast_model = 1
                   min_error_prediction = prediction_error
                   #forecast_resp  = weight_avg_predictions
               logger.debug("Prediction error LR with php_resp_time: "+str(weight_avg_current)+" --  Prediction php_resp_time: "+str(weight_avg_predictions)+" Error: "+str(prediction_error))
           
           except Exception as e:  
                logger.warning("Warning trying to predict the error estimate for LR." + str(e))
       
           try:
               
              # php_resp_data = [x for x in php_resp_data[0:12]]
               weight_avg_predictions = self.stat_utils.compute_weight_average(forecast_list_aux[2])
               prediction_error = math.fabs( weight_avg_current - weight_avg_predictions )
               
               if min_error_prediction > prediction_error and prediction_error > 0:
                   forecast_model = 2
                   min_error_prediction = prediction_error
                   #forecast_resp  = weight_avg_predictions
               logger.debug("Prediction error EXP. SMOOTHING with php_resp_time: "+str(weight_avg_current)+" --  Prediction php_resp_time: "+str(weight_avg_predictions)+" Error: "+str(prediction_error))
           except Exception as e:  
                logger.warning("Warning trying to predict the error estimate for EXP. SMOOTHING." + str(e))
                
           try:
               
              # php_resp_data = [x for x in php_resp_data[0:12]]
               weight_avg_predictions = self.stat_utils.compute_weight_average(forecast_list_aux[3])
               prediction_error = math.fabs( weight_avg_current - weight_avg_predictions )
               
               if min_error_prediction > prediction_error and prediction_error > 0:
                   forecast_model = 3
                   min_error_prediction = prediction_error
                   #forecast_resp  = weight_avg_predictions
               logger.debug("Prediction error VAR with php_resp_time: "+str(weight_avg_current)+" --  Prediction php_resp_time: "+str(weight_avg_predictions)+" Error: "+str(prediction_error))
           except Exception as e:  
                logger.warning("Warning trying to predict the error estimate for VAR." + str(e))
       
           self.forecast_model_selected = forecast_model
    
    except Exception as ex:
        logger.error("Error trying to predict the error estimate for the different models. " + str(ex))

  def obtain_prediciton_decision(self, greater, slo):
    logger.info("Obtain_prediciton_decision model: " + str(self.forecast_model_selected) + " Php resp: "+str(self.forecast_resp_predicted) + " SLO: "+str(slo))
    if self.forecast_resp_predicted == 0:
        logger.critical("Forecast_resp_predicted: ERROR all the prediction models failed predicting a future value. ")
        return True
    
    if (greater and  self.forecast_resp_predicted > slo):
        logger.info("Forecast_resp_predicted is greater than slo. ")
        return True
    elif not greater and self.forecast_resp_predicted < slo:
        logger.info("Forecast_resp_predicted is lower than slo. ")
        return True
    else:
        logger.info("Forecast_resp_predicted don't do anything. ")
        return False
    
  def calculate_strategy(self, avg_cpu_backends_now, backend_nodes, req_rate_backends, cpu_usage_backends):
      logger.info("calculate_strategy: req_rate_backends  "+str(req_rate_backends)+" cpu: "+str(cpu_usage_backends))
      max_performance_throughtput = MAX_CPU_USAGE
      min_performance_throughtput = MIN_CPU_USAGE
      
      ## In cloud infrastructures, VMes perform close to the SL O violation with cpu usage quite lower than 75.
      if avg_cpu_backends_now < MAX_CPU_USAGE:
          max_performance_throughtput = avg_cpu_backends_now
      
      capacity_inst_type = {}
      combination_machines = []

      ## Initialize the maximum capacity for each type of instance based on the monitoring data ###
      for name, cost in self.cost_controller.get_instance_prices()[self.iaas_driver].iteritems():
          capacity_max_inst_type = self.profiler.calculate_ideal_throughput(name)
          logger.info("Calculate max capacity instance "+str(name)+" : "+str(capacity_max_inst_type))
          capacity_inst_type[name] = capacity_max_inst_type
      
      for backend_node in backend_nodes:
          inst_type =  self.cost_controller.get_type_instance(backend_node.ip) 
          combination_machines.append(inst_type)

      list_req_rate_data = []
      for value in self.predictorScaler_req_rate_1h.q:
            list_req_rate_data.extend(value)
      list_cpu_data = []
      for value in self.predictorScaler_cpu_usage_1h.q:
            list_cpu_data.extend(value)        
       
      logger.info("calculate_strategy: list_cpu_data  "+str(list_cpu_data))
      strategy = self.optimal_scaling.calculate_adaptive_scaling(backend_nodes, combination_machines, cpu_usage_backends, req_rate_backends, max_performance_throughtput, min_performance_throughtput, capacity_inst_type, list_cpu_data, list_req_rate_data)
             
      logger.info("calculate_strategy: Final strategy: "+str(strategy))
       
      return strategy        
   
  def consolidate_vmes(self, nodes):
       logger.info("consolidate_vmes: Initializing consolidate ")
        
       ## Check if the rest of nodes can support the req_rate of another VM, then we release it.
       for node in nodes:
           req_rate = self.stat_utils.compute_weight_average(self.stat_utils.filter_cpu_data(self.backend_monitoring_data[node.ip]['php_request_rate']))
           inst_type = self.cost_controller.get_type_instance(node.ip)
           try:
             compute_units = self.optimal_scaling.get_compute_units(inst_type)
           
             ## Verify if we can remove the machine
             if self.cost_controller.cost_shutdown_constraint(node.ip):
               
               for node_aux in nodes:
                   if not node_aux.ip in node.ip:
                       inst_type_check = self.cost_controller.get_type_instance(node_aux.ip)
                       try:
                           (cpu_inst, req_rate_inst) = self.profiler.get_vm_type_max_throughput(inst_type_check)
                           if req_rate_inst > 0:
                                req_rate_check = self.stat_utils.compute_weight_average(self.stat_utils.filter_cpu_data(self.backend_monitoring_data[node_aux.ip]['php_request_rate']))
                                if ( (req_rate_inst - req_rate_check) >= req_rate):
                                    return node.ip
                           ## No maximum data so lets check other possibilities        
                           else:
                                if self.optimal_scaling.get_compute_units(inst_type_check) > compute_units:
                                    return node.ip
                       except:
                                if self.optimal_scaling.get_compute_units(inst_type_check) > compute_units:
                                    return node.ip
       
           except Exception:
                logger.critical("consolidate_vmes: ERROR when trying to remove a vm with ip: "+str(node.ip))
      
       ## There is not any possible vm to be released...
       return ''
 
    
  def decide_actions(self):
    n_web_to_add = n_web_to_remove = 0
    n_backend_to_add = n_backend_to_remove = 0
    
    avg_web_req_rate_lb = avg_web_resp_time_lb = 0
    avg_backend_req_rate_lb = avg_backend_resp_time_lb = 0
    avg_cpu_user_backend = avg_cpu_web = 0
    #backends_req_rate = 0
    
    ret = {'add_web_nodes': 0, 'remove_web_nodes': 0, 'add_backend_nodes': 0, 'remove_backend_nodes': 0, 'vm_backend_instance': 'small', 'vm_web_instance': 'small', 'node_ip_remove':''}
    
    perf_info = self.monitoring._performance_info_get()
    web_nodes = perf_info.getWebServiceNodes()
    backend_nodes = perf_info.getBackendServiceNodes()
    proxy_nodes = perf_info.getProxyServiceNodes()
    
    self.profiler.store_instance_workload(backend_nodes, self.backend_monitoring_data)
    
    current_time = time()
    if (current_time - self.last_change_time < self.time_between_changes):
      self.trigger_weight_balancing = True  
      logger.info('Configuration was recently updated, not making any decisions for now...')
      return ret    
    
    # For the moment we assume only 1 proxy node
    for proxy_node in proxy_nodes:
      if ( self.trigger_prediction == 1):
          self.calculate_error_prediction(self.proxy_monitoring_data[proxy_node.ip]['php_response_time_lb'],proxy_node.ip)
          self.trigger_prediction = 0
      
        
      avg_web_resp_time_lb =  self.stat_utils.compute_weight_average_response(self.proxy_monitoring_data[proxy_node.ip]['web_response_time_lb'], self.slo, self.weight_slow_violation)
      avg_web_req_rate_lb =  self.stat_utils.compute_weight_average(self.proxy_monitoring_data[proxy_node.ip]['web_request_rate_lb'])
      logger.debug('Found average value for proxy web request rate: %s web response time: %s' \
                   % (str(avg_web_req_rate_lb), str(avg_web_resp_time_lb)))
      
      if avg_web_resp_time_lb > UPPER_THRS_SLO * self.slo:
        n_web_to_add = 1
        
      if avg_web_resp_time_lb < LOWER_THRS_SLO * self.slo:
        n_web_to_remove = 1
     
      
      avg_backend_req_rate_lb = self.stat_utils.compute_weight_average(self.proxy_monitoring_data[proxy_node.ip]['php_request_rate_lb'])
      #proxy_backends_filtered_data = self.stat_utils.filter_response_data(self.proxy_monitoring_data[proxy_node.ip]['php_response_time_lb'])
      proxy_backends_filtered_data = self.proxy_monitoring_data[proxy_node.ip]['php_response_time_lb']
      avg_backend_resp_time_lb = self.stat_utils.compute_weight_average_response(proxy_backends_filtered_data, self.slo, self.weight_slow_violation ) 
      logger.debug('Found average value for proxy backend request rate: %s backend response time: %s' \
                    % (str(avg_backend_req_rate_lb), str(avg_backend_resp_time_lb)))
      
      
      ##### TRIGGER PREDICTION EVALUATION #######
      if (avg_backend_resp_time_lb > UPPER_THRS_PREDICTION * self.slo ):
          self.trigger_prediction = 1
          self.prediction_evaluation_proxy(proxy_node.ip, proxy_backends_filtered_data)
          
      if (avg_backend_resp_time_lb < LOWER_THRS_PREDICTION * self.slo ):
          self.trigger_prediction = 1
          self.prediction_evaluation_proxy(proxy_node.ip, proxy_backends_filtered_data)
      ############################################
      
      if avg_backend_resp_time_lb > UPPER_THRS_SLO * self.slo and self.obtain_prediciton_decision( True, UPPER_THRS_SLO * self.slo):
        n_backend_to_add = 1
           
      if avg_backend_resp_time_lb < 10 or (avg_backend_resp_time_lb < LOWER_THRS_SLO * self.slo and self.obtain_prediciton_decision( False, LOWER_THRS_SLO * self.slo)):
        n_backend_to_remove = 1
        
    #### CPU AVERAGE VERIFICATION #####         
    for web_node in web_nodes:
      avg_cpu_web_node = self.stat_utils.compute_weight_average(self.web_monitoring_data[web_node.ip]['cpu_user'])
      logger.info('Average CPU usage per Web with IP '+web_node.ip+' '+str(avg_cpu_web_node))
      avg_cpu_web += avg_cpu_web_node
    
    avg_cpu_web = float(avg_cpu_web) / len(web_nodes)
    
    req_rate_backends_list = []
    cpu_backends_list = []
    sum_php_req_rate_backends = 0
    
    for backend_node in backend_nodes:
      cpu_backends = self.stat_utils.filter_cpu_data(self.backend_monitoring_data[backend_node.ip]['cpu_user'])
      avg_cpu_user_node = self.stat_utils.compute_weight_average(cpu_backends)
      req_rate_backends = self.stat_utils.filter_cpu_data(self.backend_monitoring_data[backend_node.ip]['php_request_rate'])
      php_req_rate_node = self.stat_utils.compute_weight_average(req_rate_backends)
      sum_php_req_rate_backends += php_req_rate_node
      
      logger.info('Average CPU usage per Backend with IP '+backend_node.ip+' '+str(avg_cpu_user_node)+ ' Req_rate:' +str(php_req_rate_node))
      if avg_cpu_user_node > MAX_CPU_USAGE:
          self.trigger_weight_balancing = True
      avg_cpu_user_backend += avg_cpu_user_node  
      
      if len(req_rate_backends_list) ==0:
          req_rate_backends_list = req_rate_backends
      else: 
          req_rate_backends_list =  [(req_rate_list_a + req_rate_list_b) for req_rate_list_a, req_rate_list_b in itertools.izip_longest(req_rate_backends_list, req_rate_backends, fillvalue=0)]
      
      if len(cpu_backends_list) == 0:
          cpu_backends_list = cpu_backends
      else: 
          cpu_backends_list =  [(cpu_list_a + cpu_list_b) / 2 for cpu_list_a, cpu_list_b in itertools.izip_longest(cpu_backends_list, cpu_backends, fillvalue=0)]    
       
    avg_cpu_user_backend = float(avg_cpu_user_backend) / len(backend_nodes)
    
    ### Check the cpu usage to add or remove backends... As shown in the plots, there is a correlation between cpu usage and sla violations. ###
    if (avg_cpu_user_backend > MAX_CPU_USAGE):
        n_backend_to_add = 1
        n_backend_to_remove = 0    
    elif (len(backend_nodes) > 1)  and (avg_cpu_user_backend < 35) and (n_backend_to_add == 0):
        n_backend_to_remove = 1    
        n_backend_to_add = 0
    
    if (avg_cpu_web > MAX_CPU_USAGE) and (len(web_nodes) < MIN_NUM_BACKENDS + 1):
       n_web_to_add = 1
       n_web_to_remove = 0
    elif (len(web_nodes) > 1)  and (avg_cpu_web < MIN_CPU_USAGE) and (n_web_to_add == 0):
        n_backend_to_remove = 1    
        n_backend_to_add = 0
    
    logger.info('Total average CPU usage (user) backend: %f' % avg_cpu_user_backend)
    logger.info('Total average CPU usage (user) web: %f' % avg_cpu_web)
    
    self.store_predictorScaler_workload(cpu_backends_list, req_rate_backends_list)
    
    #######################################################################   
    
    if (current_time - self.last_scaling_operation >= self.time_between_scaling_predictions and self.calculate_scaling_error):
      logger.info('ProvisioningV3_proxy: Calculating the prediction error of our last scaling action...')
      list_data_cpu = list_data_req_rate = []
      for value in self.predictorScaler_cpu_usage_1h.q:
            list_data_cpu.extend(value)

      for value in self.predictorScaler_req_rate_1h.q:
            list_data_req_rate.extend(value)  
            
      self.optimal_scaling.calculate_error_prediction_cpu(list_data_cpu)
      self.optimal_scaling.calculate_error_prediction_req_rate(list_data_req_rate)
      
      self.calculate_scaling_error = False
      logger.info("ProvisioningV3_proxy: Prediction cpu model: "+str(self.optimal_scaling.get_cpu_prediction_model())+" and Cpu prediction: "+str(self.optimal_scaling.get_cpu_prediction()))
      logger.info("ProvisioningV3_proxy: Prediction req_rate model: "+str(self.optimal_scaling.get_req_rate_prediction_model())+" and Req_rate prediction: "+str(self.optimal_scaling.get_req_rate_prediction()))
    
    ### CONDITIONS TO ABORT A WEB OR BACKEND REMOVAL OPERATION ###
    
    abort_backend_removal = 0
    abort_web_removal = avg_cpu_after_removal =0
    consolidate_vm = ''
    
    if  len(backend_nodes) > MIN_NUM_BACKENDS and n_backend_to_remove > 0:
       avg_cpu_after_removal = ( float(avg_cpu_user_backend) / (len(backend_nodes) - 1)) + avg_cpu_user_backend
       logger.info("ProvisioningV3: Prediction avg_cpu_after_removal: "+str(avg_cpu_after_removal))
       
    if  len(backend_nodes) > MIN_NUM_BACKENDS and n_backend_to_remove > 0 and avg_cpu_user_backend > 35: 
       
        if len(backend_nodes) == (MIN_NUM_BACKENDS + 1):
            inst_type_1 = self.cost_controller.get_type_instance(backend_nodes[0].ip)
            inst_type_2 = self.cost_controller.get_type_instance(backend_nodes[1].ip)
            if not inst_type_1 in inst_type_2:
                consolidate_vm = self.consolidate_vmes(backend_nodes)
        else:       
            consolidate_vm = self.consolidate_vmes(backend_nodes)
        logger.info("ProvisioningV3: consolidate_vm "+str(consolidate_vm))
    
 #   if(  (avg_backend_req_rate_lb / len(backend_nodes) > 1.3 and avg_cpu_user_backend > 40 )
  #        or (avg_cpu_user_backend > 40 and self.obtain_prediciton_decision( True, 0.5 * self.slo) ) 
   #       or (avg_backend_req_rate_lb / len(backend_nodes) > 1.3 and avg_backend_resp_time_lb > 0.5 * self.slo) ):
    #    abort_backend_removal = 1  
        
    if(  ( avg_cpu_after_removal > MAX_CPU_USAGE and len(consolidate_vm) == 0)
          or (avg_cpu_user_backend > 40 and self.obtain_prediciton_decision( True, 0.5 * self.slo) ) 
          or (avg_backend_req_rate_lb / len(backend_nodes) > 1.3 and avg_backend_resp_time_lb > 0.5 * self.slo) ):
        abort_backend_removal = 1     
        
    if( (avg_web_req_rate_lb  >= 4.0 and avg_cpu_web > 40) or (avg_web_req_rate_lb  >= 4.0 and avg_web_resp_time_lb > 0.5 * self.slo) ):
        abort_web_removal = 1 
        
    if (len(backend_nodes) == MIN_NUM_BACKENDS or n_backend_to_add != 0 or abort_backend_removal == 1): 
      n_backend_to_remove = 0
    if (len(web_nodes) == MIN_NUM_WEBS or n_web_to_add != 0 or abort_web_removal == 1):
      n_web_to_remove = 0

    ##################################################################
    
    ret['add_web_nodes'] = n_web_to_add
    ret['remove_web_nodes'] = n_web_to_remove
    ret['add_backend_nodes'] = n_backend_to_add
    ret['remove_backend_nodes'] = n_backend_to_remove
    
    ##### DECIDE VM CANDIDATE OR INSTANCE TPYE TO ADD OR REMOVE #####
    
    if n_web_to_add > 0:
        ret['vm_web_instance'] = self.optimal_scaling.get_vm_inst_types()[0]
    
    if n_web_to_remove > 0:
        ret['node_ip_remove'] = web_nodes[0].ip
    
    
    if n_backend_to_remove > 0:
        self.trigger_prediction = 0
        if len(consolidate_vm) == 0:
            ret['node_ip_remove'] = self.optimal_scaling.remove_backend_vm_candidate(backend_nodes, self.backend_monitoring_data)
        else:
            ret['node_ip_remove'] = consolidate_vm
    
    if n_backend_to_add > 0:
        self.trigger_prediction = 0
        strategy = self.calculate_strategy(avg_cpu_user_backend, backend_nodes, sum_php_req_rate_backends, avg_cpu_user_backend)
        self.calculate_scaling_error = True
        self.last_scaling_operation = time()
        ret['vm_backend_instance'] = strategy
                
    self.cost_controller.print_vm_cost()
      
    logger.info('Provisioning decisions: %s' % str(ret))
    return ret

          
  def execute_actions(self, actions):
    n_backend_to_add = actions['add_backend_nodes']
    n_backend_to_remove = actions['remove_backend_nodes']
    n_web_to_add = actions['add_web_nodes']
    n_web_to_remove = actions['remove_web_nodes']
    
    vm_web_type = actions['vm_web_instance']
    ip=actions['node_ip_remove']
    
    strategy = []
    strategy = actions['vm_backend_instance']
    
    if ( n_backend_to_add> 0 and len(strategy) > 0 or n_web_to_add > 0):
      logger.info('Adding nodes: %d , backend strategy: %s ' % (n_web_to_add, str(strategy) ))
      if n_backend_to_add > 0:
          concurrent_ops = False
          perf_info = self.monitoring._performance_info_get()
          backend_nodes = perf_info.getBackendServiceNodes()
          
          for op, (vm_type, num) in sorted(strategy):
            if 'add' in op:                
                if not concurrent_ops:
                    concurrent_ops = True
                num_retries = NUM_RETRIES_SCALING_ACTION
                added_node = False
                while not added_node and num_retries > 0:
                    try:
                        logger.info('Adding backend nodes, quantity: %s , vm_type: %s ' % (str(num), str(vm_type) ))
                        client.add_nodes(MANAGER_HOST, MANAGER_PORT, web=0, backend=num, cloud='default', vm_backend_instance=vm_type, vm_web_instance=vm_web_type)
                        added_node = True
                    except Exception as ex:
                        logger.warning('Error when trying to add a node: '+str(ex))
                        num_retries = num_retries - 1
                        logger.warning('Node cannot be added at this time, retrying in 1min. Number of additional retries: '+str(num_retries))
                        added_node = False
                        sleep(100)
                    
            if 'remove' in op:
                logger.info('Removing backend nodes, quantity: %s , vm_type: %s ' % (str(num), str(vm_type) ))
                vmes_ip = []
                vmes_ip = self.optimal_scaling.remove_vmes_type_candidate(backend_nodes, self.backend_monitoring_data, vm_type, num)
                    
                for vm_ip in vmes_ip:
                    if concurrent_ops:
                        ## Before I used 60, but it seems it is not enough for the system to recognize the changes... 
                        sleep(100)
                    num_retries = NUM_RETRIES_SCALING_ACTION 
                    removed_node = False
                    while not removed_node and num_retries > 0:
                        try:
                            client.remove_nodes(MANAGER_HOST, MANAGER_PORT, web=0, backend=1, node_ip=vm_ip)
                            #remove_node = True

                            server_id = self.dyc_load_balancer.get_updated_backend_weights_id(vm_ip)
                            self.killed_backends.append(server_id)
                            self.dyc_load_balancer.remove_updated_backend_weights(server_id)
                            self.dyc_load_balancer.remove_updated_backend_weights_id(vm_ip)

                        except Exception as ex:
                            logger.warning('Error when trying to remove a node: '+str(ex)) 
                            num_retries = num_retries - 1
                            logger.warning('Node cannot be removed at this time, retrying in 1min. Number of additional retries: '+str(num_retries))
                            removed_node = False
                            sleep(100)
                            
          self.last_change_time = time()
            
      if n_web_to_add > 0:
           num_retries = NUM_RETRIES_SCALING_ACTION       
           added_node = False 
           while not added_node and num_retries > 0: 
              try:    
                  logger.info('Adding a web node: %d , inst type: %s ' % (n_web_to_add, str(vm_web_type) ))
                  vm_backend_type=self.optimal_scaling.get_vm_inst_types()[0]
                  client.add_nodes(MANAGER_HOST, MANAGER_PORT, web=n_web_to_add, backend=0, cloud='default', vm_backend_instance=vm_backend_type, vm_web_instance=vm_web_type)
                  added_node = True 
              except Exception as ex:         
                  logger.warning('Error when trying to add a web node: '+str(ex))
                  num_retries = num_retries - 1
                  logger.warning('Web node cannot be added at this time, retrying in 1min. Number of additional retries: '+str(num_retries))
                  added_node = False
                  sleep(100)
                      
           self.last_change_time = time()    
          
    if ((n_backend_to_remove > 0 or n_web_to_remove > 0) and len(ip) > 0):
      logger.info('Removing web nodes: %d , backend nodes: %d ' % (n_web_to_remove, n_backend_to_remove))
      client.remove_nodes(MANAGER_HOST, MANAGER_PORT, web=n_web_to_remove, backend=n_backend_to_remove, node_ip=ip)
      if n_backend_to_remove > 0:
        try:
          server_id = self.dyc_load_balancer.get_updated_backend_weights_id(ip)
          self.killed_backends.append(server_id)
          self.dyc_load_balancer.remove_updated_backend_weights(server_id)
          self.dyc_load_balancer.remove_updated_backend_weights_id(ip)
        except:
          logger.warning("Backend weight cannot be deleted for the backend with ip "+str(ip))
      self.last_change_time = time()
      logger.info('After triggering the remove operation web nodes: %d , backend nodes: %d ' % (n_web_to_remove, n_backend_to_remove))
      self.last_change_time = time()
      
  def collect_monitoring_data(self):
      self.monitoring.init_collect_monitoring_data()
               
      self.web_monitoring_data = self.monitoring.collect_monitoring_data_web()
      self.backend_monitoring_data = self.monitoring.collect_monitoring_data_backend()
      self.proxy_monitoring_data = self.monitoring.collect_monitoring_data_proxy()
      
      if len(self.proxy_monitoring_data) == 0 or len(self.backend_monitoring_data) == 0 or len(self.web_monitoring_data) == 0:
          return False
      
      return True
  
  def stop_provisioning(self):
      self.autoscaling_running = False
      #try:
       #   os.remove(PATH_LOG_FILE)
      #except OSError as e:
       #   logger.critical('stop_provisioning: Error when removing the autoscaling log '+str(e))
  
  def do_provisioning(self,slo,cooldown_time, slo_fulfillment_degree):
    step_no = 0 
    self.slo = slo
    self.time_between_changes = cooldown_time*60
    self.autoscaling_running = True 
    self.optimal_scaling.set_slo_fulfillment_degree(slo_fulfillment_degree)
    logger.info('Autoscaling: Starting with QoS autoscaling: '+str(slo_fulfillment_degree))
    
    try: 
     while self.autoscaling_running:
      step_no += 1
      tstart = datetime.now()  
      
      print 'Synchronizing node info with manager...'
      self.monitoring.nodes_info_update(self.killed_backends)
      print "Collecting monitoring data..."

      ret = self.collect_monitoring_data()
      
      if not ret:
          logger.warning('Monitoring data was not properly retrieved, will retry later...')
          sleep(60)
          continue
      else:
          self.log_monitoring_data()  
      
          actions = self.decide_actions()
          self.execute_actions(actions)
      
      
          n_backend_to_add = actions['add_backend_nodes']
          n_backend_to_remove = actions['remove_backend_nodes']
          n_web_to_add = actions['add_web_nodes']
          n_web_to_remove = actions['remove_web_nodes']
      
          # Adjust node weights every 3 steps
          if ( (step_no % 2 == 0 and n_backend_to_add == 0 and n_backend_to_remove == 0 \
             and n_web_to_add == 0 and n_web_to_remove == 0) or self.trigger_weight_balancing ):
              logger.info('Calling adjust_node_weights ...')
              Thread(target=self.dyc_load_balancer.adjust_node_weights(self.monitoring, self.backend_monitoring_data)).start()
              self.trigger_weight_balancing = False
    
      
          tend = datetime.now()
          logger.info('--> EXECUTION TIME SCALING DECISION: %s ' % str(tend-tstart))
          
          sleep(300)
    
      
     logger.info('Autoscaling: Terminated.')
    except Exception as ex:
        logger.critical('Autoscaling: Error in the autoscaling system '+str(ex))