def process_elb_access_log(elb_buckets_dict, queue): """ Primarily handle S3 log processing threads :param elb_buckets_dict: :param queue: :return: """ # collecting access log for each elb with different threads elb_data_manager = ThreadingManager() for elb_region_str, bucket_name in elb_buckets_dict.iteritems(): c = S3Connection() bucket = c.get_bucket(bucket_name) elb_data_manager.start_tasks(target_func=counting_elb_data, name="elb_data_collector", para=[bucket, elb_region_str]) # waiting for all threads to finish parsing S3 log # by which time it will be the end of measurement interval result_queue = elb_data_manager.collect_results() # collect the total data processed by each ELB data_in = dict() data_out = dict() while not result_queue.empty(): data_tuple = result_queue.get() station = data_tuple[0] # the amount of data sent and received by each client # *of ONE station client_sent, client_received = data_tuple[1] data_in.update({station: client_sent}) data_out.update({station: client_received}) # debug print_message('Data in (bytes) from clients: %s' % data_in) print_message('Data out (bytes) from clients: %s' % data_out) queue.put((data_in, data_out))
def calculate_key_prefix(elb_region, elb_name, last_expected_time): """Function that calculate the prefix for bucket key searching :param elb_region: :param elb_name: :return: """ print_message('Retrieving access log for %s ...' % elb_name) next_expected_time, max_waiting_time\ = get_next_nth_elb_log_time(1, last_expected_time) last_expected_time = next_expected_time year, month, day, hour, minute = next_expected_time.year, \ next_expected_time.month, \ next_expected_time.day, \ next_expected_time.hour, \ next_expected_time.minute # convert month, day, hour and minute to 2 digit representation month = '%02d' % month day = '%02d' % day hour = '%02d' % hour minute = '%02d' % minute # The time string that the expected log file should contain time_str = "%s%s%sT%s%sZ" % (year, month, day, hour, minute) aws_account_id = str(305933725014) region = elb_region load_balancer_name = elb_name end_time = time_str key_prefix = 'AWSLogs/{0}/elasticloadbalancing/{1}/{2}/{3}/{4}/{5}' \ '_elasticloadbalancing_{6}_{7}_{8}' \ .format(aws_account_id, region, year, month, day, aws_account_id, region, load_balancer_name, end_time) request_headers = {'prefix': unicode(key_prefix), 'delimiter': '.log'} return request_headers, max_waiting_time, last_expected_time
def measure_round_trip_delay(from_host, to_host, from_host_pk, measurement_interval, queue): from_host_ip, from_host_name = from_host to_host_ip, to_host_name = to_host start_time = time.time() current_time = start_time time_elapsed = current_time - start_time measurements = [] # list to store series of measurement while time_elapsed < measurement_interval: # in the unit of seconds cmd = ' '.join(["ping", "-c 5", to_host_ip]) out, err = execute_remote_command(from_host_ip, cmd, 'ubuntu', '', from_host_pk) rt_str = [text for text in out.split('\n') if 'min/avg/max/' in text] stat_str = rt_str[0].split('=') metric_name = stat_str[0] values = stat_str[1].split('/') avg_idx = metric_name.split('/').index('avg') avg_round_trip_delay = values[avg_idx] print_message('Average round trip delay between %s and %s: %s (ms)' % (from_host, to_host, avg_round_trip_delay)) # record current measurement measurements.append(float(avg_round_trip_delay) / 1000) time.sleep(30) # sleep for half a min current_time = time.time() time_elapsed = current_time - start_time average_round_trip_delay = numpy.mean(measurements) queue.put((from_host, to_host, average_round_trip_delay))
def main(): setup_logging() # line counter for reading csparql logs of each service station line_counters = dict() # bucket and ELB info for getting access log from S3 elb_buckets_dict = dict() elb_regions = get_station_region() elb_buckets = get_elb_buckets_map() for station, region in elb_regions.iteritems(): elb_region_str = '%s:%s' % (station, region) elb_buckets_dict.update( {elb_region_str: unicode(elb_buckets[station])}) stations = get_available_stations() for station in stations: line_counters.update({station: 0}) log_base_dir = time.strftime("%Y_%m%d_%H%M") total_num_users = cfg.get_int('Default', 'total_num_users') # Get all available client region available_clients = get_available_clients() # counter = 0 # For testing while True: # Processing csparql log and ELB access log simultaneously by 2 threads # Start csparql log paring first since ELB access log has delay # emitting the log file # First we need to calculate how much time to wait before retrieving # csparql logs. i.e how long the actual measurement time is. Since S3 # only emit log at 5, 10, 15 etc. minute of the hour, we can only # measure the time that measurement start until the last expected log # omission time which is not the actual time that log being obtained # since there is delay measurement_interval = calculate_waiting_time() # no need to wait until log actually being obtained measurement_interval -= 300 # Measuring latency between each client region and service station latency_manager = ThreadingManager() latency_manager.start_task( target_func=measure_latency, name="latency_manager", para=[available_clients, stations, measurement_interval]) server_log_processor = ThreadingManager() server_log_processor.start_task(target_func=process_server_logs, name="server_log_processor", para=[ log_base_dir, line_counters, total_num_users, measurement_interval ]) # Begin gathering info of the amount of data # transferred through each service station data_counting_task = ThreadingManager() data_counting_task.start_task(target_func=process_elb_access_log, name="elb_access_log_processor", para=[elb_buckets_dict]) latency_results_dict = latency_manager.collect_results().get() # collecting csparql log first since its processing will complete first # while elb data might has delay server_metrics_queue = server_log_processor.collect_results() (station_metric_list, total_request) = server_metrics_queue.get() line_counters = server_metrics_queue.get() print_message('') print_message('Service station logs processing finished\n') # collecting elb data now elb_data_queue = data_counting_task.collect_results() data_in, data_out = elb_data_queue.get() """ Preparing optimisation parameters """ # Calculate The "average amount of data involved in each request" for # each service station and the "total number of requests" # These 2 dictionary stores the average data send and received per # request sent and received by *each service station* from *each # client*. The length of the dictionary should be equals to the # number of clients (regions) # <Client_name: <Station: data_in_per_req>> avg_data_in_per_reqs = dict() # <Client_name: <Station: data_out_per_req>> avg_data_out_per_reqs = dict() # initialise for cli_name in available_clients: avg_data_in_per_reqs.update({cli_name: {}}) avg_data_out_per_reqs.update({cli_name: {}}) # requests arrival rate and service rate of each service station arrival_rates = dict() service_rates = dict() for station_metric in station_metric_list: # getting metric station_name = station_metric.station_name arrival_rate = station_metric.arrival_rate service_rate = station_metric.service_rate requests = station_metric.total_requests print '\nTotal requests for station %s: %s' \ % (station_name, requests) log_info( metric_record_file, '\nTotal requests for station %s: %s' % (station_name, requests)) # arrival_rate and service_rate arrival_rates.update({station_name: arrival_rate}) service_rates.update({station_name: service_rate}) response_time = \ math.pow(service_rate, -1) / (1 - math.pow(service_rate, -1) * arrival_rate) print '[Debug] predicted current response time of service station ' \ '\'%s\': %s' % (station_name, response_time) log_info( metric_record_file, '[Debug] predicted currentresponse time of service ' 'station \'%s\': %s' % (station_name, response_time)) # TODO: calculate total requests for each client # TODO: if c-sparql could record the source of each requests # TODO: things would be much more easier total_request_per_client = dict() data_in_sum = 0 client_data_in_sum = dict() for a_client in available_clients: client_data_in_sum.update({a_client: 0}) for station_name in stations: d_in = data_in.get(station_name) for c, sent_data in d_in.iteritems(): # calculate total data sent by each client # and the total data sent by all client client_data_in_sum[c] += sent_data data_in_sum += sent_data # calculate total amount of requests sent by each client for ac in available_clients: t_request = math.ceil(total_request * (client_data_in_sum[ac] / data_in_sum)) t_request = int(t_request) # build this so that it could be used by calculating out data total_request_per_client.update({ac: t_request}) for station_name in stations: d_in = data_in.get(station_name) for c, sent_data in d_in.iteritems(): t_request = total_request_per_client[c] # convert the amount of data to GB sent_data = float(sent_data / math.pow(1024, 3)) avg_data_in_per_req = sent_data / t_request avg_data_in_per_reqs[c].update( {station_name: avg_data_in_per_req}) for station_name in stations: d_out = data_out.get(station_name) for c1, received_data in d_out.iteritems(): t_request = total_request_per_client[c1] received_data = float(received_data / math.pow(1024, 3)) avg_data_out_per_req = received_data / t_request avg_data_out_per_reqs[c1].update( {station_name: avg_data_out_per_req}) # For testing purpose info_str = \ '\n[Debug] total_request: %s\n' \ '[Debug] avg_data_in_per_reqs: %s\n' \ '[Debug] avg_data_out_per_reqs: %s\n' \ '[Debug] arrival_rates: %s\n' \ '[Debug] service_rates: %s\n' \ % (total_request, avg_data_in_per_reqs, avg_data_out_per_reqs, arrival_rates, service_rates) print info_str log_info(metric_record_file, info_str) # TODO: Get elb price from config # ELB pricing elb_prices = [0.008, 0.008] # optimise for each client... # do optimisation for each client in a new threads optimiser = ThreadingManager() for client in available_clients: optimiser.start_tasks(target_func=clients_optimisation, name="optimiser", para=[ avg_data_in_per_reqs, avg_data_out_per_reqs, client, elb_prices, latency_results_dict, measurement_interval, service_rates, stations, total_request_per_client ]) # synchronising threads optimiser.collect_results() # it takes up to 60 mins for Route 53 record changes to take effect time.sleep(60)
def _generate_data(base_path, queue): response_file = base_path + '/ResponseInfo.txt' cpu_file = base_path + '/CPUUtil.txt' # cell(6,2); data = [[] for j in range(7)] category_map = dict() # containers.Map; category_index = 0 category_count = 0 category_list = [] count = 0 # No ResponseInfo available in observer log yet if not os.path.exists(response_file): print_message('%s not exists yet\n' % response_file) return with open(response_file) as f: line = f.readline() while line: # skip odd line if count % 2 != 0: line = f.readline() count += 1 continue split_str = line.split(',') if len(split_str) < 7: line = f.readline() count += 1 continue date_str = [split_str[j] for j in xrange(7)] date = datetime.strptime("".join(date_str), '%Y%m%d%H%M%S%f') # date = None # # python strptime thread safety bug # # http://bugs.python.org/issue11108 # while not date: # try: # date = strptime("".join(date_str), '%Y%m%d%H%M%S%f') # except AttributeError as e: # print "[Debug]: strptime reported AttributeError\n" + \ # "Details: %s" % e date_milli = mktime( date.timetuple()) * 1e3 + date.microsecond / 1e3 category_str = split_str[8] if category_str not in category_map: category_map[category_str] = category_index category_list.append(category_str) category_count += 1 category = category_index data = update_data_array(data, 5, category, []) data = update_data_array(data, 6, category, []) data = update_data_array(data, 7, category, []) category_index += 1 else: category = category_map[category_str] if split_str[9] == 'Request Begun': continue response_time = float(split_str[10]) arrival_time = date_milli - response_time * 1000 data = update_data_array(data, 2, category, arrival_time) data = update_data_array(data, 3, category, response_time) line = f.readline() count += 1 update_data_array(data, 2, category_index, [[]]) # fill in the gap between length of data and the target index for i in xrange(len(data)): for j in range(len(data[2])): if len(data[i]) < len(data[2]): data[i].append([]) raw_data = data data = format_data(raw_data, 60000, category_list, cpu_file) seg = base_path.split('/') vm_name = seg[len(seg) - 1] results = (vm_name, data) queue.put(results)
def process_server_logs(base_dir, line_counters, total_users, waiting_time, queue): """ :param base_dir: Base directory of monitor log :param line_counters: Counter for continuously reading the single log file :param total_users: The total number of users simulated :param queue: Queue to store results when using in thread :param waiting_time: The measurement time :return: """ print_message('') print_message('Waiting for the next batch of service station monitoring ' 'logs (%s seconds)...\n' % waiting_time) time.sleep(waiting_time) module_path = os.path.dirname(client_server.__file__) base_dir = module_path + '/logs/' + base_dir + '/' # retrieve service station and observer mapping station_observers = get_station_csparql() # retrieve and process the log of each service station with a new threads csparql_reader = ThreadingManager() # python strptime thread safety bug. Has to call strptime once before # creating thread. Details can be found on: # http://bugs.python.org/issue11108 time.strptime("30 Nov 00", "%d %b %y") for station_name, observer_ip in station_observers.iteritems(): observer_addr = '%s=%s' % (station_name, observer_ip) csparql_reader.start_tasks(target_func=process_monitor_log, name='csparql_reader', para=[base_dir, observer_addr, line_counters[station_name]]) # wait for all threads to finish and collect their results result_queue = csparql_reader.collect_results() total_requests = 0 # Now collect metric data from all service station and then calculate the # necessary metric for the optimisation since these metrics are calculated # for the entire online services service_station_metric_list = [] while not result_queue.empty(): # get metrics returned result_dict = result_queue.get() station_name = result_dict['station_name'] station_total_requests = result_dict['total_requests'] arrival_rate = result_dict['arrival_rate'] service_rate_para_list = result_dict['service_rate_para_list'] line_counter = result_dict['line_counter'] total_requests += station_total_requests service_station_metric = ServiceStationMetric(station_name, station_total_requests, arrival_rate / 60, service_rate_para_list, service_rate=0) service_station_metric_list.append(service_station_metric) # update the current line counter line_counters[station_name] = line_counter # now calculate service rate for each station for station_metric in service_station_metric_list: # parameter needed for calculating service rate for servers from # one service station mu_para_list = station_metric.service_rate_para_list # Calculating service rate of each server in one station service_time_list = [] # list to store service time of each server # for service rate calculation parameters for each server ... for s_para in mu_para_list: # number of users for this vm num_of_requests = s_para['num_of_requests'] num_of_user = int(math.ceil( total_users * (num_of_requests / total_requests))) num_of_cores = s_para['cpu_cores'] data = s_para['data'] mean_service_time = calculate_service_rate(num_of_user, num_of_cores, data) service_time_list.append(mean_service_time) print_message('Mean service time of VM \'%s\' at station \'%s\': %s' % (s_para['vm_name'], station_metric.station_name, str(mean_service_time))) # The overall service rate is calculated by the number of requests # completed by all servers within the time that the slowest server # takes to complete a single request max_time = max(service_time_list) comp_req_sum = 0 for service_time in service_time_list: comp_req_sum += max_time / service_time overall_service_rate = comp_req_sum / max_time station_metric.service_rate = overall_service_rate # store result of this thread in result the queue queue.put((service_station_metric_list, total_requests)) queue.put(line_counters)
def optimisation(num_of_stations, total_requests, elb_prices, avg_data_in_per_reqs, avg_data_out_per_reqs, in_bandwidths, out_bandwidths, budget, service_rates, measurement_interval, station_latency): variables = [1 for i in xrange(num_of_stations)] feasible_tuple = [] # get all combination that satisfy constrains for i in f_range(1, 99, 0.0001): variables[0] = float(i) / 100.0 variables[1] = 1 - float(i) / 100.0 satisfy_constrains = constrains_check( variables, total_requests, avg_data_in_per_reqs, avg_data_out_per_reqs, elb_prices, measurement_interval, budget, in_bandwidths, out_bandwidths, service_rates, station_latency) if satisfy_constrains: feasible_tuple.append((variables[0], variables[1])) if len(feasible_tuple) == 0: print_message('No feasible solution found') return smallest = float("inf") answer = (1, 1) # minimisation - find the feasible tuple gives the minimal value for f_tuple_idx, f_tuple_val in enumerate(feasible_tuple): objective_result = objective_function(f_tuple_val, total_requests, avg_data_in_per_reqs, avg_data_out_per_reqs, elb_prices, measurement_interval, service_rates, station_latency) if objective_result < smallest: smallest = objective_result answer = f_tuple_val #### test #### total_cost = 0 for i in xrange(len(answer)): elb_cost = \ total_requests * (avg_data_in_per_reqs[i] + avg_data_out_per_reqs[i]) * \ elb_prices[i] * answer[i] total_data_out = total_requests * answer[i] * \ avg_data_out_per_reqs[i] ec2_cost = 0 if total_data_out < 1: ec2_cost = 0 elif 1 < total_data_out <= 10240: ec2_cost = total_data_out * 0.12 elif 10240 < total_data_out <= 51200: ec2_cost = (total_data_out - 10240) * 0.09 + 10240 * 0.12 elif 51200 < total_data_out <= 153600: ec2_cost = \ (total_data_out - 51200) * 0.07 + 40960 * 0.09 + 10240 * 0.12 elif 153600 < total_data_out <= 512000: ec2_cost = \ (total_data_out - 153600) * 0.05 + 102400 * 0.07 + \ 40960 * 0.09 + 10240 * 0.12 total_cost += elb_cost + ec2_cost print_message('') print_message('Total cost: $%s ' % total_cost) # #### test #### return answer
def optimise(num_of_stations, total_requests, elb_prices, avg_data_in_per_reqs, avg_data_out_per_reqs, in_bandwidths, out_bandwidths, budget, sla_response_t, service_rates, measurement_interval, station_latency, k, ec2_prices_ranges=None, cost_mode=None): """ :param num_of_stations: total number of receipts of client requests :param total_requests: total number of requests generated by client :param elb_prices: list of pricing of every ELB involved :param avg_data_in_per_reqs: the average amount of data transferred in pre request for requests received at *each service station* :param avg_data_out_per_reqs: the average amount of data transferred out pre request for requests received at *each service station* :param in_bandwidths: The capacity of the link used by each service station to receive request :param out_bandwidths: The capacity of the link used by each service station to send response :param service_rates: Overall service rate of each service station :param station_latency: latency between this client and each station :param sla_response_t: Service Level Agreement of the response time for each service station :param budget: Budget of the OSP :param k: coefficient to reflect on how much additional cost to pay for one unit of throughput improvement :param measurement_interval: The length of measurement time (in seconds) :param ec2_prices_ranges: The price charged by EC2 based on different amount of data sent out of EC2 (Dict of dict) :param cost_mode: Mode for selecting different EC2 data trans :return: The current best weight for each requests receipt # Objective: (e.g number of service station = 2) # maximise Cost + K * throughput # /////////////////// Update: New objective function: Minimise Cost + latency perceived by user latency perceived by user Only the objective function and response time constrain needs to change //////////////////// # # ************ Deprecated ************* # # total_requests * # (avg_data_in_per_req[0] + avg_data_out_per_req[0]) * # elb_price[0] * # P[0] # + # total_requests * avg_data_out_per_req[0] * P[0] * ec2_price_range # + # K * total_requests * # P[0] # + # total_requests * # (avg_data_in_per_req[1] + avg_data_out_per_req[1]) * # elb_price[1] * # P[1] # + # total_requests * avg_data_out_per_req[0] * P[0] * ec2_price_range # + # K * total_requests * # P[1] # "Actually formulation": # (total_requests * # (avg_data_in_per_req[0] + avg_data_out_per_req[0]) * # elb_price[0] # + # K * total_requests # + # total_requests * avg_data_out_per_req[0] * ec2_price_range) * P[0] # # + # # (total_requests * # (avg_data_in_per_req[1] + avg_data_out_per_req[1]) * # elb_price[1] # + # K * total_requests # + # total_requests * avg_data_out_per_req[1] * ec2_price_range) * P[1] # # ************ End of Deprecated ************* # # Subject to: # "data_in/second < bandwidth of the in_link" # (total_requests * avg_data_in_per_req[0] * P[0])) # / measurement_interval < in_bandwidth[0] # (total_requests * avg_data_in_per_req[1] * P[1])) # / measurement_interval < in_bandwidth[1] # "data_out/second < bandwidth of the out_link" # (total_requests * avg_data_out_per_req[0] * P[0])) # / measurement_interval < out_bandwidth[0] # (total_requests * avg_data_out_per_req[1] * P[1])) # / measurement_interval < out_bandwidth[1] # "Actually formulation": # # total_requests * avg_data_in_per_req[0] / measurement_interval * P[0] + # 0 * P[1] # <= in_bandwidth[0] # 0 * P[0] + # total_requests * avg_data_in_per_req[1] / measurement_interval * P[1] # <= in_bandwidth[1] # total_requests * avg_data_out_per_req[0] / measurement_interval * P[0] + # 0 * P[1] # <= out_bandwidth[0] # 0 * P[0] + # total_requests * avg_data_out_per_req[1] / measurement_interval * P[1] # <= out_bandwidth[1] # "(Deprecated)Response time constrain" # D_sla[0] * service_rates[0]^-1 * total_requests * P[0] # <= measurement_interval * (D_sla[0] - service_rates[0]^-1) # D_sla[1] * service_rates[1]^-1 * total_requests * P[1] # <= measurement_interval * (D_sla[1] - service_rates[1]^-1) # "Actually formulation": # D_sla[0] * service_rates[0]^-1 * total_requests * P[0] + # 0 * P[1] # <= measurement_interval * (D_sla[0] - service_rates[0]^-1) # 0 * P[0] + # D_sla[1] * service_rates[1]^-1 * total_requests * P[1] # <= measurement_interval * (D_sla[1] - service_rates[1]^-1) # "Budget of OSP" (EC2 cost is calculated differently) # total_requests * (avg_data_in_per_reqs[0] + # avg_data_out_per_reqs[0]) * elb_prices[0] * P[0] # + # # total_requests * (avg_data_in_per_reqs[1] + # avg_data_out_per_reqs[1]) * elb_prices[1] * P[1] # <= budget # "Sum of weights is 1" # P[0] + P[1] + ... P[num Of Servers - 1] = 1 # "P are all positive" # 1 * P[0] + 0 * P[1] + 0 * P[2] .... > 0 # 0 * P[0] + 1 * P[1] + 0 * P[2] .... > 0 # 0 * P[0] + 0 * P[1] + 1 * P[2] .... > 0 # ... ... # # Variable: P[i] """ coefficients = [] # right hand side of constrains right_hand_side = [] # coefficients in objective function obj_func_coef = [] for i in xrange(num_of_stations): # Building coefficients for constrains inequations. # Collecting coefficients of each variable of each constrains inequation """ In bandwidth constrains """ # | t*a/m 0 0 0 ... | < in_bandwidth[0] # | 0 t*a/m 0 0 ... | < in_bandwidth[1] # | 0 0 t*a/m 0 ... | < in_bandwidth[2] # | 0 0 0 t*a/m ... | ... ... in_bandwidth_coef = [0 for i1 in xrange(num_of_stations)] in_bandwidth_coef[i] = \ total_requests * avg_data_in_per_reqs[i] / measurement_interval """ Out bandwidth constrains """ out_bandwidth_coef = [0 for i2 in xrange(num_of_stations)] out_bandwidth_coef[i] = \ total_requests * avg_data_out_per_reqs[i] / measurement_interval # """ Response time constrain """ response_t_coef = [0 for i3 in xrange(num_of_stations)] response_t_coef[i] = \ sla_response_t[i] * math.pow(service_rates[i], -1) * total_requests """ All variable (weights) are positive """ all_pos_coef = [0 for i4 in xrange(num_of_stations)] all_pos_coef[i] = -1 # convert to standard form """ coefficient for the "sum of weights is 1" constrain (i.e all 1) """ sum_p_coef = 1 """ Cost less then or equal to budget """ cost_coef = \ total_requests * (avg_data_in_per_reqs[i] + avg_data_out_per_reqs[i]) * elb_prices[i] + \ 0.120 * total_requests * avg_data_out_per_reqs[i] #### test #### print_message('Total cost : $%s' % cost_coef) #### test #### # Store all coefficients for this variable in the above order """ Order matters """ coefficients_for_p_i = [] coefficients_for_p_i.extend(in_bandwidth_coef) coefficients_for_p_i.extend(out_bandwidth_coef) coefficients_for_p_i.extend(response_t_coef) coefficients_for_p_i.extend(all_pos_coef) # in order to turn the "sum of weights is 1" equability constrain to # inequality constrain, replace the original equality constrain with # 2 new inequality that represent a very tiny range around the original # right hand side of the equability constrain # P1 + P2 + P3 + .... > 1 - 0.0000000001 # P1 + P2 + P3 + .... < 1 + 0.0000000001 coefficients_for_p_i.append(sum_p_coef * -1) coefficients_for_p_i.append(sum_p_coef) coefficients_for_p_i.append(cost_coef) # add this list in the coefficient collection as the coefficient of # current variable i.e weight coefficients.append(coefficients_for_p_i) # Building objective function coefficient for this variable service_time = math.pow(service_rates[i], -1) obj_p_i_coef = \ total_requests * (avg_data_in_per_reqs[i] + avg_data_out_per_reqs[i]) * elb_prices[i] + \ 0.120 * total_requests * avg_data_out_per_reqs[i] + \ (measurement_interval - service_time * total_requests) / \ (service_time * measurement_interval) # maximise = minimise the negative form obj_func_coef.append(obj_p_i_coef * -1) """ Order Matters """ # Now adding right hand side. # Right hands side has to be added in the order that coefficients was added # e.g in_bandwidths -> out_bandwidths -> Response time constrains -> ... right_hand_side.extend([in_bandwidths[n] for n in xrange(num_of_stations)]) right_hand_side.extend( [out_bandwidths[m] for m in xrange(num_of_stations)]) right_hand_side.extend([ measurement_interval * (sla_response_t[k] - math.pow(service_rates[k], -1)) for k in xrange(num_of_stations) ]) right_hand_side.extend([0 for j in xrange(num_of_stations)]) right_hand_side.append(0.0000000001 - 1) right_hand_side.append(1 + 0.0000000001) right_hand_side.append(budget) print 'coefficients: %s' % coefficients print 'right_hand_side: %s' % right_hand_side print 'obj_func_coef: %s' % obj_func_coef a = matrix(coefficients) b = matrix(right_hand_side) c = matrix(obj_func_coef) sol = solvers.lp(c, a, b) return sol['x']
def format_data(data, period, category_list, cpu_file): metric_list = ['addtocartbulk', 'checkLogin', 'checkoutoptions', 'login', 'logout', 'main', 'orderhistory', 'quickadd'] delete = [] for i in xrange(len(data[2]) - 1): if not data[2][i]: delete.append(i) # delete data data, category_list, delete = remove_data(data, category_list, delete) # find out those metrics that are not in the metric_list for i in xrange(len(data[2]) - 1): flag = 0 for j in range(8): if category_list[i] == metric_list[j]: flag = 1 if flag == 0: delete.append(i) # delete data data, category_list, delete = remove_data(data, category_list, delete) start_time = min(data[2][0]) max_time = max(data[2][0]) for i in xrange(1, len(data[2]) - 1): if data[2][i] and start_time > min(data[2][i]): start_time = min(data[2][i]) if data[2][i] and max_time > max(data[2][i]): max_time = max(data[2][i]) samples = int(math.floor(((max_time - start_time) / period))) print_message('Number of samples (interval:%s) : %s' % (period, samples)) for i in xrange(len(data[2]) - 1): end_time = start_time departure = [a + r * 1000 for a, r in zip(data[2][i], data[3][i])] for k in xrange(samples): index = [v[0] for v in enumerate(departure) if end_time <= v[1] < (end_time + period)] arr_index = [v[0] for v in enumerate(data[2][i]) if end_time <= v[1] < (end_time + period)] response_times = [0] if index: response_times = [data[3][i][idx] for idx in index] data[4][i].append(scipy.mean(response_times)) data[5][i].append(len(index) / period * 1000) data[6][i].append(len(index)) data[7][i].append(len(arr_index)) data[0][i].append(end_time + period) end_time += period # Number of samples for each request might not be equal max_num_requests = 0 max_requests_idx = 0 for i in xrange(len(data[2]) - 1): if max_num_requests < len(data[2][i]): max_num_requests = len(data[2][i]) max_requests_idx = i for i in xrange(len(data[2]) - 1): data[0][i] = data[0][max_requests_idx] if len(data[4][i]) < len(data[0][i]): data[4][i].append([0] * (len(data[0][i]) - len(data[4][i]))) data[5][i].append([0] * (len(data[0][i]) - len(data[5][i]))) data[6][i].append([0] * (len(data[0][i]) - len(data[6][i]))) data[0][len(data[0]) - 1] = data[0][0] with open(cpu_file) as f: count = 0 cpu = [] cpu_time = [] flag = 0 line = f.readline() while line: cpu_num = float(line) if count % 2 == 0: if cpu_num > 1 or math.isnan(cpu_num): flag = 1 else: cpu.append(cpu_num) else: if flag: flag = 0 else: cpu_time.append(cpu_num) count += 1 line = f.readline() cpu_time = [e - 3600 * 1000 for e in cpu_time] indices = [i[0] for i in sorted(enumerate(cpu_time), key=lambda x: x[1])] cpu_time = [cpu_time[i] for i in indices] cpu = [cpu[i] for i in indices] for i in xrange(len(data[0][0])): indices_found = [v[0] for v in enumerate(cpu_time) if data[0][0][i] <= v[1] < data[0][0][i] + period] if indices_found: mean = scipy.mean([cpu[i] for i in indices_found]) data[1][len(data[1]) - 1].append(mean) return data
def _generate_data(base_path, queue): response_file = base_path + '/ResponseInfo.txt' cpu_file = base_path + '/CPUUtil.txt' # cell(6,2); data = [[] for j in range(7)] category_map = dict() # containers.Map; category_index = 0 category_count = 0 category_list = [] count = 0 # No ResponseInfo available in observer log yet if not os.path.exists(response_file): print_message('%s not exists yet\n' % response_file) return with open(response_file) as f: line = f.readline() while line: # skip odd line if count % 2 != 0: line = f.readline() count += 1 continue split_str = line.split(',') if len(split_str) < 7: line = f.readline() count += 1 continue date_str = [split_str[j] for j in xrange(7)] date = datetime.strptime("".join(date_str), '%Y%m%d%H%M%S%f') # date = None # # python strptime thread safety bug # # http://bugs.python.org/issue11108 # while not date: # try: # date = strptime("".join(date_str), '%Y%m%d%H%M%S%f') # except AttributeError as e: # print "[Debug]: strptime reported AttributeError\n" + \ # "Details: %s" % e date_milli = mktime(date.timetuple())*1e3 + date.microsecond/1e3 category_str = split_str[8] if category_str not in category_map: category_map[category_str] = category_index category_list.append(category_str) category_count += 1 category = category_index data = update_data_array(data, 5, category, []) data = update_data_array(data, 6, category, []) data = update_data_array(data, 7, category, []) category_index += 1 else: category = category_map[category_str] if split_str[9] == 'Request Begun': continue response_time = float(split_str[10]) arrival_time = date_milli - response_time * 1000 data = update_data_array(data, 2, category, arrival_time) data = update_data_array(data, 3, category, response_time) line = f.readline() count += 1 update_data_array(data, 2, category_index, [[]]) # fill in the gap between length of data and the target index for i in xrange(len(data)): for j in range(len(data[2])): if len(data[i]) < len(data[2]): data[i].append([]) raw_data = data data = format_data(raw_data, 60000, category_list, cpu_file) seg = base_path.split('/') vm_name = seg[len(seg) - 1] results = (vm_name, data) queue.put(results)
def process_access_log(bucket, elb_region, elb_name): """ function that retrieve access logs of ELB that stored in S3 buckets and calculate the total amount of data processed by the ELB log file format {Bucket}/{Prefix}/AWSLogs/{AWS AccountID}/elasticloadbalancing/{Region}/ {Year}/{Month}/{Day}/{AWS Account ID}_elasticloadbalancing_{Region}_ {Load Balancer Name}_{End Time}_{Load Balancer IP}_{Random String}.log :param bucket: the S3 buckets that stores the access log of ELB :param elb_region: region of the elastic load balancer of which access logs are being retrieved :param elb_name: name of the elastic load balancer :return: total amount of data being processed by the elb during the measurement interval """ if not bucket: print 'S3 bucket object required' return # Start new threads for downloading and reading each matching log file data_accumulator = DataAccumulatorManager() # record the start time to calculate time elapsed # start_time = time.time() # Since the actually log omission time varies we determine the time stop # waiting for log by the number of logs retrieved. Each log represents # elb access of either 5 minutes or 1 hour # Counter for the number of log obtained. logs_obtained = 0 expected_logs_to_obtain = get_expected_num_logs() # maintain the last log emitting minutes that dealt with last_expected_time = None # flag indicating that the process needs to stop before obtaining # expected number of logs that matches measurement interval need_to_stop = False # check whether the it has reached the end of measurement interval # while (time.time() - start_time) / 60 <= m_interval: while logs_obtained < expected_logs_to_obtain and not need_to_stop: request_headers, max_waiting_time, last_expected_time \ = calculate_key_prefix(elb_region, elb_name, last_expected_time) matching_keys = [] # In case of total waiting time exceed the S3 # logging interval (e.g 5 min) we need to recalculate the next # expected log name time_counter = 0 # Wait for polling interval while the log is not available while not matching_keys or len(matching_keys) < 2: print_message('') print_message('Searching for bucket key(s) that start with: %s' % request_headers['prefix']) matching_keys = bucket.search_key(parameters=request_headers) if matching_keys: for m_key in matching_keys: print_message('Found %s' % m_key) if len(matching_keys) > 1: break print_message('Time elapsed since current searching: %s min(s)' % (time_counter / 60)) # Check whether we need to wait for new log. # There could be up to 5 mins delay for actual log delivery # http://docs.aws.amazon.com/ElasticLoadBalancing/latest/ # DeveloperGuide/access-log-collection.html # The next expected minus should be re calculated base on its # last value i.e the last "next expected minus" if time_counter > max_waiting_time: # Generally if waiting time exceed the maximum time # calculated there could either be some error during ELB # access log omission in S3 in which case the waiting time is # unpredictable OR it is the end of the simulation. Either # case we need to stop waiting for S3 need_to_stop = True break print_message('Waiting for log to be emitted (polling interval %s ' 'seconds) ...\n' % log_polling_interval) time.sleep(log_polling_interval) time_counter += log_polling_interval if need_to_stop: break for key_name in matching_keys: key = bucket.get_key(key_name=key_name) # compose log file directory segment = key_name.split('/') log_file_name = segment[len(segment) - 1] log_file_path_dir = log_file_dir + bucket.name if not os.path.exists(log_file_path_dir): os.makedirs(log_file_path_dir) log_file_path = log_file_path_dir + '/' + log_file_name # download and process the each log file simultaneously data_accumulator.start_tasks(data_accumulator.read_log, 'data_accumulator', (key, log_file_path)) # Collect results from each threads data_accumulator.collect_results() print_message('Total amount of data (bytes) received by \'%s\' from ' 'clients so far: %s' % (elb_name, data_accumulator.client_sent)) print_message('Total amount of data (bytes) sent by \'%s\' to ' 'clients so far: %s' % (elb_name, data_accumulator.client_receive)) logs_obtained += 1 print_message('Access log of \'%s\' obtained so far : %s\n' % (elb_name, logs_obtained)) return data_accumulator.client_sent, data_accumulator.client_receive
def clients_optimisation(avg_data_in_per_reqs, avg_data_out_per_reqs, client, elb_prices, latency_results_dict, measurement_interval, service_rates, stations, total_request_per_client, queue): # bandwidths for each client in_bandwidths = [] out_bandwidths = [] # average data per requests avg_in_data = [] avg_out_data = [] service_rates_list = [] station_latency = [] in_band_dict, out_band_dict = get_stations_bandwidth(client) # Budget e.g 100,000 / 30 / 24 / 60 / interval # Not total budget. Abstract Budget for the interval budget = 1000 client_avg_data_in_per_reqs = avg_data_in_per_reqs[client] client_avg_data_out_per_reqs = avg_data_out_per_reqs[client] request_sum = total_request_per_client[client] # get latency for each from this client to each station station_latency_dict = dict() for key_str, latency_val in latency_results_dict.iteritems(): src_host, dst_host = key_str.split(',') if client == src_host: station_latency_dict.update({dst_host: latency_val}) for station in stations: # convert from Mb/s to GB/s in_band = float(in_band_dict[station]) / 8 / 1024 out_band = float(out_band_dict[station]) / 8 / 1024 in_bandwidths.append(in_band) out_bandwidths.append(out_band) avg_in_data.append(client_avg_data_in_per_reqs[station]) avg_out_data.append(client_avg_data_out_per_reqs[station]) service_rates_list.append(service_rates[station]) station_latency.append(station_latency_dict[station]) weights = optimisation(num_of_stations=2, total_requests=request_sum, elb_prices=elb_prices, avg_data_in_per_reqs=avg_in_data, avg_data_out_per_reqs=avg_out_data, in_bandwidths=in_bandwidths, out_bandwidths=out_bandwidths, budget=budget, service_rates=service_rates_list, measurement_interval=measurement_interval, station_latency=station_latency) print_message('Weights calculated for client %s: %s' % (client, weights)) # weights are fraction initially but Route53 only accept integer and # must be an integer between 0 and 255 # so we convert the ratio of weights to ratio of integers # this scaling should match the searching step of in optimisation # Weight weights = [int(val * 255) for val in weights] route53_conn = Route53Connection() zone = route53_conn.get_zone(base_domain) elb_records = station_metadata_map['StationELBDNS'] alias_zone_id = {'xueshi-station-1': 'Z32O12XQLNTSW2', 'xueshi-station-2': 'Z35SXDOTRQ7X7K'} # need to be mapped to IPs clients_regions = {'ap_south_1_client_1': 'ap-southeast-1', 'us_east_1_client_1': 'us-east-1', 'us_west_1_client_1': 'us-west-1'} identifiers = dict(cfg.items('StationWRRAliasIdentifiers')) stations = get_available_stations() # Since we put optimisation parameter by the order available # stations the output weights should be in the same order station_weights = {} for idx in xrange(len(stations)): station_weights.update({stations[idx]: int(round(weights[idx]))}) rrs = ResourceRecordSets(route53_conn, zone.id) for s_name, weights_val in station_weights.iteritems(): alias_dns_name = elb_records[s_name] host_zone_id = alias_zone_id[s_name] # Client region not station region region_name = clients_regions[client] dns_record_name = '%s.%s' % (region_name, base_domain) identifier = identifiers[s_name] base_record = dict(name=dns_record_name, record_type="A", weight=weights_val, identifier=identifier) print_message('[Debug]: weight before sending change request %s' % weights_val) new = rrs.add_change(action="UPSERT", **base_record) new.set_alias(host_zone_id, unicode(alias_dns_name), False) # with retry in case request rejected due to proc succeed = False while not succeed: try: rrs.commit() succeed = True except UnsuccessfulRequestError as e: retriable_err = 'The request was rejected because Route 53 ' \ 'was still processing a prior request' if retriable_err in e.body: # pause for a while before send another request time.sleep(2.5) print_message('Previous request to Route 53 in progress.\n ' 'Re-sending request...') print_message('Weights set for client %s: %s' % (client, weights)) log_info(metric_record_file, 'Weights set for client %s: %s' % (client, weights)) queue.put((client, weights))
def main(): setup_logging() # line counter for reading csparql logs of each service station line_counters = dict() # bucket and ELB info for getting access log from S3 elb_buckets_dict = dict() elb_regions = get_station_region() elb_buckets = get_elb_buckets_map() for station, region in elb_regions.iteritems(): elb_region_str = '%s:%s' % (station, region) elb_buckets_dict.update({elb_region_str: unicode(elb_buckets[station])}) stations = get_available_stations() for station in stations: line_counters.update({station: 0}) log_base_dir = time.strftime("%Y_%m%d_%H%M") total_num_users = cfg.get_int('Default', 'total_num_users') # Get all available client region available_clients = get_available_clients() # counter = 0 # For testing while True: # Processing csparql log and ELB access log simultaneously by 2 threads # Start csparql log paring first since ELB access log has delay # emitting the log file # First we need to calculate how much time to wait before retrieving # csparql logs. i.e how long the actual measurement time is. Since S3 # only emit log at 5, 10, 15 etc. minute of the hour, we can only # measure the time that measurement start until the last expected log # omission time which is not the actual time that log being obtained # since there is delay measurement_interval = calculate_waiting_time() # no need to wait until log actually being obtained measurement_interval -= 300 # Measuring latency between each client region and service station latency_manager = ThreadingManager() latency_manager.start_task( target_func=measure_latency, name="latency_manager", para=[available_clients, stations, measurement_interval] ) server_log_processor = ThreadingManager() server_log_processor.start_task( target_func=process_server_logs, name="server_log_processor", para=[log_base_dir, line_counters, total_num_users, measurement_interval] ) # Begin gathering info of the amount of data # transferred through each service station data_counting_task = ThreadingManager() data_counting_task.start_task( target_func=process_elb_access_log, name="elb_access_log_processor", para=[elb_buckets_dict] ) latency_results_dict = latency_manager.collect_results().get() # collecting csparql log first since its processing will complete first # while elb data might has delay server_metrics_queue = server_log_processor.collect_results() (station_metric_list, total_request) = server_metrics_queue.get() line_counters = server_metrics_queue.get() print_message('') print_message('Service station logs processing finished\n') # collecting elb data now elb_data_queue = data_counting_task.collect_results() data_in, data_out = elb_data_queue.get() """ Preparing optimisation parameters """ # Calculate The "average amount of data involved in each request" for # each service station and the "total number of requests" # These 2 dictionary stores the average data send and received per # request sent and received by *each service station* from *each # client*. The length of the dictionary should be equals to the # number of clients (regions) # <Client_name: <Station: data_in_per_req>> avg_data_in_per_reqs = dict() # <Client_name: <Station: data_out_per_req>> avg_data_out_per_reqs = dict() # initialise for cli_name in available_clients: avg_data_in_per_reqs.update({cli_name: {}}) avg_data_out_per_reqs.update({cli_name: {}}) # requests arrival rate and service rate of each service station arrival_rates = dict() service_rates = dict() for station_metric in station_metric_list: # getting metric station_name = station_metric.station_name arrival_rate = station_metric.arrival_rate service_rate = station_metric.service_rate requests = station_metric.total_requests print '\nTotal requests for station %s: %s' \ % (station_name, requests) log_info(metric_record_file, '\nTotal requests for station %s: %s' % (station_name, requests)) # arrival_rate and service_rate arrival_rates.update({station_name: arrival_rate}) service_rates.update({station_name: service_rate}) response_time = \ math.pow(service_rate, -1) / (1 - math.pow(service_rate, -1) * arrival_rate) print '[Debug] predicted current response time of service station ' \ '\'%s\': %s' % (station_name, response_time) log_info(metric_record_file, '[Debug] predicted currentresponse time of service ' 'station \'%s\': %s' % (station_name, response_time)) # TODO: calculate total requests for each client # TODO: if c-sparql could record the source of each requests # TODO: things would be much more easier total_request_per_client = dict() data_in_sum = 0 client_data_in_sum = dict() for a_client in available_clients: client_data_in_sum.update({a_client: 0}) for station_name in stations: d_in = data_in.get(station_name) for c, sent_data in d_in.iteritems(): # calculate total data sent by each client # and the total data sent by all client client_data_in_sum[c] += sent_data data_in_sum += sent_data # calculate total amount of requests sent by each client for ac in available_clients: t_request = math.ceil( total_request * (client_data_in_sum[ac] / data_in_sum)) t_request = int(t_request) # build this so that it could be used by calculating out data total_request_per_client.update({ac: t_request}) for station_name in stations: d_in = data_in.get(station_name) for c, sent_data in d_in.iteritems(): t_request = total_request_per_client[c] # convert the amount of data to GB sent_data = float(sent_data / math.pow(1024, 3)) avg_data_in_per_req = sent_data / t_request avg_data_in_per_reqs[c].update( {station_name: avg_data_in_per_req}) for station_name in stations: d_out = data_out.get(station_name) for c1, received_data in d_out.iteritems(): t_request = total_request_per_client[c1] received_data = float(received_data / math.pow(1024, 3)) avg_data_out_per_req = received_data / t_request avg_data_out_per_reqs[c1].update( {station_name: avg_data_out_per_req}) # For testing purpose info_str = \ '\n[Debug] total_request: %s\n' \ '[Debug] avg_data_in_per_reqs: %s\n' \ '[Debug] avg_data_out_per_reqs: %s\n' \ '[Debug] arrival_rates: %s\n' \ '[Debug] service_rates: %s\n' \ % (total_request, avg_data_in_per_reqs, avg_data_out_per_reqs, arrival_rates, service_rates) print info_str log_info(metric_record_file, info_str) # TODO: Get elb price from config # ELB pricing elb_prices = [0.008, 0.008] # optimise for each client... # do optimisation for each client in a new threads optimiser = ThreadingManager() for client in available_clients: optimiser.start_tasks( target_func=clients_optimisation, name="optimiser", para=[avg_data_in_per_reqs, avg_data_out_per_reqs, client, elb_prices, latency_results_dict, measurement_interval, service_rates, stations, total_request_per_client] ) # synchronising threads optimiser.collect_results() # it takes up to 60 mins for Route 53 record changes to take effect time.sleep(60)
def process_monitor_log(base_dir, observer_addr, line_counter, queue): """Parse the monitor log and calculate various metric for all servers in the service station monitored by the observer :param base_dir: Base directory of monitor log :param observer_addr: Service station name and observer ips pair :param line_counter: Counter for continuously reading the single log file :param queue: Queue that store metrics needed for optimisation generated by current thread """ station_name, observer_ip = observer_addr.split('=') # every thread store data in separate folder base_dir = base_dir + station_name + '/' if not os.path.exists(base_dir): os.makedirs(base_dir) monitor_log_path = base_dir + 'observer_log.txt' # Flag indicates whether the logs of all servers contain useful data or not all_has_info = False while not all_has_info: print_message('') print_message('Synchronising log from observer at: %s' % observer_ip) module_path = os.path.dirname(Resources.__file__) private_key_file_path = module_path + '/ec2_private_key' sync_files(host_ip=observer_ip, username='******', host_file_path='~/results.txt', pk_path=private_key_file_path, dst_loc=monitor_log_path) parsed_log_dir, line_counter = parse_monitor_log( base_dir, int(line_counter)) result_queue = generate_data(parsed_log_dir) # observer log has contain ResponseInfo if the queue if not empty # check all server has response info # TODO: the number of server can be read from config if result_queue.qsize() > 1: all_has_info = True else: time.sleep(5) data_list = [] while not result_queue.empty(): server_data = result_queue.get() data_list.append(server_data) total_requests, arrival_rate, service_rate_para_list \ = calculate_metrics(data_list) result_dict = { 'station_name': station_name, 'total_requests': total_requests, 'arrival_rate': arrival_rate, 'service_rate_para_list': service_rate_para_list, 'line_counter': line_counter } queue.put(result_dict)
def calculate_metrics(data_list): """ Function to calculate metrics needed for optimisation. What needs to be calculated: 1. The total number of requests 2. Requests arrival rate of the station (lambda) 3. Prepares metrics for calculating Service Rate of the station (mu) :param data_list: list of data for each server in current service station """ # "vm, number of requests, cpu_core, data" dict list service_rate_para_list = [] # list that stores the average arrival rate of each server avg_server_arrival_rate = [] # collecting relative parameters for vm_data in data_list: vm_name = vm_data[0] data = vm_data[1] arrivals_list = [] # list that stores the average arrival # rate of "each sampling interval" for # a single server # sum the arrival rate for each request at the same sampling interval for i in xrange(len(data[0][0])): sampling_interval_arrivals = 0 for j in xrange(len(data[2]) - 1): sampling_interval_arrivals += data[7][j][i] # store overall arrival rate of each sampling interval # in order to estimate service rate with CPU utilisation # which is also collected during each sampling interval arrivals_list.append(sampling_interval_arrivals) # Mean of arrivals of all sampling intervals is the arrival rate # the unit time of which is the sampling interval (in this case it is # 1 minutes avg_server_arrival_rate.append(numpy.mean(arrivals_list)) # calculate service rate num_of_requests = calculate_total_requests(data) para_tuple = { 'vm_name': vm_name, 'num_of_requests': num_of_requests, 'data': data } vm_cpu_spec = dict(cfg.items('VMSpec')) cpu_core = [ vm_cpu_spec[spec] for spec in vm_cpu_spec.keys() if spec in vm_name ] if not cpu_core: print 'No specification configured for VM \'%s\'' % vm_name return print_message('[Debug] Number of CPUs of \'%s\': %s' % (vm_name, cpu_core)) para_tuple.update({'cpu_cores': cpu_core}) service_rate_para_list.append(para_tuple) # calculate total number of requests total_requests = sum( [p['num_of_requests'] for p in service_rate_para_list]) station_arrival_rate = sum(avg_server_arrival_rate) return total_requests, station_arrival_rate, service_rate_para_list
def parse_monitor_log(input_files_dir, line_counter): if not input_files_dir: print 'Please supply the directory that contains observer results' return original_file_dir = input_files_dir parsed_file_dir = original_file_dir + "parsed_results/" if not os.path.exists(parsed_file_dir): os.makedirs(parsed_file_dir) # get time to store current reading current_time = time.strftime("%Y_%m%d_%H%M") files = [ f for f in os.listdir(original_file_dir) if f.startswith('observer_log') ] # for i in xrange(len(files)): file_name = files[0] sub_folder_name = '%s/%s/' % (file_name[0:file_name.rfind('.')], current_time) metric_name = None # keep track of last metric id # in order to save reading of last metric when # encounter new metric last_metric_id = None metric_value = None timestamps = None vm_id = None file_path = os.path.abspath(original_file_dir + '/' + file_name) # debug # with open(file_path) as test_f: # print_message('File length: %s' % (len(test_f.readlines()))) # print_message('Line counter %s\n' % line_counter) # counter for number of skip due to incorrect format of some # csparql log entries skip_counter = 0 # counter to skip to the last with open(file_path) as f: # count = 0 # counter for skip to the line left over last time # while count < line_counter: # try: # f.next() # except StopIteration as e: # # print 'StopIteration\n' + str(e.message) # count += 1 # count += 1 # current_line = 1 # default to 1 to enable the while loop count = 0 # counter for skip to the line left over last time if line_counter != 0: for current_line in f: if count == int(line_counter): break count += 1 # Skip to the line after ObserverReceivedTimestamp for current_line in f: line_counter += 1 line_segments = current_line.split("\t") if len(line_segments) < 3: continue if 'ObserverReceivedTimesampt' in line_segments[1]: break # The csparql log sometimes contain entries with "MonitorDatum" # entries mixed up, hence we explicitly set the program to expect # metrics in the expected order and skip those mixed entries # since it is way to difficult to parse it for external program # like this one and it is actually the job of the csparql # observer to print metrics in a consistent format which on the # other hand is the efficient fix of this issue expected_properties = [ 'isAbout', 'isProducedBy', 'hasMonitoredMetric', 'hasValue', 'hasTimeStamp' ] expected_prop_idx = 0 # counter to keep track of order of # expected properties encountered for current_line in f: line_counter += 1 # parse current line of the file # current_line = f.readline() # line_counter += 1 # [metric_id, metric_prop, value, dump] # = current_line.split("\t") line_segments = current_line.split("\t") if len(line_segments) < 3: continue metric_id = line_segments[0] metric_prop = line_segments[1] value = line_segments[2] # if the line is not about metric continue to read next line if 'MonitoringDatum' not in metric_id: continue # For the first time if not last_metric_id: last_metric_id = metric_id # Check the metric property is currently expected and the # metric id is the same one as the one that we'd been # reading other expected properties for # If the property is not expected skip to next bundle of metric # i.e next 'isAbout'. OR if it is expected but the metric ID # is not the same as other metric property we've been reading # for the current metric ID we skip as well. Except # that when expecting 'isAbout' i.e the first property, # we don't check for metric ID, since it could be a new metric if metric_prop != expected_properties[expected_prop_idx] or \ (expected_prop_idx != 0 and metric_id != last_metric_id): # as long as it enters here it is one skip already skip_counter += 1 # if an 'isAbout' encountered we assume the following # lines could possible be in order if metric_prop == 'isAbout': # start fresh expected_prop_idx = 0 last_metric_id = metric_id else: # Skip to the next 'isAbout' line expected_prop_idx = 0 # expecting the 'isAbout' last_metric_id = None continue # if the metric property is expected then expect the next one if expected_prop_idx == len(expected_properties) - 1: expected_prop_idx = 0 else: expected_prop_idx += 1 # save reading of last metric since following reading # will be of a new metric id if last_metric_id != metric_id: # check whether any of the 4 properties : # vm_id, metric name, metric value, timestamp # is none, which means that the very end of the file # has one record that is only partially written, hence # ignore this record # This "if" shouldn't be here since I skipped those records that # are mixed up... but for some reason values below could # still be None... It is much more convenient to fix csparql log # rather than thinking out logic to patching up for this if vm_id and metric_name and metric_value and timestamps: result_dir_path = parsed_file_dir + sub_folder_name + vm_id if not os.path.exists(result_dir_path): os.makedirs(result_dir_path) result_file_path = \ result_dir_path + '/' + metric_name + '.txt' with open(result_file_path, 'a') as parsed_f: parsed_f.write(metric_value + '\n') parsed_f.write(timestamps + '\n') # current metric become the last after finish # processing its value last_metric_id = metric_id if metric_prop == "isAbout": vm_id = value.replace("Compute#", "") elif metric_prop == "hasMonitoredMetric": metric_name = value.replace("QoSMetric#", "") elif metric_prop == "hasValue": metric_value = value elif metric_prop == "hasTimeStamp": timestamps = value # record the position left over # line_counter = f.tell() print_message('[Debug] Skipped: %s' % skip_counter) return parsed_file_dir + sub_folder_name, line_counter
def clients_optimisation(avg_data_in_per_reqs, avg_data_out_per_reqs, client, elb_prices, latency_results_dict, measurement_interval, service_rates, stations, total_request_per_client, queue): # bandwidths for each client in_bandwidths = [] out_bandwidths = [] # average data per requests avg_in_data = [] avg_out_data = [] service_rates_list = [] station_latency = [] in_band_dict, out_band_dict = get_stations_bandwidth(client) # Budget e.g 100,000 / 30 / 24 / 60 / interval # Not total budget. Abstract Budget for the interval budget = 1000 client_avg_data_in_per_reqs = avg_data_in_per_reqs[client] client_avg_data_out_per_reqs = avg_data_out_per_reqs[client] request_sum = total_request_per_client[client] # get latency for each from this client to each station station_latency_dict = dict() for key_str, latency_val in latency_results_dict.iteritems(): src_host, dst_host = key_str.split(',') if client == src_host: station_latency_dict.update({dst_host: latency_val}) for station in stations: # convert from Mb/s to GB/s in_band = float(in_band_dict[station]) / 8 / 1024 out_band = float(out_band_dict[station]) / 8 / 1024 in_bandwidths.append(in_band) out_bandwidths.append(out_band) avg_in_data.append(client_avg_data_in_per_reqs[station]) avg_out_data.append(client_avg_data_out_per_reqs[station]) service_rates_list.append(service_rates[station]) station_latency.append(station_latency_dict[station]) weights = optimisation(num_of_stations=2, total_requests=request_sum, elb_prices=elb_prices, avg_data_in_per_reqs=avg_in_data, avg_data_out_per_reqs=avg_out_data, in_bandwidths=in_bandwidths, out_bandwidths=out_bandwidths, budget=budget, service_rates=service_rates_list, measurement_interval=measurement_interval, station_latency=station_latency) print_message('Weights calculated for client %s: %s' % (client, weights)) # weights are fraction initially but Route53 only accept integer and # must be an integer between 0 and 255 # so we convert the ratio of weights to ratio of integers # this scaling should match the searching step of in optimisation # Weight weights = [int(val * 255) for val in weights] route53_conn = Route53Connection() zone = route53_conn.get_zone(base_domain) elb_records = station_metadata_map['StationELBDNS'] alias_zone_id = { 'xueshi-station-1': 'Z32O12XQLNTSW2', 'xueshi-station-2': 'Z35SXDOTRQ7X7K' } # need to be mapped to IPs clients_regions = { 'ap_south_1_client_1': 'ap-southeast-1', 'us_east_1_client_1': 'us-east-1', 'us_west_1_client_1': 'us-west-1' } identifiers = dict(cfg.items('StationWRRAliasIdentifiers')) stations = get_available_stations() # Since we put optimisation parameter by the order available # stations the output weights should be in the same order station_weights = {} for idx in xrange(len(stations)): station_weights.update({stations[idx]: int(round(weights[idx]))}) rrs = ResourceRecordSets(route53_conn, zone.id) for s_name, weights_val in station_weights.iteritems(): alias_dns_name = elb_records[s_name] host_zone_id = alias_zone_id[s_name] # Client region not station region region_name = clients_regions[client] dns_record_name = '%s.%s' % (region_name, base_domain) identifier = identifiers[s_name] base_record = dict(name=dns_record_name, record_type="A", weight=weights_val, identifier=identifier) print_message('[Debug]: weight before sending change request %s' % weights_val) new = rrs.add_change(action="UPSERT", **base_record) new.set_alias(host_zone_id, unicode(alias_dns_name), False) # with retry in case request rejected due to proc succeed = False while not succeed: try: rrs.commit() succeed = True except UnsuccessfulRequestError as e: retriable_err = 'The request was rejected because Route 53 ' \ 'was still processing a prior request' if retriable_err in e.body: # pause for a while before send another request time.sleep(2.5) print_message('Previous request to Route 53 in progress.\n ' 'Re-sending request...') print_message('Weights set for client %s: %s' % (client, weights)) log_info(metric_record_file, 'Weights set for client %s: %s' % (client, weights)) queue.put((client, weights))
def calculate_metrics(data_list): """ Function to calculate metrics needed for optimisation. What needs to be calculated: 1. The total number of requests 2. Requests arrival rate of the station (lambda) 3. Prepares metrics for calculating Service Rate of the station (mu) :param data_list: list of data for each server in current service station """ # "vm, number of requests, cpu_core, data" dict list service_rate_para_list = [] # list that stores the average arrival rate of each server avg_server_arrival_rate = [] # collecting relative parameters for vm_data in data_list: vm_name = vm_data[0] data = vm_data[1] arrivals_list = [] # list that stores the average arrival # rate of "each sampling interval" for # a single server # sum the arrival rate for each request at the same sampling interval for i in xrange(len(data[0][0])): sampling_interval_arrivals = 0 for j in xrange(len(data[2]) - 1): sampling_interval_arrivals += data[7][j][i] # store overall arrival rate of each sampling interval # in order to estimate service rate with CPU utilisation # which is also collected during each sampling interval arrivals_list.append(sampling_interval_arrivals) # Mean of arrivals of all sampling intervals is the arrival rate # the unit time of which is the sampling interval (in this case it is # 1 minutes avg_server_arrival_rate.append(numpy.mean(arrivals_list)) # calculate service rate num_of_requests = calculate_total_requests(data) para_tuple = {'vm_name': vm_name, 'num_of_requests': num_of_requests, 'data': data} vm_cpu_spec = dict(cfg.items('VMSpec')) cpu_core = [vm_cpu_spec[spec] for spec in vm_cpu_spec.keys() if spec in vm_name] if not cpu_core: print 'No specification configured for VM \'%s\'' % vm_name return print_message('[Debug] Number of CPUs of \'%s\': %s' % (vm_name, cpu_core)) para_tuple.update({'cpu_cores': cpu_core}) service_rate_para_list.append(para_tuple) # calculate total number of requests total_requests = sum([p['num_of_requests'] for p in service_rate_para_list]) station_arrival_rate = sum(avg_server_arrival_rate) return total_requests, station_arrival_rate, service_rate_para_list
def format_data(data, period, category_list, cpu_file): metric_list = [ 'addtocartbulk', 'checkLogin', 'checkoutoptions', 'login', 'logout', 'main', 'orderhistory', 'quickadd' ] delete = [] for i in xrange(len(data[2]) - 1): if not data[2][i]: delete.append(i) # delete data data, category_list, delete = remove_data(data, category_list, delete) # find out those metrics that are not in the metric_list for i in xrange(len(data[2]) - 1): flag = 0 for j in range(8): if category_list[i] == metric_list[j]: flag = 1 if flag == 0: delete.append(i) # delete data data, category_list, delete = remove_data(data, category_list, delete) start_time = min(data[2][0]) max_time = max(data[2][0]) for i in xrange(1, len(data[2]) - 1): if data[2][i] and start_time > min(data[2][i]): start_time = min(data[2][i]) if data[2][i] and max_time > max(data[2][i]): max_time = max(data[2][i]) samples = int(math.floor(((max_time - start_time) / period))) print_message('Number of samples (interval:%s) : %s' % (period, samples)) for i in xrange(len(data[2]) - 1): end_time = start_time departure = [a + r * 1000 for a, r in zip(data[2][i], data[3][i])] for k in xrange(samples): index = [ v[0] for v in enumerate(departure) if end_time <= v[1] < (end_time + period) ] arr_index = [ v[0] for v in enumerate(data[2][i]) if end_time <= v[1] < (end_time + period) ] response_times = [0] if index: response_times = [data[3][i][idx] for idx in index] data[4][i].append(scipy.mean(response_times)) data[5][i].append(len(index) / period * 1000) data[6][i].append(len(index)) data[7][i].append(len(arr_index)) data[0][i].append(end_time + period) end_time += period # Number of samples for each request might not be equal max_num_requests = 0 max_requests_idx = 0 for i in xrange(len(data[2]) - 1): if max_num_requests < len(data[2][i]): max_num_requests = len(data[2][i]) max_requests_idx = i for i in xrange(len(data[2]) - 1): data[0][i] = data[0][max_requests_idx] if len(data[4][i]) < len(data[0][i]): data[4][i].append([0] * (len(data[0][i]) - len(data[4][i]))) data[5][i].append([0] * (len(data[0][i]) - len(data[5][i]))) data[6][i].append([0] * (len(data[0][i]) - len(data[6][i]))) data[0][len(data[0]) - 1] = data[0][0] with open(cpu_file) as f: count = 0 cpu = [] cpu_time = [] flag = 0 line = f.readline() while line: cpu_num = float(line) if count % 2 == 0: if cpu_num > 1 or math.isnan(cpu_num): flag = 1 else: cpu.append(cpu_num) else: if flag: flag = 0 else: cpu_time.append(cpu_num) count += 1 line = f.readline() cpu_time = [e - 3600 * 1000 for e in cpu_time] indices = [i[0] for i in sorted(enumerate(cpu_time), key=lambda x: x[1])] cpu_time = [cpu_time[i] for i in indices] cpu = [cpu[i] for i in indices] for i in xrange(len(data[0][0])): indices_found = [ v[0] for v in enumerate(cpu_time) if data[0][0][i] <= v[1] < data[0][0][i] + period ] if indices_found: mean = scipy.mean([cpu[i] for i in indices_found]) data[1][len(data[1]) - 1].append(mean) return data
def process_monitor_log(base_dir, observer_addr, line_counter, queue): """Parse the monitor log and calculate various metric for all servers in the service station monitored by the observer :param base_dir: Base directory of monitor log :param observer_addr: Service station name and observer ips pair :param line_counter: Counter for continuously reading the single log file :param queue: Queue that store metrics needed for optimisation generated by current thread """ station_name, observer_ip = observer_addr.split('=') # every thread store data in separate folder base_dir = base_dir + station_name + '/' if not os.path.exists(base_dir): os.makedirs(base_dir) monitor_log_path = base_dir + 'observer_log.txt' # Flag indicates whether the logs of all servers contain useful data or not all_has_info = False while not all_has_info: print_message('') print_message('Synchronising log from observer at: %s' % observer_ip) module_path = os.path.dirname(Resources.__file__) private_key_file_path = module_path + '/ec2_private_key' sync_files(host_ip=observer_ip, username='******', host_file_path='~/results.txt', pk_path=private_key_file_path, dst_loc=monitor_log_path) parsed_log_dir, line_counter = parse_monitor_log(base_dir, int(line_counter)) result_queue = generate_data(parsed_log_dir) # observer log has contain ResponseInfo if the queue if not empty # check all server has response info # TODO: the number of server can be read from config if result_queue.qsize() > 1: all_has_info = True else: time.sleep(5) data_list = [] while not result_queue.empty(): server_data = result_queue.get() data_list.append(server_data) total_requests, arrival_rate, service_rate_para_list \ = calculate_metrics(data_list) result_dict = {'station_name': station_name, 'total_requests': total_requests, 'arrival_rate': arrival_rate, 'service_rate_para_list': service_rate_para_list, 'line_counter': line_counter} queue.put(result_dict)
def process_server_logs(base_dir, line_counters, total_users, waiting_time, queue): """ :param base_dir: Base directory of monitor log :param line_counters: Counter for continuously reading the single log file :param total_users: The total number of users simulated :param queue: Queue to store results when using in thread :param waiting_time: The measurement time :return: """ print_message('') print_message('Waiting for the next batch of service station monitoring ' 'logs (%s seconds)...\n' % waiting_time) time.sleep(waiting_time) module_path = os.path.dirname(client_server.__file__) base_dir = module_path + '/logs/' + base_dir + '/' # retrieve service station and observer mapping station_observers = get_station_csparql() # retrieve and process the log of each service station with a new threads csparql_reader = ThreadingManager() # python strptime thread safety bug. Has to call strptime once before # creating thread. Details can be found on: # http://bugs.python.org/issue11108 time.strptime("30 Nov 00", "%d %b %y") for station_name, observer_ip in station_observers.iteritems(): observer_addr = '%s=%s' % (station_name, observer_ip) csparql_reader.start_tasks( target_func=process_monitor_log, name='csparql_reader', para=[base_dir, observer_addr, line_counters[station_name]]) # wait for all threads to finish and collect their results result_queue = csparql_reader.collect_results() total_requests = 0 # Now collect metric data from all service station and then calculate the # necessary metric for the optimisation since these metrics are calculated # for the entire online services service_station_metric_list = [] while not result_queue.empty(): # get metrics returned result_dict = result_queue.get() station_name = result_dict['station_name'] station_total_requests = result_dict['total_requests'] arrival_rate = result_dict['arrival_rate'] service_rate_para_list = result_dict['service_rate_para_list'] line_counter = result_dict['line_counter'] total_requests += station_total_requests service_station_metric = ServiceStationMetric(station_name, station_total_requests, arrival_rate / 60, service_rate_para_list, service_rate=0) service_station_metric_list.append(service_station_metric) # update the current line counter line_counters[station_name] = line_counter # now calculate service rate for each station for station_metric in service_station_metric_list: # parameter needed for calculating service rate for servers from # one service station mu_para_list = station_metric.service_rate_para_list # Calculating service rate of each server in one station service_time_list = [] # list to store service time of each server # for service rate calculation parameters for each server ... for s_para in mu_para_list: # number of users for this vm num_of_requests = s_para['num_of_requests'] num_of_user = int( math.ceil(total_users * (num_of_requests / total_requests))) num_of_cores = s_para['cpu_cores'] data = s_para['data'] mean_service_time = calculate_service_rate(num_of_user, num_of_cores, data) service_time_list.append(mean_service_time) print_message( 'Mean service time of VM \'%s\' at station \'%s\': %s' % (s_para['vm_name'], station_metric.station_name, str(mean_service_time))) # The overall service rate is calculated by the number of requests # completed by all servers within the time that the slowest server # takes to complete a single request max_time = max(service_time_list) comp_req_sum = 0 for service_time in service_time_list: comp_req_sum += max_time / service_time overall_service_rate = comp_req_sum / max_time station_metric.service_rate = overall_service_rate # store result of this thread in result the queue queue.put((service_station_metric_list, total_requests)) queue.put(line_counters)
def parse_monitor_log(input_files_dir, line_counter): if not input_files_dir: print 'Please supply the directory that contains observer results' return original_file_dir = input_files_dir parsed_file_dir = original_file_dir + "parsed_results/" if not os.path.exists(parsed_file_dir): os.makedirs(parsed_file_dir) # get time to store current reading current_time = time.strftime("%Y_%m%d_%H%M") files = [f for f in os.listdir(original_file_dir) if f.startswith('observer_log')] # for i in xrange(len(files)): file_name = files[0] sub_folder_name = '%s/%s/' % (file_name[0: file_name.rfind('.')], current_time) metric_name = None # keep track of last metric id # in order to save reading of last metric when # encounter new metric last_metric_id = None metric_value = None timestamps = None vm_id = None file_path = os.path.abspath(original_file_dir + '/' + file_name) # debug # with open(file_path) as test_f: # print_message('File length: %s' % (len(test_f.readlines()))) # print_message('Line counter %s\n' % line_counter) # counter for number of skip due to incorrect format of some # csparql log entries skip_counter = 0 # counter to skip to the last with open(file_path) as f: # count = 0 # counter for skip to the line left over last time # while count < line_counter: # try: # f.next() # except StopIteration as e: # # print 'StopIteration\n' + str(e.message) # count += 1 # count += 1 # current_line = 1 # default to 1 to enable the while loop count = 0 # counter for skip to the line left over last time if line_counter != 0: for current_line in f: if count == int(line_counter): break count += 1 # Skip to the line after ObserverReceivedTimestamp for current_line in f: line_counter += 1 line_segments = current_line.split("\t") if len(line_segments) < 3: continue if 'ObserverReceivedTimesampt' in line_segments[1]: break # The csparql log sometimes contain entries with "MonitorDatum" # entries mixed up, hence we explicitly set the program to expect # metrics in the expected order and skip those mixed entries # since it is way to difficult to parse it for external program # like this one and it is actually the job of the csparql # observer to print metrics in a consistent format which on the # other hand is the efficient fix of this issue expected_properties = ['isAbout', 'isProducedBy', 'hasMonitoredMetric', 'hasValue', 'hasTimeStamp'] expected_prop_idx = 0 # counter to keep track of order of # expected properties encountered for current_line in f: line_counter += 1 # parse current line of the file # current_line = f.readline() # line_counter += 1 # [metric_id, metric_prop, value, dump] # = current_line.split("\t") line_segments = current_line.split("\t") if len(line_segments) < 3: continue metric_id = line_segments[0] metric_prop = line_segments[1] value = line_segments[2] # if the line is not about metric continue to read next line if 'MonitoringDatum' not in metric_id: continue # For the first time if not last_metric_id: last_metric_id = metric_id # Check the metric property is currently expected and the # metric id is the same one as the one that we'd been # reading other expected properties for # If the property is not expected skip to next bundle of metric # i.e next 'isAbout'. OR if it is expected but the metric ID # is not the same as other metric property we've been reading # for the current metric ID we skip as well. Except # that when expecting 'isAbout' i.e the first property, # we don't check for metric ID, since it could be a new metric if metric_prop != expected_properties[expected_prop_idx] or \ (expected_prop_idx != 0 and metric_id != last_metric_id): # as long as it enters here it is one skip already skip_counter += 1 # if an 'isAbout' encountered we assume the following # lines could possible be in order if metric_prop == 'isAbout': # start fresh expected_prop_idx = 0 last_metric_id = metric_id else: # Skip to the next 'isAbout' line expected_prop_idx = 0 # expecting the 'isAbout' last_metric_id = None continue # if the metric property is expected then expect the next one if expected_prop_idx == len(expected_properties) - 1: expected_prop_idx = 0 else: expected_prop_idx += 1 # save reading of last metric since following reading # will be of a new metric id if last_metric_id != metric_id: # check whether any of the 4 properties : # vm_id, metric name, metric value, timestamp # is none, which means that the very end of the file # has one record that is only partially written, hence # ignore this record # This "if" shouldn't be here since I skipped those records that # are mixed up... but for some reason values below could # still be None... It is much more convenient to fix csparql log # rather than thinking out logic to patching up for this if vm_id and metric_name and metric_value and timestamps: result_dir_path = parsed_file_dir + sub_folder_name + vm_id if not os.path.exists(result_dir_path): os.makedirs(result_dir_path) result_file_path = \ result_dir_path + '/' + metric_name + '.txt' with open(result_file_path, 'a') as parsed_f: parsed_f.write(metric_value + '\n') parsed_f.write(timestamps + '\n') # current metric become the last after finish # processing its value last_metric_id = metric_id if metric_prop == "isAbout": vm_id = value.replace("Compute#", "") elif metric_prop == "hasMonitoredMetric": metric_name = value.replace("QoSMetric#", "") elif metric_prop == "hasValue": metric_value = value elif metric_prop == "hasTimeStamp": timestamps = value # record the position left over # line_counter = f.tell() print_message('[Debug] Skipped: %s' % skip_counter) return parsed_file_dir + sub_folder_name, line_counter