def __init__(self, host, is_secure=True, port=None, debug=0, path='/'): """ :param host: The host to connection to :keyword str aws_access_key_id: AWS Access Key ID :keyword str aws_secret_access_key: AWS Secret Access Key :type is_secure: boolean :param is_secure: Whether the connection is over SSL """ self.access_key = cfg.get('Default', 'AWS_ACCESS_KEY') self.secret_key = cfg.get('Default', 'AWS_SECRET_KEY') # default number of retries value self.num_retries = 6 self.is_secure = is_secure if port: self.port = port else: self.port = PORTS[is_secure] # catch and retry on certain exceptions from httplib self.http_exceptions = (httplib.HTTPException, socket.error, socket.gaierror, httplib.BadStatusLine) if is_secure: self.protocol = 'https' else: self.protocol = 'http' self.host = host self.path = path # if the value passed in for debug if not isinstance(debug, (int, long)): debug = 0 self.debug = cfg.get_int('Default', 'debug', debug) self.host_header = None # Set default socket time out as suggested: # http://docs.aws.amazon.com/amazonswf/latest/apireference/ # API_PollForActivityTask.html self.http_connection_kwargs = { 'timeout': cfg.get_int('HTTPConnection', 'http_socket_timeout', 70) } self._connection = (self.host, self.port, self.is_secure) self.auth_handler = auth.get_auth_handler(host, cfg, self._target_aws_service())
def __init__(self, host, is_secure=True, port=None, debug=0, path='/'): """ :param host: The host to connection to :keyword str aws_access_key_id: AWS Access Key ID :keyword str aws_secret_access_key: AWS Secret Access Key :type is_secure: boolean :param is_secure: Whether the connection is over SSL """ self.access_key = cfg.get('Default', 'AWS_ACCESS_KEY') self.secret_key = cfg.get('Default', 'AWS_SECRET_KEY') # default number of retries value self.num_retries = 6 self.is_secure = is_secure if port: self.port = port else: self.port = PORTS[is_secure] # catch and retry on certain exceptions from httplib self.http_exceptions = (httplib.HTTPException, socket.error, socket.gaierror, httplib.BadStatusLine) if is_secure: self.protocol = 'https' else: self.protocol = 'http' self.host = host self.path = path # if the value passed in for debug if not isinstance(debug, (int, long)): debug = 0 self.debug = cfg.get_int('Default', 'debug', debug) self.host_header = None # Set default socket time out as suggested: # http://docs.aws.amazon.com/amazonswf/latest/apireference/ # API_PollForActivityTask.html self.http_connection_kwargs = {'timeout': cfg.get_int( 'HTTPConnection', 'http_socket_timeout', 70)} self._connection = (self.host, self.port, self.is_secure) self.auth_handler = auth.get_auth_handler( host, cfg, self._target_aws_service())
def get_expected_num_logs(): """ Function to return the number of ELB access log expected. (For code re-usability) """ measurement_interval = cfg.get_int('Default', 'measurement_interval', 10) print_message('') print_message('Measurement interval: %s' % measurement_interval) logging_time = cfg.get_int('s3', 'log_emitting_time', 5) expected_logs_to_obtain = math.floor(measurement_interval / logging_time) return expected_logs_to_obtain
class Key(object): """ Represents a key object (metadata) in an S3 bucket. """ DefaultContentType = 'application/octet-stream' BufferSize = cfg.get_int('s3', 'key_buffer_size', 8192) def __init__(self, bucket=None, name=None): self.bucket = bucket self.name = name self.content_type = self.DefaultContentType self.entity_tag = None self.last_modified = None self.accept_ranges = None self.response = None self.size = None def __repr__(self): if self.bucket: return '<Key: %s,%s>' % (self.bucket.name, self.name) else: return '<Key: None,%s>' % self.name def __iter__(self): return self def next(self): """ Make Key object iterable such that large content can be read pieces by pieces depends on buffer size """ self.open_read() data = self.response.read(self.BufferSize) if not data: self.close() raise StopIteration return data def get_contents_to_file(self, fp): self.open_read() data_size = 0 try: for content_fragment in self: fp.write(content_fragment) data_size += len(content_fragment) except IOError, e: if e.errno == errno.ENOSPC: raise S3Exception('Out of space for saving file ' '%s' % fp.name) raise if self.size is None: self.size = data_size self.close()
from __future__ import division import os import time from connection.s3_connection import S3Connection from etc.configuration import cfg from utilities.multi_threading import ThreadingManager from utilities.utils import print_message, get_expected_num_logs, \ get_next_nth_elb_log_time, get_available_clients, station_metadata_map log_polling_interval = cfg.get_int('s3', 'log_polling_interval', 60) log_file_dir = os.getcwd() + '/data_parser/s3/elb_access_logs/' class DataAccumulatorManager(ThreadingManager): """ Class that manage all thread that reading data from logs """ def __init__(self): ThreadingManager.__init__(self) self.data_sum = 0 self.client_sent = dict() self.client_receive = dict() self.available_client = get_available_clients() self.client_ips_name_pair = dict() for client_name in self.available_client: self.client_ips_name_pair.update( {station_metadata_map['ip'][client_name]: client_name}) # initialisation
def main(): setup_logging() # line counter for reading csparql logs of each service station line_counters = dict() # bucket and ELB info for getting access log from S3 elb_buckets_dict = dict() elb_regions = get_station_region() elb_buckets = get_elb_buckets_map() for station, region in elb_regions.iteritems(): elb_region_str = '%s:%s' % (station, region) elb_buckets_dict.update( {elb_region_str: unicode(elb_buckets[station])}) stations = get_available_stations() for station in stations: line_counters.update({station: 0}) log_base_dir = time.strftime("%Y_%m%d_%H%M") total_num_users = cfg.get_int('Default', 'total_num_users') # Get all available client region available_clients = get_available_clients() # counter = 0 # For testing while True: # Processing csparql log and ELB access log simultaneously by 2 threads # Start csparql log paring first since ELB access log has delay # emitting the log file # First we need to calculate how much time to wait before retrieving # csparql logs. i.e how long the actual measurement time is. Since S3 # only emit log at 5, 10, 15 etc. minute of the hour, we can only # measure the time that measurement start until the last expected log # omission time which is not the actual time that log being obtained # since there is delay measurement_interval = calculate_waiting_time() # no need to wait until log actually being obtained measurement_interval -= 300 # Measuring latency between each client region and service station latency_manager = ThreadingManager() latency_manager.start_task( target_func=measure_latency, name="latency_manager", para=[available_clients, stations, measurement_interval]) server_log_processor = ThreadingManager() server_log_processor.start_task(target_func=process_server_logs, name="server_log_processor", para=[ log_base_dir, line_counters, total_num_users, measurement_interval ]) # Begin gathering info of the amount of data # transferred through each service station data_counting_task = ThreadingManager() data_counting_task.start_task(target_func=process_elb_access_log, name="elb_access_log_processor", para=[elb_buckets_dict]) latency_results_dict = latency_manager.collect_results().get() # collecting csparql log first since its processing will complete first # while elb data might has delay server_metrics_queue = server_log_processor.collect_results() (station_metric_list, total_request) = server_metrics_queue.get() line_counters = server_metrics_queue.get() print_message('') print_message('Service station logs processing finished\n') # collecting elb data now elb_data_queue = data_counting_task.collect_results() data_in, data_out = elb_data_queue.get() """ Preparing optimisation parameters """ # Calculate The "average amount of data involved in each request" for # each service station and the "total number of requests" # These 2 dictionary stores the average data send and received per # request sent and received by *each service station* from *each # client*. The length of the dictionary should be equals to the # number of clients (regions) # <Client_name: <Station: data_in_per_req>> avg_data_in_per_reqs = dict() # <Client_name: <Station: data_out_per_req>> avg_data_out_per_reqs = dict() # initialise for cli_name in available_clients: avg_data_in_per_reqs.update({cli_name: {}}) avg_data_out_per_reqs.update({cli_name: {}}) # requests arrival rate and service rate of each service station arrival_rates = dict() service_rates = dict() for station_metric in station_metric_list: # getting metric station_name = station_metric.station_name arrival_rate = station_metric.arrival_rate service_rate = station_metric.service_rate requests = station_metric.total_requests print '\nTotal requests for station %s: %s' \ % (station_name, requests) log_info( metric_record_file, '\nTotal requests for station %s: %s' % (station_name, requests)) # arrival_rate and service_rate arrival_rates.update({station_name: arrival_rate}) service_rates.update({station_name: service_rate}) response_time = \ math.pow(service_rate, -1) / (1 - math.pow(service_rate, -1) * arrival_rate) print '[Debug] predicted current response time of service station ' \ '\'%s\': %s' % (station_name, response_time) log_info( metric_record_file, '[Debug] predicted currentresponse time of service ' 'station \'%s\': %s' % (station_name, response_time)) # TODO: calculate total requests for each client # TODO: if c-sparql could record the source of each requests # TODO: things would be much more easier total_request_per_client = dict() data_in_sum = 0 client_data_in_sum = dict() for a_client in available_clients: client_data_in_sum.update({a_client: 0}) for station_name in stations: d_in = data_in.get(station_name) for c, sent_data in d_in.iteritems(): # calculate total data sent by each client # and the total data sent by all client client_data_in_sum[c] += sent_data data_in_sum += sent_data # calculate total amount of requests sent by each client for ac in available_clients: t_request = math.ceil(total_request * (client_data_in_sum[ac] / data_in_sum)) t_request = int(t_request) # build this so that it could be used by calculating out data total_request_per_client.update({ac: t_request}) for station_name in stations: d_in = data_in.get(station_name) for c, sent_data in d_in.iteritems(): t_request = total_request_per_client[c] # convert the amount of data to GB sent_data = float(sent_data / math.pow(1024, 3)) avg_data_in_per_req = sent_data / t_request avg_data_in_per_reqs[c].update( {station_name: avg_data_in_per_req}) for station_name in stations: d_out = data_out.get(station_name) for c1, received_data in d_out.iteritems(): t_request = total_request_per_client[c1] received_data = float(received_data / math.pow(1024, 3)) avg_data_out_per_req = received_data / t_request avg_data_out_per_reqs[c1].update( {station_name: avg_data_out_per_req}) # For testing purpose info_str = \ '\n[Debug] total_request: %s\n' \ '[Debug] avg_data_in_per_reqs: %s\n' \ '[Debug] avg_data_out_per_reqs: %s\n' \ '[Debug] arrival_rates: %s\n' \ '[Debug] service_rates: %s\n' \ % (total_request, avg_data_in_per_reqs, avg_data_out_per_reqs, arrival_rates, service_rates) print info_str log_info(metric_record_file, info_str) # TODO: Get elb price from config # ELB pricing elb_prices = [0.008, 0.008] # optimise for each client... # do optimisation for each client in a new threads optimiser = ThreadingManager() for client in available_clients: optimiser.start_tasks(target_func=clients_optimisation, name="optimiser", para=[ avg_data_in_per_reqs, avg_data_out_per_reqs, client, elb_prices, latency_results_dict, measurement_interval, service_rates, stations, total_request_per_client ]) # synchronising threads optimiser.collect_results() # it takes up to 60 mins for Route 53 record changes to take effect time.sleep(60)
def _make_request(self, request, retry_handler=None): """ executing the HTTP request and retry request in case of temporary connection failure """ response = None body = None e = None # exception to raise if any "unretriable" num_retries = cfg.get_int('HTTPConnection', 'num_retries', self.num_retries) connection = self.new_http_connection(request.host, request.port) counter = 0 while counter <= num_retries: # Use binary exponential back-off to avoid traffic congestion max_retry_delay = cfg.get('HTTPConnection', 'max_retry_delay', 60) wait_time = min(random.random() * (2 ** counter), max_retry_delay) try: # sign the request with AWS access key request.authorize(connection=self) # add host to header except requests made to s3 if 's3' not in self._target_aws_service(): self.set_host_header(request) request.start_time = datetime.now() connection.request(request.method, request.path, request.body, request.headers) response = connection.getresponse() location = response.getheader('location') if request.method == 'HEAD' and \ getattr(response, 'chunked', False): response.chunked = 0 # checking response code - check 400 first if callable(retry_handler): status = retry_handler(response, counter) if status: msg, counter = status if msg: log.debug(msg) time.sleep(wait_time) continue if response.status in [500, 502, 503, 504]: msg = 'HTTP response: %s\nRe-attempting in %3.1f ' \ 'seconds' % response.status, wait_time log.debug(msg) # response has to be consumed body = response.read() elif response.status < 300 or response.status >= 400 or \ not location: # close the connection if it is set # to be closed by the other end conn_header_value = response.getheader('connection') if conn_header_value == 'close': connection.close() return response except PleaseRetryException, e: log.debug('encountered a retry exception: %s' % e) connection = self.new_http_connection(request.host, request.port) response = e.response except self.http_exceptions, e: log.debug('encountered %s exception, reconnecting' % e.__class__.__name__) connection = self.new_http_connection(request.host, request.port)
def main(): setup_logging() # line counter for reading csparql logs of each service station line_counters = dict() # bucket and ELB info for getting access log from S3 elb_buckets_dict = dict() elb_regions = get_station_region() elb_buckets = get_elb_buckets_map() for station, region in elb_regions.iteritems(): elb_region_str = '%s:%s' % (station, region) elb_buckets_dict.update({elb_region_str: unicode(elb_buckets[station])}) stations = get_available_stations() for station in stations: line_counters.update({station: 0}) log_base_dir = time.strftime("%Y_%m%d_%H%M") total_num_users = cfg.get_int('Default', 'total_num_users') # Get all available client region available_clients = get_available_clients() # counter = 0 # For testing while True: # Processing csparql log and ELB access log simultaneously by 2 threads # Start csparql log paring first since ELB access log has delay # emitting the log file # First we need to calculate how much time to wait before retrieving # csparql logs. i.e how long the actual measurement time is. Since S3 # only emit log at 5, 10, 15 etc. minute of the hour, we can only # measure the time that measurement start until the last expected log # omission time which is not the actual time that log being obtained # since there is delay measurement_interval = calculate_waiting_time() # no need to wait until log actually being obtained measurement_interval -= 300 # Measuring latency between each client region and service station latency_manager = ThreadingManager() latency_manager.start_task( target_func=measure_latency, name="latency_manager", para=[available_clients, stations, measurement_interval] ) server_log_processor = ThreadingManager() server_log_processor.start_task( target_func=process_server_logs, name="server_log_processor", para=[log_base_dir, line_counters, total_num_users, measurement_interval] ) # Begin gathering info of the amount of data # transferred through each service station data_counting_task = ThreadingManager() data_counting_task.start_task( target_func=process_elb_access_log, name="elb_access_log_processor", para=[elb_buckets_dict] ) latency_results_dict = latency_manager.collect_results().get() # collecting csparql log first since its processing will complete first # while elb data might has delay server_metrics_queue = server_log_processor.collect_results() (station_metric_list, total_request) = server_metrics_queue.get() line_counters = server_metrics_queue.get() print_message('') print_message('Service station logs processing finished\n') # collecting elb data now elb_data_queue = data_counting_task.collect_results() data_in, data_out = elb_data_queue.get() """ Preparing optimisation parameters """ # Calculate The "average amount of data involved in each request" for # each service station and the "total number of requests" # These 2 dictionary stores the average data send and received per # request sent and received by *each service station* from *each # client*. The length of the dictionary should be equals to the # number of clients (regions) # <Client_name: <Station: data_in_per_req>> avg_data_in_per_reqs = dict() # <Client_name: <Station: data_out_per_req>> avg_data_out_per_reqs = dict() # initialise for cli_name in available_clients: avg_data_in_per_reqs.update({cli_name: {}}) avg_data_out_per_reqs.update({cli_name: {}}) # requests arrival rate and service rate of each service station arrival_rates = dict() service_rates = dict() for station_metric in station_metric_list: # getting metric station_name = station_metric.station_name arrival_rate = station_metric.arrival_rate service_rate = station_metric.service_rate requests = station_metric.total_requests print '\nTotal requests for station %s: %s' \ % (station_name, requests) log_info(metric_record_file, '\nTotal requests for station %s: %s' % (station_name, requests)) # arrival_rate and service_rate arrival_rates.update({station_name: arrival_rate}) service_rates.update({station_name: service_rate}) response_time = \ math.pow(service_rate, -1) / (1 - math.pow(service_rate, -1) * arrival_rate) print '[Debug] predicted current response time of service station ' \ '\'%s\': %s' % (station_name, response_time) log_info(metric_record_file, '[Debug] predicted currentresponse time of service ' 'station \'%s\': %s' % (station_name, response_time)) # TODO: calculate total requests for each client # TODO: if c-sparql could record the source of each requests # TODO: things would be much more easier total_request_per_client = dict() data_in_sum = 0 client_data_in_sum = dict() for a_client in available_clients: client_data_in_sum.update({a_client: 0}) for station_name in stations: d_in = data_in.get(station_name) for c, sent_data in d_in.iteritems(): # calculate total data sent by each client # and the total data sent by all client client_data_in_sum[c] += sent_data data_in_sum += sent_data # calculate total amount of requests sent by each client for ac in available_clients: t_request = math.ceil( total_request * (client_data_in_sum[ac] / data_in_sum)) t_request = int(t_request) # build this so that it could be used by calculating out data total_request_per_client.update({ac: t_request}) for station_name in stations: d_in = data_in.get(station_name) for c, sent_data in d_in.iteritems(): t_request = total_request_per_client[c] # convert the amount of data to GB sent_data = float(sent_data / math.pow(1024, 3)) avg_data_in_per_req = sent_data / t_request avg_data_in_per_reqs[c].update( {station_name: avg_data_in_per_req}) for station_name in stations: d_out = data_out.get(station_name) for c1, received_data in d_out.iteritems(): t_request = total_request_per_client[c1] received_data = float(received_data / math.pow(1024, 3)) avg_data_out_per_req = received_data / t_request avg_data_out_per_reqs[c1].update( {station_name: avg_data_out_per_req}) # For testing purpose info_str = \ '\n[Debug] total_request: %s\n' \ '[Debug] avg_data_in_per_reqs: %s\n' \ '[Debug] avg_data_out_per_reqs: %s\n' \ '[Debug] arrival_rates: %s\n' \ '[Debug] service_rates: %s\n' \ % (total_request, avg_data_in_per_reqs, avg_data_out_per_reqs, arrival_rates, service_rates) print info_str log_info(metric_record_file, info_str) # TODO: Get elb price from config # ELB pricing elb_prices = [0.008, 0.008] # optimise for each client... # do optimisation for each client in a new threads optimiser = ThreadingManager() for client in available_clients: optimiser.start_tasks( target_func=clients_optimisation, name="optimiser", para=[avg_data_in_per_reqs, avg_data_out_per_reqs, client, elb_prices, latency_results_dict, measurement_interval, service_rates, stations, total_request_per_client] ) # synchronising threads optimiser.collect_results() # it takes up to 60 mins for Route 53 record changes to take effect time.sleep(60)
def _make_request(self, request, retry_handler=None): """ executing the HTTP request and retry request in case of temporary connection failure """ response = None body = None e = None # exception to raise if any "unretriable" num_retries = cfg.get_int('HTTPConnection', 'num_retries', self.num_retries) connection = self.new_http_connection(request.host, request.port) counter = 0 while counter <= num_retries: # Use binary exponential back-off to avoid traffic congestion max_retry_delay = cfg.get('HTTPConnection', 'max_retry_delay', 60) wait_time = min(random.random() * (2**counter), max_retry_delay) try: # sign the request with AWS access key request.authorize(connection=self) # add host to header except requests made to s3 if 's3' not in self._target_aws_service(): self.set_host_header(request) request.start_time = datetime.now() connection.request(request.method, request.path, request.body, request.headers) response = connection.getresponse() location = response.getheader('location') if request.method == 'HEAD' and \ getattr(response, 'chunked', False): response.chunked = 0 # checking response code - check 400 first if callable(retry_handler): status = retry_handler(response, counter) if status: msg, counter = status if msg: log.debug(msg) time.sleep(wait_time) continue if response.status in [500, 502, 503, 504]: msg = 'HTTP response: %s\nRe-attempting in %3.1f ' \ 'seconds' % response.status, wait_time log.debug(msg) # response has to be consumed body = response.read() elif response.status < 300 or response.status >= 400 or \ not location: # close the connection if it is set # to be closed by the other end conn_header_value = response.getheader('connection') if conn_header_value == 'close': connection.close() return response except PleaseRetryException, e: log.debug('encountered a retry exception: %s' % e) connection = self.new_http_connection(request.host, request.port) response = e.response except self.http_exceptions, e: log.debug('encountered %s exception, reconnecting' % e.__class__.__name__) connection = self.new_http_connection(request.host, request.port)
def get_next_nth_elb_log_time(n, last_expected_time): """ Get the next minutes e.g 5 or 10 or 15 that the S3 will emit ELB access log. It can also get the time of next nth log base on current time :return: """ current_time = datetime.utcnow() print_message('Current UTC Time: %s' % '-'.join([str(current_time.year), str('%02d' % current_time.month), str('%02d' % current_time.day), str('%02d' % current_time.hour), str('%02d' % current_time.minute)])) # [year, month, day, hour, minute] = \ # current_time.year, current_time.month, current_time.day, \ # current_time.hour, current_time.minute # # print_message('Current UTC Time: %s' % '-'.join([str(year), # str('%02d' % month), # str('%02d' % day), # str('%02d' % hour), # str('%02d' % minute)])) # Calculate the next expected log file emitted by S3. # With 5 minute logging interval, logs are emitted every 5 minutes # at each hour, hence the ceiling of the division of current minute # with 5 (minutes) should be the next expected minute at which a new # log will be emitted logging_interval = cfg.get_int('s3', 'log_emitting_time', 60) if not last_expected_time: interval_covered = math.ceil(current_time.minute / logging_interval) reminder = current_time.minute % logging_interval time_delta = \ logging_interval * interval_covered + logging_interval * (n - 1) -\ current_time.minute next_expected_time = current_time + timedelta(minutes=time_delta) # next_expected_minute = \ # logging_interval * interval_covered + logging_interval * (n - 1) # it the current minute is happen to be a logging time # i.e 5, 10, 15 etc. if reminder == 0: time_delta = n * logging_interval next_expected_time = current_time + timedelta(minutes=time_delta) # next_expected_minute = minute + n * logging_interval else: min_delta = n * logging_interval next_expected_time = last_expected_time + timedelta(minutes=min_delta) # next_expected_minute = \ # last_expected_minutes + n * logging_interval # .seconds // 60) % 60 # (timespot - lasttime).days * 24 * 60 * 60 + (timespot - lasttime).seconds # 10863 # timedelta(seconds=10863) if next_expected_time < current_time: t_delta = current_time - next_expected_time over_due_sec = t_delta.days * 24 * 60 * 60 + t_delta.seconds if over_due_sec <= logging_interval * 60: waiting_td = \ next_expected_time + timedelta(seconds=logging_interval * 60) \ - current_time max_waiting_time = \ waiting_td.days * 24 * 60 * 60 + waiting_td.seconds else: max_waiting_time = 0 print_message('[Fatal!] Waiting for too long\n' 'Current expected logging time %s\n' 'Current time: %s' % (next_expected_time, current_time)) elif next_expected_time > current_time: waiting_td = next_expected_time - current_time max_waiting_time = waiting_td.days * 24 * 60 * 60 + \ waiting_td.seconds + logging_interval * 60 else: max_waiting_time = logging_interval * 60 print_message('Next expected time (UTC) %s' % '-'.join([str(next_expected_time.year), str('%02d' % next_expected_time.month), str('%02d' % next_expected_time.day), str('%02d' % next_expected_time.hour), str('%02d' % next_expected_time.minute)])) # if next_expected_logging_minute < minute: # max_waiting_minutes = \ # 60 - minute + next_expected_logging_minute + logging_interval # else: # max_waiting_minutes = \ # next_expected_logging_minute - minute + logging_interval print_message('[Debug]: max_waiting_minutes: %s' % (max_waiting_time / 60)) return next_expected_time, max_waiting_time