class Evaluator: def __init__(self, start_time, end_time): self.set_time(start_time, end_time) self.aggregator = Aggregator() def set_time(self, start_time, end_time): self.start_time = start_time self.end_time = end_time # read_logs and moving_logs are always emited unless for testing def evaluate(self, read_logs=None, moving_logs=None): # collect server logs if read_logs is None: read_logs = self.aggregator.get_read_log_entries( self.start_time, self.end_time) if moving_logs is None: moving_logs = self.aggregator.get_moving_log_entries( self.start_time, self.end_time) # calculate the average latency latency_sum = 0 request_count = 0 ip_cache = ip_location_cache() for log in read_logs: timestamp, uuid, source, source_uuid, dest, req_type, status, response_size = log client_loc = ip_cache.get_lat_lon_from_ip(source) server_loc = ip_cache.get_lat_lon_from_ip(dest) distance = util.get_distance(client_loc, server_loc) unit = 1000.0 latency = distance / unit request_importance = 1 latency_sum += latency * request_importance request_count += request_importance average_latency = latency_sum / request_count inter_datacenter_traffic = 0 for log in moving_logs: timestamp, uuid, source, source_uuid, dest, req_type, status, response_size = log # treat all files as uniform size inter_datacenter_traffic += 1 # display latency, cost, etc return average_latency, inter_datacenter_traffic
class GreedyReplication: def __init__(self): self.aggregator = Aggregator() # to retrive server logs self.client_set = set([]) # [client_ip, ] self.server_set = set(util.retrieve_server_list()) # [server_ip, ] self.content_set = set([]) # [uuid, ] self.access_map = {} # {uuid: {client_ip: num_request}} self.replica_map = {} # {uuid: {server_ip: num_replica}} self.last_timestamp = 0 # the timestamp of last update self.requests_per_replica = 3 self.uuid_to_server = None # self.sample_interval = 1000 # the time interval between two rounds in second # update client_set, server_set, content_set, access_info # and replication status # call this function before running greedy algorithm def update(self): # clear data self.content_set = set([]) self.access_map = {} self.replica_map = {} # update content_set, replica_map for server in self.server_set: file_list = util.get_file_list_on_server(server) for file_uuid in file_list: self.content_set.add(file_uuid) if file_uuid not in self.replica_map: self.replica_map[file_uuid] = {} if server not in self.replica_map[file_uuid]: self.replica_map[file_uuid][server] = 0 self.replica_map[file_uuid][server] += 1 current_timestamp = int(time.time()) logs = self.aggregator.get_read_log_entries(self.last_timestamp, current_timestamp) # used recently generated logs to update inner data structure for log in logs: timestamp, uuid, source, source_uuid, dest, req_type, status, response_size = log if uuid not in self.content_set: continue self.content_set.add(uuid) if uuid not in self.access_map: self.access_map[uuid] = {} self.client_set.add(source) if req_type == 'READ': if source not in self.access_map[uuid]: self.access_map[uuid][source] = 0 self.access_map[uuid][source] += 1 self.last_timestamp = current_timestamp def run_replication(self): self.update() request_delta = self.requests_per_replica / 10 replica_delta = 1 i = 0 if not self.enough_replica_on_increase(request_delta): self.add_replica(request_delta, replica_delta) # currently we don't remove any replica # else: # remove_replica() # test whether current replicas can handle more requests # # delta: specify the amount of request increased every time def enough_replica_on_increase(self, delta): for c in self.content_set: if c in self.access_map: for a in self.access_map[c].keys(): # add a small amount of requests for content c from client a self.access_map[c][a] += delta # test whether current replicas can handle that much request is_enough = self.enough_replica() # back tracking, self.access_map[c][a] -= delta if not is_enough: return False return True def add_replica(self, request_delta, replica_delta): I = [] for c in self.content_set: if c in self.access_map: for a in self.access_map[c].keys(): # add a small amount of requests for content c from client a self.access_map[c][a] += request_delta # test whether current replicas can handle that much request if not self.enough_replica(): I.append((a, c)) # back tracking, self.access_map[c][a] -= request_delta max_satisfied_num = 0 best_c = None best_s = None # find the server s to replicate content c so that # maximum number of starved clients can be satisfied for a, c in I: for s in self.server_set: satisfied_num = 0 self.access_map[c][a] += request_delta if s not in self.replica_map[c]: self.replica_map[c][s] = 0 self.replica_map[c][s] += replica_delta if self.enough_replica(): satisfied_num += 1 self.access_map[c][a] -= request_delta self.replica_map[c][s] -= replica_delta if self.replica_map[c][s] == 0: self.replica_map[c].pop(s) if (satisfied_num > max_satisfied_num): max_satisfied_num = satisfied_num best_c = c best_s = s if max_satisfied_num > 0: source = self.replica_map[best_c].iterkeys().next() if source == best_s: # can't hold more than 1 replica, replicate to a random other server best_s = random.sample(self.server_set - set([source]), 1)[0] self.replicate(best_c, source, best_s) else: # replicate everything print 'replicate to all servers' for content in self.content_set: if not self.enough_replica_for_content(content): if content not in self.replica_map: continue source = self.replica_map[content].iterkeys().next() #select first none zero replica for server in self.server_set: print 'replicate ' + 'content: ' + content + ' from: ' + source + ' to ' + server util.replicate(content, source, server) def enough_replica(self): # this is an approximate implementation, may need to # construct a bipartite graph and run min matching algo for c in self.content_set: if not self.enough_replica_for_content(c): return False return True def enough_replica_for_content(self, c): # this is an approximate implementation, may need to # construct a bipartite graph and run min matching algo server_to_request_sum_map = {} if c not in self.access_map.keys(): # no client accesses c return True for a in self.access_map[c].keys(): nearest_server = util.find_closest_servers_with_ip( a, self.server_set)[0]['server'] if nearest_server not in server_to_request_sum_map: server_to_request_sum_map[nearest_server] = 0 server_to_request_sum_map[nearest_server] += self.access_map[c][a] for server, request_sum in server_to_request_sum_map.iteritems(): if (c not in self.replica_map) or (server not in self.replica_map[c]): return False if self.replica_map[c][ server] * self.requests_per_replica < request_sum: return False return True def replicate(self, content, source, dest): print 'Greedy: replicate file %s from %s to %s', (content, source, dest) if source == dest: if dest not in self.replica_map[content]: self.replica_map[content] = 0 self.replica_map[content][dest] += 1 else: util.replicate(content, source, dest)