def dereplicate_function(self, fname, num_replicas, function_locations): if num_replicas < 2: return while len(function_locations[fname]) > num_replicas: ip, tid = random.sample(function_locations[fname], 1)[0] send_message(self.context, fname, get_executor_unpin_address(ip, tid)) function_locations[fname].discard((ip, tid))
def replicate_function(self, fname, num_replicas, function_locations, cpu_executors, gpu_executors): existing_replicas = function_locations[fname] msg = PinFunction() msg.name = fname msg.response_address = self.ip # TODO: Add proper support for autoscaling GPU instances and for # checking whether batching is enabled. if 'gpu' in fname: candidate_nodes = gpu_executors.difference(existing_replicas) for key in function_locations: if 'gpu' in key: for location in function_locations[key]: candidate_nodes.discard(location) else: candidate_nodes = cpu_executors.difference(existing_replicas) for _ in range(num_replicas): if len(candidate_nodes) == 0: continue ip, tid = random.sample(candidate_nodes, 1)[0] send_message(self.context, msg.SerializeToString(), get_executor_pin_address(ip, tid)) response = GenericResponse() try: response.ParseFromString(self.pin_accept_socket.recv()) except zmq.ZMQError: logging.error('Pin operation to %s:%d timed out for %s.' % (ip, tid, fname)) candidate_nodes.remove((ip, tid)) continue if response.success: logging.info('Pin operation to %s:%d for %s successful.' % (ip, tid, fname)) function_locations[fname].add((ip, tid)) else: # The pin operation was rejected, remove node and try again. logging.error('Node %s:%d rejected pin for %s.' % (ip, tid, fname)) candidate_nodes.remove((ip, tid))
def replicate_function(self, fname, num_replicas, function_locations, executors): if num_replicas < 0: return for _ in range(num_replicas): existing_replicas = function_locations[fname] candiate_nodes = executors.difference(existing_replicas) if len(candiate_nodes) == 0: continue ip, tid = random.sample(candiate_nodes, 1)[0] msg = '127.0.0.1:' + fname send_message(self.context, msg, get_executor_pin_address(ip, tid)) function_locations[fname].add((ip, tid))
def replicate_function(self, fname, num_replicas, function_locations, executors): if num_replicas < 0: return existing_replicas = function_locations[fname] candiate_nodes = executors.difference(existing_replicas) for _ in range(num_replicas): if len(candiate_nodes) == 0: continue ip, tid = random.sample(candiate_nodes, 1)[0] msg = PinFunction() msg.name = fname msg.response_address = self.ip send_message(self.context, msg.SerializeToString(), get_executor_pin_address(ip, tid)) response = GenericResponse() try: response.ParseFromString(self.pin_accept_socket.recv()) except zmq.ZMQError: logging.error('Pin operation to %s:%d timed out for %s.' % (ip, tid, fname)) candiate_nodes.remove((ip, tid)) continue if response.success: logging.info('Pin operation to %s:%d for %s successful.' % (ip, tid, fname)) function_locations[fname].add((ip, tid)) else: # The pin operation was rejected, remove node and try again. logging.error('Node %s:%d rejected pin for %s.' % (ip, tid, fname)) candiate_nodes.remove((ip, tid))
def executor_policy(self, executor_statuses, departing_executors): # If no executors have joined yet, we don't need to calcuate anything. if len(executor_statuses) == 0: return # We institute a grace period (2 minutes by default) during which no # elasticity decisions are made. We start the grace period when we # decide to add or remove a VM and wait until after its over to make # sure we don't put the system in hysteresis. if time.time() < (self.grace_start + self.grace_period): return utilization_sum = 0.0 pinned_function_count = 0 for status in executor_statuses.values(): utilization_sum += status.utilization pinned_function_count += len(status.functions) avg_utilization = utilization_sum / len(executor_statuses) avg_pinned_count = pinned_function_count / len(executor_statuses) num_nodes = len(executor_statuses) / NUM_EXEC_THREADS logging.info( ('There are currently %d executor nodes active in the' + 'system (%d threads).') % (num_nodes, len(executor_statuses))) logging.info('Average executor utilization: %.4f' % (avg_utilization)) logging.info('Average pinned function count: %.2f' % (avg_pinned_count)) # We check to see if the average utilization or number of pinned # functions exceeds the policy's thresholds and add machines to the # system in both cases. if (avg_utilization > self.max_utilization or avg_pinned_count > self.max_pin_count): logging.info( ('Average utilization is %.4f. Adding %d nodes to' + ' cluster.') % (avg_utilization, self.scale_increase)) if (len(executor_statuses) / NUM_EXEC_THREADS) < 5: self.scaler.add_vms('function', self.scale_increase) # start the grace period after adding nodes self.grace_start = time.time() # We also look at any individual nodes that might be overloaded. Since # we currently only pin one function per node, that means that function # is very expensive, so we proactively replicate it onto two other # threads. for status in executor_statuses.values(): if status.utilization > .9: logging.info( ('Node %s:%d has over 90%% utilization.' + ' Replicating its functions.') % (status.ip, status.tid)) executors = set(executor_statuses.keys()) for fname in status.functions: self.scaler.replicate_function(fname, 2, self.function_locations, executors) # We only decide to kill nodes if they are underutilized and if there # are at least 5 executors in the system -- we never scale down past # that. if avg_utilization < self.min_utilization and num_nodes > 5: ip = random.choice(list(executor_statuses.values())).ip logging.info(('Average utilization is %.4f, and there are %d ' + 'executors. Removing IP %s.') % (avg_utilization, len(executor_statuses), ip)) for tid in range(NUM_EXEC_THREADS): send_message(self.scaler.context, '', get_executor_depart_address(ip, tid)) if (ip, tid) in executor_statuses: del executor_statuses[(ip, tid)] departing_executors[ip] = NUM_EXEC_THREADS
def check_hash_ring(client, context): route_ips = util.get_pod_ips(client, 'role=routing') # If there are no routing nodes in the system currently, the system is # still starting, so we do nothing. if not route_ips: return ip = random.choice(route_ips) # Retrieve a list of all current members of the cluster. socket = context.socket(zmq.REQ) socket.connect(get_routing_seed_address(ip, 0)) socket.send_string('') resp = socket.recv() cluster = ClusterMembership() cluster.ParseFromString(resp) tiers = cluster.tiers # If there are no tiers, then we don't need to evaluate anything. if len(tiers) == 0: return elif len(tiers) == 1: # If there is one tier, it will be the memory tier. mem_tier, ebs_tier = tiers[0], None else: # If there are two tiers, we need to make sure that we assign the # correct tiers as the memory and EBS tiers, respectively. if tiers[0].tier_id == MEMORY: mem_tier = tiers[0] ebs_tier = tiers[1] else: mem_tier = tiers[1] ebs_tier = tiers[0] # Queries the Kubernetes master for the list of memory nodes its aware of # -- if any of the nodes in the hash ring aren't currently running, we add # those the departed list. mem_ips = util.get_pod_ips(client, 'role=memory') departed = [] for node in mem_tier.servers: if node.private_ip not in mem_ips: departed.append(('0', node)) # Performs the same process for the EBS tier if it exists. ebs_ips = [] if ebs_tier: ebs_ips = util.get_pod_ips(client, 'role=ebs') for node in ebs_tier.servers: if node.private_ip not in ebs_ips: departed.append(('1', node)) logging.debug('Found %d departed nodes.' % (len(departed))) mon_ips = util.get_pod_ips(client, 'role=monitoring') storage_ips = mem_ips + ebs_ips # For each departed node the cluster is unaware of, we inform all storage # nodes, all monitoring nodes, and all routing nodes that it has departed. for pair in departed: logging.info('Informing cluster that node %s/%s has departed.' % (pair[1].public_ip, pair[1].private_ip)) msg = pair[0] + ':' + pair[1].public_ip + ':' + pair[1].private_ip # NOTE: In this code, we are presuming there are 4 threads per # storage/routing node. If there are more, this will be buggy; if there # are fewer, this is fine as the messages will go into the void. for ip in storage_ips: for t in range(4): send_message(context, msg, get_storage_depart_address(ip, t)) msg = 'depart:' + msg for ip in route_ips: for t in range(4): send_message(context, msg, get_routing_depart_address(ip, t)) for ip in mon_ips: send_message(context, msg, get_monitoring_depart_address(ip))