예제 #1
0
    def dereplicate_function(self, fname, num_replicas, function_locations):
        if num_replicas < 2:
            return

        while len(function_locations[fname]) > num_replicas:
            ip, tid = random.sample(function_locations[fname], 1)[0]
            send_message(self.context, fname,
                         get_executor_unpin_address(ip, tid))

            function_locations[fname].discard((ip, tid))
예제 #2
0
    def replicate_function(self, fname, num_replicas, function_locations,
                           cpu_executors, gpu_executors):

        existing_replicas = function_locations[fname]

        msg = PinFunction()
        msg.name = fname
        msg.response_address = self.ip

        # TODO: Add proper support for autoscaling GPU instances and for
        # checking whether batching is enabled.
        if 'gpu' in fname:
            candidate_nodes = gpu_executors.difference(existing_replicas)

            for key in function_locations:
                if 'gpu' in key:
                    for location in function_locations[key]:
                        candidate_nodes.discard(location)
        else:
            candidate_nodes = cpu_executors.difference(existing_replicas)

        for _ in range(num_replicas):
            if len(candidate_nodes) == 0:
                continue

            ip, tid = random.sample(candidate_nodes, 1)[0]
            send_message(self.context, msg.SerializeToString(),
                         get_executor_pin_address(ip, tid))

            response = GenericResponse()
            try:
                response.ParseFromString(self.pin_accept_socket.recv())
            except zmq.ZMQError:
                logging.error('Pin operation to %s:%d timed out for %s.' %
                              (ip, tid, fname))
                candidate_nodes.remove((ip, tid))
                continue

            if response.success:
                logging.info('Pin operation to %s:%d for %s successful.' %
                             (ip, tid, fname))
                function_locations[fname].add((ip, tid))
            else:
                # The pin operation was rejected, remove node and try again.
                logging.error('Node %s:%d rejected pin for %s.' %
                              (ip, tid, fname))
            candidate_nodes.remove((ip, tid))
예제 #3
0
    def replicate_function(self, fname, num_replicas, function_locations,
                           executors):
        if num_replicas < 0:
            return

        for _ in range(num_replicas):
            existing_replicas = function_locations[fname]
            candiate_nodes = executors.difference(existing_replicas)

            if len(candiate_nodes) == 0:
                continue

            ip, tid = random.sample(candiate_nodes, 1)[0]
            msg = '127.0.0.1:' + fname
            send_message(self.context, msg, get_executor_pin_address(ip, tid))

            function_locations[fname].add((ip, tid))
예제 #4
0
    def replicate_function(self, fname, num_replicas, function_locations,
                           executors):
        if num_replicas < 0:
            return

        existing_replicas = function_locations[fname]
        candiate_nodes = executors.difference(existing_replicas)

        for _ in range(num_replicas):
            if len(candiate_nodes) == 0:
                continue

            ip, tid = random.sample(candiate_nodes, 1)[0]

            msg = PinFunction()
            msg.name = fname
            msg.response_address = self.ip

            send_message(self.context, msg.SerializeToString(),
                         get_executor_pin_address(ip, tid))

            response = GenericResponse()
            try:
                response.ParseFromString(self.pin_accept_socket.recv())
            except zmq.ZMQError:
                logging.error('Pin operation to %s:%d timed out for %s.' %
                              (ip, tid, fname))
                candiate_nodes.remove((ip, tid))
                continue

            if response.success:
                logging.info('Pin operation to %s:%d for %s successful.' %
                             (ip, tid, fname))
                function_locations[fname].add((ip, tid))
            else:
                # The pin operation was rejected, remove node and try again.
                logging.error('Node %s:%d rejected pin for %s.' %
                              (ip, tid, fname))
            candiate_nodes.remove((ip, tid))
예제 #5
0
    def executor_policy(self, executor_statuses, departing_executors):
        # If no executors have joined yet, we don't need to calcuate anything.
        if len(executor_statuses) == 0:
            return

        # We institute a grace period (2 minutes by default) during which no
        # elasticity decisions are made. We start the grace period when we
        # decide to add or remove a VM and wait until after its over to make
        # sure we don't put the system in hysteresis.
        if time.time() < (self.grace_start + self.grace_period):
            return

        utilization_sum = 0.0
        pinned_function_count = 0
        for status in executor_statuses.values():
            utilization_sum += status.utilization
            pinned_function_count += len(status.functions)

        avg_utilization = utilization_sum / len(executor_statuses)
        avg_pinned_count = pinned_function_count / len(executor_statuses)
        num_nodes = len(executor_statuses) / NUM_EXEC_THREADS

        logging.info(
            ('There are currently %d executor nodes active in the' +
             'system (%d threads).') % (num_nodes, len(executor_statuses)))
        logging.info('Average executor utilization: %.4f' % (avg_utilization))
        logging.info('Average pinned function count: %.2f' %
                     (avg_pinned_count))

        # We check to see if the average utilization or number of pinned
        # functions exceeds the policy's thresholds and add machines to the
        # system in both cases.
        if (avg_utilization > self.max_utilization
                or avg_pinned_count > self.max_pin_count):
            logging.info(
                ('Average utilization is %.4f. Adding %d nodes to' +
                 ' cluster.') % (avg_utilization, self.scale_increase))

            if (len(executor_statuses) / NUM_EXEC_THREADS) < 5:
                self.scaler.add_vms('function', self.scale_increase)

            # start the grace period after adding nodes
            self.grace_start = time.time()

        # We also look at any individual nodes that might be overloaded. Since
        # we currently only pin one function per node, that means that function
        # is very expensive, so we proactively replicate it onto two other
        # threads.
        for status in executor_statuses.values():
            if status.utilization > .9:
                logging.info(
                    ('Node %s:%d has over 90%% utilization.' +
                     ' Replicating its functions.') % (status.ip, status.tid))

                executors = set(executor_statuses.keys())
                for fname in status.functions:
                    self.scaler.replicate_function(fname, 2,
                                                   self.function_locations,
                                                   executors)

        # We only decide to kill nodes if they are underutilized and if there
        # are at least 5 executors in the system -- we never scale down past
        # that.
        if avg_utilization < self.min_utilization and num_nodes > 5:
            ip = random.choice(list(executor_statuses.values())).ip
            logging.info(('Average utilization is %.4f, and there are %d ' +
                          'executors. Removing IP %s.') %
                         (avg_utilization, len(executor_statuses), ip))

            for tid in range(NUM_EXEC_THREADS):
                send_message(self.scaler.context, '',
                             get_executor_depart_address(ip, tid))

                if (ip, tid) in executor_statuses:
                    del executor_statuses[(ip, tid)]

            departing_executors[ip] = NUM_EXEC_THREADS
예제 #6
0
def check_hash_ring(client, context):
    route_ips = util.get_pod_ips(client, 'role=routing')

    # If there are no routing nodes in the system currently, the system is
    # still starting, so we do nothing.
    if not route_ips:
        return

    ip = random.choice(route_ips)

    # Retrieve a list of all current members of the cluster.
    socket = context.socket(zmq.REQ)
    socket.connect(get_routing_seed_address(ip, 0))
    socket.send_string('')
    resp = socket.recv()

    cluster = ClusterMembership()
    cluster.ParseFromString(resp)
    tiers = cluster.tiers

    # If there are no tiers, then we don't need to evaluate anything.
    if len(tiers) == 0:
        return
    elif len(tiers) == 1:
        # If there is one tier, it will be the memory tier.
        mem_tier, ebs_tier = tiers[0], None
    else:
        # If there are two tiers, we need to make sure that we assign the
        # correct tiers as the memory and EBS tiers, respectively.
        if tiers[0].tier_id == MEMORY:
            mem_tier = tiers[0]
            ebs_tier = tiers[1]
        else:
            mem_tier = tiers[1]
            ebs_tier = tiers[0]

    # Queries the Kubernetes master for the list of memory nodes its aware of
    # -- if any of the nodes in the hash ring aren't currently running, we add
    # those the departed list.
    mem_ips = util.get_pod_ips(client, 'role=memory')
    departed = []
    for node in mem_tier.servers:
        if node.private_ip not in mem_ips:
            departed.append(('0', node))

    # Performs the same process for the EBS tier if it exists.
    ebs_ips = []
    if ebs_tier:
        ebs_ips = util.get_pod_ips(client, 'role=ebs')
        for node in ebs_tier.servers:
            if node.private_ip not in ebs_ips:
                departed.append(('1', node))

    logging.debug('Found %d departed nodes.' % (len(departed)))
    mon_ips = util.get_pod_ips(client, 'role=monitoring')
    storage_ips = mem_ips + ebs_ips

    # For each departed node the cluster is unaware of, we inform all storage
    # nodes, all monitoring nodes, and all routing nodes that it has departed.
    for pair in departed:
        logging.info('Informing cluster that node %s/%s has departed.' %
                     (pair[1].public_ip, pair[1].private_ip))

        msg = pair[0] + ':' + pair[1].public_ip + ':' + pair[1].private_ip

        # NOTE: In this code, we are presuming there are 4 threads per
        # storage/routing node. If there are more, this will be buggy; if there
        # are fewer, this is fine as the messages will go into the void.
        for ip in storage_ips:
            for t in range(4):
                send_message(context, msg, get_storage_depart_address(ip,
                                                                      t))

        msg = 'depart:' + msg
        for ip in route_ips:
            for t in range(4):
                send_message(context, msg, get_routing_depart_address(ip,
                                                                      t))

        for ip in mon_ips:
            send_message(context, msg, get_monitoring_depart_address(ip))