Exemplo n.º 1
0
    def test_succesful_pin(self):
        '''
        This test executes a pin operation that is supposed to be successful,
        and it checks to make sure that that the correct metadata for execution
        and reporting is generated.
        '''
        # Create a new function in the KVS.
        fname = 'incr'

        def func(_, x):
            return x + 1

        create_function(func, self.kvs_client, fname)

        # Create a pin message and put it into the socket.
        msg = PinFunction(name=fname, response_address=self.ip)
        self.socket.inbox.append(msg.SerializeToString())

        # Execute the pin operation.
        pin(self.socket, self.pusher_cache, self.kvs_client, self.status,
            self.pinned_functions, self.runtimes, self.exec_counts,
            self.user_library, False, False)

        # Check that the correct messages were sent and the correct metadata
        # created.
        self.assertEqual(len(self.pusher_cache.socket.outbox), 1)
        response = GenericResponse()
        response.ParseFromString(self.pusher_cache.socket.outbox[0])
        self.assertTrue(response.success)

        self.assertEqual(func('', 1), self.pinned_functions[fname]('', 1))
        self.assertTrue(fname in self.pinned_functions)
        self.assertTrue(fname in self.runtimes)
        self.assertTrue(fname in self.exec_counts)
        self.assertTrue(fname in self.status.functions)
Exemplo n.º 2
0
    def pin_function(self, dag_name, function_ref):
        # If there are no functions left to choose from, then we return None,
        # indicating that we ran out of resources to use.
        if len(self.unpinned_executors) == 0:
            return False

        if dag_name not in self.pending_dags:
            self.pending_dags[dag_name] = []

        # Make a copy of the set of executors, so that we don't modify the
        # system's metadata.
        candidates = set(self.unpinned_executors)

        # Construct a PinFunction message to be sent to executors.
        pin_msg = PinFunction()
        pin_msg.name = function_ref.name
        pin_msg.response_address = self.ip

        serialized = pin_msg.SerializeToString()

        while True:
            # Pick a random executor from the set of candidates and attempt to
            # pin this function there.
            node, tid = sys_random.sample(candidates, 1)[0]

            sckt = self.pusher_cache.get(get_pin_address(node, tid))
            sckt.send(serialized)

            response = GenericResponse()
            try:
                response.ParseFromString(self.pin_accept_socket.recv())
            except zmq.ZMQError:
                logging.error('Pin operation to %s:%d timed out. Retrying.' %
                              (node, tid))
                continue

            # Do not use this executor either way: If it rejected, it has
            # something else pinned, and if it accepted, it has pinned what we
            # just asked it to pin.
            # In local model allow executors to have multiple functions pinned
            if not self.local:
                self.unpinned_executors.discard((node, tid))
                candidates.discard((node, tid))

            if response.success:
                # The pin operation succeeded, so we return the node and thread
                # ID to the caller.
                self.pending_dags[dag_name].append((function_ref.name, (node,
                                                                        tid)))
                return True
            else:
                # The pin operation was rejected, remove node and try again.
                logging.error('Node %s:%d rejected pin for %s. Retrying.'
                              % (node, tid, function_ref.name))

                continue
Exemplo n.º 3
0
def pin(pin_socket, pusher_cache, kvs, status, function_cache, runtimes,
        exec_counts, user_library, local, batching):
    serialized = pin_socket.recv()
    pin_msg = PinFunction()
    pin_msg.ParseFromString(serialized)

    sckt = pusher_cache.get(
        sutils.get_pin_accept_port(pin_msg.response_address))
    name = pin_msg.name

    # We currently only allow one pinned function per container in non-local
    # mode.
    if not local:
        if (len(function_cache) > 0 and name not in function_cache):
            sutils.error.SerializeToString()
            sckt.send(sutils.error.SerializeToString())
            return batching

    func = utils.retrieve_function(pin_msg.name, kvs, user_library)

    # The function must exist -- because otherwise the DAG couldn't be
    # registered -- so we keep trying to retrieve it.
    while not func:
        func = utils.retrieve_function(name, kvs, user_library)

    if name not in function_cache:
        print(
            f"writing function cache for entry {name}, it's a type {type(func)}"
        )
        import cloudpickle
        if isinstance(func, bytes):
            func = cloudpickle.loads(func)
        function_cache[name] = func

    if name not in status.functions:
        status.functions.append(name)

    # Add metadata tracking for the newly pinned functions.
    runtimes[name] = []
    exec_counts[name] = 0
    logging.info('Adding function %s to my local pinned functions.' % (name))

    if pin_msg.batching and len(status.functions) > 1:
        raise RuntimeError(
            'There is more than one pinned function (we are' +
            ' operating in local mode), and the function' +
            ' attempting to be pinned has batching enabled. This' +
            ' is not allowed -- you can only use batching in' +
            ' cluster mode or in local mode with one function.')

    sckt.send(sutils.ok_resp)

    return pin_msg.batching
Exemplo n.º 4
0
    def test_occupied_pin(self):
        '''
        This test attempts to pin a function onto a node where another function
        is already pinned. We currently only allow one pinned node per machine,
        so this operation should fail.
        '''
        # Create a new function in the KVS.
        fname = 'incr'

        def func(_, x):
            return x + 1

        create_function(func, self.kvs_client, fname)

        # Create a pin message and put it into the socket.
        msg = PinFunction(name=fname, response_address=self.ip)
        self.socket.inbox.append(msg.SerializeToString())

        # Add an already pinned_function, so that we reject the request.
        self.pinned_functions['square'] = lambda _, x: x * x
        self.runtimes['square'] = []
        self.exec_counts['square'] = []
        self.status.functions.append('square')

        # Execute the pin operation.
        pin(self.socket, self.pusher_cache, self.kvs_client, self.status,
            self.pinned_functions, self.runtimes, self.exec_counts,
            self.user_library, False, False)

        # Check that the correct messages were sent and the correct metadata
        # created.
        self.assertEqual(len(self.pusher_cache.socket.outbox), 1)
        response = GenericResponse()
        response.ParseFromString(self.pusher_cache.socket.outbox[0])
        self.assertFalse(response.success)

        # Make sure that none of the metadata was corrupted with this failed
        # pin attempt
        self.assertTrue(fname not in self.pinned_functions)
        self.assertTrue(fname not in self.runtimes)
        self.assertTrue(fname not in self.exec_counts)
        self.assertTrue(fname not in self.status.functions)
Exemplo n.º 5
0
def pin(pin_socket, pusher_cache, kvs, status, function_cache, runtimes,
        exec_counts, user_library, local):
    serialized = pin_socket.recv()
    pin_msg = PinFunction()
    pin_msg.ParseFromString(serialized)

    sckt = pusher_cache.get(
        sutils.get_pin_accept_port(pin_msg.response_address))
    name = pin_msg.name

    # We currently only allow one pinned function per container in non-local
    # mode.
    if (not local and ((len(function_cache) > 0 and name not in function_cache)
                       or not status.running)):
        sutils.error.SerializeToString()
        sckt.send(sutils.error.SerializeToString())
        return

    sckt.send(sutils.ok_resp)

    func = utils.retrieve_function(pin_msg.name, kvs, user_library)

    # The function must exist -- because otherwise the DAG couldn't be
    # registered -- so we keep trying to retrieve it.
    while not func:
        func = utils.retrieve_function(name, kvs, user_library)

    if name not in function_cache:
        function_cache[name] = func

    if name not in status.functions:
        status.functions.append(name)

    # Add metadata tracking for the newly pinned functions.
    runtimes[name] = []
    exec_counts[name] = 0
    logging.info('Adding function %s to my local pinned functions.' % (name))
Exemplo n.º 6
0
    def pin_function(self, dag_name, function_ref, colocated):
        # If there are no functions left to choose from, then we return None,
        # indicating that we ran out of resources to use.
        if function_ref.gpu and len(self.unpinned_gpu_executors) == 0:
            return False
        elif not function_ref.gpu and len(self.unpinned_cpu_executors) == 0:
            return False

        if dag_name not in self.pending_dags:
            self.pending_dags[dag_name] = []

        # Make a copy of the set of executors, so that we don't modify the
        # system's metadata.
        if function_ref.gpu:
            candidates = set(self.unpinned_gpu_executors)
        elif len(colocated) == 0:
            # If this is not a GPU function, just look at all of the unpinned
            # executors.
            candidates = set(self.unpinned_cpu_executors)
        else:
            candidates = set()

            already_pinned = set()
            for fn, thread in self.pending_dags[dag_name]:
                if fn in colocated:
                    already_pinned.add((fn, thread))
            candidate_nodes = set()

            if len(already_pinned) > 0:
                for fn, thread in already_pinned:
                    candidate_nodes.add(thread[0]) # The node's IP

                for node, tid in self.unpinned_cpu_executors:
                    if node in candidate_nodes:
                        candidates.add((node, tid))
            else:
                # If this is the first colocate to be pinned, try to assign to
                # an empty node.
                nodes = {}
                for node, tid in self.unpinned_cpu_executors:
                    if node not in nodes:
                        nodes[node] = 0
                    nodes[node] += 1

                for node in nodes:
                    if nodes[node] == NUM_EXECUTOR_THREADS:
                        for i in range(NUM_EXECUTOR_THREADS):
                            candidates.add((node, i))

        if len(candidates) == 0: # There no valid executors to colocate on.
            return self.pin_function(dag_name, function_ref, [])

        # Construct a PinFunction message to be sent to executors.
        pin_msg = PinFunction()
        pin_msg.name = function_ref.name
        pin_msg.batching = function_ref.batching
        pin_msg.response_address = self.ip

        serialized = pin_msg.SerializeToString()

        while True:
            # Pick a random executor from the set of candidates and attempt to
            # pin this function there.
            node, tid = sys_random.sample(candidates, 1)[0]

            sckt = self.pusher_cache.get(get_pin_address(node, tid))
            sckt.send(serialized)

            response = GenericResponse()
            try:
                response.ParseFromString(self.pin_accept_socket.recv())
            except zmq.ZMQError:
                logging.error('Pin operation to %s:%d timed out. Retrying.' %
                              (node, tid))
                continue

            # Do not use this executor either way: If it rejected, it has
            # something else pinned, and if it accepted, it has pinned what we
            # just asked it to pin. In local mode, however we allow executors
            # to have multiple functions pinned.
            if not self.local:
                if function_ref.gpu:
                    self.unpinned_gpu_executors.discard((node, tid))
                    candidates.discard((node, tid))
                else:
                    self.unpinned_cpu_executors.discard((node, tid))
                    candidates.discard((node, tid))

            if response.success:
                # The pin operation succeeded, so we return the node and thread
                # ID to the caller.
                self.pending_dags[dag_name].append((function_ref.name, (node,
                                                                        tid)))
                return True
            else:
                # The pin operation was rejected, remove node and try again.
                logging.error('Node %s:%d rejected pin for %s. Retrying.'
                              % (node, tid, function_ref.name))

                continue

            if len(candidates) == 0 and len(colocated) > 0:
                # Try again without colocation.
                return self.pin_function(self, dag_name, function_ref, [])
Exemplo n.º 7
0
    def test_create_gpu_dag(self):
        # Create a simple two-function DAG and add it to the inbound socket.
        dag_name = 'dag'
        fn = 'fn'

        dag = create_linear_dag([None], [fn], self.kvs_client, dag_name)
        dag.functions[0].gpu = True
        self.socket.inbox.append(dag.SerializeToString())

        dags = {}
        call_frequency = {}

        address_set = {(self.ip, 1)}
        self.policy.unpinned_gpu_executors.update(address_set)

        self.pin_socket.inbox.append(sutils.ok_resp)

        create_dag(self.socket, self.pusher_cache, self.kvs_client, dags,
                   self.policy, call_frequency)

        # Test that the correct metadata was created.
        self.assertTrue(dag_name in dags)
        created, dag_source = dags[dag_name]
        self.assertEqual(created, dag)
        self.assertEqual(len(dag_source), 1)
        self.assertEqual(list(dag_source)[0], fn)
        self.assertTrue(fn in call_frequency)
        self.assertEqual(call_frequency[fn], 0)

        # Test that the DAG is stored in the KVS correctly.
        result = self.kvs_client.get(dag_name)[dag_name]
        created = Dag()
        created.ParseFromString(result.reveal())
        self.assertEqual(created, dag)

        # Test that the correct response was returned to the user.
        self.assertTrue(len(self.socket.outbox), 1)
        response = GenericResponse()
        response.ParseFromString(self.socket.outbox.pop())
        self.assertTrue(response.success)

        # Test that the correct pin messages were sent.
        self.assertEqual(len(self.pusher_cache.socket.outbox), 1)
        messages = self.pusher_cache.socket.outbox
        function_set = {fn}
        for message in messages:
            pin_msg = PinFunction()
            pin_msg.ParseFromString(message)
            self.assertEqual(pin_msg.response_address, self.ip)
            self.assertTrue(pin_msg.name in function_set)
            function_set.discard(pin_msg.name)

        self.assertEqual(len(function_set), 0)

        for address in address_set:
            self.assertTrue(
                get_pin_address(*address) in self.pusher_cache.addresses)

        # Test that the policy engine has the correct metadata stored.
        self.assertEqual(len(self.policy.unpinned_cpu_executors), 0)
        self.assertEqual(len(self.policy.pending_dags), 0)
        self.assertTrue(fn in self.policy.function_locations)

        self.assertEqual(len(self.policy.function_locations[fn]), 1)
Exemplo n.º 8
0
    def test_create_dag_insufficient_resources(self):
        '''
        This test attempts to create a DAG even though there are not enough
        free executors in the system. It checks that a pin message is attempted
        to be sent, we run out of resources, and then the request is rejected.
        We check that the metadata is properly restored back to its original
        state.
        '''
        # Create a simple two-function DAG and add it to the inbound socket.
        source = 'source'
        sink = 'sink'
        dag_name = 'dag'

        dag = create_linear_dag([None, None], [source, sink], self.kvs_client,
                                dag_name)
        self.socket.inbox.append(dag.SerializeToString())

        # Add relevant metadata to the policy engine, but set the number of
        # executors to fewer than needed.
        address_set = {(self.ip, 1)}
        self.policy.unpinned_cpu_executors.update(address_set)

        # Prepopulate the pin_accept socket with sufficient success messages.
        self.pin_socket.inbox.append(sutils.ok_resp)

        # Attempt to create the DAG.
        dags = {}
        call_frequency = {}
        create_dag(self.socket, self.pusher_cache, self.kvs_client, dags,
                   self.policy, call_frequency)

        # Check that an error was returned to the user.
        self.assertEqual(len(self.socket.outbox), 1)
        response = GenericResponse()
        response.ParseFromString(self.socket.outbox[0])
        self.assertFalse(response.success)
        self.assertEqual(response.error, NO_RESOURCES)

        # Test that the correct pin messages were sent.
        self.assertEqual(len(self.pusher_cache.socket.outbox), 2)
        messages = self.pusher_cache.socket.outbox

        # Checks for the pin message.
        pin_msg = PinFunction()
        pin_msg.ParseFromString(messages[0])
        self.assertEqual(pin_msg.response_address, self.ip)
        self.assertEqual(pin_msg.name, source)

        # Checks for the unpin message.
        self.assertEqual(messages[1], source)

        address = random.sample(address_set, 1)[0]
        addresses = self.pusher_cache.addresses
        self.assertEqual(get_pin_address(*address), addresses[0])
        self.assertEqual(get_unpin_address(*address), addresses[1])

        # Check that no additional messages were sent.
        self.assertEqual(len(self.policy.unpinned_cpu_executors), 0)
        self.assertEqual(len(self.policy.function_locations), 0)
        self.assertEqual(len(self.policy.pending_dags), 0)

        # Check that no additional metadata was created or sent.
        self.assertEqual(len(call_frequency), 0)
        self.assertEqual(len(dags), 0)
Exemplo n.º 9
0
    def test_create_dag(self):
        '''
        This test creates a new DAG, checking that the correct pin messages are
        sent to executors and that it is persisted in the KVS correctly. It
        also checks that the server metadata was updated as expected.
        '''
        # Create a simple two-function DAG and add it to the inbound socket.
        source = 'source'
        sink = 'sink'
        dag_name = 'dag'

        dag = create_linear_dag([None, None], [source, sink], self.kvs_client,
                                dag_name)
        self.socket.inbox.append(dag.SerializeToString())

        # Add relevant metadata to the policy engine.
        address_set = {(self.ip, 1), (self.ip, 2)}
        self.policy.unpinned_cpu_executors.update(address_set)

        # Prepopulate the pin_accept socket with sufficient success messages.
        self.pin_socket.inbox.append(sutils.ok_resp)
        self.pin_socket.inbox.append(sutils.ok_resp)

        # Call the DAG creation method.
        dags = {}
        call_frequency = {}
        create_dag(self.socket, self.pusher_cache, self.kvs_client, dags,
                   self.policy, call_frequency)

        # Test that the correct metadata was created.
        self.assertTrue(dag_name in dags)
        created, dag_source = dags[dag_name]
        self.assertEqual(created, dag)
        self.assertEqual(len(dag_source), 1)
        self.assertEqual(list(dag_source)[0], source)
        self.assertTrue(source in call_frequency)
        self.assertTrue(sink in call_frequency)
        self.assertEqual(call_frequency[source], 0)
        self.assertEqual(call_frequency[sink], 0)

        # Test that the DAG is stored in the KVS correctly.
        result = self.kvs_client.get(dag_name)[dag_name]
        created = Dag()
        created.ParseFromString(result.reveal())
        self.assertEqual(created, dag)

        # Test that the correct response was returned to the user.
        self.assertTrue(len(self.socket.outbox), 1)
        response = GenericResponse()
        response.ParseFromString(self.socket.outbox.pop())
        self.assertTrue(response.success)

        # Test that the correct pin messages were sent.
        self.assertEqual(len(self.pusher_cache.socket.outbox), 2)
        messages = self.pusher_cache.socket.outbox
        function_set = {source, sink}
        for message in messages:
            pin_msg = PinFunction()
            pin_msg.ParseFromString(message)
            self.assertEqual(pin_msg.response_address, self.ip)
            self.assertTrue(pin_msg.name in function_set)
            function_set.discard(pin_msg.name)

        self.assertEqual(len(function_set), 0)

        for address in address_set:
            self.assertTrue(
                get_pin_address(*address) in self.pusher_cache.addresses)

        # Test that the policy engine has the correct metadata stored.
        self.assertEqual(len(self.policy.unpinned_cpu_executors), 0)
        self.assertEqual(len(self.policy.pending_dags), 0)
        self.assertTrue(source in self.policy.function_locations)
        self.assertTrue(sink in self.policy.function_locations)

        self.assertEqual(len(self.policy.function_locations[source]), 1)
        self.assertEqual(len(self.policy.function_locations[sink]), 1)