Пример #1
0
    def testOperandPrepush(self):
        session_id = str(uuid.uuid4())
        graph_key = str(uuid.uuid4())
        mock_workers = ['localhost:12345']

        with self._prepare_test_graph(session_id, graph_key,
                                      mock_workers) as (pool, graph_ref):
            input_op_keys, mid_op_key, output_op_keys = self._filter_graph_level_op_keys(
                graph_ref)
            fake_exec_ref = pool.create_actor(FakeExecutionActor, 0.5)

            input_refs = [
                pool.actor_ref(OperandActor.gen_uid(session_id, k))
                for k in input_op_keys
            ]
            mid_ref = pool.actor_ref(
                OperandActor.gen_uid(session_id, mid_op_key))

            def _fake_raw_execution_ref(*_, **__):
                return fake_exec_ref

            with patch_method(OperandActor._get_raw_execution_ref, new=_fake_raw_execution_ref),\
                    patch_method(AssignerActor.get_worker_assignments, new=lambda *_: mock_workers):
                input_refs[0].start_operand(OperandState.READY)
                input_refs[1].start_operand(OperandState.READY)

                start_time = time.time()
                # submission without pre-push will fail
                while mid_ref.get_state() != OperandState.FINISHED:
                    pool.sleep(0.1)
                    if time.time() - start_time > 30:
                        raise TimeoutError(
                            'Check middle chunk state timed out.')
Пример #2
0
    def testErrorOnPrepare(self, *_):
        session_id = str(uuid.uuid4())

        addr = '127.0.0.1:%d' % get_next_port()
        with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool:
            pool.create_actor(ClusterInfoActor, [pool.cluster_info.address],
                              uid=ClusterInfoActor.default_name())
            resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name())
            pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name())
            pool.create_actor(AssignerActor, uid=AssignerActor.default_name())

            resource_ref.set_worker_meta('localhost:12345', dict(hardware=dict(cpu_total=4)))
            resource_ref.set_worker_meta('localhost:23456', dict(hardware=dict(cpu_total=4)))

            # error occurred in create_operand_actors
            graph_key = str(uuid.uuid4())
            expr = mt.random.random((8, 2), chunk_size=2) + 1
            graph = expr.build_graph(compose=False)
            serialized_graph = serialize_graph(graph)

            graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialized_graph,
                                          uid=GraphActor.gen_name(session_id, graph_key))

            def _mock_raises(*_, **__):
                raise RuntimeError

            with patch_method(GraphActor.create_operand_actors, new=_mock_raises):
                with self.assertRaises(RuntimeError):
                    graph_ref.execute_graph()
            self.assertEqual(graph_ref.get_state(), GraphState.FAILED)
            graph_ref.destroy()

            # interrupted during create_operand_actors
            graph_key = str(uuid.uuid4())
            graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialized_graph,
                                          uid=GraphActor.gen_name(session_id, graph_key))

            def _mock_cancels(*_, **__):
                graph_meta_ref = pool.actor_ref(GraphMetaActor.gen_name(session_id, graph_key))
                graph_meta_ref.set_state(GraphState.CANCELLING)

            with patch_method(GraphActor.create_operand_actors, new=_mock_cancels):
                graph_ref.execute_graph()
            self.assertEqual(graph_ref.get_state(), GraphState.CANCELLED)

            # interrupted during previous steps
            graph_key = str(uuid.uuid4())
            graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialized_graph,
                                          uid=GraphActor.gen_name(session_id, graph_key))

            def _mock_cancels(*_, **__):
                graph_meta_ref = pool.actor_ref(GraphMetaActor.gen_name(session_id, graph_key))
                graph_meta_ref.set_state(GraphState.CANCELLING)
                return dict()

            with patch_method(GraphAnalyzer.calc_operand_assignments, new=_mock_cancels):
                graph_ref.execute_graph()
            self.assertEqual(graph_ref.get_state(), GraphState.CANCELLED)
Пример #3
0
    def testDestroyCalcActor(self):
        import gevent.event

        with self._start_calc_pool() as (_pool, test_actor):
            calc_ref = _pool.actor_ref(CpuCalcActor.default_uid())
            calc_ref.mark_destroy()
            gevent.sleep(0.8)
            self.assertFalse(_pool.has_actor(calc_ref))

        with self._start_calc_pool() as (_pool, test_actor):
            calc_ref = test_actor.promise_ref(CpuCalcActor.default_uid())

            session_id = str(uuid.uuid4())
            data_list = [np.random.random((10, 10)) for _ in range(2)]
            exec_graph, fetch_chunks, add_chunk = self._build_test_graph(
                data_list)
            exec_graph2, fetch_chunks2, add_chunk2 = self._build_test_graph(
                data_list[::-1])

            storage_client = test_actor.storage_client

            for fetch_chunk, d in zip(fetch_chunks, data_list):
                self.waitp(
                    storage_client.put_objects(
                        session_id, [fetch_chunk.key], [d],
                        [DataStorageDevice.SHARED_MEMORY]), )
            for fetch_chunk2, d in zip(fetch_chunks2, data_list[::-1]):
                self.waitp(
                    storage_client.put_objects(
                        session_id, [fetch_chunk2.key], [d],
                        [DataStorageDevice.SHARED_MEMORY]), )

            orig_calc_results = CpuCalcActor._calc_results

            start_event = gevent.event.Event()

            def _mock_calc_delayed(actor_obj, *args, **kwargs):
                start_event.set()
                gevent.sleep(1)
                return orig_calc_results(actor_obj, *args, **kwargs)

            with patch_method(CpuCalcActor._calc_results, _mock_calc_delayed):
                p = calc_ref.calc(session_id, add_chunk.op.key, serialize_graph(exec_graph),
                                  [add_chunk.key], _promise=True) \
                    .then(lambda *_: calc_ref.store_results(
                        session_id, add_chunk.op.key, [add_chunk.key], None, _promise=True))
                start_event.wait()
                calc_ref.mark_destroy()

                p2 = calc_ref.calc(session_id, add_chunk2.op.key, serialize_graph(exec_graph2),
                                   [add_chunk2.key], _promise=True) \
                    .then(lambda *_: calc_ref.store_results(
                        session_id, add_chunk2.op.key, [add_chunk2.key], None, _promise=True))

                self.assertTrue(_pool.has_actor(calc_ref._ref))
                self.waitp(p)
                self.waitp(p2)

            gevent.sleep(0.8)
            self.assertFalse(_pool.has_actor(calc_ref._ref))
Пример #4
0
    def testCpuCalcErrorInRunning(self):
        with self._start_calc_pool() as (_pool, test_actor):
            calc_ref = test_actor.promise_ref(CpuCalcActor.default_uid())

            session_id = str(uuid.uuid4())
            data_list = [np.random.random((10, 10)) for _ in range(2)]
            exec_graph, fetch_chunks, add_chunk = self._build_test_graph(
                data_list)

            storage_client = test_actor.storage_client

            for fetch_chunk, d in zip(fetch_chunks, data_list):
                self.waitp(
                    storage_client.put_objects(
                        session_id, [fetch_chunk.key], [d],
                        [DataStorageDevice.SHARED_MEMORY]), )

            def _mock_calc_results_error(*_, **__):
                raise ValueError

            with patch_method(CpuCalcActor._calc_results, _mock_calc_results_error), \
                    self.assertRaises(ValueError):
                self.waitp(
                    calc_ref.calc(
                        session_id,
                        add_chunk.op.key,
                        serialize_graph(exec_graph), [add_chunk.key],
                        _promise=True).then(lambda *_: calc_ref.store_results(
                            session_id,
                            add_chunk.op.key, [add_chunk.key],
                            None,
                            _promise=True)))
Пример #5
0
    def testReadyState(self, *_):
        session_id = str(uuid.uuid4())
        graph_key = str(uuid.uuid4())
        mock_workers = ['localhost:12345', 'localhost:23456']

        def _mock_get_workers_meta(*_, **__):
            return dict((w, dict(hardware=dict(cpu_total=1, memory=1024**3)))
                        for w in mock_workers)

        with patch_method(ResourceActor.get_workers_meta, new=_mock_get_workers_meta) as _, \
                self._prepare_test_graph(session_id, graph_key, mock_workers) as (pool, graph_ref):
            input_op_keys, mid_op_key, output_op_keys = self._filter_graph_level_op_keys(
                graph_ref)
            meta_client = ChunkMetaClient(
                pool, pool.actor_ref(SchedulerClusterInfoActor.default_uid()))
            op_ref = pool.actor_ref(
                OperandActor.gen_uid(session_id, mid_op_key))
            resource_ref = pool.actor_ref(ResourceActor.default_uid())

            input_refs = [
                pool.actor_ref(OperandActor.gen_uid(session_id, k))
                for k in input_op_keys
            ]

            def test_entering_state(target):
                for key in input_op_keys:
                    op_ref.remove_finished_predecessor(key)

                op_ref.start_operand(OperandState.UNSCHEDULED)
                for ref in input_refs:
                    ref.start_operand(OperandState.UNSCHEDULED)

                for ref in input_refs:
                    self.assertEqual(op_ref.get_state(),
                                     OperandState.UNSCHEDULED)
                    ref.start_operand(OperandState.FINISHED)
                pool.sleep(1)
                self.assertEqual(target, op_ref.get_state())
                for w in mock_workers:
                    resource_ref.deallocate_resource(session_id, mid_op_key, w)

            # test entering state with no input meta
            test_entering_state(OperandState.UNSCHEDULED)

            # fill meta
            input_chunk_keys, _, _ = self._filter_graph_level_chunk_keys(
                graph_ref)
            for ck in input_chunk_keys:
                meta_client.set_chunk_meta(session_id,
                                           ck,
                                           workers=('localhost:12345', ),
                                           size=800)

            # test successful entering state
            test_entering_state(OperandState.READY)
Пример #6
0
    def testReadyState(self, *_):
        session_id = str(uuid.uuid4())
        graph_key = str(uuid.uuid4())
        mock_workers = ['localhost:12345', 'localhost:23456']

        with self._prepare_test_graph(session_id, graph_key,
                                      mock_workers) as (pool, graph_ref):
            input_op_keys, mid_op_key, output_op_keys = self._filter_graph_level_op_keys(
                graph_ref)
            meta_client = ChunkMetaClient(
                pool, pool.actor_ref(SchedulerClusterInfoActor.default_name()))
            op_ref = pool.actor_ref(
                OperandActor.gen_uid(session_id, mid_op_key))

            input_refs = [
                pool.actor_ref(OperandActor.gen_uid(session_id, k))
                for k in input_op_keys
            ]

            def test_entering_state(target):
                for key in input_op_keys:
                    op_ref.remove_finished_predecessor(key)

                op_ref.start_operand(OperandState.UNSCHEDULED)
                for ref in input_refs:
                    ref.start_operand(OperandState.UNSCHEDULED)

                for ref in input_refs:
                    self.assertEqual(op_ref.get_state(),
                                     OperandState.UNSCHEDULED)
                    ref.start_operand(OperandState.FINISHED)
                pool.sleep(0.5)
                self.assertEqual(target, op_ref.get_state())

            # test entering state with no input meta
            test_entering_state(OperandState.UNSCHEDULED)

            # fill meta
            input_chunk_keys, _, _ = self._filter_graph_level_chunk_keys(
                graph_ref)
            for ck in input_chunk_keys:
                meta_client.set_chunk_meta(session_id,
                                           ck,
                                           workers=('localhost:12345', ),
                                           size=800)

            # test entering state with failure in fetching sizes
            with patch_method(ChunkMetaClient.batch_get_chunk_size,
                              new=lambda *_: [None, None]):
                test_entering_state(OperandState.UNSCHEDULED)

            # test successful entering state
            test_entering_state(OperandState.READY)
Пример #7
0
    def testMemQuotaAllocation(self):
        from mars import resource
        from mars.utils import AttributeDict

        mock_mem_stat = AttributeDict(
            dict(total=300, available=50, used=0, free=50))
        local_pool_addr = 'localhost:%d' % get_next_port()
        with create_actor_pool(n_process=1, backend='gevent', address=local_pool_addr) as pool, \
                patch_method(resource.virtual_memory, new=lambda: mock_mem_stat):
            pool.create_actor(WorkerClusterInfoActor,
                              schedulers=[local_pool_addr],
                              uid=WorkerClusterInfoActor.default_name())
            pool.create_actor(StatusActor,
                              local_pool_addr,
                              uid=StatusActor.default_name())

            pool.create_actor(DispatchActor, uid=DispatchActor.default_name())
            pool.create_actor(ProcessHelperActor,
                              uid=ProcessHelperActor.default_name())
            quota_ref = pool.create_actor(MemQuotaActor,
                                          300,
                                          refresh_time=0.1,
                                          uid=MemQuotaActor.default_name())

            time_recs = []
            with self.run_actor_test(pool) as test_actor:
                ref = test_actor.promise_ref(quota_ref)
                time_recs.append(time.time())

                def actual_exec(x):
                    ref.release_quota(x)
                    time_recs.append(time.time())
                    test_actor.set_result(None)

                ref.request_quota('req', 100, _promise=True) \
                    .then(functools.partial(actual_exec, 'req'))

                pool.sleep(0.5)
                mock_mem_stat['available'] = 150
                mock_mem_stat['free'] = 150

                self.get_result(2)

            self.assertGreater(abs(time_recs[0] - time_recs[1]), 0.4)
Пример #8
0
    def testServiceArgs(self):
        svc = WorkerService(ignore_avail_mem=True)
        self.assertGreaterEqual(svc._cache_mem_size, 0)
        self.assertIsInstance(svc._soft_mem_limit, int)
        self.assertIsInstance(svc._hard_mem_limit, int)
        self.assertIsInstance(svc._cache_mem_size, int)

        svc = WorkerService(ignore_avail_mem=True, total_mem=256 * 1024 * 1024)
        self.assertEqual(svc._total_mem, 256 * 1024**2)

        svc = WorkerService(ignore_avail_mem=True, total_mem='512m')
        self.assertEqual(svc._total_mem, 512 * 1024**2)

        with self.assertRaises(MemoryError):
            WorkerService(soft_mem_limit='128m', cache_mem_size='256m')

        with self.assertRaises(MemoryError), \
                patch_method(WorkerService._get_plasma_size, new=lambda *_, **__: 0):
            WorkerService(min_cache_mem_size='1g', cache_mem_size='256m')

        svc = WorkerService(ignore_avail_mem=True,
                            spill_dirs='/tmp/a',
                            min_cache_mem_size=0)
        self.assertListEqual(svc._spill_dirs, ['/tmp/a'])

        svc = WorkerService(ignore_avail_mem=True,
                            n_cpu_process=4,
                            n_net_process=2,
                            min_cache_mem_size=0)
        self.assertEqual(svc.n_process, 7)

        svc = WorkerService(ignore_avail_mem=True,
                            n_cpu_process=4,
                            n_net_process=2,
                            spill_dirs='/tmp/a',
                            min_cache_mem_size=0)
        self.assertEqual(svc.n_process, 8)

        svc = WorkerService(ignore_avail_mem=True,
                            n_cpu_process=4,
                            n_net_process=2,
                            spill_dirs=['/tmp/a', '/tmp/b'],
                            min_cache_mem_size=0)
        self.assertEqual(svc.n_process, 8)
Пример #9
0
    def testOperandActorWithAssignRetryAndFail(self, *_):
        arr = mt.random.randint(10, size=(10, 8), chunk_size=4)
        arr_add = mt.random.randint(10, size=(10, 8), chunk_size=4)
        arr2 = arr + arr_add

        def _allocate_raises(*_, **__):
            raise TimeoutError

        session_id = str(uuid.uuid4())
        graph_key = str(uuid.uuid4())
        try:
            options.scheduler.retry_delay = 0
            with patch_method(AssignEvaluationActor._allocate_resource,
                              new=_allocate_raises):
                self._run_operand_case(
                    session_id, graph_key, arr2, lambda pool, uid: pool.
                    create_actor(FakeExecutionActor, fail_count=5, uid=uid))
        finally:
            options.scheduler.retry_delay = 60
Пример #10
0
    def testSender(self):
        send_pool_addr = 'localhost:%d' % get_next_port()
        recv_pool_addr = 'localhost:%d' % get_next_port()
        recv_pool_addr2 = 'localhost:%d' % get_next_port()

        options.worker.spill_directory = os.path.join(
            tempfile.gettempdir(), 'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker)))
        session_id = str(uuid.uuid4())

        mock_data = np.array([1, 2, 3, 4])
        chunk_key1 = str(uuid.uuid4())
        chunk_key2 = str(uuid.uuid4())

        @contextlib.contextmanager
        def start_send_recv_pool():
            with start_transfer_test_pool(
                    address=send_pool_addr, plasma_size=self.plasma_storage_size) as sp:
                sp.create_actor(SenderActor, uid=SenderActor.default_name())
                with start_transfer_test_pool(
                        address=recv_pool_addr, plasma_size=self.plasma_storage_size) as rp:
                    rp.create_actor(MockReceiverActor, uid=ReceiverActor.default_name())
                    yield sp, rp

        with start_send_recv_pool() as (send_pool, recv_pool):
            chunk_holder_ref = send_pool.actor_ref(ChunkHolderActor.default_name())
            sender_ref = send_pool.actor_ref(SenderActor.default_name())
            receiver_ref = recv_pool.actor_ref(ReceiverActor.default_name())

            sender_mapper_ref = send_pool.actor_ref(PlasmaKeyMapActor.default_name())
            store = PlasmaChunkStore(self._plasma_client, sender_mapper_ref)

            with self.run_actor_test(send_pool) as test_actor:
                # send when data missing
                sender_ref_p = test_actor.promise_ref(sender_ref)
                sender_ref_p.send_data(session_id, str(uuid.uuid4()), recv_pool_addr, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))
                with self.assertRaises(DependencyMissing):
                    self.get_result(5)

                # send data in spill
                write_spill_file(chunk_key1, mock_data)

                sender_ref_p.send_data(session_id, chunk_key1, recv_pool_addr, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))
                self.get_result(5)
                assert_array_equal(mock_data, receiver_ref.get_result_data(session_id, chunk_key1))
                os.unlink(build_spill_file_name(chunk_key1))

                # send data in plasma store
                store.put(session_id, chunk_key1, mock_data)
                chunk_holder_ref.register_chunk(session_id, chunk_key1)

                sender_ref_p.send_data(session_id, chunk_key1, recv_pool_addr, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))
                self.get_result(5)
                assert_array_equal(mock_data, receiver_ref.get_result_data(session_id, chunk_key1))

                # send data to multiple targets
                with start_transfer_test_pool(
                        address=recv_pool_addr2, plasma_size=self.plasma_storage_size) as rp2:
                    recv_ref2 = rp2.create_actor(MockReceiverActor, uid=ReceiverActor.default_name())

                    sender_ref_p.send_data(session_id, chunk_key1,
                                           [recv_pool_addr, recv_pool_addr2], _promise=True)
                    # send data to already transferred / transferring
                    sender_ref_p.send_data(session_id, chunk_key1,
                                           [recv_pool_addr, recv_pool_addr2], _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))
                    self.get_result(5)
                    assert_array_equal(mock_data, recv_ref2.get_result_data(session_id, chunk_key1))

                # send data to non-exist endpoint which causes error
                store.put(session_id, chunk_key2, mock_data)
                chunk_holder_ref.register_chunk(session_id, chunk_key2)

                sender_ref_p.send_data(session_id, chunk_key2, recv_pool_addr2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))
                with self.assertRaises(BrokenPipeError):
                    self.get_result(5)

                def mocked_receive_data_part(*_):
                    raise ChecksumMismatch

                with patch_method(MockReceiverActor.receive_data_part, new=mocked_receive_data_part):
                    sender_ref_p.send_data(session_id, chunk_key2, recv_pool_addr, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))
                    with self.assertRaises(ChecksumMismatch):
                        self.get_result(5)
Пример #11
0
    def testClientSpill(self, *_):
        test_addr = '127.0.0.1:%d' % get_next_port()
        with self.create_pool(n_process=1, address=test_addr) as pool:
            pool.create_actor(WorkerDaemonActor,
                              uid=WorkerDaemonActor.default_uid())
            storage_manager_ref = pool.create_actor(
                StorageManagerActor, uid=StorageManagerActor.default_uid())

            pool.create_actor(DispatchActor, uid=DispatchActor.default_uid())
            pool.create_actor(IORunnerActor)

            pool.create_actor(QuotaActor,
                              1024**2,
                              uid=MemQuotaActor.default_uid())
            pool.create_actor(InProcHolderActor)

            pool.create_actor(PlasmaKeyMapActor,
                              uid=PlasmaKeyMapActor.default_uid())
            pool.create_actor(SharedHolderActor,
                              self.plasma_storage_size,
                              uid=SharedHolderActor.default_uid())

            session_id = str(uuid.uuid4())
            data_list = [
                np.random.randint(0, 32767, (655360, ), np.int16)
                for _ in range(20)
            ]
            data_keys = [str(uuid.uuid4()) for _ in range(20)]

            with self.run_actor_test(pool) as test_actor:
                storage_client = test_actor.storage_client
                idx = 0

                shared_handler = storage_client.get_storage_handler(
                    (0, DataStorageDevice.SHARED_MEMORY))
                proc_handler = storage_client.get_storage_handler(
                    (0, DataStorageDevice.PROC_MEMORY))

                def _fill_data():
                    i = 0
                    for i, (key,
                            data) in enumerate(zip(data_keys[idx:],
                                                   data_list)):
                        try:
                            shared_handler.put_objects(session_id, [key],
                                                       [data])
                        except StorageFull:
                            break
                    return i + idx

                idx = _fill_data()

                # test copying non-existing keys
                storage_client.copy_to(session_id, ['non-exist-key'], [DataStorageDevice.SHARED_MEMORY]) \
                    .then(lambda *_: test_actor.set_result(None),
                          lambda *exc: test_actor.set_result(exc, accept=False))
                with self.assertRaises(KeyError):
                    self.get_result(5)

                # test copying into containing locations
                storage_client.copy_to(session_id, [data_keys[0]], [DataStorageDevice.SHARED_MEMORY]) \
                    .then(lambda *_: test_actor.set_result(None),
                          lambda *exc: test_actor.set_result(exc, accept=False))
                self.get_result(5)

                self.assertEqual(
                    sorted(
                        storage_manager_ref.get_data_locations(
                            session_id, [data_keys[0]])[0]),
                    [(0, DataStorageDevice.SHARED_MEMORY)])

                # test unsuccessful copy when no data at target
                def _mock_load_from(*_, **__):
                    return promise.finished(*build_exc_info(SystemError),
                                            _accept=False)

                with patch_method(StorageHandler.load_from, _mock_load_from), \
                        self.assertRaises(SystemError):
                    storage_client.copy_to(session_id, [data_keys[0]], [DataStorageDevice.DISK]) \
                        .then(lambda *_: test_actor.set_result(None),
                              lambda *exc: test_actor.set_result(exc, accept=False))
                    self.get_result(5)

                # test successful copy for multiple objects
                storage_client.delete(session_id, [data_keys[idx - 1]])
                ref_data = weakref.ref(data_list[idx])
                ref_data2 = weakref.ref(data_list[idx + 1])
                proc_handler.put_objects(session_id, data_keys[idx:idx + 2],
                                         data_list[idx:idx + 2])
                data_list[idx:idx + 2] = [None, None]

                storage_client.copy_to(session_id, data_keys[idx:idx + 2],
                                       [DataStorageDevice.SHARED_MEMORY, DataStorageDevice.DISK]) \
                    .then(lambda *_: test_actor.set_result(None),
                          lambda *exc: test_actor.set_result(exc, accept=False))
                self.get_result(5)

                proc_handler.delete(session_id, data_keys[idx:idx + 2])

                self.assertEqual(
                    storage_manager_ref.get_data_locations(
                        session_id, data_keys[idx:idx + 2]),
                    [{(0, DataStorageDevice.SHARED_MEMORY)},
                     {(0, DataStorageDevice.DISK)}])
                self.assertIsNone(ref_data())
                self.assertIsNone(ref_data2())

                # test copy with spill
                idx += 2
                proc_handler.put_objects(session_id, [data_keys[idx]],
                                         [data_list[idx]])

                storage_client.copy_to(session_id, [data_keys[idx]], [DataStorageDevice.SHARED_MEMORY]) \
                    .then(lambda *_: test_actor.set_result(None),
                          lambda *exc: test_actor.set_result(exc, accept=False))
                self.get_result(5)

                self.assertEqual(
                    sorted(
                        storage_manager_ref.get_data_locations(
                            session_id, [data_keys[idx]])[0]),
                    [(0, DataStorageDevice.PROC_MEMORY),
                     (0, DataStorageDevice.SHARED_MEMORY)])
Пример #12
0
    def testCpuCalcSingleFetches(self):
        import gc
        with self._start_calc_pool() as (_pool, test_actor):
            quota_ref = test_actor.promise_ref(MemQuotaActor.default_uid())
            calc_ref = test_actor.promise_ref(CpuCalcActor.default_uid())

            session_id = str(uuid.uuid4())
            data_list = [np.random.random((10, 10)) for _ in range(3)]
            exec_graph, fetch_chunks, add_chunk = self._build_test_graph(
                data_list)

            storage_client = test_actor.storage_client

            for fetch_chunk, d in zip(fetch_chunks, data_list):
                self.waitp(
                    storage_client.put_objects(
                        session_id, [fetch_chunk.key], [d],
                        [DataStorageDevice.SHARED_MEMORY]), )
            self.assertEqual(
                list(
                    storage_client.get_data_locations(
                        session_id, [fetch_chunks[0].key])[0]),
                [(0, DataStorageDevice.SHARED_MEMORY)])

            quota_batch = {
                build_quota_key(session_id, add_chunk.key, add_chunk.op.key):
                data_list[0].nbytes,
            }

            for idx in [1, 2]:
                quota_batch[build_quota_key(session_id, fetch_chunks[idx].key, add_chunk.op.key)] \
                    = data_list[idx].nbytes

                self.waitp(
                    storage_client.copy_to(
                        session_id, [fetch_chunks[idx].key],
                        [DataStorageDevice.DISK
                         ]).then(lambda *_: storage_client.delete(
                             session_id, [fetch_chunks[idx].key],
                             [DataStorageDevice.SHARED_MEMORY])))
                self.assertEqual(
                    list(
                        storage_client.get_data_locations(
                            session_id, [fetch_chunks[idx].key])[0]),
                    [(0, DataStorageDevice.DISK)])

            self.waitp(
                quota_ref.request_batch_quota(quota_batch, _promise=True), )

            o_create = PlasmaSharedStore.create

            def _mock_plasma_create(store, session_id, data_key, size):
                if data_key == fetch_chunks[2].key:
                    raise StorageFull
                return o_create(store, session_id, data_key, size)

            id_type_set = set()

            def _extract_value_ref(*_):
                inproc_handler = storage_client.get_storage_handler(
                    (0, DataStorageDevice.PROC_MEMORY))
                obj = inproc_handler.get_objects(session_id,
                                                 [add_chunk.key])[0]
                id_type_set.add((id(obj), type(obj)))
                del obj

            with patch_method(PlasmaSharedStore.create, _mock_plasma_create):
                self.waitp(
                    calc_ref.calc(session_id,
                                  add_chunk.op.key,
                                  serialize_graph(exec_graph), [add_chunk.key],
                                  _promise=True).then(_extract_value_ref).then(
                                      lambda *_: calc_ref.store_results(
                                          session_id,
                                          add_chunk.op.key, [add_chunk.key],
                                          None,
                                          _promise=True)))

            self.assertTrue(
                all((id(obj), type(obj)) not in id_type_set
                    for obj in gc.get_objects()))

            quota_dump = quota_ref.dump_data()
            self.assertEqual(len(quota_dump.allocations), 0)
            self.assertEqual(len(quota_dump.requests), 0)
            self.assertEqual(len(quota_dump.proc_sizes), 0)
            self.assertEqual(len(quota_dump.hold_sizes), 0)

            self.assertEqual(
                sorted(
                    storage_client.get_data_locations(
                        session_id, [fetch_chunks[0].key])[0]),
                [(0, DataStorageDevice.SHARED_MEMORY)])
            self.assertEqual(
                sorted(
                    storage_client.get_data_locations(
                        session_id, [fetch_chunks[1].key])[0]),
                [(0, DataStorageDevice.DISK)])
            self.assertEqual(
                sorted(
                    storage_client.get_data_locations(
                        session_id, [fetch_chunks[2].key])[0]),
                [(0, DataStorageDevice.DISK)])
            self.assertEqual(
                sorted(
                    storage_client.get_data_locations(session_id,
                                                      [add_chunk.key])[0]),
                [(0, DataStorageDevice.SHARED_MEMORY)])
Пример #13
0
    def testReceiver(self):
        pool_addr = 'localhost:%d' % get_next_port()
        options.worker.spill_directory = tempfile.mkdtemp(
            prefix='mars_test_receiver_')
        session_id = str(uuid.uuid4())

        mock_data = np.array([1, 2, 3, 4])
        serialized_arrow_data = dataserializer.serialize(mock_data)
        data_size = serialized_arrow_data.total_bytes
        serialized_mock_data = serialized_arrow_data.to_buffer()
        serialized_crc32 = zlib.crc32(serialized_arrow_data.to_buffer())

        chunk_key1 = str(uuid.uuid4())
        chunk_key2 = str(uuid.uuid4())
        chunk_key3 = str(uuid.uuid4())
        chunk_key4 = str(uuid.uuid4())
        chunk_key5 = str(uuid.uuid4())
        chunk_key6 = str(uuid.uuid4())
        chunk_key7 = str(uuid.uuid4())
        chunk_key8 = str(uuid.uuid4())

        with start_transfer_test_pool(
                address=pool_addr,
                plasma_size=self.plasma_storage_size) as pool:
            receiver_ref = pool.create_actor(ReceiverActor,
                                             uid=str(uuid.uuid4()))

            with self.run_actor_test(pool) as test_actor:
                storage_client = test_actor.storage_client

                # check_status on receiving and received
                self.assertEqual(
                    receiver_ref.check_status(session_id, chunk_key1),
                    ReceiveStatus.NOT_STARTED)

                self.waitp(
                    storage_client.create_writer(
                        session_id, chunk_key1,
                        serialized_arrow_data.total_bytes,
                        [DataStorageDevice.DISK
                         ]).then(lambda writer: promise.finished().then(
                             lambda *_: writer.write(serialized_arrow_data)).
                                 then(lambda *_: writer.close())))
                self.assertEqual(
                    receiver_ref.check_status(session_id, chunk_key1),
                    ReceiveStatus.RECEIVED)
                storage_client.delete(session_id, chunk_key1)

                self.waitp(
                    storage_client.put_object(
                        session_id, chunk_key1, mock_data,
                        [DataStorageDevice.SHARED_MEMORY]))

                self.assertEqual(
                    receiver_ref.check_status(session_id, chunk_key1),
                    ReceiveStatus.RECEIVED)

                receiver_ref_p = test_actor.promise_ref(receiver_ref)

                # cancel on an un-run / missing result will result in nothing
                receiver_ref_p.cancel_receive(session_id, chunk_key2)

                # start creating writer
                receiver_ref_p.create_data_writer(session_id, chunk_key1, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s))
                self.assertTupleEqual(
                    self.get_result(5),
                    (receiver_ref.address, ReceiveStatus.RECEIVED))

                result = receiver_ref_p.create_data_writer(session_id,
                                                           chunk_key1,
                                                           data_size,
                                                           test_actor,
                                                           use_promise=False)
                self.assertTupleEqual(
                    result, (receiver_ref.address, ReceiveStatus.RECEIVED))

                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s))
                self.assertTupleEqual(self.get_result(5),
                                      (receiver_ref.address, None))

                result = receiver_ref_p.create_data_writer(session_id,
                                                           chunk_key2,
                                                           data_size,
                                                           test_actor,
                                                           use_promise=False)
                self.assertTupleEqual(
                    result, (receiver_ref.address, ReceiveStatus.RECEIVING))

                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s))
                self.assertTupleEqual(
                    self.get_result(5),
                    (receiver_ref.address, ReceiveStatus.RECEIVING))

                receiver_ref_p.cancel_receive(session_id, chunk_key2)
                self.assertEqual(
                    receiver_ref.check_status(session_id, chunk_key2),
                    ReceiveStatus.NOT_STARTED)

                # test checksum error on receive_data_part
                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s))
                self.get_result(5)

                receiver_ref_p.register_finish_callback(session_id, chunk_key2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False))

                receiver_ref_p.receive_data_part(session_id, chunk_key2,
                                                 serialized_mock_data, 0)

                with self.assertRaises(ChecksumMismatch):
                    self.get_result(5)

                # test checksum error on finish_receive
                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s))
                self.assertTupleEqual(self.get_result(5),
                                      (receiver_ref.address, None))

                receiver_ref_p.receive_data_part(session_id, chunk_key2,
                                                 serialized_mock_data,
                                                 serialized_crc32)
                receiver_ref_p.finish_receive(session_id, chunk_key2, 0)

                receiver_ref_p.register_finish_callback(session_id, chunk_key2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False))

                with self.assertRaises(ChecksumMismatch):
                    self.get_result(5)

                receiver_ref_p.cancel_receive(session_id, chunk_key2)

                # test intermediate cancellation
                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s))
                self.assertTupleEqual(self.get_result(5),
                                      (receiver_ref.address, None))

                receiver_ref_p.register_finish_callback(session_id, chunk_key2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False))

                receiver_ref_p.receive_data_part(
                    session_id, chunk_key2, serialized_mock_data[:64],
                    zlib.crc32(serialized_mock_data[:64]))
                receiver_ref_p.cancel_receive(session_id, chunk_key2)
                receiver_ref_p.receive_data_part(session_id, chunk_key2,
                                                 serialized_mock_data[64:],
                                                 serialized_crc32)
                with self.assertRaises(ExecutionInterrupted):
                    self.get_result(5)

                # test transfer in memory
                receiver_ref_p.register_finish_callback(session_id, chunk_key3, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False))

                receiver_ref_p.create_data_writer(session_id, chunk_key3, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s))
                self.assertTupleEqual(self.get_result(5),
                                      (receiver_ref.address, None))

                receiver_ref_p.receive_data_part(
                    session_id, chunk_key3, serialized_mock_data[:64],
                    zlib.crc32(serialized_mock_data[:64]))
                receiver_ref_p.receive_data_part(session_id, chunk_key3,
                                                 serialized_mock_data[64:],
                                                 serialized_crc32)
                receiver_ref_p.finish_receive(session_id, chunk_key3,
                                              serialized_crc32)

                self.assertTupleEqual((), self.get_result(5))

                receiver_ref_p.create_data_writer(session_id, chunk_key3, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s))
                self.assertTupleEqual(
                    self.get_result(5),
                    (receiver_ref.address, ReceiveStatus.RECEIVED))

                # test transfer in spill file
                def mocked_store_create(*_):
                    raise StorageFull

                with patch_method(PlasmaSharedStore.create,
                                  new=mocked_store_create):
                    with self.assertRaises(StorageFull):
                        receiver_ref_p.create_data_writer(session_id,
                                                          chunk_key4,
                                                          data_size,
                                                          test_actor,
                                                          ensure_cached=True,
                                                          use_promise=False)
                    # test receive aborted
                    receiver_ref_p.create_data_writer(
                        session_id, chunk_key4, data_size, test_actor, ensure_cached=False, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s))
                    self.assertTupleEqual(self.get_result(5),
                                          (receiver_ref.address, None))

                    receiver_ref_p.register_finish_callback(session_id, chunk_key4, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s)) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False))

                    receiver_ref_p.receive_data_part(
                        session_id, chunk_key4, serialized_mock_data[:64],
                        zlib.crc32(serialized_mock_data[:64]))
                    receiver_ref_p.cancel_receive(session_id, chunk_key4)
                    with self.assertRaises(ExecutionInterrupted):
                        self.get_result(5)

                    # test receive into spill
                    receiver_ref_p.create_data_writer(
                        session_id, chunk_key4, data_size, test_actor, ensure_cached=False, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s))
                    self.assertTupleEqual(self.get_result(5),
                                          (receiver_ref.address, None))

                    receiver_ref_p.register_finish_callback(session_id, chunk_key4, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s)) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False))

                    receiver_ref_p.receive_data_part(session_id, chunk_key4,
                                                     serialized_mock_data,
                                                     serialized_crc32)
                    receiver_ref_p.finish_receive(session_id, chunk_key4,
                                                  serialized_crc32)

                    self.assertTupleEqual((), self.get_result(5))

                # test intermediate error
                def mocked_store_create(*_):
                    raise SpillNotConfigured

                with patch_method(PlasmaSharedStore.create,
                                  new=mocked_store_create):
                    receiver_ref_p.create_data_writer(
                        session_id, chunk_key5, data_size, test_actor, ensure_cached=False, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s),
                              lambda *s: test_actor.set_result(s, accept=False))

                    with self.assertRaises(SpillNotConfigured):
                        self.get_result(5)

                # test receive timeout
                receiver_ref_p.register_finish_callback(session_id, chunk_key6, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False))

                receiver_ref_p.create_data_writer(session_id, chunk_key6, data_size, test_actor,
                                                  timeout=2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s))
                self.assertTupleEqual(self.get_result(5),
                                      (receiver_ref.address, None))
                receiver_ref_p.receive_data_part(
                    session_id, chunk_key6, serialized_mock_data[:64],
                    zlib.crc32(serialized_mock_data[:64]))

                with self.assertRaises(TimeoutError):
                    self.get_result(5)

                # test sender halt
                receiver_ref_p.register_finish_callback(session_id, chunk_key7, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False))

                mock_ref = pool.actor_ref(test_actor.uid, address='MOCK_ADDR')
                receiver_ref_p.create_data_writer(
                    session_id, chunk_key7, data_size, mock_ref, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s))
                self.assertTupleEqual(self.get_result(5),
                                      (receiver_ref.address, None))
                receiver_ref_p.receive_data_part(
                    session_id, chunk_key7, serialized_mock_data[:64],
                    zlib.crc32(serialized_mock_data[:64]))
                receiver_ref_p.notify_dead_senders(['MOCK_ADDR'])

                with self.assertRaises(WorkerDead):
                    self.get_result(5)

                # test checksum error on finish_receive
                result = receiver_ref_p.create_data_writer(session_id,
                                                           chunk_key8,
                                                           data_size,
                                                           test_actor,
                                                           use_promise=False)
                self.assertTupleEqual(result, (receiver_ref.address, None))

                receiver_ref_p.receive_data_part(session_id, chunk_key8,
                                                 serialized_mock_data,
                                                 serialized_crc32)
                receiver_ref_p.finish_receive(session_id, chunk_key8, 0)
Пример #14
0
    def testSender(self):
        send_pool_addr = 'localhost:%d' % get_next_port()
        recv_pool_addr = 'localhost:%d' % get_next_port()
        recv_pool_addr2 = 'localhost:%d' % get_next_port()

        options.worker.spill_directory = tempfile.mkdtemp(
            prefix='mars_test_sender_')
        session_id = str(uuid.uuid4())

        mock_data = np.array([1, 2, 3, 4])
        chunk_key1 = str(uuid.uuid4())
        chunk_key2 = str(uuid.uuid4())

        @contextlib.contextmanager
        def start_send_recv_pool():
            with start_transfer_test_pool(
                    address=send_pool_addr,
                    plasma_size=self.plasma_storage_size) as sp:
                sp.create_actor(SenderActor, uid=SenderActor.default_uid())
                with start_transfer_test_pool(
                        address=recv_pool_addr,
                        plasma_size=self.plasma_storage_size) as rp:
                    rp.create_actor(MockReceiverActor,
                                    uid=ReceiverActor.default_uid())
                    yield sp, rp

        with start_send_recv_pool() as (send_pool, recv_pool):
            sender_ref = send_pool.actor_ref(SenderActor.default_uid())
            receiver_ref = recv_pool.actor_ref(ReceiverActor.default_uid())

            with self.run_actor_test(send_pool) as test_actor:
                storage_client = test_actor.storage_client

                # send when data missing
                sender_ref_p = test_actor.promise_ref(sender_ref)
                sender_ref_p.send_data(session_id, str(uuid.uuid4()), recv_pool_addr, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False))
                with self.assertRaises(DependencyMissing):
                    self.get_result(5)

                # send data in spill
                serialized = dataserializer.serialize(mock_data)
                self.waitp(
                    storage_client.create_writer(
                        session_id, chunk_key1, serialized.total_bytes,
                        [DataStorageDevice.DISK
                         ]).then(lambda writer: promise.finished().then(
                             lambda *_: writer.write(serialized)).then(
                                 lambda *_: writer.close())))

                sender_ref_p.send_data(session_id, chunk_key1, recv_pool_addr, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False))
                self.get_result(5)
                assert_array_equal(
                    mock_data,
                    receiver_ref.get_result_data(session_id, chunk_key1))
                storage_client.delete(session_id, chunk_key1)

                # send data in plasma store
                self.waitp(
                    storage_client.put_object(
                        session_id, chunk_key1, mock_data,
                        [DataStorageDevice.SHARED_MEMORY]))

                sender_ref_p.send_data(session_id, chunk_key1, recv_pool_addr, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False))
                self.get_result(5)
                assert_array_equal(
                    mock_data,
                    receiver_ref.get_result_data(session_id, chunk_key1))

                # send data to multiple targets
                with start_transfer_test_pool(
                        address=recv_pool_addr2,
                        plasma_size=self.plasma_storage_size) as rp2:
                    recv_ref2 = rp2.create_actor(
                        MockReceiverActor, uid=ReceiverActor.default_uid())

                    self.waitp(
                        sender_ref_p.send_data(
                            session_id,
                            chunk_key1, [recv_pool_addr, recv_pool_addr2],
                            _promise=True))
                    # send data to already transferred / transferring
                    sender_ref_p.send_data(session_id, chunk_key1,
                                           [recv_pool_addr, recv_pool_addr2], _promise=True) \
                        .then(lambda *s: test_actor.set_result(s)) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False))
                    self.get_result(5)
                    assert_array_equal(
                        mock_data,
                        recv_ref2.get_result_data(session_id, chunk_key1))

                # send data to non-exist endpoint which causes error
                self.waitp(
                    storage_client.put_object(
                        session_id, chunk_key2, mock_data,
                        [DataStorageDevice.SHARED_MEMORY]))

                sender_ref_p.send_data(session_id, chunk_key2, recv_pool_addr2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False))
                with self.assertRaises(BrokenPipeError):
                    self.get_result(5)

                def mocked_receive_data_part(*_):
                    raise ChecksumMismatch

                with patch_method(MockReceiverActor.receive_data_part,
                                  new=mocked_receive_data_part):
                    sender_ref_p.send_data(session_id, chunk_key2, recv_pool_addr, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s)) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False))
                    with self.assertRaises(ChecksumMismatch):
                        self.get_result(5)
Пример #15
0
    def testReceiverWorker(self):
        pool_addr = f'localhost:{get_next_port()}'
        options.worker.spill_directory = tempfile.mkdtemp(
            prefix='mars_test_receiver_')
        session_id = str(uuid.uuid4())

        mock_data = np.array([1, 2, 3, 4])
        serialized_arrow_data = dataserializer.serialize(mock_data)
        data_size = serialized_arrow_data.total_bytes
        dumped_mock_data = dataserializer.dumps(mock_data)

        chunk_key1 = str(uuid.uuid4())
        chunk_key2 = str(uuid.uuid4())
        chunk_key3 = str(uuid.uuid4())
        chunk_key4 = str(uuid.uuid4())
        chunk_key5 = str(uuid.uuid4())
        chunk_key6 = str(uuid.uuid4())
        chunk_key7 = str(uuid.uuid4())
        chunk_key8 = str(uuid.uuid4())
        chunk_key9 = str(uuid.uuid4())

        with start_transfer_test_pool(address=pool_addr, plasma_size=self.plasma_storage_size) as pool, \
                self.run_actor_test(pool) as test_actor:
            storage_client = test_actor.storage_client
            receiver_ref = test_actor.promise_ref(
                pool.create_actor(ReceiverWorkerActor, uid=str(uuid.uuid4())))
            receiver_manager_ref = test_actor.promise_ref(
                ReceiverManagerActor.default_uid())

            # SCENARIO 1: create two writers and write with chunks
            self.waitp(
                receiver_ref.create_data_writers(session_id,
                                                 [chunk_key1, chunk_key2],
                                                 [data_size] * 2,
                                                 test_actor,
                                                 _promise=True))
            receiver_ref.receive_data_part(
                session_id, [chunk_key1, chunk_key2], [True, False],
                dumped_mock_data,
                dumped_mock_data[:len(dumped_mock_data) // 2])
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key1),
                             ReceiveStatus.RECEIVED)
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key2),
                             ReceiveStatus.RECEIVING)
            receiver_ref.receive_data_part(
                session_id, [chunk_key2], [True],
                dumped_mock_data[len(dumped_mock_data) // 2:])
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key2),
                             ReceiveStatus.RECEIVED)
            assert_array_equal(
                storage_client.get_object(session_id,
                                          chunk_key1,
                                          [DataStorageDevice.SHARED_MEMORY],
                                          _promise=False), mock_data)
            assert_array_equal(
                storage_client.get_object(session_id,
                                          chunk_key2,
                                          [DataStorageDevice.SHARED_MEMORY],
                                          _promise=False), mock_data)

            # SCENARIO 2: one of the writers failed to create,
            # will test both existing and non-existing keys
            old_create_writer = StorageClient.create_writer

            def _create_writer_with_fail(self, session_id, chunk_key, *args,
                                         **kwargs):
                if chunk_key == fail_key:
                    if kwargs.get('_promise', True):
                        return promise.finished(*build_exc_info(ValueError),
                                                **dict(_accept=False))
                    else:
                        raise ValueError
                return old_create_writer(self, session_id, chunk_key, *args,
                                         **kwargs)

            with patch_method(StorageClient.create_writer, new=_create_writer_with_fail), \
                    self.assertRaises(ValueError):
                fail_key = chunk_key4
                self.waitp(
                    receiver_ref.create_data_writers(
                        session_id, [chunk_key3, chunk_key4, chunk_key5],
                        [data_size] * 3,
                        test_actor,
                        ensure_cached=False,
                        _promise=True))
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key3),
                             ReceiveStatus.NOT_STARTED)
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key4),
                             ReceiveStatus.NOT_STARTED)
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key5),
                             ReceiveStatus.NOT_STARTED)

            with patch_method(StorageClient.create_writer,
                              new=_create_writer_with_fail):
                fail_key = chunk_key2
                self.waitp(
                    receiver_ref.create_data_writers(session_id,
                                                     [chunk_key2, chunk_key3],
                                                     [data_size] * 2,
                                                     test_actor,
                                                     ensure_cached=False,
                                                     _promise=True))

            # SCENARIO 3: transfer timeout
            receiver_manager_ref.register_pending_keys(session_id,
                                                       [chunk_key6])
            self.waitp(
                receiver_ref.create_data_writers(session_id, [chunk_key6],
                                                 [data_size],
                                                 test_actor,
                                                 timeout=1,
                                                 _promise=True))
            with self.assertRaises(TimeoutError):
                self.waitp(
                    receiver_manager_ref.add_keys_callback(session_id,
                                                           [chunk_key6],
                                                           _promise=True))

            # SCENARIO 4: cancelled transfer (both before and during transfer)
            receiver_manager_ref.register_pending_keys(session_id,
                                                       [chunk_key7])
            self.waitp(
                receiver_ref.create_data_writers(session_id, [chunk_key7],
                                                 [data_size],
                                                 test_actor,
                                                 timeout=1,
                                                 _promise=True))
            receiver_ref.cancel_receive(session_id, [chunk_key2, chunk_key7])
            with self.assertRaises(KeyError):
                receiver_ref.receive_data_part(
                    session_id, [chunk_key7], [False],
                    dumped_mock_data[:len(dumped_mock_data) // 2])
            with self.assertRaises(KeyError):
                self.waitp(
                    receiver_manager_ref.add_keys_callback(session_id,
                                                           [chunk_key7],
                                                           _promise=True))

            # SCENARIO 5: sender halt and receiver is notified (reusing previous unsuccessful key)
            receiver_manager_ref.register_pending_keys(session_id,
                                                       [chunk_key7])
            mock_ref = pool.actor_ref(test_actor.uid, address='MOCK_ADDR')
            self.waitp(
                receiver_ref.create_data_writers(session_id, [chunk_key7],
                                                 [data_size],
                                                 mock_ref,
                                                 timeout=1,
                                                 _promise=True))
            receiver_ref.notify_dead_senders(['MOCK_ADDR'])
            with self.assertRaises(WorkerDead):
                self.waitp(
                    receiver_manager_ref.add_keys_callback(session_id,
                                                           [chunk_key7],
                                                           _promise=True))

            # SCENARIO 6: successful transfer without promise
            receiver_ref.create_data_writers(session_id, [chunk_key8],
                                             [data_size],
                                             mock_ref,
                                             use_promise=False)
            receiver_ref.receive_data_part(session_id, [chunk_key8], [True],
                                           dumped_mock_data)
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key8),
                             ReceiveStatus.RECEIVED)
            assert_array_equal(
                storage_client.get_object(session_id,
                                          chunk_key8,
                                          [DataStorageDevice.SHARED_MEMORY],
                                          _promise=False), mock_data)

            # SCENARIO 7: failed transfer without promise
            with patch_method(StorageClient.create_writer, new=_create_writer_with_fail), \
                    self.assertRaises(ValueError):
                fail_key = chunk_key9
                receiver_ref.create_data_writers(session_id, [chunk_key9],
                                                 [data_size],
                                                 mock_ref,
                                                 use_promise=False)
Пример #16
0
    def testReceiver(self):
        pool_addr = 'localhost:%d' % get_next_port()
        options.worker.spill_directory = os.path.join(
            tempfile.gettempdir(), 'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker)))
        session_id = str(uuid.uuid4())

        mock_data = np.array([1, 2, 3, 4])
        serialized_mock_data = dataserializer.dumps(mock_data)
        serialized_crc32 = zlib.crc32(serialized_mock_data)

        chunk_key1 = str(uuid.uuid4())
        chunk_key2 = str(uuid.uuid4())
        chunk_key3 = str(uuid.uuid4())
        chunk_key4 = str(uuid.uuid4())
        chunk_key5 = str(uuid.uuid4())
        chunk_key6 = str(uuid.uuid4())

        with start_transfer_test_pool(address=pool_addr, plasma_size=self.plasma_storage_size) as pool:
            chunk_holder_ref = pool.actor_ref(ChunkHolderActor.default_name())
            mapper_ref = pool.actor_ref(PlasmaKeyMapActor.default_name())
            receiver_ref = pool.create_actor(ReceiverActor, uid=str(uuid.uuid4()))

            store = PlasmaChunkStore(self._plasma_client, mapper_ref)

            # check_status on receiving and received
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key1),
                             ReceiveStatus.NOT_STARTED)

            write_spill_file(chunk_key1, mock_data)
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key1),
                             ReceiveStatus.RECEIVED)
            os.unlink(build_spill_file_name(chunk_key1))

            ref = store.put(session_id, chunk_key1, mock_data)
            data_size = store.get_actual_size(session_id, chunk_key1)
            chunk_holder_ref.register_chunk(session_id, chunk_key1)
            del ref
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key1),
                             ReceiveStatus.RECEIVED)

            with self.run_actor_test(pool) as test_actor:
                receiver_ref_p = test_actor.promise_ref(receiver_ref)

                # cancel on an un-run / missing result will result in nothing
                receiver_ref_p.cancel_receive(session_id, chunk_key2)

                # start creating writer
                receiver_ref_p.create_data_writer(session_id, chunk_key1, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, ReceiveStatus.RECEIVED))

                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, ReceiveStatus.RECEIVING))

                receiver_ref_p.cancel_receive(session_id, chunk_key2)
                self.assertEqual(receiver_ref.check_status(session_id, chunk_key2),
                                 ReceiveStatus.NOT_STARTED)

                # test checksum error on receive_data_part
                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))

                receiver_ref_p.register_finish_callback(session_id, chunk_key2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data, 0)

                with self.assertRaises(ChecksumMismatch):
                    self.get_result(5)

                # test checksum error on finish_receive
                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data, serialized_crc32)
                receiver_ref_p.finish_receive(session_id, chunk_key2, 0)

                receiver_ref_p.register_finish_callback(session_id, chunk_key2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                with self.assertRaises(ChecksumMismatch):
                    self.get_result(5)

                receiver_ref_p.cancel_receive(session_id, chunk_key2)

                # test intermediate cancellation
                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                receiver_ref_p.register_finish_callback(session_id, chunk_key2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data[:64],
                                                 zlib.crc32(serialized_mock_data[:64]))
                receiver_ref_p.cancel_receive(session_id, chunk_key2)
                receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data[64:],
                                                 serialized_crc32)
                with self.assertRaises(ExecutionInterrupted):
                    self.get_result(5)

                # test transfer in memory
                receiver_ref_p.register_finish_callback(session_id, chunk_key3, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                receiver_ref_p.create_data_writer(session_id, chunk_key3, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                receiver_ref_p.receive_data_part(session_id, chunk_key3, serialized_mock_data[:64],
                                                 zlib.crc32(serialized_mock_data[:64]))
                receiver_ref_p.receive_data_part(session_id, chunk_key3, serialized_mock_data[64:], serialized_crc32)
                receiver_ref_p.finish_receive(session_id, chunk_key3, serialized_crc32)

                self.assertTupleEqual((), self.get_result(5))

                receiver_ref_p.create_data_writer(session_id, chunk_key3, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, ReceiveStatus.RECEIVED))

                # test transfer in spill file
                def mocked_store_create(*_):
                    raise StoreFull

                with patch_method(PlasmaChunkStore.create, new=mocked_store_create):
                    # test receive aborted
                    receiver_ref_p.create_data_writer(
                        session_id, chunk_key4, data_size, test_actor, ensure_cached=False, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False))
                    self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                    receiver_ref_p.register_finish_callback(session_id, chunk_key4, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                    receiver_ref_p.receive_data_part(session_id, chunk_key4, serialized_mock_data[:64],
                                                     zlib.crc32(serialized_mock_data[:64]))
                    receiver_ref_p.cancel_receive(session_id, chunk_key4)
                    with self.assertRaises(ExecutionInterrupted):
                        self.get_result(5)

                    # test receive into spill
                    receiver_ref_p.create_data_writer(
                        session_id, chunk_key4, data_size, test_actor, ensure_cached=False, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False))
                    self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                    receiver_ref_p.register_finish_callback(session_id, chunk_key4, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                    receiver_ref_p.receive_data_part(session_id, chunk_key4, serialized_mock_data, serialized_crc32)
                    receiver_ref_p.finish_receive(session_id, chunk_key4, serialized_crc32)

                    self.assertTupleEqual((), self.get_result(5))

                # test intermediate error
                def mocked_store_create(*_):
                    raise SpillNotConfigured

                with patch_method(PlasmaChunkStore.create, new=mocked_store_create):
                    receiver_ref_p.create_data_writer(
                        session_id, chunk_key5, data_size, test_actor, ensure_cached=False, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False),
                              lambda *s: test_actor.set_result(s, accept=False, destroy=False))

                    with self.assertRaises(SpillNotConfigured):
                        self.get_result(5)

                # test receive timeout
                receiver_ref_p.register_finish_callback(session_id, chunk_key6, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                receiver_ref_p.create_data_writer(session_id, chunk_key6, data_size, test_actor,
                                                  timeout=2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))
                receiver_ref_p.receive_data_part(session_id, chunk_key6, serialized_mock_data[:64],
                                                 zlib.crc32(serialized_mock_data[:64]))

                with self.assertRaises(TimeoutError):
                    self.get_result(5)
Пример #17
0
    def testPrepareQuota(self, *_):
        pinned = True

        orig_pin = SharedHolderActor.pin_data_keys

        def _mock_pin(self, session_id, chunk_keys, token):
            from mars.errors import PinDataKeyFailed
            if pinned:
                raise PinDataKeyFailed
            return orig_pin(self, session_id, chunk_keys, token)

        pool_address = '127.0.0.1:%d' % get_next_port()
        session_id = str(uuid.uuid4())
        mock_data = np.array([1, 2, 3, 4])
        with patch_method(SharedHolderActor.pin_data_keys, new=_mock_pin), \
                create_actor_pool(n_process=1, backend='gevent', address=pool_address) as pool:
            self.create_standard_actors(pool, pool_address, with_daemon=False, with_status=False)
            pool.create_actor(MockSenderActor, [mock_data], 'in', uid='w:mock_sender')
            pool.create_actor(CpuCalcActor)
            pool.create_actor(InProcHolderActor)
            pool.actor_ref(WorkerClusterInfoActor.default_uid())

            import mars.tensor as mt
            from mars.tensor.fetch import TensorFetch
            arr = mt.ones((4,), chunk_size=4)
            arr_add = mt.array(mock_data)
            result_tensor = arr + arr_add
            graph = result_tensor.build_graph(compose=False, tiled=True)

            arr_add = get_tiled(arr_add)
            result_tensor = get_tiled(result_tensor)

            modified_chunk = arr_add.chunks[0]
            arr_add.chunks[0]._op = TensorFetch(
                dtype=modified_chunk.dtype, _outputs=[weakref.ref(o) for o in modified_chunk.op.outputs],
                _key=modified_chunk.op.key)
            metas = {modified_chunk.key: WorkerMeta(
                mock_data.nbytes, mock_data.shape,
                ('0.0.0.0:1234', pool_address.replace('127.0.0.1', 'localhost')))}
            with self.run_actor_test(pool) as test_actor:
                graph_key = str(uuid.uuid4())
                execution_ref = test_actor.promise_ref(ExecutionActor.default_uid())

                start_time = time.time()

                execution_ref.execute_graph(
                    session_id, graph_key, serialize_graph(graph),
                    dict(chunks=[result_tensor.chunks[0].key]), metas, _tell=True)

                execution_ref.add_finish_callback(session_id, graph_key, _promise=True) \
                    .then(lambda *_: test_actor.set_result(time.time())) \
                    .catch(lambda *exc: test_actor.set_result(exc, False))

                def _delay_fun():
                    nonlocal pinned
                    time.sleep(0.5)
                    pinned = False

                threading.Thread(target=_delay_fun).start()

            finish_time = self.get_result()
            self.assertGreaterEqual(finish_time, start_time + 0.5)
Пример #18
0
    def testQuota(self):
        def _raiser(*_, **__):
            raise ValueError

        local_pool_addr = 'localhost:%d' % get_next_port()
        with create_actor_pool(n_process=1, backend='gevent', address=local_pool_addr) as pool:
            pool.create_actor(WorkerClusterInfoActor, [local_pool_addr],
                              uid=WorkerClusterInfoActor.default_uid())
            pool.create_actor(StatusActor, local_pool_addr, uid=StatusActor.default_uid())

            quota_ref = pool.create_actor(QuotaActor, 300, uid=QuotaActor.default_uid())

            quota_ref.process_quota('non_exist')
            quota_ref.hold_quota('non_exist')
            quota_ref.release_quota('non_exist')

            with self.assertRaises(ValueError):
                quota_ref.request_quota('ERROR', 1000)

            self.assertTrue(quota_ref.request_quota('0', 100))
            self.assertTrue(quota_ref.request_quota('0', 50))
            self.assertTrue(quota_ref.request_quota('0', 200))

            quota_ref.process_quota('0')
            self.assertIn('0', quota_ref.dump_data().proc_sizes)
            quota_ref.alter_allocation('0', 190, new_key=('0', 0))
            self.assertEqual(quota_ref.dump_data().allocations[('0', 0)], 190)

            quota_ref.hold_quota(('0', 0))
            self.assertIn(('0', 0), quota_ref.dump_data().hold_sizes)
            quota_ref.alter_allocation(('0', 0), new_key=('0', 1))
            self.assertEqual(quota_ref.dump_data().allocations[('0', 1)], 190)

            with self.run_actor_test(pool) as test_actor:
                ref = test_actor.promise_ref(QuotaActor.default_uid())

                ref.request_quota('1', 150, _promise=True) \
                    .then(lambda *_: test_actor.set_result(True)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False))
                pool.sleep(0.5)

                self.assertFalse(quota_ref.request_quota('2', 50))
                self.assertFalse(quota_ref.request_quota('3', 200))

                self.assertFalse(quota_ref.request_quota('3', 180))

                self.assertNotIn('2', quota_ref.dump_data().allocations)

                ref.cancel_requests(('1',), reject_exc=build_exc_info(OSError))
                with self.assertRaises(OSError):
                    self.get_result(5)

                with patch_method(QuotaActor._request_quota, new=_raiser):
                    ref.request_quota('err_raise', 1, _promise=True) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False))

                    with self.assertRaises(ValueError):
                        self.get_result(5)

                    ref.request_batch_quota({'err_raise': 1}, _promise=True) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False))

                    with self.assertRaises(ValueError):
                        self.get_result(5)

            self.assertNotIn('1', quota_ref.dump_data().requests)
            self.assertIn('2', quota_ref.dump_data().allocations)
            self.assertNotIn('3', quota_ref.dump_data().allocations)

            quota_ref.release_quotas([('0', 1)])
            self.assertIn('3', quota_ref.dump_data().allocations)

            self.assertFalse(quota_ref.request_quota('4', 180))
            quota_ref.alter_allocations(['3'], [50])
            self.assertIn('4', quota_ref.dump_data().allocations)

            with self.run_actor_test(pool) as test_actor:
                ref = test_actor.promise_ref(QuotaActor.default_uid())
                ref.request_quota('5', 50, _promise=True) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False))

                with patch_method(QuotaActor.alter_allocation, new=_raiser):
                    quota_ref.release_quota('2')

                    with self.assertRaises(ValueError):
                        self.get_result(5)