Пример #1
0
    def testKVStoreActor(self):
        etcd_port = get_next_port()
        proc_helper = EtcdProcessHelper(port_range_start=etcd_port)
        options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
        with proc_helper.run(), create_actor_pool(n_process=1,
                                                  backend='gevent') as pool:
            store_ref = pool.create_actor(KVStoreActor,
                                          uid=KVStoreActor.default_name())

            store_ref.write('/node/v1', 'value1')
            store_ref.write('/node/v2', 'value2')
            store_ref.write_batch([
                ('/node/v2', 'value2'),
                ('/node/v3', 'value3'),
            ])

            self.assertEqual(store_ref.read('/node/v1').value, 'value1')
            self.assertListEqual([
                v.value for v in store_ref.read_batch(['/node/v2', '/node/v3'])
            ], ['value2', 'value3'])

            store_ref.delete('/node', dir=True, recursive=True)
            with self.assertRaises(KeyError):
                store_ref.delete('/node', dir=True, recursive=True)
            store_ref.delete('/node', dir=True, recursive=True, silent=True)
Пример #2
0
class Test(unittest.TestCase):
    def tearDown(self):
        super(Test, self).tearDown()
        options.kv_store = ':inproc:'

    @unittest.skipIf(sys.platform == 'win32', 'does not run in windows')
    @unittest.skipIf('CI' not in os.environ and not EtcdProcessHelper().is_installed(),
                     'does not run without etcd')
    def testKVStoreActor(self):
        etcd_port = get_next_port()
        proc_helper = EtcdProcessHelper(port_range_start=etcd_port)
        options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
        with proc_helper.run(), create_actor_pool(n_process=1, backend='gevent') as pool:
            store_ref = pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name())

            store_ref.write('/node/v1', 'value1')
            store_ref.write('/node/v2', 'value2')
            store_ref.write_batch([
                ('/node/v2', 'value2'),
                ('/node/v3', 'value3'),
            ])

            self.assertEqual(store_ref.read('/node/v1').value, 'value1')
            self.assertListEqual([v.value for v in store_ref.read_batch(['/node/v2', '/node/v3'])],
                                 ['value2', 'value3'])

            store_ref.delete('/node', dir=True, recursive=True)
            with self.assertRaises(KeyError):
                store_ref.delete('/node', dir=True, recursive=True)
            store_ref.delete('/node', dir=True, recursive=True, silent=True)
Пример #3
0
    def testEtcdWatch(self):
        with EtcdProcessHelper(port_range_start=51342).run():
            kvstore = get('etcd://localhost:51342')
            kvstore.write('/node/subnode/v1', 'value1')
            kvstore.write('/node/v2', 'value2')

            def watcher():
                return kvstore.watch('/node/v2', timeout=10)

            def writer():
                gevent.sleep(1)
                kvstore.write('/node/v2', 'value2\'')

            g1 = gevent.spawn(writer)
            g2 = gevent.spawn(watcher)
            gevent.joinall([g1, g2])
            self.assertEqual(g2.value.value, 'value2\'')

            kvstore.delete('/node/v2')

            def watcher():
                return kvstore.watch('/node/subnode',
                                     timeout=10,
                                     recursive=True)

            def writer():
                gevent.sleep(1)
                kvstore.write('/node/subnode/v1', 'value1\'')

            g1 = gevent.spawn(writer)
            g2 = gevent.spawn(watcher)
            gevent.joinall([g1, g2])
            self.assertEqual(g2.value.children[0].value, 'value1\'')

            kvstore.write('/node/subnode/v3', '-1')

            def watcher():
                results = []
                for idx, result in enumerate(
                        kvstore.eternal_watch('/node/subnode/v3')):
                    results.append(int(result.value))
                    if idx == 4:
                        break
                return results

            def writer():
                gevent.sleep(0.1)
                for v in range(5):
                    kvstore.write('/node/subnode/v3', str(v))
                    gevent.sleep(0.1)

            g1 = gevent.spawn(writer)
            g2 = gevent.spawn(watcher)
            gevent.joinall([g1, g2])
            self.assertEqual(g2.value, list(range(5)))

            kvstore.delete('/node', dir=True, recursive=True)
Пример #4
0
    def testKVStoreActor(self):
        proc_helper = EtcdProcessHelper(port_range_start=54131)
        with proc_helper.run(), create_actor_pool(n_process=1,
                                                  backend='gevent') as pool:
            store_ref = pool.create_actor(KVStoreActor,
                                          uid=KVStoreActor.default_name())

            store_ref.write('/node/v1', 'value1')
            store_ref.write('/node/v2', 'value2')
            store_ref.write_batch([
                ('/node/v2', 'value2'),
                ('/node/v3', 'value3'),
            ])

            self.assertEqual(store_ref.read('/node/v1').value, 'value1')
            self.assertListEqual([
                v.value for v in store_ref.read_batch(['/node/v2', '/node/v3'])
            ], ['value2', 'value3'])
Пример #5
0
class Test(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        from mars import kvstore

        options.worker.spill_directory = os.path.join(tempfile.gettempdir(),
                                                      'mars_test_spill')
        cls._kv_store = kvstore.get(options.kv_store)

    @classmethod
    def tearDownClass(cls):
        import shutil
        if os.path.exists(options.worker.spill_directory):
            shutil.rmtree(options.worker.spill_directory)

        try:
            delay_state_file = os.environ.get('DELAY_STATE_FILE')
            if delay_state_file:
                os.unlink(delay_state_file)
        except OSError:
            pass

    def setUp(self):
        self.scheduler_endpoints = []
        self.proc_schedulers = []
        self.proc_workers = []
        self.state_files = []
        self.etcd_helper = None

    def tearDown(self):
        for fn in self.state_files:
            if os.path.exists(fn):
                os.unlink(fn)

        procs = tuple(self.proc_workers) + tuple(self.proc_schedulers)
        for p in procs:
            p.send_signal(signal.SIGINT)

        check_time = time.time()
        while any(p.poll() is None for p in procs):
            time.sleep(0.1)
            if time.time() - check_time > 5:
                break

        for p in procs:
            if p.poll() is None:
                p.kill()

        if self.etcd_helper:
            self.etcd_helper.stop()
        options.kv_store = ':inproc:'

    def add_state_file(self, environ):
        fn = os.environ[environ] = os.path.join(
            tempfile.gettempdir(),
            'test-main-%s-%d-%d' % (environ.lower(), os.getpid(), id(self)))
        self.state_files.append(fn)
        return fn

    def start_processes(self,
                        n_schedulers=2,
                        n_workers=2,
                        etcd=False,
                        modules=None,
                        log_scheduler=True,
                        log_worker=True):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception, )

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(
                ['--schedulers', ','.join(self.scheduler_endpoints)])

        if 'DUMP_GRAPH_DATA' in os.environ:
            append_args += ['-Dscheduler.dump_graph_data=true']

        self.proc_schedulers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1',
                '--level', 'debug' if log_scheduler else 'warning', '-p', p,
                '--format', '%(asctime)-15s %(message)s',
                '-Dscheduler.retry_delay=5'
            ] + append_args) for p in scheduler_ports
        ]
        self.proc_workers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
                '--cpu-procs', '1', '--level',
                'debug' if log_worker else 'warning', '--cache-mem', '16m',
                '--ignore-avail-mem', '-Dworker.prepare_data_timeout=30'
            ] + append_args) for _ in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            ClusterInfoActor.default_name(),
            address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                started_schedulers = self.cluster_info.get_schedulers()
                if len(started_schedulers) < n_schedulers:
                    raise RuntimeError(
                        'Schedulers does not met requirement: %d < %d.' %
                        (len(started_schedulers), n_schedulers))
                actor_address = self.cluster_info.get_scheduler(
                    SessionManagerActor.default_name())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_name(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(
                    ResourceActor.default_name())
                resource_ref = actor_client.actor_ref(
                    ResourceActor.default_name(), address=actor_address)

                if resource_ref.get_worker_count() < n_workers:
                    raise RuntimeError(
                        'Workers does not met requirement: %d < %d.' %
                        (resource_ref.get_worker_count(), n_workers))
                break
            except:
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors

    def check_process_statuses(self):
        for scheduler_proc in self.proc_schedulers:
            if scheduler_proc.poll() is not None:
                raise SystemError('Scheduler not started. exit code %s' %
                                  self.proc_scheduler.poll())
        for worker_proc in self.proc_workers:
            if worker_proc.poll() is not None:
                raise SystemError('Worker not started. exit code %s' %
                                  worker_proc.poll())

    def wait_for_termination(self, actor_client, session_ref, graph_key):
        check_time = time.time()
        dump_time = time.time()
        check_timeout = int(os.environ.get('CHECK_TIMEOUT', 120))
        while True:
            time.sleep(0.1)
            self.check_process_statuses()
            if time.time() - check_time > check_timeout:
                raise SystemError('Check graph status timeout')
            if time.time() - dump_time > 10:
                dump_time = time.time()
                graph_refs = session_ref.get_graph_refs()
                try:
                    graph_ref = actor_client.actor_ref(graph_refs[graph_key])
                    graph_ref.dump_unfinished_terminals()
                except KeyError:
                    pass
            if session_ref.graph_state(
                    graph_key) in GraphState.TERMINATED_STATES:
                return session_ref.graph_state(graph_key)

    def testMainWithoutEtcd(self):
        self.start_processes()

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1
        assert_allclose(loads(result), expected.sum())

        a = mt.ones((100, 50), chunk_size=35) * 2 + 1
        b = mt.ones((50, 200), chunk_size=35) * 2 + 1
        c = a.dot(b)
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)
        result = session_ref.fetch_result(graph_key, c.key)
        assert_allclose(loads(result), np.ones((100, 200)) * 450)

        base_arr = np.random.random((100, 100))
        a = mt.array(base_arr)
        sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)])
        graph = sumv.build_graph()
        targets = [sumv.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = reduce(operator.add,
                          [base_arr[:10, :10] for _ in range(10)])
        result = session_ref.fetch_result(graph_key, sumv.key)
        assert_allclose(loads(result), expected)

        a = mt.ones((31, 27), chunk_size=10)
        b = a.reshape(27, 31)
        b.op.params['_reshape_with_shuffle'] = True
        graph = b.build_graph()
        targets = [b.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, b.key)
        assert_allclose(loads(result), np.ones((27, 31)))

    def testMainWithEtcd(self):
        self.start_processes(etcd=True)

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1
        assert_allclose(loads(result), expected.sum())

    def testWorkerFailOver(self):
        def kill_process_tree(proc):
            import psutil
            proc = psutil.Process(proc.pid)
            plasma_sock_dir = None
            for p in proc.children(recursive=True):
                if 'plasma' in p.name():
                    socks = [
                        conn.laddr for conn in p.connections('unix')
                        if 'plasma' in conn.laddr
                    ]
                    if socks:
                        plasma_sock_dir = os.path.dirname(socks[0])
                p.kill()
            proc.kill()
            if plasma_sock_dir:
                shutil.rmtree(plasma_sock_dir, ignore_errors=True)

        delay_file = self.add_state_file('DELAY_STATE_FILE')
        open(delay_file, 'w').close()

        terminate_file = self.add_state_file('TERMINATE_STATE_FILE')

        self.start_processes(modules=['mars.scheduler.tests.op_delayer'],
                             log_worker=True)

        session_id = uuid.uuid1()
        actor_client = new_client()
        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        np_a = np.random.random((100, 100))
        np_b = np.random.random((100, 100))

        a = mt.array(np_a, chunk_size=30) * 2 + 1
        b = mt.array(np_b, chunk_size=30) * 2 + 1
        c = a.dot(b) * 2 + 1
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        while not os.path.exists(terminate_file):
            actor_client.sleep(0.05)

        kill_process_tree(self.proc_workers[0])
        logger.warning('Worker %s KILLED!\n\n', self.proc_workers[0].pid)
        self.proc_workers = self.proc_workers[1:]
        os.unlink(delay_file)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np_a * 2 + 1).dot(np_b * 2 + 1) * 2 + 1
        assert_allclose(loads(result), expected)
Пример #6
0
    def start_processes(self,
                        n_schedulers=2,
                        n_workers=2,
                        etcd=False,
                        modules=None,
                        log_scheduler=True,
                        log_worker=True):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception, )

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(
                ['--schedulers', ','.join(self.scheduler_endpoints)])

        if 'DUMP_GRAPH_DATA' in os.environ:
            append_args += ['-Dscheduler.dump_graph_data=true']

        self.proc_schedulers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1',
                '--level', 'debug' if log_scheduler else 'warning', '-p', p,
                '--format', '%(asctime)-15s %(message)s',
                '-Dscheduler.retry_delay=5'
            ] + append_args) for p in scheduler_ports
        ]
        self.proc_workers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
                '--cpu-procs', '1', '--level',
                'debug' if log_worker else 'warning', '--cache-mem', '16m',
                '--ignore-avail-mem', '-Dworker.prepare_data_timeout=30'
            ] + append_args) for _ in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            ClusterInfoActor.default_name(),
            address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                started_schedulers = self.cluster_info.get_schedulers()
                if len(started_schedulers) < n_schedulers:
                    raise RuntimeError(
                        'Schedulers does not met requirement: %d < %d.' %
                        (len(started_schedulers), n_schedulers))
                actor_address = self.cluster_info.get_scheduler(
                    SessionManagerActor.default_name())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_name(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(
                    ResourceActor.default_name())
                resource_ref = actor_client.actor_ref(
                    ResourceActor.default_name(), address=actor_address)

                if resource_ref.get_worker_count() < n_workers:
                    raise RuntimeError(
                        'Workers does not met requirement: %d < %d.' %
                        (resource_ref.get_worker_count(), n_workers))
                break
            except:
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors
Пример #7
0
class SchedulerIntegratedTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        from mars import kvstore

        options.worker.spill_directory = os.path.join(tempfile.gettempdir(),
                                                      'mars_test_spill')
        cls._kv_store = kvstore.get(options.kv_store)

    @classmethod
    def tearDownClass(cls):
        import shutil
        if os.path.exists(options.worker.spill_directory):
            shutil.rmtree(options.worker.spill_directory)

    def setUp(self):
        self.scheduler_endpoints = []
        self.proc_schedulers = []
        self.proc_workers = []
        self.state_files = dict()
        self.etcd_helper = None
        self.intentional_death_pids = set()

    def tearDown(self):
        for env, fn in self.state_files.items():
            os.environ.pop(env)
            if os.path.exists(fn):
                os.unlink(fn)

        procs = tuple(self.proc_workers) + tuple(self.proc_schedulers)
        for p in procs:
            p.send_signal(signal.SIGINT)

        check_time = time.time()
        while any(p.poll() is None for p in procs):
            time.sleep(0.1)
            if time.time() - check_time > 5:
                break

        for p in procs:
            if p.poll() is None:
                self.kill_process_tree(p)

        if self.etcd_helper:
            self.etcd_helper.stop()
        options.kv_store = ':inproc:'

    def kill_process_tree(self, proc, intentional=True):
        if intentional:
            self.intentional_death_pids.add(proc.pid)

        import psutil
        proc = psutil.Process(proc.pid)
        plasma_sock_dir = None
        for p in proc.children(recursive=True):
            try:
                if 'plasma' in p.name():
                    socks = [
                        conn.laddr for conn in p.connections('unix')
                        if 'plasma' in conn.laddr
                    ]
                    if socks:
                        plasma_sock_dir = os.path.dirname(socks[0])
                p.kill()
            except psutil.NoSuchProcess:
                continue
        proc.kill()
        if plasma_sock_dir:
            shutil.rmtree(plasma_sock_dir, ignore_errors=True)

    def add_state_file(self, environ):
        fn = os.environ[environ] = os.path.join(
            tempfile.gettempdir(),
            'test-main-%s-%d-%d' % (environ.lower(), os.getpid(), id(self)))
        self.state_files[environ] = fn
        return fn

    def start_processes(self,
                        n_schedulers=2,
                        n_workers=2,
                        etcd=False,
                        cuda=False,
                        modules=None,
                        log_scheduler=True,
                        log_worker=True,
                        env=None):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception, )

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        append_args_scheduler = []
        append_args_worker = []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(
                ['--schedulers', ','.join(self.scheduler_endpoints)])

        if 'DUMP_GRAPH_DATA' in os.environ:
            append_args_scheduler += ['-Dscheduler.dump_graph_data=true']
        if not cuda:
            append_args_worker += ['--no-cuda']

        proc_env = os.environ.copy()
        if env:
            proc_env.update(env)

        self.proc_schedulers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1',
                '-p', p, '--log-level',
                'debug' if log_scheduler else 'warning', '--log-format',
                'SCH%d %%(asctime)-15s %%(message)s' % idx,
                '-Dscheduler.retry_delay=5', '-Dscheduler.default_cpu_usage=0',
                '-Dscheduler.status_timeout=10'
            ] + append_args + append_args_scheduler,
                             env=proc_env)
            for idx, p in enumerate(scheduler_ports)
        ]
        cuda_count = resource.cuda_count()
        self.proc_workers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
                '--cpu-procs', '1', '--log-level',
                'debug' if log_worker else 'warning', '--log-format',
                'WOR%d %%(asctime)-15s %%(message)s' % idx, '--cache-mem',
                '16m', '--ignore-avail-mem', '--cuda-device',
                str(idx % cuda_count) if cuda_count else '0',
                '-Dworker.prepare_data_timeout=30'
            ] + append_args + append_args_worker,
                             env=proc_env) for idx in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            SchedulerClusterInfoActor.default_uid(),
            address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                started_schedulers = self.cluster_info.get_schedulers()
                if len(started_schedulers) < n_schedulers:
                    raise ProcessRequirementUnmetError(
                        'Schedulers does not met requirement: %d < %d.' %
                        (len(started_schedulers), n_schedulers))
                actor_address = self.cluster_info.get_scheduler(
                    SessionManagerActor.default_uid())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_uid(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(
                    ResourceActor.default_uid())
                resource_ref = actor_client.actor_ref(
                    ResourceActor.default_uid(), address=actor_address)

                if resource_ref.get_worker_count() < n_workers:
                    raise ProcessRequirementUnmetError(
                        'Workers does not met requirement: %d < %d.' %
                        (resource_ref.get_worker_count(), n_workers))
                break
            except:
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors

    def check_process_statuses(self):
        for scheduler_proc in self.proc_schedulers:
            if scheduler_proc.poll() is not None:
                raise ProcessRequirementUnmetError(
                    'Scheduler not started. exit code %s' %
                    self.proc_scheduler.poll())
        for worker_proc in self.proc_workers:
            if worker_proc.poll(
            ) is not None and worker_proc.pid not in self.intentional_death_pids:
                raise ProcessRequirementUnmetError(
                    'Worker not started. exit code %s' % worker_proc.poll())

    def wait_for_termination(self, actor_client, session_ref, graph_key):
        check_time = time.time()
        dump_time = time.time()
        check_timeout = int(os.environ.get('CHECK_TIMEOUT', 120))
        while True:
            time.sleep(0.1)
            self.check_process_statuses()
            if time.time() - check_time > check_timeout:
                raise SystemError('Check graph status timeout')
            if time.time() - dump_time > 10:
                dump_time = time.time()
                graph_refs = session_ref.get_graph_refs()
                try:
                    graph_ref = actor_client.actor_ref(graph_refs[graph_key])
                    graph_ref.dump_unfinished_terminals()
                except KeyError:
                    pass
            if session_ref.graph_state(
                    graph_key) in GraphState.TERMINATED_STATES:
                return session_ref.graph_state(graph_key)
Пример #8
0
    def start_processes(self,
                        n_schedulers=2,
                        n_workers=2,
                        etcd=False,
                        cuda=False,
                        modules=None,
                        log_scheduler=True,
                        log_worker=True,
                        env=None):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception, )

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        append_args_scheduler = []
        append_args_worker = []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(
                ['--schedulers', ','.join(self.scheduler_endpoints)])

        if 'DUMP_GRAPH_DATA' in os.environ:
            append_args_scheduler += ['-Dscheduler.dump_graph_data=true']
        if not cuda:
            append_args_worker += ['--no-cuda']

        proc_env = os.environ.copy()
        if env:
            proc_env.update(env)

        self.proc_schedulers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1',
                '-p', p, '--log-level',
                'debug' if log_scheduler else 'warning', '--log-format',
                'SCH%d %%(asctime)-15s %%(message)s' % idx,
                '-Dscheduler.retry_delay=5', '-Dscheduler.default_cpu_usage=0',
                '-Dscheduler.status_timeout=10'
            ] + append_args + append_args_scheduler,
                             env=proc_env)
            for idx, p in enumerate(scheduler_ports)
        ]
        cuda_count = resource.cuda_count()
        cuda_devices = [int(d) for d in os.environ['CUDA_VISIBLE_DEVICES'].split(',')] \
            if os.environ.get('CUDA_VISIBLE_DEVICES') else list(range(cuda_count))
        self.proc_workers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
                '--cpu-procs', '1', '--log-level',
                'debug' if log_worker else 'warning', '--log-format',
                'WOR%d %%(asctime)-15s %%(message)s' % idx, '--cache-mem',
                '16m', '--ignore-avail-mem', '--cuda-device',
                str(cuda_devices[idx % cuda_count]) if cuda_count else '0',
                '-Dworker.prepare_data_timeout=30'
            ] + append_args + append_args_worker,
                             env=proc_env) for idx in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            SchedulerClusterInfoActor.default_uid(),
            address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                try:
                    started_schedulers = self.cluster_info.get_schedulers()
                except Exception as e:
                    raise ProcessRequirementUnmetError(
                        'Failed to get scheduler numbers, %s' % e)
                if len(started_schedulers) < n_schedulers:
                    raise ProcessRequirementUnmetError(
                        'Schedulers does not met requirement: %d < %d.' %
                        (len(started_schedulers), n_schedulers))
                actor_address = self.cluster_info.get_scheduler(
                    SessionManagerActor.default_uid())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_uid(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(
                    ResourceActor.default_uid())
                resource_ref = actor_client.actor_ref(
                    ResourceActor.default_uid(), address=actor_address)

                if resource_ref.get_worker_count() < n_workers:
                    raise ProcessRequirementUnmetError(
                        'Workers does not met requirement: %d < %d.' %
                        (resource_ref.get_worker_count(), n_workers))
                break
            except:
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors
Пример #9
0
class Test(SchedulerIntegratedTest):
    def testMainTensorWithoutEtcd(self):
        self.start_processes()

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1
        assert_allclose(loads(result), expected.sum())

        a = mt.ones((100, 50), chunk_size=35) * 2 + 1
        b = mt.ones((50, 200), chunk_size=35) * 2 + 1
        c = a.dot(b)
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)
        result = session_ref.fetch_result(graph_key, c.key)
        assert_allclose(loads(result), np.ones((100, 200)) * 450)

        base_arr = np.random.random((100, 100))
        a = mt.array(base_arr)
        sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)])
        graph = sumv.build_graph()
        targets = [sumv.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)])
        result = session_ref.fetch_result(graph_key, sumv.key)
        assert_allclose(loads(result), expected)

        a = mt.ones((31, 27), chunk_size=10)
        b = a.reshape(27, 31)
        b.op.extra_params['_reshape_with_shuffle'] = True
        r = b.sum(axis=1)
        graph = r.build_graph()
        targets = [r.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, r.key)
        assert_allclose(loads(result), np.ones((27, 31)).sum(axis=1))

        raw = np.random.RandomState(0).rand(10, 10)
        a = mt.tensor(raw, chunk_size=(5, 4))
        b = a[a.argmin(axis=1), mt.tensor(np.arange(10))]
        graph = b.build_graph()
        targets = [b.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, b.key)

        np.testing.assert_array_equal(loads(result), raw[raw.argmin(axis=1), np.arange(10)])

    @unittest.skipIf('CI' not in os.environ and not EtcdProcessHelper().is_installed(),
                     'does not run without etcd')
    def testMainTensorWithEtcd(self):
        self.start_processes(etcd=True)

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1
        assert_allclose(loads(result), expected.sum())

    @require_cupy
    @require_cudf
    def testMainTensorWithCuda(self):
        self.start_processes(cuda=True)

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1
        assert_allclose(loads(result), expected.sum())

    def testMainDataFrameWithoutEtcd(self):
        import pandas as pd
        from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df
        from mars.dataframe.datasource.series import from_pandas as from_pandas_series
        from mars.dataframe.arithmetic import add

        self.start_processes(etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true'])

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas_df(data2, chunk_size=6)

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas_df(data1, chunk_size=(10, 5))
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas_df(data2, chunk_size=(10, 6))

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas_df(data2, chunk_size=6)

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series1 = from_pandas_series(s1)

        graph = series1.build_graph()
        targets = [series1.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, series1.key)
        pd.testing.assert_series_equal(s1, loads(result))

    def testIterativeTilingWithoutEtcd(self):
        self.start_processes(etcd=False)

        session_id = uuid.uuid1()
        actor_client = new_client()
        rs = np.random.RandomState(0)

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        raw = rs.rand(100)
        a = mt.tensor(raw, chunk_size=10)
        a.sort()
        c = a[:5]

        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = np.sort(raw)[:5]
        assert_allclose(loads(result), expected)

        with self.assertRaises(KeyError):
            session_ref.fetch_result(graph_key, a.key, check=False)

        raw1 = rs.rand(20)
        raw2 = rs.rand(20)
        a = mt.tensor(raw1, chunk_size=10)
        a.sort()
        b = mt.tensor(raw2, chunk_size=15) + 1
        c = mt.concatenate([a[:10], b])
        c.sort()
        d = c[:5]

        graph = d.build_graph()
        targets = [d.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, d.key)
        expected = np.sort(np.concatenate([np.sort(raw1)[:10], raw2 + 1]))[:5]
        assert_allclose(loads(result), expected)

        raw = rs.randint(100, size=(100,))
        a = mt.tensor(raw, chunk_size=53)
        a.sort()
        b = mt.histogram(a, bins='scott')

        graph = build_tileable_graph(b, set())
        targets = [b[0].key, b[1].key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        res = session_ref.fetch_result(graph_key, b[0].key), \
              session_ref.fetch_result(graph_key, b[1].key)
        expected = np.histogram(np.sort(raw), bins='scott')
        assert_allclose(loads(res[0]), expected[0])
        assert_allclose(loads(res[1]), expected[1])

    def testDistributedContext(self):
        self.start_processes(etcd=False)

        session_id = uuid.uuid1()
        actor_client = new_client()
        rs = np.random.RandomState(0)

        context = DistributedContext(scheduler_address=self.scheduler_endpoints[0], session_id=session_id)

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))
        raw1 = rs.rand(10, 10)
        a = mt.tensor(raw1, chunk_size=4)

        graph = a.build_graph()
        targets = [a.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key,
                                          target_tileables=targets, names=['test'])

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        tileable_key = context.get_tileable_key_by_name('test')
        self.assertEqual(a.key, tileable_key)

        nsplits = context.get_tileable_metas([a.key], filter_fields=['nsplits'])[0][0]
        self.assertEqual(((4, 4, 2), (4, 4, 2)), nsplits)

        r = context.get_tileable_data(a.key)
        np.testing.assert_array_equal(raw1, r)

        indexes = [slice(3, 9), slice(0, 7)]
        r = context.get_tileable_data(a.key, indexes)
        np.testing.assert_array_equal(raw1[tuple(indexes)], r)

        indexes = [[1, 4, 2, 4, 5], slice(None, None, None)]
        r = context.get_tileable_data(a.key, indexes)
        np.testing.assert_array_equal(raw1[tuple(indexes)], r)

        indexes = ([9, 1, 2, 0], [0, 0, 4, 4])
        r = context.get_tileable_data(a.key, indexes)
        np.testing.assert_array_equal(raw1[[9, 1, 2, 0], [0, 0, 4, 4]], r)

    def testOperandsWithoutPrepareInputs(self):
        self.start_processes(etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op'])

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        actor_address = self.cluster_info.get_scheduler(ResourceActor.default_uid())
        resource_ref = actor_client.actor_ref(ResourceActor.default_uid(), address=actor_address)
        worker_endpoints = resource_ref.get_worker_endpoints()

        t1 = mt.random.rand(10)
        t1.op._expect_worker = worker_endpoints[0]
        t2 = mt.random.rand(10)
        t2.op._expect_worker = worker_endpoints[1]

        t = NoPrepareOperand().new_tileable([t1, t2])
        t.op._prepare_inputs = [False, False]

        graph = t.build_graph()
        targets = [t.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)
Пример #10
0
    def testEtcdPathStore(self):
        with EtcdProcessHelper(port_range_start=51342).run():
            kvstore = get(u'etcd://localhost:51342')
            kvstore.write(u'/node/subnode/v1', u'value1')
            kvstore.write(u'/node/v2', u'value2')

            res = kvstore.read(u'/node', sort=True)
            expected = PathResult(key=u'/node',
                                  dir=True,
                                  children=[
                                      PathResult(key=u'/node/subnode',
                                                 dir=True),
                                      PathResult(key=u'/node/v2',
                                                 value=u'value2'),
                                  ])
            self.assertEqual(repr(res), repr(expected))

            res = kvstore.read(u'/node', recursive=True, sort=True)
            expected = PathResult(key=u'/node',
                                  dir=True,
                                  children=[
                                      PathResult(key=u'/node/subnode/v1',
                                                 value=u'value1'),
                                      PathResult(key=u'/node/v2',
                                                 value=u'value2'),
                                  ])
            self.assertEqual(repr(res), repr(expected))

            kvstore.write(u'/node/v3', u'value3')
            with self.assertRaises(KeyError):
                kvstore.write(u'/node/v2/invalid_value', value=u'invalid')

            res = kvstore.read('/', recursive=False, sort=True)
            expected = PathResult(key='/',
                                  dir=True,
                                  children=[
                                      PathResult(key=u'/node', dir=True),
                                  ])
            self.assertEqual(repr(res), repr(expected))

            res = kvstore.read('/', recursive=True, sort=True)
            expected = PathResult(key='/',
                                  dir=True,
                                  children=[
                                      PathResult(key=u'/node/subnode/v1',
                                                 value=u'value1'),
                                      PathResult(key=u'/node/v2',
                                                 value=u'value2'),
                                      PathResult(key=u'/node/v3',
                                                 value=u'value3'),
                                  ])
            self.assertEqual(repr(res), repr(expected))

            kvstore.write(u'/node/subnode2/v4', u'value4')

            with self.assertRaises(KeyError):
                kvstore.delete(u'/node/subnode', dir=True)

            kvstore.delete(u'/node/subnode/v1')
            res = kvstore.read('/', recursive=True, sort=True)
            expected = PathResult(key='/',
                                  dir=True,
                                  children=[
                                      PathResult(key=u'/node/subnode',
                                                 dir=True),
                                      PathResult(key=u'/node/subnode2/v4',
                                                 value=u'value4'),
                                      PathResult(key=u'/node/v2',
                                                 value=u'value2'),
                                      PathResult(key=u'/node/v3',
                                                 value=u'value3'),
                                  ])
            self.assertEqual(repr(res), repr(expected))

            kvstore.delete(u'/node', recursive=True, dir=True)
Пример #11
0
class Test(SchedulerIntegratedTest):
    def testMainTensorWithoutEtcd(self):
        self.start_processes()
        sess = new_session(self.session_manager_ref.address)

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()

        result = c.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1
        np.testing.assert_allclose(result, expected.sum())

        a = mt.ones((100, 50), chunk_size=35) * 2 + 1
        b = mt.ones((50, 200), chunk_size=35) * 2 + 1
        c = a.dot(b)
        result = c.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        np.testing.assert_allclose(result, np.ones((100, 200)) * 450)

        base_arr = np.random.random((100, 100))
        a = mt.array(base_arr)
        r = reduce(operator.add, [a[:10, :10] for _ in range(10)])
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = reduce(operator.add,
                          [base_arr[:10, :10] for _ in range(10)])
        np.testing.assert_allclose(result, expected)

        a = mt.ones((31, 27), chunk_size=10)
        b = a.reshape(27, 31)
        b.op.extra_params['_reshape_with_shuffle'] = True
        r = b.sum(axis=1)
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        np.testing.assert_allclose(result, np.ones((27, 31)).sum(axis=1))

        raw = np.random.RandomState(0).rand(10, 10)
        a = mt.tensor(raw, chunk_size=(5, 4))
        r = a[a.argmin(axis=1), mt.tensor(np.arange(10))]
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        np.testing.assert_array_equal(result, raw[raw.argmin(axis=1),
                                                  np.arange(10)])

    @unittest.skipIf('CI' not in os.environ
                     and not EtcdProcessHelper().is_installed(),
                     'does not run without etcd')
    def testMainTensorWithEtcd(self):
        self.start_processes(etcd=True)
        sess = new_session(self.session_manager_ref.address)

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        r = (a * b * 2 + 1).sum()
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1
        np.testing.assert_allclose(result, expected.sum())

    @require_cupy
    @require_cudf
    def testMainTensorWithCuda(self):
        self.start_processes(cuda=True)
        sess = new_session(self.session_manager_ref.address)

        a = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1
        r = (a * b * 2 + 1).sum()
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = ((np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1).sum()
        np.testing.assert_allclose(result, expected)

    def testMainDataFrameWithoutEtcd(self):
        self.start_processes(
            etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true'])
        sess = new_session(self.session_manager_ref.address)

        raw1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10),
                            index=np.arange(10),
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=(10, 5))
        raw2 = pd.DataFrame(np.random.rand(10, 10),
                            index=np.arange(11, 1, -1),
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=(10, 6))
        r = df1 + df2
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10),
                            index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10),
                            index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10))
        raw1[0] = raw1[0].apply(str)
        df1 = md.DataFrame(raw1, chunk_size=5)
        r = df1.sort_values(0)
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1.sort_values(0))

        s1 = pd.Series(np.random.rand(10),
                       index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series1 = md.Series(s1, chunk_size=6)
        result = series1.execute(session=sess,
                                 timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_series_equal(result, s1)

    def testIterativeTilingWithoutEtcd(self):
        self.start_processes(etcd=False)
        sess = new_session(self.session_manager_ref.address)
        actor_client = sess._api.actor_client
        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(sess.session_id))
        rs = np.random.RandomState(0)

        raw = rs.rand(100)
        a = mt.tensor(raw, chunk_size=10)
        a.sort()
        r = a[:5]
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = np.sort(raw)[:5]
        np.testing.assert_allclose(result, expected)

        graph_key = sess._get_tileable_graph_key(r.key)
        graph_ref = actor_client.actor_ref(
            session_ref.get_graph_refs()[graph_key])
        with self.assertRaises(KeyError):
            _, keys, _ = graph_ref.get_tileable_metas([a.key])[0]
            sess._api.fetch_chunk_data(sess.session_id, keys[0])

        raw1 = rs.rand(20)
        raw2 = rs.rand(20)
        a = mt.tensor(raw1, chunk_size=10)
        a.sort()
        b = mt.tensor(raw2, chunk_size=15) + 1
        c = mt.concatenate([a[:10], b])
        c.sort()
        r = c[:5]
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = np.sort(np.concatenate([np.sort(raw1)[:10], raw2 + 1]))[:5]
        np.testing.assert_allclose(result, expected)

        raw = rs.randint(100, size=(100, ))
        a = mt.tensor(raw, chunk_size=53)
        a.sort()
        r = mt.histogram(a, bins='scott')
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = np.histogram(np.sort(raw), bins='scott')
        np.testing.assert_allclose(result[0], expected[0])
        np.testing.assert_allclose(result[1], expected[1])

    def testDistributedContext(self):
        self.start_processes(etcd=False)
        sess = new_session(self.session_manager_ref.address)
        rs = np.random.RandomState(0)
        context = DistributedContext(
            scheduler_address=self.session_manager_ref.address,
            session_id=sess.session_id)

        raw1 = rs.rand(10, 10)
        a = mt.tensor(raw1, chunk_size=4)
        a.execute(session=sess, timeout=self.timeout, name='test')

        tileable_infos = context.get_named_tileable_infos('test')
        self.assertEqual(a.key, tileable_infos.tileable_key)
        self.assertEqual(a.shape, tileable_infos.tileable_shape)

        nsplits = context.get_tileable_metas([a.key],
                                             filter_fields=['nsplits'])[0][0]
        self.assertEqual(((4, 4, 2), (4, 4, 2)), nsplits)

        r = context.get_tileable_data(a.key)
        np.testing.assert_array_equal(raw1, r)

        indexes = [slice(3, 9), slice(0, 7)]
        r = context.get_tileable_data(a.key, indexes)
        np.testing.assert_array_equal(raw1[tuple(indexes)], r)

        indexes = [[1, 4, 2, 4, 5], slice(None, None, None)]
        r = context.get_tileable_data(a.key, indexes)
        np.testing.assert_array_equal(raw1[tuple(indexes)], r)

        indexes = ([9, 1, 2, 0], [0, 0, 4, 4])
        r = context.get_tileable_data(a.key, indexes)
        np.testing.assert_array_equal(raw1[[9, 1, 2, 0], [0, 0, 4, 4]], r)

    def testOperandsWithoutPrepareInputs(self):
        self.start_processes(
            etcd=False,
            modules=['mars.scheduler.tests.integrated.no_prepare_op'])
        sess = new_session(self.session_manager_ref.address)

        actor_address = self.cluster_info.get_scheduler(
            ResourceActor.default_uid())
        resource_ref = sess._api.actor_client.actor_ref(
            ResourceActor.default_uid(), address=actor_address)
        worker_endpoints = resource_ref.get_worker_endpoints()

        t1 = mt.random.rand(10)
        t1.op._expect_worker = worker_endpoints[0]
        t2 = mt.random.rand(10)
        t2.op._expect_worker = worker_endpoints[1]

        t = NoPrepareOperand().new_tileable([t1, t2])
        t.op._prepare_inputs = [False, False]
        t.execute(session=sess, timeout=self.timeout)

    def testRemoteWithoutEtcd(self):
        from mars.scheduler.resource import ResourceActor
        from mars.worker.dispatcher import DispatchActor

        self.start_processes(
            etcd=False,
            modules=['mars.scheduler.tests.integrated.no_prepare_op'])
        sess = new_session(self.session_manager_ref.address)
        resource_ref = sess._api.actor_client.actor_ref(
            ResourceActor.default_uid(),
            address=self.cluster_info.get_scheduler(
                ResourceActor.default_uid()))
        worker_ips = resource_ref.get_worker_endpoints()

        rs = np.random.RandomState(0)
        raw1 = rs.rand(10, 10)
        raw2 = rs.rand(10, 10)

        def f_none(_x):
            return None

        r_none = spawn(f_none, raw1)
        result = r_none.execute(session=sess,
                                timeout=self.timeout).fetch(session=sess)
        self.assertIsNone(result)

        def f1(x):
            return x + 1

        def f2(x, y, z=None):
            return x * y * (z[0] + z[1])

        r1 = spawn(f1, raw1)
        r2 = spawn(f1, raw2)
        r3 = spawn(f2, (r1, r2), {'z': [r1, r2]})
        result = r3.execute(session=sess,
                            timeout=self.timeout).fetch(session=sess)
        expected = (raw1 + 1) * (raw2 + 1) * (raw1 + 1 + raw2 + 1)
        np.testing.assert_allclose(result, expected)

        def f(t, x):
            mul = (t * x).execute()
            return mul.sum().to_numpy()

        rs = np.random.RandomState(0)
        raw = rs.rand(5, 4)

        t1 = mt.tensor(raw, chunk_size=3)
        t2 = t1.sum(axis=0)
        s = spawn(f, args=(t2, 3))

        result = s.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = (raw.sum(axis=0) * 3).sum()
        self.assertAlmostEqual(result, expected)

        time.sleep(1)
        for worker_ip in worker_ips:
            ref = sess._api.actor_client.actor_ref(DispatchActor.default_uid(),
                                                   address=worker_ip)
            self.assertEqual(len(ref.get_slots('cpu')), 1)

    def testNoWorkerException(self):
        self.start_processes(etcd=False, n_workers=0)

        a = mt.ones((10, 10))
        b = mt.ones((10, 10))
        c = (a + b)

        endpoint = self.scheduler_endpoints[0]
        sess = new_session(endpoint)

        try:
            c.execute(session=sess, timeout=self.timeout)
        except ExecutionFailed as e:
            self.assertIsInstance(e.__cause__, RuntimeError)
Пример #12
0
class Test(SchedulerIntegratedTest):
    def testMainTensorWithoutEtcd(self):
        self.start_processes()
        sess = new_session(self.session_manager_ref.address)

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()

        result = c.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1
        np.testing.assert_allclose(result, expected.sum())

        a = mt.ones((100, 50), chunk_size=35) * 2 + 1
        b = mt.ones((50, 200), chunk_size=35) * 2 + 1
        c = a.dot(b)
        result = c.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        np.testing.assert_allclose(result, np.ones((100, 200)) * 450)

        base_arr = np.random.random((100, 100))
        a = mt.array(base_arr)
        r = reduce(operator.add, [a[:10, :10] for _ in range(10)])
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = reduce(operator.add,
                          [base_arr[:10, :10] for _ in range(10)])
        np.testing.assert_allclose(result, expected)

        a = mt.ones((31, 27), chunk_size=10)
        b = a.reshape(27, 31)
        b.op.extra_params['_reshape_with_shuffle'] = True
        r = b.sum(axis=1)
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        np.testing.assert_allclose(result, np.ones((27, 31)).sum(axis=1))

        raw = np.random.RandomState(0).rand(10, 10)
        a = mt.tensor(raw, chunk_size=(5, 4))
        r = a[a.argmin(axis=1), mt.tensor(np.arange(10))]
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        np.testing.assert_array_equal(result, raw[raw.argmin(axis=1),
                                                  np.arange(10)])

        raw = np.random.RandomState(0).rand(1000)
        a = mt.tensor(raw, chunk_size=100)
        r = mt.median(a)
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        np.testing.assert_array_equal(result, np.median(raw))

    @unittest.skipIf('CI' not in os.environ
                     and not EtcdProcessHelper().is_installed(),
                     'does not run without etcd')
    def testMainTensorWithEtcd(self):
        self.start_processes(etcd=True)
        sess = new_session(self.session_manager_ref.address)

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        r = (a * b * 2 + 1).sum()
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1
        np.testing.assert_allclose(result, expected.sum())

    @require_cupy
    @require_cudf
    def testMainTensorWithCuda(self):
        self.start_processes(cuda=True)
        sess = new_session(self.session_manager_ref.address)

        a = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1
        r = (a * b * 2 + 1).sum()
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = ((np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1).sum()
        np.testing.assert_allclose(result, expected)

    def testMainDataFrameWithoutEtcd(self):
        self.start_processes(
            etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true'])
        sess = new_session(self.session_manager_ref.address)

        # test binary arithmetics with different indices
        raw1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10),
                            index=np.arange(10),
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=(10, 5))
        raw2 = pd.DataFrame(np.random.rand(10, 10),
                            index=np.arange(11, 1, -1),
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=(10, 6))
        r = df1 + df2
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10),
                            index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10),
                            index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        # test sort_values
        raw1 = pd.DataFrame(np.random.rand(10, 10))
        raw1[0] = raw1[0].apply(str)
        raw1.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')])
        df1 = md.DataFrame(raw1, chunk_size=5)
        r = df1.sort_values([('A', 'C')])
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1.sort_values([('A', 'C')]))

        rs = np.random.RandomState(0)
        raw2 = pd.DataFrame({
            'a': rs.rand(10),
            'b': [f's{rs.randint(1000)}' for _ in range(10)]
        })
        raw2['b'] = raw2['b'].astype(md.ArrowStringDtype())
        mdf = md.DataFrame(raw2, chunk_size=4)
        filtered = mdf[mdf['a'] > 0.5]
        df2 = filtered.sort_values(by='b')
        result = df2.execute(session=sess,
                             timeout=self.timeout).fetch(session=sess)
        expected = raw2[raw2['a'] > 0.5].sort_values(by='b')
        pd.testing.assert_frame_equal(result, expected)

        s1 = pd.Series(np.random.rand(10),
                       index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series1 = md.Series(s1, chunk_size=6)
        result = series1.execute(session=sess,
                                 timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_series_equal(result, s1)

        # test reindex
        data = pd.DataFrame(np.random.rand(10, 5),
                            columns=['c1', 'c2', 'c3', 'c4', 'c5'])
        df3 = md.DataFrame(data, chunk_size=4)
        r = df3.reindex(index=mt.arange(10, 1, -1, chunk_size=3))

        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = data.reindex(index=np.arange(10, 1, -1))
        pd.testing.assert_frame_equal(result, expected)

        # test rebalance
        df4 = md.DataFrame(data)
        r = df4.rebalance()

        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, data)
        chunk_metas = sess.get_tileable_chunk_metas(r.key)
        workers = list(
            set(itertools.chain(*(m.workers for m in chunk_metas.values()))))
        self.assertEqual(len(workers), 2)

        # test nunique
        data = pd.DataFrame(np.random.randint(0, 10, (100, 5)),
                            columns=['c1', 'c2', 'c3', 'c4', 'c5'])
        df5 = md.DataFrame(data, chunk_size=4)
        r = df5.nunique()

        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = data.nunique()
        pd.testing.assert_series_equal(result, expected)

        # test re-execute df.groupby().agg().sort_values()
        rs = np.random.RandomState(0)
        data = pd.DataFrame({
            'col1': rs.rand(100),
            'col2': rs.randint(10, size=100)
        })
        df6 = md.DataFrame(data, chunk_size=40)
        grouped = df6.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}) \
            .execute(session=sess, timeout=self.timeout)
        r = grouped.sort_values(by='cnt').head().execute(session=sess,
                                                         timeout=self.timeout)
        result = r.fetch(session=sess)
        expected = data.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}) \
            .sort_values(by='cnt').head()
        pd.testing.assert_frame_equal(result.reset_index(drop=True),
                                      expected.reset_index(drop=True))
        r2 = df6.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}).sort_values(by='cnt').head() \
            .execute(session=sess, timeout=self.timeout)
        result = r2.fetch(session=sess)
        pd.testing.assert_frame_equal(result.reset_index(drop=True),
                                      expected.reset_index(drop=True))

        # test groupby with sample
        src_data_list = []
        sample_count = 10
        for b in range(5):
            data_count = int(np.random.randint(40, 100))
            src_data_list.append(
                pd.DataFrame({
                    'a': np.random.randint(0, 100, size=data_count),
                    'b': np.array([b] * data_count),
                    'c': np.random.randint(0, 100, size=data_count),
                    'd': np.random.randint(0, 100, size=data_count),
                }))
        data = pd.concat(src_data_list)
        shuffle_idx = np.arange(len(data))
        np.random.shuffle(shuffle_idx)
        data = data.iloc[shuffle_idx].reset_index(drop=True)

        df7 = md.DataFrame(data, chunk_size=40)
        sampled = df7.groupby('b').sample(10)
        r = sampled.execute(session=sess, timeout=self.timeout)
        result = r.fetch(session=sess)
        self.assertFalse((result.groupby('b').count() - sample_count).any()[0])

    def testIterativeTilingWithoutEtcd(self):
        self.start_processes(etcd=False)
        sess = new_session(self.session_manager_ref.address)
        actor_client = sess._api.actor_client
        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(sess.session_id))
        rs = np.random.RandomState(0)

        raw = rs.rand(100)
        a = mt.tensor(raw, chunk_size=10)
        a.sort()
        r = a[:5]
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = np.sort(raw)[:5]
        np.testing.assert_allclose(result, expected)

        graph_key = sess._get_tileable_graph_key(r.key)
        graph_ref = actor_client.actor_ref(
            session_ref.get_graph_refs()[graph_key])
        with self.assertRaises(KeyError):
            _, keys, _ = graph_ref.get_tileable_metas(
                [a.key],
                filter_fields=['nsplits', 'chunk_keys', 'chunk_indexes'])[0]
            sess._api.fetch_chunk_data(sess.session_id, keys[0])

        raw1 = rs.rand(20)
        raw2 = rs.rand(20)
        a = mt.tensor(raw1, chunk_size=10)
        a.sort()
        b = mt.tensor(raw2, chunk_size=15) + 1
        c = mt.concatenate([a[:10], b])
        c.sort()
        r = c[:5]
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = np.sort(np.concatenate([np.sort(raw1)[:10], raw2 + 1]))[:5]
        np.testing.assert_allclose(result, expected)

        raw = rs.randint(100, size=(100, ))
        a = mt.tensor(raw, chunk_size=53)
        a.sort()
        r = mt.histogram(a, bins='scott')
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = np.histogram(np.sort(raw), bins='scott')
        np.testing.assert_allclose(result[0], expected[0])
        np.testing.assert_allclose(result[1], expected[1])

    def testDistributedContext(self):
        self.start_processes(etcd=False)
        sess = new_session(self.session_manager_ref.address)
        rs = np.random.RandomState(0)
        context = DistributedContext(
            scheduler_address=self.session_manager_ref.address,
            session_id=sess.session_id)

        raw1 = rs.rand(10, 10)
        a = mt.tensor(raw1, chunk_size=4)
        a.execute(session=sess, timeout=self.timeout, name='test')

        tileable_infos = context.get_named_tileable_infos('test')
        self.assertEqual(a.key, tileable_infos.tileable_key)
        self.assertEqual(a.shape, tileable_infos.tileable_shape)

        nsplits = context.get_tileable_metas([a.key],
                                             filter_fields=['nsplits'])[0][0]
        self.assertEqual(((4, 4, 2), (4, 4, 2)), nsplits)

        r = context.get_tileable_data(a.key)
        np.testing.assert_array_equal(raw1, r)

        indexes = [slice(3, 9), slice(0, 7)]
        r = context.get_tileable_data(a.key, indexes)
        np.testing.assert_array_equal(raw1[tuple(indexes)], r)

        indexes = [[1, 4, 2, 4, 5], slice(None, None, None)]
        r = context.get_tileable_data(a.key, indexes)
        np.testing.assert_array_equal(raw1[tuple(indexes)], r)

        indexes = ([9, 1, 2, 0], [0, 0, 4, 4])
        r = context.get_tileable_data(a.key, indexes)
        np.testing.assert_array_equal(raw1[[9, 1, 2, 0], [0, 0, 4, 4]], r)

    def testOperandsWithPureDepends(self):
        self.start_processes(
            etcd=False,
            modules=['mars.scheduler.tests.integrated.no_prepare_op'])
        sess = new_session(self.session_manager_ref.address)

        actor_address = self.cluster_info.get_scheduler(
            ResourceActor.default_uid())
        resource_ref = sess._api.actor_client.actor_ref(
            ResourceActor.default_uid(), address=actor_address)
        worker_endpoints = resource_ref.get_worker_endpoints()

        t1 = mt.random.rand(10)
        t1.op._expect_worker = worker_endpoints[0]
        t2 = mt.random.rand(10)
        t2.op._expect_worker = worker_endpoints[1]

        t = PureDependsOperand().new_tileable([t1, t2])
        t.op._pure_depends = [True, True]
        t.execute(session=sess, timeout=self.timeout)

    def testRemoteWithoutEtcd(self):
        from mars.scheduler.resource import ResourceActor
        from mars.worker.dispatcher import DispatchActor

        self.start_processes(
            etcd=False,
            modules=['mars.scheduler.tests.integrated.no_prepare_op'])
        sess = new_session(self.session_manager_ref.address)
        resource_ref = sess._api.actor_client.actor_ref(
            ResourceActor.default_uid(),
            address=self.cluster_info.get_scheduler(
                ResourceActor.default_uid()))
        worker_ips = resource_ref.get_worker_endpoints()

        rs = np.random.RandomState(0)
        raw1 = rs.rand(10, 10)
        raw2 = rs.rand(10, 10)

        def f_none(_x):
            return None

        r_none = spawn(f_none, raw1)
        result = r_none.execute(session=sess,
                                timeout=self.timeout).fetch(session=sess)
        self.assertIsNone(result)

        def f1(x):
            return x + 1

        def f2(x, y, z=None):
            return x * y * (z[0] + z[1])

        r1 = spawn(f1, raw1)
        r2 = spawn(f1, raw2)
        r3 = spawn(f2, (r1, r2), {'z': [r1, r2]})
        result = r3.execute(session=sess,
                            timeout=self.timeout).fetch(session=sess)
        expected = (raw1 + 1) * (raw2 + 1) * (raw1 + 1 + raw2 + 1)
        np.testing.assert_allclose(result, expected)

        def f(t, x):
            mul = (t * x).execute()
            return mul.sum().to_numpy()

        rs = np.random.RandomState(0)
        raw = rs.rand(5, 4)

        t1 = mt.tensor(raw, chunk_size=3)
        t2 = t1.sum(axis=0)
        s = spawn(f, args=(t2, 3))

        result = s.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = (raw.sum(axis=0) * 3).sum()
        self.assertAlmostEqual(result, expected)

        time.sleep(1)
        for worker_ip in worker_ips:
            ref = sess._api.actor_client.actor_ref(DispatchActor.default_uid(),
                                                   address=worker_ip)
            self.assertEqual(len(ref.get_slots('cpu')), 1)

    def testFetchLogWithoutEtcd(self):
        # test fetch log
        with tempfile.TemporaryDirectory() as temp_dir:
            self.start_processes(
                etcd=False,
                modules=['mars.scheduler.tests.integrated.no_prepare_op'],
                scheduler_args=[f'-Dcustom_log_dir={temp_dir}'])
            sess = new_session(self.session_manager_ref.address)

            def f():
                print('test')

            r = spawn(f)
            r.execute(session=sess)

            custom_log_actor = sess._api.actor_client.actor_ref(
                CustomLogMetaActor.default_uid(),
                address=self.cluster_info.get_scheduler(
                    CustomLogMetaActor.default_uid()))

            chunk_key_to_log_path = custom_log_actor.get_tileable_op_log_paths(
                sess.session_id, r.op.key)
            paths = list(chunk_key_to_log_path.values())
            self.assertEqual(len(paths), 1)
            log_path = paths[0][1]
            with open(log_path) as f:
                self.assertEqual(f.read().strip(), 'test')

            context = DistributedContext(
                scheduler_address=self.session_manager_ref.address,
                session_id=sess.session_id)
            log_result = context.fetch_tileable_op_logs(r.op.key)
            log = next(iter(log_result.values()))['log']
            self.assertEqual(log.strip(), 'test')

            log = r.fetch_log()
            self.assertEqual(str(log).strip(), 'test')

            # test multiple functions
            def f1(size):
                print('f1' * size)
                sys.stdout.flush()

            fs = ExecutableTuple([spawn(f1, 30), spawn(f1, 40)])
            fs.execute(session=sess)
            log = fs.fetch_log(offsets=20, sizes=10)
            self.assertEqual(str(log[0]).strip(), ('f1' * 30)[20:30])
            self.assertEqual(str(log[1]).strip(), ('f1' * 40)[20:30])
            self.assertGreater(len(log[0].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[0].offsets))
            self.assertGreater(len(log[1].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[1].offsets))
            self.assertGreater(len(log[0].chunk_op_keys), 0)

            # test negative offsets
            log = fs.fetch_log(offsets=-20, sizes=10)
            self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10])
            self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10])
            self.assertTrue(all(s > 0 for s in log[0].offsets))
            self.assertGreater(len(log[1].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[1].offsets))
            self.assertGreater(len(log[0].chunk_op_keys), 0)

            # test negative offsets which represented in string
            log = fs.fetch_log(offsets='-0.02K', sizes='0.01K')
            self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10])
            self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10])
            self.assertTrue(all(s > 0 for s in log[0].offsets))
            self.assertGreater(len(log[1].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[1].offsets))
            self.assertGreater(len(log[0].chunk_op_keys), 0)

            def test_nested():
                print('level0')
                fr = spawn(f1, 1)
                fr.execute()
                print(fr.fetch_log())

            r = spawn(test_nested)
            with self.assertRaises(ValueError):
                r.fetch_log()
            r.execute(session=sess)
            log = str(r.fetch_log())
            self.assertIn('level0', log)
            self.assertIn('f1', log)

            df = md.DataFrame(mt.random.rand(10, 3), chunk_size=5)

            def df_func(c):
                print('df func')
                return c

            df2 = df.map_chunk(df_func)
            df2.execute(session=sess)
            log = df2.fetch_log()
            self.assertIn('Chunk op key:', str(log))
            self.assertIn('df func', repr(log))
            self.assertEqual(len(str(df.fetch_log(session=sess))), 0)

    def testNoWorkerException(self):
        self.start_processes(etcd=False, n_workers=0)

        a = mt.ones((10, 10))
        b = mt.ones((10, 10))
        c = (a + b)

        sess = new_session(self.session_manager_ref.address)

        try:
            c.execute(session=sess, timeout=self.timeout)
        except ExecutionFailed as e:
            self.assertIsInstance(e.__cause__, RuntimeError)
Пример #13
0
class Test(SchedulerIntegratedTest):
    def testMainTensorWithoutEtcd(self):
        self.start_processes()

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1
        assert_allclose(loads(result), expected.sum())

        a = mt.ones((100, 50), chunk_size=35) * 2 + 1
        b = mt.ones((50, 200), chunk_size=35) * 2 + 1
        c = a.dot(b)
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)
        result = session_ref.fetch_result(graph_key, c.key)
        assert_allclose(loads(result), np.ones((100, 200)) * 450)

        base_arr = np.random.random((100, 100))
        a = mt.array(base_arr)
        sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)])
        graph = sumv.build_graph()
        targets = [sumv.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = reduce(operator.add,
                          [base_arr[:10, :10] for _ in range(10)])
        result = session_ref.fetch_result(graph_key, sumv.key)
        assert_allclose(loads(result), expected)

        a = mt.ones((31, 27), chunk_size=10)
        b = a.reshape(27, 31)
        b.op.extra_params['_reshape_with_shuffle'] = True
        r = b.sum(axis=1)
        graph = r.build_graph()
        targets = [r.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, r.key)
        assert_allclose(loads(result), np.ones((27, 31)).sum(axis=1))

        raw = np.random.RandomState(0).rand(10, 10)
        a = mt.tensor(raw, chunk_size=(5, 4))
        b = a[a.argmin(axis=1), mt.tensor(np.arange(10))]
        graph = b.build_graph()
        targets = [b.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, b.key)

        np.testing.assert_array_equal(loads(result), raw[raw.argmin(axis=1),
                                                         np.arange(10)])

    @unittest.skipIf('CI' not in os.environ
                     and not EtcdProcessHelper().is_installed(),
                     'does not run without etcd')
    def testMainTensorWithEtcd(self):
        self.start_processes(etcd=True)

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1
        assert_allclose(loads(result), expected.sum())

    @require_cupy
    @require_cudf
    def testMainTensorWithCuda(self):
        self.start_processes(cuda=True)

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1
        assert_allclose(loads(result), expected.sum())

    def testMainDataFrameWithoutEtcd(self):
        import pandas as pd
        from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df
        from mars.dataframe.datasource.series import from_pandas as from_pandas_series
        from mars.dataframe.arithmetic import add

        self.start_processes(etcd=False)

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas_df(data2, chunk_size=6)

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas_df(data1, chunk_size=(10, 5))
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas_df(data2, chunk_size=(10, 6))

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas_df(data2, chunk_size=6)

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        s1 = pd.Series(np.random.rand(10),
                       index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series1 = from_pandas_series(s1)

        graph = series1.build_graph()
        targets = [series1.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, series1.key)
        pd.testing.assert_series_equal(s1, loads(result))
Пример #14
0
class Test(unittest.TestCase):
    def testLocalPathStore(self):
        kvstore = get(':inproc:')
        kvstore.write('/node/subnode/v1', 'value1')
        kvstore.write('/node/v2', 'value2')

        res = kvstore.read('/node', sort=True)
        expected = PathResult(key='/node',
                              dir=True,
                              children=[
                                  PathResult(key='/node/subnode', dir=True),
                                  PathResult(key='/node/v2', value='value2'),
                              ])
        self.assertEqual(repr(res), repr(expected))

        res = kvstore.read('/node', recursive=True, sort=True)
        expected = PathResult(key='/node',
                              dir=True,
                              children=[
                                  PathResult(key='/node/subnode/v1',
                                             value='value1'),
                                  PathResult(key='/node/v2', value='value2'),
                              ])
        self.assertEqual(repr(res), repr(expected))

        kvstore.write('/node/v3', 'value3')
        with self.assertRaises(KeyError):
            kvstore.write('/node/v2/invalid_value', value='invalid')

        res = kvstore.read('/', recursive=False, sort=True)
        expected = PathResult(key='/',
                              dir=True,
                              children=[
                                  PathResult(key='/node', dir=True),
                              ])
        self.assertEqual(repr(res), repr(expected))

        res = kvstore.read('/', recursive=True, sort=True)
        expected = PathResult(key='/',
                              dir=True,
                              children=[
                                  PathResult(key='/node/subnode/v1',
                                             value='value1'),
                                  PathResult(key='/node/v2', value='value2'),
                                  PathResult(key='/node/v3', value='value3'),
                              ])
        self.assertEqual(repr(res), repr(expected))

        kvstore.write('/node/subnode2/v4', 'value4')

        with self.assertRaises(KeyError):
            kvstore.delete('/node/subnode', dir=True)

        kvstore.delete('/node/subnode/v1')
        res = kvstore.read('/', recursive=True, sort=True)
        expected = PathResult(key='/',
                              dir=True,
                              children=[
                                  PathResult(key='/node/subnode', dir=True),
                                  PathResult(key='/node/subnode2/v4',
                                             value='value4'),
                                  PathResult(key='/node/v2', value='value2'),
                                  PathResult(key='/node/v3', value='value3'),
                              ])
        self.assertEqual(repr(res), repr(expected))

        kvstore.delete('/node/subnode2', dir=True, recursive=True)
        res = kvstore.read('/', recursive=True, sort=True)
        expected = PathResult(key='/',
                              dir=True,
                              children=[
                                  PathResult(key='/node/subnode', dir=True),
                                  PathResult(key='/node/v2', value='value2'),
                                  PathResult(key='/node/v3', value='value3')
                              ])
        self.assertEqual(repr(res), repr(expected))

    @unittest.skipIf(sys.platform == 'win32', 'does not run in windows')
    @unittest.skipIf('CI' not in os.environ
                     and not EtcdProcessHelper().is_installed(),
                     'does not run without etcd')
    def testEtcdPathStore(self):
        with EtcdProcessHelper(port_range_start=51342).run():
            kvstore = get(u'etcd://*****:*****@unittest.skipIf(sys.platform == 'win32', 'does not run in windows')
    @unittest.skipIf('CI' not in os.environ
                     and not EtcdProcessHelper().is_installed(),
                     'does not run without etcd')
    def testEtcdWatch(self):
        with EtcdProcessHelper(port_range_start=51342).run():
            kvstore = get('etcd://localhost:51342')
            kvstore.write('/node/subnode/v1', 'value1')
            kvstore.write('/node/v2', 'value2')

            def watcher():
                return kvstore.watch('/node/v2', timeout=10)

            def writer():
                gevent.sleep(1)
                kvstore.write('/node/v2', 'value2\'')

            g1 = gevent.spawn(writer)
            g2 = gevent.spawn(watcher)
            gevent.joinall([g1, g2])
            self.assertEqual(g2.value.value, 'value2\'')

            kvstore.delete('/node/v2')

            def watcher():
                return kvstore.watch('/node/subnode',
                                     timeout=10,
                                     recursive=True)

            def writer():
                gevent.sleep(1)
                kvstore.write('/node/subnode/v1', 'value1\'')

            g1 = gevent.spawn(writer)
            g2 = gevent.spawn(watcher)
            gevent.joinall([g1, g2])
            self.assertEqual(g2.value.children[0].value, 'value1\'')

            kvstore.write('/node/subnode/v3', '-1')

            def watcher():
                results = []
                for idx, result in enumerate(
                        kvstore.eternal_watch('/node/subnode/v3')):
                    results.append(int(result.value))
                    if idx == 4:
                        break
                return results

            def writer():
                gevent.sleep(0.1)
                for v in range(5):
                    kvstore.write('/node/subnode/v3', str(v))
                    gevent.sleep(0.1)

            g1 = gevent.spawn(writer)
            g2 = gevent.spawn(watcher)
            gevent.joinall([g1, g2])
            self.assertEqual(g2.value, list(range(5)))

            kvstore.delete('/node', dir=True, recursive=True)
Пример #15
0
class Test(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        import tempfile
        from mars import kvstore

        options.worker.spill_directory = os.path.join(tempfile.gettempdir(), 'mars_test_spill')
        cls._kv_store = kvstore.get(options.kv_store)

    @classmethod
    def tearDownClass(cls):
        import shutil
        if os.path.exists(options.worker.spill_directory):
            shutil.rmtree(options.worker.spill_directory)

    def setUp(self):
        self.scheduler_endpoints = []
        self.proc_schedulers = []
        self.proc_workers = []
        self.etcd_helper = None

    def tearDown(self):
        procs = tuple(self.proc_workers) + tuple(self.proc_schedulers)
        for p in procs:
            p.send_signal(signal.SIGINT)

        check_time = time.time()
        while any(p.poll() is None for p in procs):
            time.sleep(0.1)
            if time.time() - check_time > 5:
                break

        for p in procs:
            if p.poll() is None:
                p.kill()

        if self.etcd_helper:
            self.etcd_helper.stop()

    def start_processes(self, n_schedulers=1, n_workers=2, etcd=False, modules=None):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception,)

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(['--schedulers', ','.join(self.scheduler_endpoints)])

        self.proc_schedulers = [
            subprocess.Popen([sys.executable, '-m', 'mars.scheduler',
                              '-H', '127.0.0.1',
                              '--level', 'debug',
                              '-p', p,
                              '--format', '%(asctime)-15s %(message)s']
                             + append_args)
            for p in scheduler_ports]
        self.proc_workers = [
            subprocess.Popen([sys.executable, '-m', 'mars.worker',
                              '-a', '127.0.0.1',
                              '--cpu-procs', '1',
                              '--level', 'debug',
                              '--cache-mem', '16m',
                              '--ignore-avail-mem']
                             + append_args)
            for _ in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            ClusterInfoActor.default_name(), address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                started_schedulers = self.cluster_info.get_schedulers()
                if len(started_schedulers) < n_schedulers:
                    raise RuntimeError('Schedulers does not met requirement: %d < %d.' % (
                        len(started_schedulers), n_schedulers
                    ))
                actor_address = self.cluster_info.get_scheduler(SessionManagerActor.default_name())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_name(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(ResourceActor.default_name())
                resource_ref = actor_client.actor_ref(ResourceActor.default_name(), address=actor_address)

                if resource_ref.get_worker_count() < n_workers:
                    raise RuntimeError('Workers does not met requirement: %d < %d.' % (
                        resource_ref.get_worker_count(), n_workers
                    ))
                break
            except:
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors

    def check_process_statuses(self):
        for scheduler_proc in self.proc_schedulers:
            if scheduler_proc.poll() is not None:
                raise SystemError('Scheduler not started. exit code %s' % self.proc_scheduler.poll())
        for worker_proc in self.proc_workers:
            if worker_proc.poll() is not None:
                raise SystemError('Worker not started. exit code %s' % worker_proc.poll())

    def wait_for_termination(self, session_ref, graph_key):
        check_time = time.time()
        while True:
            time.sleep(0.1)
            self.check_process_statuses()
            if time.time() - check_time > 60:
                raise SystemError('Check graph status timeout')
            if session_ref.graph_state(graph_key) in GraphState.TERMINATED_STATES:
                return session_ref.graph_state(graph_key)

    def testMainWithoutEtcd(self):
        self.start_processes(n_schedulers=2)

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key, target_tensors=targets)

        state = self.wait_for_termination(session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1
        assert_array_equal(loads(result), expected.sum())

        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key, target_tensors=targets)

        # todo this behavior may change when eager mode is introduced
        state = self.wait_for_termination(session_ref, graph_key)
        self.assertEqual(state, GraphState.FAILED)

        a = mt.ones((100, 50), chunk_size=35) * 2 + 1
        b = mt.ones((50, 200), chunk_size=35) * 2 + 1
        c = a.dot(b)
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key, target_tensors=targets)

        state = self.wait_for_termination(session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)
        result = session_ref.fetch_result(graph_key, c.key)
        assert_array_equal(loads(result), np.ones((100, 200)) * 450)

        base_arr = np.random.random((100, 100))
        a = mt.array(base_arr)
        sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)])
        graph = sumv.build_graph()
        targets = [sumv.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key, target_tensors=targets)

        state = self.wait_for_termination(session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)])
        result = session_ref.fetch_result(graph_key, sumv.key)
        assert_array_equal(loads(result), expected)

    def testMainWithEtcd(self):
        self.start_processes(n_schedulers=2, etcd=True)

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key, target_tensors=targets)

        state = self.wait_for_termination(session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1
        assert_array_equal(loads(result), expected.sum())
Пример #16
0
class SchedulerIntegratedTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        from mars import kvstore

        options.worker.spill_directory = os.path.join(tempfile.gettempdir(), 'mars_test_spill')
        cls._kv_store = kvstore.get(options.kv_store)
        cls.timeout = int(os.environ.get('CHECK_TIMEOUT', 120))

    @classmethod
    def tearDownClass(cls):
        import shutil
        if os.path.exists(options.worker.spill_directory):
            shutil.rmtree(options.worker.spill_directory)

    def setUp(self):
        self.scheduler_endpoints = []
        self.proc_schedulers = []
        self.proc_workers = []
        self.state_files = dict()
        self.etcd_helper = None
        self.intentional_death_pids = set()

    def tearDown(self):
        for env, fn in self.state_files.items():
            os.environ.pop(env)
            if os.path.exists(fn):
                os.unlink(fn)

        self.terminate_processes()
        options.kv_store = ':inproc:'

    def terminate_processes(self):
        procs = tuple(self.proc_workers) + tuple(self.proc_schedulers)
        for p in procs:
            p.send_signal(signal.SIGINT)

        check_time = time.time()
        while any(p.poll() is None for p in procs):
            time.sleep(0.1)
            if time.time() - check_time > 5:
                break

        for p in procs:
            if p.poll() is None:
                self.kill_process_tree(p)

        if self.etcd_helper:
            self.etcd_helper.stop()

    def kill_process_tree(self, proc, intentional=True):
        if intentional:
            self.intentional_death_pids.add(proc.pid)
        kill_process_tree(proc.pid)

    def add_state_file(self, environ):
        fn = os.environ[environ] = os.path.join(
            tempfile.gettempdir(), f'test-main-{environ.lower()}-{os.getpid()}-{id(self)}')
        self.state_files[environ] = fn
        return fn

    def start_processes(self, *args, **kwargs):
        fail_count = 0
        while True:
            try:
                self._start_processes(*args, **kwargs)
                break
            except ProcessRequirementUnmetError:
                self.terminate_processes()
                fail_count += 1
                if fail_count >= 10:
                    raise
                time.sleep(5)
                logger.error('Failed to start service, retrying')

    def _start_processes(self, n_schedulers=2, n_workers=2, etcd=False, cuda=False, modules=None,
                         log_scheduler=True, log_worker=True, env=None, scheduler_args=None,
                         worker_args=None, worker_cpu=1):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception,)

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        append_args_scheduler = scheduler_args or []
        append_args_worker = worker_args or []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = f'etcd://127.0.0.1:{etcd_port}'
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(['--schedulers', ','.join(self.scheduler_endpoints)])

        if 'DUMP_GRAPH_DATA' in os.environ:
            append_args_scheduler += ['-Dscheduler.dump_graph_data=true']

        proc_env = os.environ.copy()
        if env:
            proc_env.update(env)

        self.proc_schedulers = [
            subprocess.Popen([sys.executable, '-m', 'mars.scheduler',
                              '-H', '127.0.0.1',
                              '-p', p,
                              '--log-level', 'debug' if log_scheduler else 'warning',
                              '--log-format', f'SCH{idx} %(asctime)-15s %(message)s'
                              '-Dscheduler.retry_delay=5',
                              '-Dscheduler.default_cpu_usage=0',
                              '-Dscheduler.status_timeout=10']
                             + append_args + append_args_scheduler, env=proc_env)
            for idx, p in enumerate(scheduler_ports)]
        cuda_count = resource.cuda_count()
        cuda_devices = [int(d) for d in os.environ['CUDA_VISIBLE_DEVICES'].split(',')] \
            if os.environ.get('CUDA_VISIBLE_DEVICES') else list(range(cuda_count))
        self.proc_workers = [
            subprocess.Popen([sys.executable, '-m', 'mars.worker',
                              '-a', '127.0.0.1',
                              '--cpu-procs', str(worker_cpu),
                              '--log-level', 'debug' if log_worker else 'warning',
                              '--log-format', f'WOR{idx} %(asctime)-15s %(message)s',
                              '--cache-mem', '16m',
                              '--ignore-avail-mem',
                              '--cuda-device', str(cuda_devices[idx % cuda_count]) if cuda_count else '',
                              '-Dworker.prepare_data_timeout=30']
                             + append_args + append_args_worker, env=proc_env)
            for idx in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            SchedulerClusterInfoActor.default_uid(), address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                try:
                    started_schedulers = self.cluster_info.get_schedulers()
                except Exception as e:
                    raise ProcessRequirementUnmetError(f'Failed to get scheduler numbers, {e}')
                if len(started_schedulers) < n_schedulers:
                    raise ProcessRequirementUnmetError(
                        f'Schedulers does not met requirement: {len(started_schedulers)} < {n_schedulers}.')
                actor_address = self.cluster_info.get_scheduler(SessionManagerActor.default_uid())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_uid(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(ResourceActor.default_uid())
                resource_ref = actor_client.actor_ref(ResourceActor.default_uid(), address=actor_address)

                if not actor_client.has_actor(self.session_manager_ref) \
                        or resource_ref.get_worker_count() < n_workers:
                    raise ProcessRequirementUnmetError(
                        f'Workers does not met requirement: {resource_ref.get_worker_count()} < {n_workers}')
                break
            except:  # noqa: E722
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors

    def check_process_statuses(self):
        for scheduler_proc in self.proc_schedulers:
            if scheduler_proc.poll() is not None:
                raise ProcessRequirementUnmetError(
                    f'Scheduler not started. exit code {self.proc_scheduler.poll()}')
        for worker_proc in self.proc_workers:
            if worker_proc.poll() is not None and worker_proc.pid not in self.intentional_death_pids:
                raise ProcessRequirementUnmetError(
                    f'Worker not started. exit code {worker_proc.poll()}')

    def wait_for_termination(self, actor_client, session_ref, graph_key):
        check_time = time.time()
        dump_time = time.time()
        check_timeout = int(os.environ.get('CHECK_TIMEOUT', 120))
        while True:
            time.sleep(0.1)
            self.check_process_statuses()
            if time.time() - check_time > check_timeout:
                raise SystemError('Check graph status timeout')
            if time.time() - dump_time > 10:
                dump_time = time.time()
                graph_refs = session_ref.get_graph_refs()
                try:
                    graph_ref = actor_client.actor_ref(graph_refs[graph_key])
                    graph_ref.dump_unfinished_terminals()
                except KeyError:
                    pass
            if session_ref.graph_state(graph_key) in GraphState.TERMINATED_STATES:
                return session_ref.graph_state(graph_key)