예제 #1
0
    def testKVStoreActor(self):
        etcd_port = get_next_port()
        proc_helper = EtcdProcessHelper(port_range_start=etcd_port)
        options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
        with proc_helper.run(), create_actor_pool(n_process=1,
                                                  backend='gevent') as pool:
            store_ref = pool.create_actor(KVStoreActor,
                                          uid=KVStoreActor.default_name())

            store_ref.write('/node/v1', 'value1')
            store_ref.write('/node/v2', 'value2')
            store_ref.write_batch([
                ('/node/v2', 'value2'),
                ('/node/v3', 'value3'),
            ])

            self.assertEqual(store_ref.read('/node/v1').value, 'value1')
            self.assertListEqual([
                v.value for v in store_ref.read_batch(['/node/v2', '/node/v3'])
            ], ['value2', 'value3'])

            store_ref.delete('/node', dir=True, recursive=True)
            with self.assertRaises(KeyError):
                store_ref.delete('/node', dir=True, recursive=True)
            store_ref.delete('/node', dir=True, recursive=True, silent=True)
예제 #2
0
    def testKVStoreActor(self):
        proc_helper = EtcdProcessHelper(port_range_start=54131)
        with proc_helper.run(), create_actor_pool(n_process=1,
                                                  backend='gevent') as pool:
            store_ref = pool.create_actor(KVStoreActor,
                                          uid=KVStoreActor.default_name())

            store_ref.write('/node/v1', 'value1')
            store_ref.write('/node/v2', 'value2')
            store_ref.write_batch([
                ('/node/v2', 'value2'),
                ('/node/v3', 'value3'),
            ])

            self.assertEqual(store_ref.read('/node/v1').value, 'value1')
            self.assertListEqual([
                v.value for v in store_ref.read_batch(['/node/v2', '/node/v3'])
            ], ['value2', 'value3'])
예제 #3
0
파일: test_main.py 프로젝트: kevintsok/mars
class Test(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        from mars import kvstore

        options.worker.spill_directory = os.path.join(tempfile.gettempdir(),
                                                      'mars_test_spill')
        cls._kv_store = kvstore.get(options.kv_store)

    @classmethod
    def tearDownClass(cls):
        import shutil
        if os.path.exists(options.worker.spill_directory):
            shutil.rmtree(options.worker.spill_directory)

        try:
            delay_state_file = os.environ.get('DELAY_STATE_FILE')
            if delay_state_file:
                os.unlink(delay_state_file)
        except OSError:
            pass

    def setUp(self):
        self.scheduler_endpoints = []
        self.proc_schedulers = []
        self.proc_workers = []
        self.state_files = []
        self.etcd_helper = None

    def tearDown(self):
        for fn in self.state_files:
            if os.path.exists(fn):
                os.unlink(fn)

        procs = tuple(self.proc_workers) + tuple(self.proc_schedulers)
        for p in procs:
            p.send_signal(signal.SIGINT)

        check_time = time.time()
        while any(p.poll() is None for p in procs):
            time.sleep(0.1)
            if time.time() - check_time > 5:
                break

        for p in procs:
            if p.poll() is None:
                p.kill()

        if self.etcd_helper:
            self.etcd_helper.stop()
        options.kv_store = ':inproc:'

    def add_state_file(self, environ):
        fn = os.environ[environ] = os.path.join(
            tempfile.gettempdir(),
            'test-main-%s-%d-%d' % (environ.lower(), os.getpid(), id(self)))
        self.state_files.append(fn)
        return fn

    def start_processes(self,
                        n_schedulers=2,
                        n_workers=2,
                        etcd=False,
                        modules=None,
                        log_scheduler=True,
                        log_worker=True):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception, )

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(
                ['--schedulers', ','.join(self.scheduler_endpoints)])

        if 'DUMP_GRAPH_DATA' in os.environ:
            append_args += ['-Dscheduler.dump_graph_data=true']

        self.proc_schedulers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1',
                '--level', 'debug' if log_scheduler else 'warning', '-p', p,
                '--format', '%(asctime)-15s %(message)s',
                '-Dscheduler.retry_delay=5'
            ] + append_args) for p in scheduler_ports
        ]
        self.proc_workers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
                '--cpu-procs', '1', '--level',
                'debug' if log_worker else 'warning', '--cache-mem', '16m',
                '--ignore-avail-mem', '-Dworker.prepare_data_timeout=30'
            ] + append_args) for _ in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            ClusterInfoActor.default_name(),
            address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                started_schedulers = self.cluster_info.get_schedulers()
                if len(started_schedulers) < n_schedulers:
                    raise RuntimeError(
                        'Schedulers does not met requirement: %d < %d.' %
                        (len(started_schedulers), n_schedulers))
                actor_address = self.cluster_info.get_scheduler(
                    SessionManagerActor.default_name())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_name(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(
                    ResourceActor.default_name())
                resource_ref = actor_client.actor_ref(
                    ResourceActor.default_name(), address=actor_address)

                if resource_ref.get_worker_count() < n_workers:
                    raise RuntimeError(
                        'Workers does not met requirement: %d < %d.' %
                        (resource_ref.get_worker_count(), n_workers))
                break
            except:
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors

    def check_process_statuses(self):
        for scheduler_proc in self.proc_schedulers:
            if scheduler_proc.poll() is not None:
                raise SystemError('Scheduler not started. exit code %s' %
                                  self.proc_scheduler.poll())
        for worker_proc in self.proc_workers:
            if worker_proc.poll() is not None:
                raise SystemError('Worker not started. exit code %s' %
                                  worker_proc.poll())

    def wait_for_termination(self, actor_client, session_ref, graph_key):
        check_time = time.time()
        dump_time = time.time()
        check_timeout = int(os.environ.get('CHECK_TIMEOUT', 120))
        while True:
            time.sleep(0.1)
            self.check_process_statuses()
            if time.time() - check_time > check_timeout:
                raise SystemError('Check graph status timeout')
            if time.time() - dump_time > 10:
                dump_time = time.time()
                graph_refs = session_ref.get_graph_refs()
                try:
                    graph_ref = actor_client.actor_ref(graph_refs[graph_key])
                    graph_ref.dump_unfinished_terminals()
                except KeyError:
                    pass
            if session_ref.graph_state(
                    graph_key) in GraphState.TERMINATED_STATES:
                return session_ref.graph_state(graph_key)

    def testMainWithoutEtcd(self):
        self.start_processes()

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1
        assert_allclose(loads(result), expected.sum())

        a = mt.ones((100, 50), chunk_size=35) * 2 + 1
        b = mt.ones((50, 200), chunk_size=35) * 2 + 1
        c = a.dot(b)
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)
        result = session_ref.fetch_result(graph_key, c.key)
        assert_allclose(loads(result), np.ones((100, 200)) * 450)

        base_arr = np.random.random((100, 100))
        a = mt.array(base_arr)
        sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)])
        graph = sumv.build_graph()
        targets = [sumv.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = reduce(operator.add,
                          [base_arr[:10, :10] for _ in range(10)])
        result = session_ref.fetch_result(graph_key, sumv.key)
        assert_allclose(loads(result), expected)

        a = mt.ones((31, 27), chunk_size=10)
        b = a.reshape(27, 31)
        b.op.params['_reshape_with_shuffle'] = True
        graph = b.build_graph()
        targets = [b.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, b.key)
        assert_allclose(loads(result), np.ones((27, 31)))

    def testMainWithEtcd(self):
        self.start_processes(etcd=True)

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1
        assert_allclose(loads(result), expected.sum())

    def testWorkerFailOver(self):
        def kill_process_tree(proc):
            import psutil
            proc = psutil.Process(proc.pid)
            plasma_sock_dir = None
            for p in proc.children(recursive=True):
                if 'plasma' in p.name():
                    socks = [
                        conn.laddr for conn in p.connections('unix')
                        if 'plasma' in conn.laddr
                    ]
                    if socks:
                        plasma_sock_dir = os.path.dirname(socks[0])
                p.kill()
            proc.kill()
            if plasma_sock_dir:
                shutil.rmtree(plasma_sock_dir, ignore_errors=True)

        delay_file = self.add_state_file('DELAY_STATE_FILE')
        open(delay_file, 'w').close()

        terminate_file = self.add_state_file('TERMINATE_STATE_FILE')

        self.start_processes(modules=['mars.scheduler.tests.op_delayer'],
                             log_worker=True)

        session_id = uuid.uuid1()
        actor_client = new_client()
        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        np_a = np.random.random((100, 100))
        np_b = np.random.random((100, 100))

        a = mt.array(np_a, chunk_size=30) * 2 + 1
        b = mt.array(np_b, chunk_size=30) * 2 + 1
        c = a.dot(b) * 2 + 1
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        while not os.path.exists(terminate_file):
            actor_client.sleep(0.05)

        kill_process_tree(self.proc_workers[0])
        logger.warning('Worker %s KILLED!\n\n', self.proc_workers[0].pid)
        self.proc_workers = self.proc_workers[1:]
        os.unlink(delay_file)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np_a * 2 + 1).dot(np_b * 2 + 1) * 2 + 1
        assert_allclose(loads(result), expected)
예제 #4
0
파일: base.py 프로젝트: Haxine/mars-1
class SchedulerIntegratedTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        from mars import kvstore

        options.worker.spill_directory = os.path.join(tempfile.gettempdir(),
                                                      'mars_test_spill')
        cls._kv_store = kvstore.get(options.kv_store)

    @classmethod
    def tearDownClass(cls):
        import shutil
        if os.path.exists(options.worker.spill_directory):
            shutil.rmtree(options.worker.spill_directory)

    def setUp(self):
        self.scheduler_endpoints = []
        self.proc_schedulers = []
        self.proc_workers = []
        self.state_files = dict()
        self.etcd_helper = None
        self.intentional_death_pids = set()

    def tearDown(self):
        for env, fn in self.state_files.items():
            os.environ.pop(env)
            if os.path.exists(fn):
                os.unlink(fn)

        procs = tuple(self.proc_workers) + tuple(self.proc_schedulers)
        for p in procs:
            p.send_signal(signal.SIGINT)

        check_time = time.time()
        while any(p.poll() is None for p in procs):
            time.sleep(0.1)
            if time.time() - check_time > 5:
                break

        for p in procs:
            if p.poll() is None:
                self.kill_process_tree(p)

        if self.etcd_helper:
            self.etcd_helper.stop()
        options.kv_store = ':inproc:'

    def kill_process_tree(self, proc, intentional=True):
        if intentional:
            self.intentional_death_pids.add(proc.pid)

        import psutil
        proc = psutil.Process(proc.pid)
        plasma_sock_dir = None
        for p in proc.children(recursive=True):
            try:
                if 'plasma' in p.name():
                    socks = [
                        conn.laddr for conn in p.connections('unix')
                        if 'plasma' in conn.laddr
                    ]
                    if socks:
                        plasma_sock_dir = os.path.dirname(socks[0])
                p.kill()
            except psutil.NoSuchProcess:
                continue
        proc.kill()
        if plasma_sock_dir:
            shutil.rmtree(plasma_sock_dir, ignore_errors=True)

    def add_state_file(self, environ):
        fn = os.environ[environ] = os.path.join(
            tempfile.gettempdir(),
            'test-main-%s-%d-%d' % (environ.lower(), os.getpid(), id(self)))
        self.state_files[environ] = fn
        return fn

    def start_processes(self,
                        n_schedulers=2,
                        n_workers=2,
                        etcd=False,
                        cuda=False,
                        modules=None,
                        log_scheduler=True,
                        log_worker=True,
                        env=None):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception, )

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        append_args_scheduler = []
        append_args_worker = []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(
                ['--schedulers', ','.join(self.scheduler_endpoints)])

        if 'DUMP_GRAPH_DATA' in os.environ:
            append_args_scheduler += ['-Dscheduler.dump_graph_data=true']
        if not cuda:
            append_args_worker += ['--no-cuda']

        proc_env = os.environ.copy()
        if env:
            proc_env.update(env)

        self.proc_schedulers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1',
                '-p', p, '--log-level',
                'debug' if log_scheduler else 'warning', '--log-format',
                'SCH%d %%(asctime)-15s %%(message)s' % idx,
                '-Dscheduler.retry_delay=5', '-Dscheduler.default_cpu_usage=0',
                '-Dscheduler.status_timeout=10'
            ] + append_args + append_args_scheduler,
                             env=proc_env)
            for idx, p in enumerate(scheduler_ports)
        ]
        cuda_count = resource.cuda_count()
        self.proc_workers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
                '--cpu-procs', '1', '--log-level',
                'debug' if log_worker else 'warning', '--log-format',
                'WOR%d %%(asctime)-15s %%(message)s' % idx, '--cache-mem',
                '16m', '--ignore-avail-mem', '--cuda-device',
                str(idx % cuda_count) if cuda_count else '0',
                '-Dworker.prepare_data_timeout=30'
            ] + append_args + append_args_worker,
                             env=proc_env) for idx in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            SchedulerClusterInfoActor.default_uid(),
            address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                started_schedulers = self.cluster_info.get_schedulers()
                if len(started_schedulers) < n_schedulers:
                    raise ProcessRequirementUnmetError(
                        'Schedulers does not met requirement: %d < %d.' %
                        (len(started_schedulers), n_schedulers))
                actor_address = self.cluster_info.get_scheduler(
                    SessionManagerActor.default_uid())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_uid(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(
                    ResourceActor.default_uid())
                resource_ref = actor_client.actor_ref(
                    ResourceActor.default_uid(), address=actor_address)

                if resource_ref.get_worker_count() < n_workers:
                    raise ProcessRequirementUnmetError(
                        'Workers does not met requirement: %d < %d.' %
                        (resource_ref.get_worker_count(), n_workers))
                break
            except:
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors

    def check_process_statuses(self):
        for scheduler_proc in self.proc_schedulers:
            if scheduler_proc.poll() is not None:
                raise ProcessRequirementUnmetError(
                    'Scheduler not started. exit code %s' %
                    self.proc_scheduler.poll())
        for worker_proc in self.proc_workers:
            if worker_proc.poll(
            ) is not None and worker_proc.pid not in self.intentional_death_pids:
                raise ProcessRequirementUnmetError(
                    'Worker not started. exit code %s' % worker_proc.poll())

    def wait_for_termination(self, actor_client, session_ref, graph_key):
        check_time = time.time()
        dump_time = time.time()
        check_timeout = int(os.environ.get('CHECK_TIMEOUT', 120))
        while True:
            time.sleep(0.1)
            self.check_process_statuses()
            if time.time() - check_time > check_timeout:
                raise SystemError('Check graph status timeout')
            if time.time() - dump_time > 10:
                dump_time = time.time()
                graph_refs = session_ref.get_graph_refs()
                try:
                    graph_ref = actor_client.actor_ref(graph_refs[graph_key])
                    graph_ref.dump_unfinished_terminals()
                except KeyError:
                    pass
            if session_ref.graph_state(
                    graph_key) in GraphState.TERMINATED_STATES:
                return session_ref.graph_state(graph_key)
예제 #5
0
class Test(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        import tempfile
        from mars import kvstore

        options.worker.spill_directory = os.path.join(tempfile.gettempdir(), 'mars_test_spill')
        cls._kv_store = kvstore.get(options.kv_store)

    @classmethod
    def tearDownClass(cls):
        import shutil
        if os.path.exists(options.worker.spill_directory):
            shutil.rmtree(options.worker.spill_directory)

    def setUp(self):
        self.scheduler_endpoints = []
        self.proc_schedulers = []
        self.proc_workers = []
        self.etcd_helper = None

    def tearDown(self):
        procs = tuple(self.proc_workers) + tuple(self.proc_schedulers)
        for p in procs:
            p.send_signal(signal.SIGINT)

        check_time = time.time()
        while any(p.poll() is None for p in procs):
            time.sleep(0.1)
            if time.time() - check_time > 5:
                break

        for p in procs:
            if p.poll() is None:
                p.kill()

        if self.etcd_helper:
            self.etcd_helper.stop()

    def start_processes(self, n_schedulers=1, n_workers=2, etcd=False, modules=None):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception,)

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(['--schedulers', ','.join(self.scheduler_endpoints)])

        self.proc_schedulers = [
            subprocess.Popen([sys.executable, '-m', 'mars.scheduler',
                              '-H', '127.0.0.1',
                              '--level', 'debug',
                              '-p', p,
                              '--format', '%(asctime)-15s %(message)s']
                             + append_args)
            for p in scheduler_ports]
        self.proc_workers = [
            subprocess.Popen([sys.executable, '-m', 'mars.worker',
                              '-a', '127.0.0.1',
                              '--cpu-procs', '1',
                              '--level', 'debug',
                              '--cache-mem', '16m',
                              '--ignore-avail-mem']
                             + append_args)
            for _ in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            ClusterInfoActor.default_name(), address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                started_schedulers = self.cluster_info.get_schedulers()
                if len(started_schedulers) < n_schedulers:
                    raise RuntimeError('Schedulers does not met requirement: %d < %d.' % (
                        len(started_schedulers), n_schedulers
                    ))
                actor_address = self.cluster_info.get_scheduler(SessionManagerActor.default_name())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_name(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(ResourceActor.default_name())
                resource_ref = actor_client.actor_ref(ResourceActor.default_name(), address=actor_address)

                if resource_ref.get_worker_count() < n_workers:
                    raise RuntimeError('Workers does not met requirement: %d < %d.' % (
                        resource_ref.get_worker_count(), n_workers
                    ))
                break
            except:
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors

    def check_process_statuses(self):
        for scheduler_proc in self.proc_schedulers:
            if scheduler_proc.poll() is not None:
                raise SystemError('Scheduler not started. exit code %s' % self.proc_scheduler.poll())
        for worker_proc in self.proc_workers:
            if worker_proc.poll() is not None:
                raise SystemError('Worker not started. exit code %s' % worker_proc.poll())

    def wait_for_termination(self, session_ref, graph_key):
        check_time = time.time()
        while True:
            time.sleep(0.1)
            self.check_process_statuses()
            if time.time() - check_time > 60:
                raise SystemError('Check graph status timeout')
            if session_ref.graph_state(graph_key) in GraphState.TERMINATED_STATES:
                return session_ref.graph_state(graph_key)

    def testMainWithoutEtcd(self):
        self.start_processes(n_schedulers=2)

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key, target_tensors=targets)

        state = self.wait_for_termination(session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1
        assert_array_equal(loads(result), expected.sum())

        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key, target_tensors=targets)

        # todo this behavior may change when eager mode is introduced
        state = self.wait_for_termination(session_ref, graph_key)
        self.assertEqual(state, GraphState.FAILED)

        a = mt.ones((100, 50), chunk_size=35) * 2 + 1
        b = mt.ones((50, 200), chunk_size=35) * 2 + 1
        c = a.dot(b)
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key, target_tensors=targets)

        state = self.wait_for_termination(session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)
        result = session_ref.fetch_result(graph_key, c.key)
        assert_array_equal(loads(result), np.ones((100, 200)) * 450)

        base_arr = np.random.random((100, 100))
        a = mt.array(base_arr)
        sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)])
        graph = sumv.build_graph()
        targets = [sumv.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key, target_tensors=targets)

        state = self.wait_for_termination(session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)])
        result = session_ref.fetch_result(graph_key, sumv.key)
        assert_array_equal(loads(result), expected)

    def testMainWithEtcd(self):
        self.start_processes(n_schedulers=2, etcd=True)

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key, target_tensors=targets)

        state = self.wait_for_termination(session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1
        assert_array_equal(loads(result), expected.sum())
예제 #6
0
파일: base.py 프로젝트: winningsix/mars
class SchedulerIntegratedTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        from mars import kvstore

        options.worker.spill_directory = os.path.join(tempfile.gettempdir(), 'mars_test_spill')
        cls._kv_store = kvstore.get(options.kv_store)
        cls.timeout = int(os.environ.get('CHECK_TIMEOUT', 120))

    @classmethod
    def tearDownClass(cls):
        import shutil
        if os.path.exists(options.worker.spill_directory):
            shutil.rmtree(options.worker.spill_directory)

    def setUp(self):
        self.scheduler_endpoints = []
        self.proc_schedulers = []
        self.proc_workers = []
        self.state_files = dict()
        self.etcd_helper = None
        self.intentional_death_pids = set()

    def tearDown(self):
        for env, fn in self.state_files.items():
            os.environ.pop(env)
            if os.path.exists(fn):
                os.unlink(fn)

        self.terminate_processes()
        options.kv_store = ':inproc:'

    def terminate_processes(self):
        procs = tuple(self.proc_workers) + tuple(self.proc_schedulers)
        for p in procs:
            p.send_signal(signal.SIGINT)

        check_time = time.time()
        while any(p.poll() is None for p in procs):
            time.sleep(0.1)
            if time.time() - check_time > 5:
                break

        for p in procs:
            if p.poll() is None:
                self.kill_process_tree(p)

        if self.etcd_helper:
            self.etcd_helper.stop()

    def kill_process_tree(self, proc, intentional=True):
        if intentional:
            self.intentional_death_pids.add(proc.pid)
        kill_process_tree(proc.pid)

    def add_state_file(self, environ):
        fn = os.environ[environ] = os.path.join(
            tempfile.gettempdir(), f'test-main-{environ.lower()}-{os.getpid()}-{id(self)}')
        self.state_files[environ] = fn
        return fn

    def start_processes(self, *args, **kwargs):
        fail_count = 0
        while True:
            try:
                self._start_processes(*args, **kwargs)
                break
            except ProcessRequirementUnmetError:
                self.terminate_processes()
                fail_count += 1
                if fail_count >= 10:
                    raise
                time.sleep(5)
                logger.error('Failed to start service, retrying')

    def _start_processes(self, n_schedulers=2, n_workers=2, etcd=False, cuda=False, modules=None,
                         log_scheduler=True, log_worker=True, env=None, scheduler_args=None,
                         worker_args=None, worker_cpu=1):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception,)

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        append_args_scheduler = scheduler_args or []
        append_args_worker = worker_args or []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = f'etcd://127.0.0.1:{etcd_port}'
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(['--schedulers', ','.join(self.scheduler_endpoints)])

        if 'DUMP_GRAPH_DATA' in os.environ:
            append_args_scheduler += ['-Dscheduler.dump_graph_data=true']

        proc_env = os.environ.copy()
        if env:
            proc_env.update(env)

        self.proc_schedulers = [
            subprocess.Popen([sys.executable, '-m', 'mars.scheduler',
                              '-H', '127.0.0.1',
                              '-p', p,
                              '--log-level', 'debug' if log_scheduler else 'warning',
                              '--log-format', f'SCH{idx} %(asctime)-15s %(message)s'
                              '-Dscheduler.retry_delay=5',
                              '-Dscheduler.default_cpu_usage=0',
                              '-Dscheduler.status_timeout=10']
                             + append_args + append_args_scheduler, env=proc_env)
            for idx, p in enumerate(scheduler_ports)]
        cuda_count = resource.cuda_count()
        cuda_devices = [int(d) for d in os.environ['CUDA_VISIBLE_DEVICES'].split(',')] \
            if os.environ.get('CUDA_VISIBLE_DEVICES') else list(range(cuda_count))
        self.proc_workers = [
            subprocess.Popen([sys.executable, '-m', 'mars.worker',
                              '-a', '127.0.0.1',
                              '--cpu-procs', str(worker_cpu),
                              '--log-level', 'debug' if log_worker else 'warning',
                              '--log-format', f'WOR{idx} %(asctime)-15s %(message)s',
                              '--cache-mem', '16m',
                              '--ignore-avail-mem',
                              '--cuda-device', str(cuda_devices[idx % cuda_count]) if cuda_count else '',
                              '-Dworker.prepare_data_timeout=30']
                             + append_args + append_args_worker, env=proc_env)
            for idx in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            SchedulerClusterInfoActor.default_uid(), address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                try:
                    started_schedulers = self.cluster_info.get_schedulers()
                except Exception as e:
                    raise ProcessRequirementUnmetError(f'Failed to get scheduler numbers, {e}')
                if len(started_schedulers) < n_schedulers:
                    raise ProcessRequirementUnmetError(
                        f'Schedulers does not met requirement: {len(started_schedulers)} < {n_schedulers}.')
                actor_address = self.cluster_info.get_scheduler(SessionManagerActor.default_uid())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_uid(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(ResourceActor.default_uid())
                resource_ref = actor_client.actor_ref(ResourceActor.default_uid(), address=actor_address)

                if not actor_client.has_actor(self.session_manager_ref) \
                        or resource_ref.get_worker_count() < n_workers:
                    raise ProcessRequirementUnmetError(
                        f'Workers does not met requirement: {resource_ref.get_worker_count()} < {n_workers}')
                break
            except:  # noqa: E722
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors

    def check_process_statuses(self):
        for scheduler_proc in self.proc_schedulers:
            if scheduler_proc.poll() is not None:
                raise ProcessRequirementUnmetError(
                    f'Scheduler not started. exit code {self.proc_scheduler.poll()}')
        for worker_proc in self.proc_workers:
            if worker_proc.poll() is not None and worker_proc.pid not in self.intentional_death_pids:
                raise ProcessRequirementUnmetError(
                    f'Worker not started. exit code {worker_proc.poll()}')

    def wait_for_termination(self, actor_client, session_ref, graph_key):
        check_time = time.time()
        dump_time = time.time()
        check_timeout = int(os.environ.get('CHECK_TIMEOUT', 120))
        while True:
            time.sleep(0.1)
            self.check_process_statuses()
            if time.time() - check_time > check_timeout:
                raise SystemError('Check graph status timeout')
            if time.time() - dump_time > 10:
                dump_time = time.time()
                graph_refs = session_ref.get_graph_refs()
                try:
                    graph_ref = actor_client.actor_ref(graph_refs[graph_key])
                    graph_ref.dump_unfinished_terminals()
                except KeyError:
                    pass
            if session_ref.graph_state(graph_key) in GraphState.TERMINATED_STATES:
                return session_ref.graph_state(graph_key)