Exemplo n.º 1
0
    def start_processes(self,
                        n_schedulers=2,
                        n_workers=2,
                        etcd=False,
                        cuda=False,
                        modules=None,
                        log_scheduler=True,
                        log_worker=True,
                        env=None):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception, )

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        append_args_scheduler = []
        append_args_worker = []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(
                ['--schedulers', ','.join(self.scheduler_endpoints)])

        if 'DUMP_GRAPH_DATA' in os.environ:
            append_args_scheduler += ['-Dscheduler.dump_graph_data=true']
        if not cuda:
            append_args_worker += ['--no-cuda']

        proc_env = os.environ.copy()
        if env:
            proc_env.update(env)

        self.proc_schedulers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1',
                '-p', p, '--log-level',
                'debug' if log_scheduler else 'warning', '--log-format',
                'SCH%d %%(asctime)-15s %%(message)s' % idx,
                '-Dscheduler.retry_delay=5', '-Dscheduler.default_cpu_usage=0',
                '-Dscheduler.status_timeout=10'
            ] + append_args + append_args_scheduler,
                             env=proc_env)
            for idx, p in enumerate(scheduler_ports)
        ]
        cuda_count = resource.cuda_count()
        self.proc_workers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
                '--cpu-procs', '1', '--log-level',
                'debug' if log_worker else 'warning', '--log-format',
                'WOR%d %%(asctime)-15s %%(message)s' % idx, '--cache-mem',
                '16m', '--ignore-avail-mem', '--cuda-device',
                str(idx % cuda_count) if cuda_count else '0',
                '-Dworker.prepare_data_timeout=30'
            ] + append_args + append_args_worker,
                             env=proc_env) for idx in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            SchedulerClusterInfoActor.default_uid(),
            address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                started_schedulers = self.cluster_info.get_schedulers()
                if len(started_schedulers) < n_schedulers:
                    raise ProcessRequirementUnmetError(
                        'Schedulers does not met requirement: %d < %d.' %
                        (len(started_schedulers), n_schedulers))
                actor_address = self.cluster_info.get_scheduler(
                    SessionManagerActor.default_uid())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_uid(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(
                    ResourceActor.default_uid())
                resource_ref = actor_client.actor_ref(
                    ResourceActor.default_uid(), address=actor_address)

                if resource_ref.get_worker_count() < n_workers:
                    raise ProcessRequirementUnmetError(
                        'Workers does not met requirement: %d < %d.' %
                        (resource_ref.get_worker_count(), n_workers))
                break
            except:
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors
Exemplo n.º 2
0
    def _start_processes(self, n_schedulers=2, n_workers=2, etcd=False, cuda=False, modules=None,
                         log_scheduler=True, log_worker=True, env=None, scheduler_args=None,
                         worker_args=None, worker_cpu=1):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception,)

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        append_args_scheduler = scheduler_args or []
        append_args_worker = worker_args or []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = f'etcd://127.0.0.1:{etcd_port}'
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(['--schedulers', ','.join(self.scheduler_endpoints)])

        if 'DUMP_GRAPH_DATA' in os.environ:
            append_args_scheduler += ['-Dscheduler.dump_graph_data=true']

        proc_env = os.environ.copy()
        if env:
            proc_env.update(env)

        self.proc_schedulers = [
            subprocess.Popen([sys.executable, '-m', 'mars.scheduler',
                              '-H', '127.0.0.1',
                              '-p', p,
                              '--log-level', 'debug' if log_scheduler else 'warning',
                              '--log-format', f'SCH{idx} %(asctime)-15s %(message)s'
                              '-Dscheduler.retry_delay=5',
                              '-Dscheduler.default_cpu_usage=0',
                              '-Dscheduler.status_timeout=10']
                             + append_args + append_args_scheduler, env=proc_env)
            for idx, p in enumerate(scheduler_ports)]
        cuda_count = resource.cuda_count()
        cuda_devices = [int(d) for d in os.environ['CUDA_VISIBLE_DEVICES'].split(',')] \
            if os.environ.get('CUDA_VISIBLE_DEVICES') else list(range(cuda_count))
        self.proc_workers = [
            subprocess.Popen([sys.executable, '-m', 'mars.worker',
                              '-a', '127.0.0.1',
                              '--cpu-procs', str(worker_cpu),
                              '--log-level', 'debug' if log_worker else 'warning',
                              '--log-format', f'WOR{idx} %(asctime)-15s %(message)s',
                              '--cache-mem', '16m',
                              '--ignore-avail-mem',
                              '--cuda-device', str(cuda_devices[idx % cuda_count]) if cuda_count else '',
                              '-Dworker.prepare_data_timeout=30']
                             + append_args + append_args_worker, env=proc_env)
            for idx in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            SchedulerClusterInfoActor.default_uid(), address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                try:
                    started_schedulers = self.cluster_info.get_schedulers()
                except Exception as e:
                    raise ProcessRequirementUnmetError(f'Failed to get scheduler numbers, {e}')
                if len(started_schedulers) < n_schedulers:
                    raise ProcessRequirementUnmetError(
                        f'Schedulers does not met requirement: {len(started_schedulers)} < {n_schedulers}.')
                actor_address = self.cluster_info.get_scheduler(SessionManagerActor.default_uid())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_uid(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(ResourceActor.default_uid())
                resource_ref = actor_client.actor_ref(ResourceActor.default_uid(), address=actor_address)

                if not actor_client.has_actor(self.session_manager_ref) \
                        or resource_ref.get_worker_count() < n_workers:
                    raise ProcessRequirementUnmetError(
                        f'Workers does not met requirement: {resource_ref.get_worker_count()} < {n_workers}')
                break
            except:  # noqa: E722
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors