def start(self): """ Start multiple workerpools, possibly on remote servers via ssh, assuming there is an active streamer. """ starting = [] for host, cores in self.host_cores: if general.socket_ready((host, self.ctrl_port)): print('%s:%s already running' % (host, self.ctrl_port)) continue ctrl_url = 'tcp://0.0.0.0:%s' % self.ctrl_port if host == '127.0.0.1': # localhost args = [sys.executable] else: args = [ 'ssh', '-f', '-T', f'{self.remote_user}@{host}', self.remote_python ] args += [ '-m', 'openquake.baselib.workerpool', ctrl_url, '-n', cores ] if host != '127.0.0.1': print('%s: if it hangs, check the ssh keys' % ' '.join(args)) self.popens.append(subprocess.Popen(args)) starting.append(host) return 'starting %s' % starting
def set_concurrent_tasks_default(calc): """ Set the default for concurrent_tasks based on the available worker pools . """ num_workers = 0 w = config.zworkers if w.host_cores: host_cores = [hc.split() for hc in w.host_cores.split(',')] else: host_cores = [] for host, _cores in host_cores: url = 'tcp://%s:%s' % (host, w.ctrl_port) with z.Socket(url, z.zmq.REQ, 'connect') as sock: if not general.socket_ready(url): logging.warning('%s is not running', host) continue num_workers += sock.send('get_num_workers') if num_workers == 0: num_workers = os.cpu_count() logging.warning( 'Missing host_cores, no idea about how many cores ' 'are available, using %d', num_workers) parallel.CT = num_workers * 2 OqParam.concurrent_tasks.default = num_workers * 2 logging.warning('Using %d zmq workers', num_workers)
def start(self, streamer=False): """ Start multiple workerpools, possibly on remote servers via ssh, and possibly a streamer, depending on the `streamercls`. :param streamer: if True, starts a streamer with multiprocessing.Process """ if streamer and not general.socket_ready(self.task_in_url): # started self.streamer = multiprocessing.Process( target=_streamer, args=(self.master_host, self.task_in_port, self.task_out_port)) self.streamer.start() starting = [] for host, cores in self.host_cores: if self.status(host)[0][1] == 'running': print('%s:%s already running' % (host, self.ctrl_port)) continue ctrl_url = 'tcp://%s:%s' % (host, self.ctrl_port) if host == '127.0.0.1': # localhost args = [sys.executable] else: args = ['ssh', host, self.remote_python] args += ['-m', 'openquake.baselib.workerpool', ctrl_url, self.task_out_url, cores] starting.append(' '.join(args)) po = subprocess.Popen(args) self.pids.append(po.pid) return 'starting %s' % starting
def start(self, streamer=False): """ Start multiple workerpools, possibly on remote servers via ssh, and possibly a streamer, depending on the `streamercls`. :param streamer: if True, starts a streamer with multiprocessing.Process """ if streamer and not general.socket_ready(self.task_in_url): # started self.streamer = multiprocessing.Process(target=_streamer, args=(self.master_host, self.task_in_port, self.task_out_port)) self.streamer.start() starting = [] for host, cores in self.host_cores: if self.status(host)[0][1] == 'running': print('%s:%s already running' % (host, self.ctrl_port)) continue ctrl_url = 'tcp://%s:%s' % (host, self.ctrl_port) if host == '127.0.0.1': # localhost args = [sys.executable] else: args = ['ssh', host, self.remote_python] args += [ '-m', 'openquake.baselib.workerpool', ctrl_url, self.task_out_url, cores ] starting.append(' '.join(args)) po = subprocess.Popen(args) self.pids.append(po.pid) return 'starting %s' % starting
def setUpClass(cls): cls.z = config.zworkers.copy() host_cores = '127.0.0.1 4' hostport = '127.0.0.1', int(cls.z['ctrl_port']) + 1 if not socket_ready(hostport): raise unittest.SkipTest('The task streamer is off') cls.master = WorkerMaster('127.0.0.1', cls.z['ctrl_port'], host_cores) cls.master.start()
def get_status(address=None): """ Check if the DbServer is up. :param address: pair (hostname, port) :returns: 'running' or 'not-running' """ address = address or (config.dbserver.host, DBSERVER_PORT) return 'running' if socket_ready(address) else 'not-running'
def status(self, host=None): """ :returns: a list of pairs (hostname, 'running'|'not-running') """ if host is None: host_cores = self.host_cores else: host_cores = [hc for hc in self.host_cores if hc[0] == host] lst = [] for host, _ in host_cores: ready = general.socket_ready((host, self.ctrl_port)) lst.append((host, 'running' if ready else 'not-running')) return lst
def check_status(**kw): """ :returns: a non-empty error string if the streamer or worker pools are down """ c = config.zworkers.copy() c.update(kw) hostport = config.dbserver.listen, int(c['ctrl_port']) + 1 errors = [] if not general.socket_ready(hostport): errors.append('The task streamer on %s:%s is down' % hostport) for host, status in WorkerMaster(**c).status(): if status != 'running': errors.append('The workerpool on %s is down' % host) return '\n'.join(errors)
def status(self): """ :returns: a list [(host, running, total), ...] """ executing = [] for host, _cores in self.host_cores: if not general.socket_ready((host, self.ctrl_port)): continue ctrl_url = 'tcp://%s:%s' % (host, self.ctrl_port) with z.Socket(ctrl_url, z.zmq.REQ, 'connect') as sock: running = len(sock.send('get_executing').split()) total = sock.send('get_num_workers') executing.append((host, running, total)) return executing
def set_concurrent_tasks_default(job_id): """ Set the default for concurrent_tasks based on the available worker pools . """ num_workers = 0 w = config.zworkers for host, _cores in [hc.split() for hc in w.host_cores.split(',')]: url = 'tcp://%s:%s' % (host, w.ctrl_port) with z.Socket(url, z.zmq.REQ, 'connect') as sock: if not general.socket_ready(url): logs.LOG.warn('%s is not running', host) continue num_workers += sock.send('get_num_workers') OqParam.concurrent_tasks.default = num_workers * 2 logs.LOG.warn('Using %d zmq workers', num_workers)
def kill(self): """ Send a "kill" command to all worker pools """ killed = [] for host, _ in self.host_cores: if not general.socket_ready((host, self.ctrl_port)): continue ctrl_url = 'tcp://%s:%s' % (host, self.ctrl_port) with z.Socket(ctrl_url, z.zmq.REQ, 'connect') as sock: sock.send('kill') killed.append(host) for popen in self.popens: popen.kill() self.popens = [] return 'killed %s' % killed
def set_concurrent_tasks_default(job_id): """ Set the default for concurrent_tasks based on the available worker pools . """ num_workers = 0 w = config.zworkers for host, _cores in [hc.split() for hc in w.host_cores.split(',')]: url = 'tcp://%s:%s' % (host, w.ctrl_port) with z.Socket(url, z.zmq.REQ, 'connect') as sock: if not general.socket_ready(url): logs.LOG.warn('%s is not running', host) continue num_workers += sock.send('get_num_workers') OqParam.concurrent_tasks.default = num_workers * 3 logs.LOG.warn('Using %d zmq workers', num_workers)
def stop(self): """ Send a "stop" command to all worker pools """ stopped = [] for host, _ in self.host_cores: if not general.socket_ready((host, self.ctrl_port)): continue ctrl_url = 'tcp://%s:%s' % (host, self.ctrl_port) with z.Socket(ctrl_url, z.zmq.REQ, 'connect') as sock: sock.send('stop') stopped.append(host) for popen in self.popens: popen.terminate() # since we are not consuming any output from the spawned process # we must call wait() after terminate() to have Popen() # fully deallocate the process file descriptors, otherwise # zombies will arise popen.wait() self.popens = [] return 'stopped %s' % stopped