def _create_sockets(self): """ Each worker has three sockets at start: (1) request_master_socket: sends job address to master node. (2) reply_job_socket: receives job_address from subprocess. (3) kill_job_socket : receives commands to kill the job from jobs. When a job starts, a new heartbeat socket is created to receive heartbeat signals from the job. """ self.worker_ip = get_ip_address() # request_master_socket: sends job address to master self.request_master_socket = self.ctx.socket(zmq.REQ) self.request_master_socket.linger = 0 # wait for 0.5 second to check whether master is started self.request_master_socket.setsockopt(zmq.RCVTIMEO, 500) self.request_master_socket.connect("tcp://" + self.master_address) # reply_job_socket: receives job_address from subprocess self.reply_job_socket = self.ctx.socket(zmq.REP) self.reply_job_socket.linger = 0 reply_job_port = self.reply_job_socket.bind_to_random_port("tcp://*") self.reply_job_address = "{}:{}".format(self.worker_ip, reply_job_port) # kill_job_socket self.kill_job_socket = self.ctx.socket(zmq.REP) self.kill_job_socket.linger = 0 kill_job_port = self.kill_job_socket.bind_to_random_port("tcp://*") self.kill_job_address = "{}:{}".format(self.worker_ip, kill_job_port)
def status(): if _IS_WINDOWS: cmd = r'''wmic process where "commandline like '%remote\\start.py --name worker --address%'" get commandline /format:list | findstr /V wmic | findstr CommandLine=''' else: cmd = r'ps -ef | grep remote/start.py\ --name\ worker\ --address' content = os.popen(cmd).read().strip() pattern = re.compile('--address (.*?) --cpu') clusters = set(pattern.findall(content)) if len(clusters) == 0: click.echo('No active cluster is found.') else: ctx = zmq.Context() status = [] for cluster in clusters: if _IS_WINDOWS: cmd = r'''wmic process where "commandline like '%address {}%'" get commandline /format:list | findstr /V wmic | findstr CommandLine='''.format( cluster) else: cmd = r'ps -ef | grep address\ {}'.format(cluster) content = os.popen(cmd).read() pattern = re.compile('--monitor_port (.*?)\n', re.S) monitors = pattern.findall(content) if len(monitors): monitor_port, _, master_address = monitors[0].split(' ') monitor_address = "{}:{}".format(get_ip_address(), monitor_port) socket = ctx.socket(zmq.REQ) socket.setsockopt(zmq.RCVTIMEO, 10000) socket.connect('tcp://{}'.format(master_address)) try: socket.send_multipart([STATUS_TAG]) monitor_info = to_str(socket.recv_multipart()[1]) except zmq.error.Again as e: click.echo( 'Can not connect to cluster {}, please try later.'. format(master_address)) socket.close(0) continue msg = """ # Cluster {} {} # If you want to check cluster status, please view: http://{} """.format(master_address, monitor_info, monitor_address) status.append(msg) socket.close(0) else: msg = """ # Cluster {} fails to start the cluster monitor. """.format(cluster) status.append(msg) for monitor_status in status: click.echo(monitor_status)
def __init__(self, port, monitor_port=None): self.ctx = zmq.Context() self.master_ip = get_ip_address() self.monitor_url = "http://{}:{}".format(self.master_ip, monitor_port) logger.set_dir( os.path.expanduser('~/.parl_data/master/{}_{}'.format( self.master_ip, port))) self.client_socket = self.ctx.socket(zmq.REP) self.client_socket.bind("tcp://*:{}".format(port)) self.client_socket.linger = 0 self.port = port self.job_center = JobCenter(self.master_ip) self.cluster_monitor = ClusterMonitor() self.master_is_alive = True self.client_hostname = defaultdict(int)
def __init__(self, worker_address, log_server_address): """ Args: worker_address(str): worker_address for sending job information(e.g, pid) Attributes: pid (int): Job process ID. max_memory (float): Maximum memory (MB) can be used by each remote instance. """ self.max_memory = None self.job_address_receiver, job_address_sender = Pipe() self.job_id_receiver, job_id_sender = Pipe() self.worker_address = worker_address self.log_server_address = log_server_address self.job_ip = get_ip_address() self.pid = os.getpid() self.run_job_process = Process(target=self.run, args=(job_address_sender, job_id_sender)) self.run_job_process.start() """ NOTE: In Windows, it will raise errors when creating threading.Lock before starting multiprocess.Process. """ self.lock = threading.Lock() self._create_sockets() process = psutil.Process(self.pid) self.init_memory = float(process.memory_info()[0]) / (1024**2) self.run_job_process.join() with self.lock: self.kill_job_socket.send_multipart( [remote_constants.KILLJOB_TAG, to_byte(self.job_address)]) try: _ = self.kill_job_socket.recv_multipart() except zmq.error.Again as e: pass os._exit(0)
def run(self, job_address_sender, job_id_sender): """An infinite loop waiting for a new task. Args: job_address_sender(sending end of multiprocessing.Pipe): send job address of reply_socket to main process. """ ctx = zmq.Context() # create the reply_socket reply_socket = ctx.socket(zmq.REP) job_port = reply_socket.bind_to_random_port(addr="tcp://*") reply_socket.linger = 0 job_ip = get_ip_address() job_address = "{}:{}".format(job_ip, job_port) job_id = job_address.replace(':', '_') + '_' + str(int(time.time())) self.log_dir = os.path.expanduser('~/.parl_data/job/{}'.format(job_id)) logger.set_dir(self.log_dir) logger.info( "[Job] Job {} initialized. Reply heartbeat socket Address: {}.". format(job_id, job_address)) job_address_sender.send(job_address) job_id_sender.send(job_id) try: # receive source code from the actor and append them to the environment variables. envdir = self.wait_for_files(reply_socket, job_address) sys.path.insert(0, envdir) os.chdir(envdir) obj = self.wait_for_connection(reply_socket) assert obj is not None self.single_task(obj, reply_socket, job_address) except Exception as e: logger.error( "Error occurs when running a single task. We will reset this job. \nReason:{}" .format(e)) traceback_str = str(traceback.format_exc()) logger.error("traceback:\n{}".format(traceback_str))
def start_worker(address, cpu_num, log_server_port_range): start, end = parse_port_range(log_server_port_range) log_server_port = get_port_from_range(start, end) if not is_master_started(address): raise Exception( "Worker can not connect to the master node, " + "please check if the input address {} ".format(address) + "is correct.") cpu_num = str(cpu_num) if cpu_num else '' start_file = __file__.replace('scripts.pyc', 'start.py') start_file = start_file.replace('scripts.py', 'start.py') command = [ sys.executable, start_file, "--name", "worker", "--address", address, "--cpu_num", str(cpu_num), "--log_server_port", str(log_server_port) ] p = subprocess.Popen(command) if not is_log_server_started(get_ip_address(), log_server_port): click.echo("# Fail to start the log server.")
def _reply_heartbeat(self): """Reply heartbeat signals to the master node.""" socket = self.ctx.socket(zmq.REP) socket.linger = 0 socket.setsockopt(zmq.RCVTIMEO, remote_constants.HEARTBEAT_RCVTIMEO_S * 1000) reply_master_heartbeat_port =\ socket.bind_to_random_port(addr="tcp://*") self.reply_master_heartbeat_address = "{}:{}".format( get_ip_address(), reply_master_heartbeat_port) self.heartbeat_socket_initialized.set() connected = False while self.client_is_alive and self.master_is_alive: try: message = socket.recv_multipart() elapsed_time = datetime.timedelta(seconds=int(time.time() - self.start_time)) socket.send_multipart([ remote_constants.HEARTBEAT_TAG, to_byte(self.executable_path), to_byte(str(self.actor_num)), to_byte(str(elapsed_time)), to_byte(str(self.log_monitor_url)), ]) # TODO: remove additional information except zmq.error.Again as e: if connected: logger.warning("[Client] Cannot connect to the master." "Please check if it is still alive.") else: logger.warning( "[Client] Cannot connect to the master." "Please check the firewall between client and master.(e.g., ping the master IP)" ) self.master_is_alive = False socket.close(0) logger.warning("Client exit replying heartbeat for master.")
def start_master(port, cpu_num, monitor_port, debug, log_server_port_range): if debug: os.environ['DEBUG'] = 'True' if not is_port_available(port): raise Exception( "The master address localhost:{} is already in use.".format(port)) if monitor_port and not is_port_available(monitor_port): raise Exception( "The input monitor port localhost:{} is already in use.".format( monitor_port)) cpu_num = int( cpu_num) if cpu_num is not None else multiprocessing.cpu_count() start_file = __file__.replace('scripts.pyc', 'start.py') start_file = start_file.replace('scripts.py', 'start.py') monitor_file = __file__.replace('scripts.pyc', 'monitor.py') monitor_file = monitor_file.replace('scripts.py', 'monitor.py') monitor_port = monitor_port if monitor_port else get_free_tcp_port() start, end = parse_port_range(log_server_port_range) log_server_port = get_port_from_range(start, end) while log_server_port == monitor_port or log_server_port == port: log_server_port = get_port_from_range(start, end) master_command = [ sys.executable, start_file, "--name", "master", "--port", port, "--monitor_port", monitor_port, ] worker_command = [ sys.executable, start_file, "--name", "worker", "--address", "localhost:" + str(port), "--cpu_num", str(cpu_num), '--log_server_port', str(log_server_port) ] monitor_command = [ sys.executable, monitor_file, "--monitor_port", str(monitor_port), "--address", "localhost:" + str(port) ] FNULL = open(os.devnull, 'w') # Redirect the output to DEVNULL to solve the warning log. _ = subprocess.Popen(master_command, stdout=FNULL, stderr=subprocess.STDOUT) if cpu_num > 0: # Sleep 1s for master ready time.sleep(1) _ = subprocess.Popen(worker_command, stdout=FNULL, stderr=subprocess.STDOUT) if _IS_WINDOWS: # TODO(@zenghsh3) redirecting stdout of monitor subprocess to FNULL will cause occasional failure tmp_file = tempfile.TemporaryFile() _ = subprocess.Popen(monitor_command, stdout=tmp_file) tmp_file.close() else: _ = subprocess.Popen(monitor_command, stdout=FNULL, stderr=subprocess.STDOUT) FNULL.close() if cpu_num > 0: monitor_info = """ # The Parl cluster is started at localhost:{}. # A local worker with {} CPUs is connected to the cluster. # Starting the cluster monitor...""".format( port, cpu_num, ) else: monitor_info = """ # The Parl cluster is started at localhost:{}. # Starting the cluster monitor...""".format(port) click.echo(monitor_info) # check if monitor is started monitor_is_started = False if _IS_WINDOWS: cmd = r'''wmic process where "commandline like '%remote\\monitor.py --monitor_port {} --address localhost:{}%'" get commandline /format:list | findstr /V wmic | findstr CommandLine='''.format( monitor_port, port) else: cmd = r'ps -ef | grep -v grep | grep remote/monitor.py\ --monitor_port\ {}\ --address\ localhost:{}'.format( monitor_port, port) for i in range(3): check_monitor_is_started = os.popen(cmd).read() if len(check_monitor_is_started) > 0: monitor_is_started = True break time.sleep(3) master_ip = get_ip_address() if monitor_is_started: start_info = """ ## If you want to check cluster status, please view: http://{}:{} or call: xparl status""".format(master_ip, monitor_port) else: start_info = "# Fail to start the cluster monitor." monitor_info = """ {} ## If you want to add more CPU resources, please call: xparl connect --address {}:{} ## If you want to shutdown the cluster, please call: xparl stop """.format(start_info, master_ip, port) click.echo(monitor_info) if not is_log_server_started(master_ip, log_server_port): click.echo("# Fail to start the log server.")