def __init__(self, sandbox_dir, source_addresses=None, source_pub_addresses=None, rep_addresses=None, pub_addresses=None, experiment_uuid='', name='obci_process_supervisor'): self.peers = {} self.status = launcher_tools.READY_TO_LAUNCH self.source_pub_addresses = source_pub_addresses self.machine = socket.gethostname() self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR self.ctx = zmq.Context() self.mx_data = self.set_mx_data() self.env = self.peer_env(self.mx_data) self.launch_data = [] self.peer_order = [] self._running_peer_order = [] self._current_part = None self.experiment_uuid = experiment_uuid self.peers_to_launch = [] self.processes = {} self.restarting = [] super(OBCIProcessSupervisor, self).__init__( source_addresses=source_addresses, rep_addresses=rep_addresses, pub_addresses=pub_addresses, name=name) self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid)
def __init__(self, rep_addresses=None, pub_addresses=None, name='obci_server'): self.experiments = {} self.exp_process_supervisors = {} self._nearby_servers = net.DNS() super(OBCIServer, self).__init__(None, rep_addresses, pub_addresses, name) self.machine = socket.gethostname() self.rep_port = int(net.server_rep_port()) self.pub_port = int(net.server_pub_port()) bcast_port = int(net.server_bcast_port()) self._nearby_servers.logger = self.logger self._bcast_server = threading.Thread(target=broadcast_server, args=[self.uuid, self.rep_port, self.pub_port, bcast_port]) self._bcast_server.daemon = True self._bcast_server.start() self._nearby_updater = threading.Thread(target=update_nearby_servers, args=[self._nearby_servers, bcast_port, self.ctx, self._push_addr]) self._nearby_updater.daemon = True self._nearby_updater.start() self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)
def __init__( self, sandbox_dir, source_addresses=None, source_pub_addresses=None, rep_addresses=None, pub_addresses=None, experiment_uuid="", name="obci_process_supervisor", ): self.peers = {} self.status = launcher_tools.READY_TO_LAUNCH self.source_pub_addresses = source_pub_addresses self.machine = socket.gethostname() self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR self.ctx = zmq.Context() self.mx_data = self.set_mx_data() self.env = self.peer_env(self.mx_data) self.launch_data = [] self.peer_order = [] self._running_peer_order = [] self._current_part = None self.experiment_uuid = experiment_uuid self.peers_to_launch = [] self.processes = {} self.restarting = [] super(OBCIProcessSupervisor, self).__init__( source_addresses=source_addresses, rep_addresses=rep_addresses, pub_addresses=pub_addresses, name=name ) self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid)
def __init__(self, rep_addresses=None, pub_addresses=None, name='obci_server'): self.experiments = {} self.exp_process_supervisors = {} self._nearby_servers = net.DNS() super(OBCIServer, self).__init__(None, rep_addresses, pub_addresses, name) self.machine = socket.gethostname() self.rep_port = int(net.server_rep_port()) self.pub_port = int(net.server_pub_port()) bcast_port = int(net.server_bcast_port()) self._nearby_servers.logger = self.logger self._bcast_server = threading.Thread( target=broadcast_server, args=[self.uuid, self.rep_port, self.pub_port, bcast_port]) self._bcast_server.daemon = True self._bcast_server.start() self._nearby_updater = threading.Thread( target=update_nearby_servers, args=[self._nearby_servers, bcast_port, self.ctx, self._push_addr]) self._nearby_updater.daemon = True self._nearby_updater.start() self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)
def __init__(self, source_addresses=None, rep_addresses=None, pub_addresses=None, name='obci_control_peer'): # TODO TODO TODO !!!! # cleaner subclassing of obci_control_peer!!! self.hostname = socket.gethostname() self.source_addresses = source_addresses if source_addresses else [] self.rep_addresses = rep_addresses self.pub_addresses = pub_addresses self._all_sockets = [] self._pull_addr = 'inproc://publisher_msg' self._push_addr = 'inproc://publisher' self._subpr_push_addr = 'inproc://subprocess_info' self.uuid = str(uuid.uuid4()) self.name = str(name) self.type = self.peer_type() log_dir = os.path.join(settings.OBCI_CONTROL_LOG_DIR, self.name + '-' + self.uuid[:8]) if not hasattr(self, 'logger'): if not os.path.exists(log_dir): os.makedirs(log_dir) self.logger = get_logger(self.peer_type(), log_dir=log_dir, stream_level=net_tools.peer_loglevel(), obci_peer=self) self.mtool = self.message_tool() if not hasattr(self, "ctx"): self.ctx = zmq.Context() self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger) self.net_init() if self.source_addresses: self.registration_response = self.register() self._handle_registration_response(self.registration_response) else: self.registration_response = None self.interrupted = False signal.signal(signal.SIGTERM, self.signal_handler()) signal.signal(signal.SIGINT, self.signal_handler())
def __init__(self, sandbox_dir, source_addresses=None, source_pub_addresses=None, rep_addresses=None, pub_addresses=None, experiment_uuid='', name='obci_process_supervisor'): self.peers = {} self.status = launcher_tools.READY_TO_LAUNCH self.source_pub_addresses = source_pub_addresses self.machine = socket.gethostname() self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR self.ctx = zmq.Context() self.mx_data = self.set_mx_data() self.env = self.peer_env(self.mx_data) self.launch_data = [] self.peer_order = [] self._running_peer_order = [] self._current_part = None self.__cfg_launch_info = None self.__cfg_morph = False self.experiment_uuid = experiment_uuid self.peers_to_launch = [] self.processes = {} self.restarting = [] self.rqs = 0 self._nearby_machines = net.DNS() self.test_count = 0 self.__cfg_lock = threading.RLock() super(OBCIProcessSupervisor, self).__init__( source_addresses=source_addresses, rep_addresses=rep_addresses, pub_addresses=pub_addresses, name=name) self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)
class OBCIServer(OBCIControlPeer): msg_handlers = OBCIControlPeer.msg_handlers.copy() @log_crash def __init__(self, rep_addresses=None, pub_addresses=None, name='obci_server'): self.experiments = {} self.exp_process_supervisors = {} self._nearby_servers = net.DNS() super(OBCIServer, self).__init__(None, rep_addresses, pub_addresses, name) self.machine = socket.gethostname() self.rep_port = int(net.server_rep_port()) self.pub_port = int(net.server_pub_port()) bcast_port = int(net.server_bcast_port()) self._nearby_servers.logger = self.logger self._bcast_server = threading.Thread(target=broadcast_server, args=[self.uuid, self.rep_port, self.pub_port, bcast_port]) self._bcast_server.daemon = True self._bcast_server.start() self._nearby_updater = threading.Thread(target=update_nearby_servers, args=[self._nearby_servers, bcast_port, self.ctx, self._push_addr]) self._nearby_updater.daemon = True self._nearby_updater.start() self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger) def nearby_server_addrs(self): snap = self._nearby_servers.snapshot() return [srv.ip for srv in snap.values()] def nearby_servers(self): return self._nearby_servers.snapshot() def my_ip(self): addr = "127.0.1.1" try: addr = self._nearby_servers.this_addr_network() except Exception as e: self.logger.error(str(e)) return addr def network_ready(self): # i know my network IP return self.my_ip() != self.machine def handle_socket_read_error(self, socket, error): if socket == self.rep_socket: self.logger.warning("reinitialising REP socket") self._all_sockets.remove(self.rep_socket) if socket in self.client_rq: self.client_rq = None self.rep_socket.close() # linger=0) self.rep_socket = None time.sleep(0.2) (self.rep_socket, self.rep_addresses) = self._init_socket( ['tcp://*:' + str(self.rep_port)], zmq.REP) self.rep_socket.setsockopt(zmq.LINGER, 0) self._all_sockets.append(self.rep_socket) self.logger.info(self.rep_addresses) elif socket == self.exp_rep: self.logger.info("reinitialising EXPERIMENT REP socket") self.exp_rep.close() # linger=0) (self.exp_rep, self.exp_rep_addrs) = self._init_socket( self.exp_rep_addrs, zmq.REP) self.exp_rep.setsockopt(zmq.LINGER, 0) self._all_sockets.append(self.exp_rep) def peer_type(self): return 'obci_server' def net_init(self): (self.exp_rep, self.exp_rep_addrs) = self._init_socket( [], zmq.REP) # (self.exp_pub, self.exp_pub_addrs) = self._init_socket( # [], zmq.PUB) # self.exp_pub.setsockopt(zmq.LINGER, 0) self._all_sockets.append(self.exp_rep) # self._all_sockets.append(self.exp_pub) tcp_port = int(net.server_tcp_proxy_port()) self._tcp_proxy_thr, tcp_port = twisted_tcp_handling.run_twisted_server( ('0.0.0.0', tcp_port), self.ctx, self.rep_addresses[0]) self.tcp_addresses = [(self.my_ip(), tcp_port), (socket.gethostname(), tcp_port)] super(OBCIServer, self).net_init() def custom_sockets(self): return [self.exp_rep] # , self.srv_rep, self.srv_pub] def clean_up(self): # self._tcp_srv.shutdown() pass def cleanup_before_net_shutdown(self, kill_message, sock=None): send_msg(self._publish_socket, # self.exp_pub, self.mtool.fill_msg("kill", receiver="")) send_msg(self._publish_socket, self.mtool.fill_msg("launcher_shutdown", sender=self.uuid)) for sup in self.exp_process_supervisors: self.exp_process_supervisors[sup].kill() self.logger.info('sent KILL to experiments') def _args_for_experiment(self, sandbox_dir, launch_file, local=False, name=None, overwrites=None): args = ['--sv-addresses'] args += self.exp_rep_addrs args.append('--sv-pub-addresses') # if local: # addrs = net.choose_local(self.exp_pub_addrs) # else: # addrs = net.choose_not_local(self.exp_pub_addrs) addrs = net.choose_local(self.pub_addresses) # self.exp_pub_addrs args += addrs exp_name = name if name else os.path.basename(launch_file) args += [ '--sandbox-dir', str(sandbox_dir), '--launch-file', str(launch_file), '--name', exp_name, '--current-ip', self.my_ip()] if overwrites is not None: args += peer_cmd.peer_overwrites_cmd(overwrites) # print '{0} [{1}] -- experiment args: {2}'.format(self.name, self.peer_type(), args) return args def start_experiment_process(self, sandbox_dir, launch_file, name=None, overwrites=None): path = 'obci_experiment' args = self._args_for_experiment(sandbox_dir, launch_file, local=True, name=name, overwrites=overwrites) return self.subprocess_mgr.new_local_process(path, args, proc_type='obci_experiment', capture_io=NO_STDIO) def handle_register_experiment(self, message, sock): machine, pid = message.other_params['origin_machine'], message.other_params['pid'] status, det = message.other_params['status_name'], message.other_params['details'] launch_file = message.other_params['launch_file_path'] tcp_addr = message.other_params['tcp_addrs'] exp_proc = self.subprocess_mgr.process(machine, pid) if exp_proc is None: send_msg(sock, self.mtool.fill_msg("rq_error", err_code="experiment_not_found")) return info = self.experiments[message.uuid] = ExperimentInfo(message.uuid, message.name, message.rep_addrs, message.pub_addrs, time.time(), machine, pid, status, det, launch_file, tcp_addr, self._nearby_servers.this_addr_network()) exp_proc.registered(info) for addrs in [info.rep_addrs, info.pub_addrs]: one = addrs[0] port = net.port(one) addrs = [self._nearby_servers.this_addr_network() + ':' + str(port)] + addrs info_msg = self.mtool.fill_msg("experiment_created", uuid=info.uuid, name=info.name, rep_addrs=info.rep_addrs, pub_addrs=info.pub_addrs, origin_machine=info.origin_machine, status_name=status, details=det, launch_file_path=launch_file, tcp_addrs=tcp_addr) if self.client_rq: msg_type = self.client_rq[0].type rq_sock = self.client_rq[1] if msg_type == "create_experiment": self.client_rq = None send_msg(rq_sock, info_msg) send_msg(sock, self.mtool.fill_msg("rq_ok", params=self._nearby_servers.dict_snapshot())) send_msg(self._publish_socket, info_msg) def _handle_register_experiment_timeout(self, exp): self.logger.error("New experiment process failed to " "register before timeout" + str(exp.pid)) if exp.returncode is None: exp.kill() exp.wait() # msg_type = self.client_rq[0].type rq_sock = self.client_rq[1] send_msg(rq_sock, self.mtool.fill_msg("rq_error", err_code="create_experiment_error", request=vars(self.client_rq[0]))) @msg_handlers.handler("register_peer") def handle_register_peer(self, message, sock): """Register peer""" if message.peer_type == "obci_client": send_msg(sock, self.mtool.fill_msg("rq_ok")) elif message.peer_type == "obci_experiment": self.handle_register_experiment(message, sock) else: super(OBCIServer, self).handle_register_peer(message, sock) @msg_handlers.handler("create_experiment") def handle_create_experiment(self, message, sock): if not self.network_ready() and self._nearby_servers.dict_snapshot(): send_msg(sock, self.mtool.fill_msg("rq_error", err_code='server_network_not_ready')) return launch_file = message.launch_file sandbox = message.sandbox_dir name = message.name overwrites = message.overwrites sandbox = sandbox if sandbox else settings.DEFAULT_SANDBOX_DIR exp, details = self.start_experiment_process( sandbox, launch_file, name, overwrites) if exp is None: self.logger.error("failed to launch experiment " "process, request: " + str(vars(message))) send_msg(sock, self.mtool.fill_msg("rq_error", request=vars(message), err_code='launch_error', details=details)) else: self.logger.info("experiment process " "launched: {0}".format(exp.pid)) if sock.socket_type in [zmq.REP, zmq.ROUTER]: self.client_rq = (message, sock) @msg_handlers.handler("list_experiments") def handle_list_experiments(self, message, sock): exp_data = {} for exp_id in self.experiments: exp_data[exp_id] = self.experiments[exp_id].info() nearby = self.nearby_servers() nearby_dict = {} for srv in nearby.values(): nearby_dict[srv.ip] = srv.hostname info = '\n{' for srv in nearby_dict: info += '\n' + srv + ' : ' + nearby_dict[srv] + ',' info += '}' self.logger.debug("nearby servers: count: {0}, {1}".format( len(nearby), info)) send_msg(sock, self.mtool.fill_msg("running_experiments", exp_data=exp_data, nearby_machines=nearby_dict)) @msg_handlers.handler("list_nearby_machines") def handle_list_nearby_machines(self, message, sock): send_msg(sock, self.mtool.fill_msg('nearby_machines', nearby_machines=self._nearby_servers.dict_snapshot())) def _handle_match_name(self, message, sock, this_machine=False): matches = self.exp_matching(message.strname) match = None msg = None if not matches: msg = self.mtool.fill_msg("rq_error", request=vars(message), err_code='experiment_not_found') elif len(matches) > 1: matches = [(exp.uuid, exp.name) for exp in matches] msg = self.mtool.fill_msg("rq_error", request=vars(message), err_code='ambiguous_exp_name', details=matches) else: match = matches.pop() if this_machine and match.origin_machine != self.machine: msg = self.mtool.fill_msg("rq_error", request=vars(message), err_code='exp_not_on_this_machine', details=match.origin_machine) match = None if msg and sock.socket_type in [zmq.REP, zmq.ROUTER]: send_msg(sock, msg) return match @msg_handlers.handler("get_experiment_contact") def handle_get_experiment_contact(self, message, sock): self.logger.debug("##### rq contact for: %s", message.strname) info = self._handle_match_name(message, sock) if info: send_msg(sock, self.mtool.fill_msg("experiment_contact", uuid=info.uuid, name=info.name, rep_addrs=info.rep_addrs, pub_addrs=info.pub_addrs, tcp_addrs=info.tcp_addrs, machine=info.origin_machine, status_name=info.status_name, details=info.details)) @msg_handlers.handler("experiment_status_change") def handle_experiment_status_change(self, message, sock): exp = self.experiments.get(message.uuid, None) if not exp: if sock.socket_type in [zmq.REP, zmq.ROUTER]: send_msg(sock, self.mtool.fill_msg('rq_error', err_code='experiment_not_found')) return exp.status_name = message.status_name exp.details = message.details if sock.socket_type in [zmq.REP, zmq.ROUTER]: send_msg(sock, self.mtool.fill_msg('rq_ok')) send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("experiment_info_change") def handle_experiment_info_change(self, message, sock): exp = self.experiments.get(message.uuid, None) if not exp: self.logger.warning("UUID not found " + message.uuid) if sock.socket_type in [zmq.REP, zmq.ROUTER]: send_msg(sock, self.mtool.fill_msg('rq_error', err_code='experiment_not_found')) return exp.name = message.name exp.launch_file_path = message.launch_file_path if sock.socket_type in [zmq.REP, zmq.ROUTER]: send_msg(sock, self.mtool.fill_msg('rq_ok')) self.logger.info("INFO CHANGED %s", exp.launch_file_path) send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("experiment_transformation") def handle_experiment_transformation(self, message, sock): exp = self.experiments.get(message.uuid, None) if not exp: if sock.socket_type in [zmq.REP, zmq.ROUTER]: send_msg(sock, self.mtool.fill_msg('rq_error', err_code='experiment_not_found')) return exp.status_name = message.status_name exp.details = message.details exp.launch_file_path = message.launch_file exp.name = message.name if sock.socket_type in [zmq.REP, zmq.ROUTER]: send_msg(sock, self.mtool.fill_msg('rq_ok')) send_msg(self._publish_socket, message.SerializeToString()) def exp_matching(self, strname): """Match *strname* against all created experiment IDs and names. Return those experiment descriptions which name or uuid starts with strname. """ match_names = {} for uid, exp in self.experiments.items(): if exp.name.startswith(strname): match_names[uid] = exp ids = self.experiments.keys() match_ids = [uid for uid in ids if uid.startswith(strname)] experiments = set() for uid in match_ids: experiments.add(self.experiments[uid]) for name, exp in match_names.items(): experiments.add(exp) return experiments @msg_handlers.handler("kill_experiment") def handle_kill_experiment(self, message, sock): match = self._handle_match_name(message, sock, this_machine=True) if match: if match.kill_timer is not None: send_msg(sock, self.mtool.fill_msg("rq_error", err_code="already_killed", details="Experiment already shutting down")) elif not message.force: self.logger.info("sending kill to experiment " "{0} ({1})".format(match.uuid, match.name)) send_msg(self._publish_socket, # self.exp_pub, self.mtool.fill_msg("kill", receiver=match.uuid)) send_msg(sock, self.mtool.fill_msg("kill_sent", experiment_id=match.uuid)) pid = match.experiment_pid uid = match.uuid self.logger.info("Waiting for experiment process {0} to terminate".format(uid)) match.kill_timer = threading.Timer(1.1, self._handle_killing_exp, args=[pid, uid]) match.kill_timer.start() send_msg(self._publish_socket, self.mtool.fill_msg('kill_sent', experiment_id=match.uuid )) def _handle_killing_exp(self, pid, uid): proc = self.subprocess_mgr.process(self.machine, pid) if proc.process_is_running(): proc.kill() self.logger.info("experiment {0} FINISHED".format(uid)) proc.delete = True del self.experiments[uid] return proc.popen_obj.returncode @msg_handlers.handler("launch_process") def handle_launch_process(self, message, sock): if message.proc_type == 'obci_process_supervisor': self._handle_launch_process_supervisor(message, sock) def _handle_launch_process_supervisor(self, message, sock): sv_obj, details = self._start_obci_supervisor_process(message) self.logger.info("LAUNCH PROCESS SV " + str(sv_obj) + str(details)) if sv_obj: self.exp_process_supervisors[message.sender] = sv_obj send_msg(sock, self.mtool.fill_msg("launched_process_info", sender=self.uuid, machine=self.machine, pid=sv_obj.pid, proc_type=sv_obj.proc_type, name=sv_obj.name, path=sv_obj.path)) self.logger.info("CONFIRMED LAUNCH") else: send_msg(sock, self.mtool.fill_msg('rq_error', request=message.dict(), err_code="launch_error", details=details)) self.logger.error("PROCESS SUPERVISOR LAUNCH FAILURE") @msg_handlers.handler("kill_process") def handle_kill_process_supervisor(self, message, sock): proc = self.subprocess_mgr.process(message.machine, message.pid) if not proc: send_msg(sock, self.mtool.fill_msg("rq_error", err_code="process_not_found")) else: # TODO # name = proc.name proc.kill() proc.mark_delete() send_msg(sock, self.mtool.fill_msg("rq_ok")) del self.exp_process_supervisors[proc.name] @msg_handlers.handler("dead_process") def handle_dead_process(self, message, sock): proc = self.subprocess_mgr.process(message.machine, message.pid) if proc is not None: proc.mark_delete() status, details = proc.status() self.logger.warning("Process " + proc.proc_type + " dead: " + status + str(details) + proc.name + str(proc.pid)) if proc.proc_type == 'obci_process_supervisor': pass elif proc.proc_type == 'obci_experiment': pass if status == subprocess_monitor.FAILED: pass @msg_handlers.handler("find_eeg_experiments") def handle_find_eeg_experiments(self, message, sock): if not self.network_ready() and self._nearby_servers.dict_snapshot(): send_msg(sock, self.mtool.fill_msg("rq_error", err_code='server_network_not_ready')) return send_msg(sock, self.mtool.fill_msg("rq_ok")) finder_thr = threading.Thread(target=find_eeg_experiments_and_push_results, args=[self.ctx, self.rep_addresses, message, self._nearby_servers.copy()]) finder_thr.daemon = True finder_thr.start() @msg_handlers.handler("find_eeg_amplifiers") def handle_find_new_eeg_amplifiers(self, message, sock): if not self.network_ready() and self._nearby_servers.dict_snapshot(): send_msg(sock, self.mtool.fill_msg("rq_error", err_code='server_network_not_ready')) return send_msg(sock, self.mtool.fill_msg("rq_ok")) amp_thr = threading.Thread(target=find_new_experiments_and_push_results, args=[self.ctx, message]) amp_thr.daemon = True amp_thr.start() @msg_handlers.handler("start_eeg_signal") def handle_start_eeg_signal(self, message, sock): if not self.network_ready() and self._nearby_servers.dict_snapshot(): send_msg(sock, self.mtool.fill_msg("rq_error", err_code='server_network_not_ready')) return send_msg(sock, self.mtool.fill_msg("rq_ok")) start_thr = threading.Thread(target=start_eeg_signal_experiment, args=[self.ctx, self.rep_addresses, message]) start_thr.daemon = True start_thr.start() def _start_obci_supervisor_process(self, rq_message): path = obci_process_supervisor.__file__ path = '.'.join([path.rsplit('.', 1)[0], 'py']) start_params = rq_message.dict() start_params['path'] = path del start_params['type'] del start_params['sender'] del start_params['sender_ip'] del start_params['receiver'] sv_obj, details = self.subprocess_mgr.new_local_process(**start_params) if sv_obj is None: return None, details return sv_obj, False def _crash_extra_data(self, exception=None): data = super(OBCIServer, self)._crash_extra_data(exception) data.update({ 'experiments': [e.info() for e in self.experiments.values()] }) return data
class OBCIProcessSupervisor(OBCIControlPeer): msg_handlers = OBCIControlPeer.msg_handlers.copy() def __init__( self, sandbox_dir, source_addresses=None, source_pub_addresses=None, rep_addresses=None, pub_addresses=None, experiment_uuid="", name="obci_process_supervisor", ): self.peers = {} self.status = launcher_tools.READY_TO_LAUNCH self.source_pub_addresses = source_pub_addresses self.machine = socket.gethostname() self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR self.ctx = zmq.Context() self.mx_data = self.set_mx_data() self.env = self.peer_env(self.mx_data) self.launch_data = [] self.peer_order = [] self._running_peer_order = [] self._current_part = None self.experiment_uuid = experiment_uuid self.peers_to_launch = [] self.processes = {} self.restarting = [] super(OBCIProcessSupervisor, self).__init__( source_addresses=source_addresses, rep_addresses=rep_addresses, pub_addresses=pub_addresses, name=name ) self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid) def peer_type(self): return "obci_process_supervisor" def net_init(self): self.source_sub_socket = self.ctx.socket(zmq.SUB) self.source_sub_socket.setsockopt(zmq.SUBSCRIBE, "") self._all_sockets.append(self.source_sub_socket) if self.source_pub_addresses: for addr in self.source_pub_addresses: self.source_sub_socket.connect(addr) (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL) # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "") self.cs_addr = net.choose_not_local(self.cs_addresses) if not self.cs_addr: self.cs_addr = net.choose_local(self.cs_addresses)[0] else: self.cs_addr = self.cs_addr[0] self._all_sockets.append(self.config_server_socket) super(OBCIProcessSupervisor, self).net_init() def params_for_registration(self): return dict( pid=os.getpid(), machine=self.machine, mx_data=[self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]], ) def custom_sockets(self): return [self.source_sub_socket, self.config_server_socket] def _handle_registration_response(self, response): self.launch_data = response.params["launch_data"] self.peers_to_launch = list(self.launch_data.keys()) self.peer_order = response.params["peer_order"] for part in self.peer_order: self._running_peer_order.append(list(part)) print self.name, "[", self.type, "]", "RECEIVED LAUNCH DATA: ", self.launch_data def set_mx_data(self): src_ = net.choose_not_local(self.source_pub_addresses)[:1] if not src_: src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1] src = src_[0] src = src[6:].split(":")[0] if src == socket.gethostname(): sock = self.ctx.socket(zmq.REP) port = str( sock.bind_to_random_port( "tcp://127.0.0.1", min_port=settings.PORT_RANGE[0], max_port=settings.PORT_RANGE[1] ) ) sock.close() return ("0.0.0.0", port), "" # empty passwd else: return None, None def mx_addr_str(self, mx_data): if mx_data[0] is None: return None addr, port = mx_data[0] print self.name, "[", self.type, "]", "mx addr str", addr + ":" + str(port) return addr + ":" + str(port) def peer_env(self, mx_data): if mx_data[0] is None: return None env = os.environ.copy() addr, port = mx_data[0] _env = { "MULTIPLEXER_ADDRESSES": socket.gethostname() + ":" + str(port), "MULTIPLEXER_PASSWORD": mx_data[1], "MULTIPLEXER_RULES": launcher_tools.mx_rules_path(), } env.update(_env) return env @msg_handlers.handler("start_mx") def handle_start_mx(self, message, sock): if "mx" in self.launch_data and self.mx_data[0] is not None: print self.name, "[", self.type, "]", "..starting multiplexer" self.peer_order.remove(["mx"]) self.peers_to_launch.remove("mx") path = launcher_tools.mx_path() args = [ "run_multiplexer", self.mx_addr_str((("0.0.0.0", self.mx_data[0][1]), self.mx_data[1])), "--multiplexer-password", self.mx_data[1], "--rules", launcher_tools.mx_rules_path(), ] proc, details = self._launch_process(path, args, "multiplexer", "mx", env=self.env) self.processes["mx"] = proc if proc is not None: self.mx = proc @msg_handlers.handler("start_peers") def handle_start_peers(self, message, sock): self._launch_processes(self.launch_data) def test(self): # for i in range(SEND): # send_msg(self.push, str(i)) self.pull = self.ctx.socket(zmq.SUB) self.pull.bind("tcp://*:16789") received = 0 prev = -1 for i in range(SEND): msg = recv_msg(self.pull) if int(msg): # prev = int(msg) received += 1 if received % 10000 == 0: print "zmq: received ", received, "messages, last: ", msg if received == SEND: print "zmq: OK" else: print "WUT?", received # self.push.close() self.pull.close() @msg_handlers.handler("manage_peers") def handle_manage_peers(self, message, sock): if not message.receiver == self.uuid: return message.kill_peers.append("config_server") message.start_peers_data["config_server"] = dict(self.launch_data["config_server"]) restore_config = [peer for peer in self.processes if peer not in message.kill_peers] for peer in message.kill_peers: proc = self.processes.get(peer, None) if not proc: print self.name, "[", self.type, "]", "peer to kill not found:", peer continue print "MORPH: KILLING ", peer proc.kill() print "MORPH: KILLED ", peer del self.processes[peer] del self.launch_data[peer] for peer, data in message.start_peers_data.iteritems(): self.launch_data[peer] = data self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers] self._launch_processes(message.start_peers_data, restore_config=restore_config) def _launch_processes(self, launch_data, restore_config=[]): proc, details = None, None success = True path, args = None, None self.status = launcher_tools.LAUNCHING ldata = [] if "config_server" in launch_data: ldata.append(("config_server", launch_data["config_server"])) if "amplifier" in launch_data: ldata.append(("amplifier", launch_data["amplifier"])) for peer, data in launch_data.iteritems(): if (peer, data) not in ldata: ldata.append((peer, data)) for peer, data in ldata: # self.launch_data.iteritems(): wait = 0 if peer.startswith("mx"): continue path = os.path.join(launcher_tools.obci_root(), data["path"]) args = data["args"] if peer.startswith("config_server"): args += ["-p", "launcher_socket_addr", self.cs_addr] args += ["-p", "experiment_uuid", self.experiment_uuid] if restore_config: args += ["-p", "restore_peers", " ".join(restore_config)] wait = 0.4 proc, details = self._launch_process(path, args, data["peer_type"], peer, env=self.env, capture_io=NO_STDIO) if proc is not None: self.processes[peer] = proc else: success = False break time.sleep(wait) if success: send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched", machine=self.machine)) else: print self.name, "[", self.type, "]", "OBCI LAUNCH FAILED" send_msg( self._publish_socket, self.mtool.fill_msg("obci_launch_failed", machine=self.machine, path=path, args=args, details=details), ) self.processes = {} self.subprocess_mgr.killall() def _launch_process(self, path, args, proc_type, name, env=None, capture_io=NO_STDIO): proc, details = self.subprocess_mgr.new_local_process( path, args, proc_type=proc_type, name=name, monitoring_optflags=RETURNCODE, capture_io=capture_io, env=env ) if proc is None: print self.name, "[", self.type, "]", "process launch FAILED:", path, args send_msg( self._publish_socket, self.mtool.fill_msg( "launch_error", sender=self.uuid, details=dict(machine=self.machine, path=path, args=args, error=details), ), ) else: print self.name, "[", self.type, "]", "process launch success:", path, args, proc.pid send_msg( self._publish_socket, self.mtool.fill_msg( "launched_process_info", sender=self.uuid, machine=self.machine, pid=proc.pid, proc_type=proc_type, name=name, path=path, args=args, ), ) return proc, details @msg_handlers.handler("get_tail") def handle_get_tail(self, message, sock): lines = message.len if message.len else DEFAULT_TAIL_RQ peer = message.peer_id if peer not in self.launch_data: return experiment_id = self.launch_data[peer]["experiment_id"] txt = self.processes[peer].tail_stdout(lines=lines) send_msg( self._publish_socket, self.mtool.fill_msg("tail", txt=txt, sender=self.uuid, experiment_id=experiment_id, peer_id=peer), ) @msg_handlers.handler("experiment_finished") def handle_experiment_finished(self, message, sock): pass @msg_handlers.handler("morph_to_new_scenario") def handle_morph(self, message, sock): pass @msg_handlers.handler("stop_all") def handle_stop_all(self, message, sock): self.subprocess_mgr.killall() @msg_handlers.handler("dead_process") def handle_dead_process(self, message, sock): proc = self.subprocess_mgr.process(message.machine, message.pid) if proc is not None: proc.mark_delete() name = proc.name print "~~~~~ ~~~~~ ", name, self.restarting, message.status[0] if (proc.proc_type == "obci_peer" or proc.proc_type == "multiplexer") and not ( name in self.restarting and message.status[0] == "terminated" ): print "KILLLLLING and sending obci_peer_dead", proc.name send_msg( self._publish_socket, self.mtool.fill_msg( "obci_peer_dead", sender=self.uuid, sender_ip=self.machine, peer_id=proc.name, path=proc.path, status=proc.status(), ), ) if name in self.restarting: self.restarting.remove(name) @msg_handlers.handler("obci_peer_registered") def handle_obci_peer_registered(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_params_changed") def handle_obci_peer_params_changed(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_ready") def handle_obci_peer_ready(self, message, sock): print self.name, "got!", message.type send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_control_message") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("obci_peer_dead") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("process_supervisor_registered") def handle_supervisor_registered(self, messsage, sock): # also ignore pass def cleanup_before_net_shutdown(self, kill_message, sock=None): self.processes = {} # self.subprocess_mgr.killall() def clean_up(self): print self.name, "[", self.type, "]", "cleaning up" self.processes = {} self.subprocess_mgr.killall() self.subprocess_mgr.delete_all()
class OBCIControlPeer(object): msg_handlers = HandlerCollection() def __init__(self, source_addresses=None, rep_addresses=None, pub_addresses=None, name='obci_control_peer'): # TODO TODO TODO !!!! # cleaner subclassing of obci_control_peer!!! self.hostname = socket.gethostname() self.source_addresses = source_addresses if source_addresses else [] self.rep_addresses = rep_addresses self.pub_addresses = pub_addresses self._all_sockets = [] self._pull_addr = 'inproc://publisher_msg' self._push_addr = 'inproc://publisher' self._subpr_push_addr = 'inproc://subprocess_info' self.uuid = str(uuid.uuid4()) self.name = str(name) self.type = self.peer_type() log_dir = os.path.join(settings.OBCI_CONTROL_LOG_DIR, self.name + '-' + self.uuid[:8]) if not hasattr(self, 'logger'): if not os.path.exists(log_dir): os.makedirs(log_dir) self.logger = get_logger(self.peer_type(), log_dir=log_dir, stream_level=net_tools.peer_loglevel(), obci_peer=self) self.mtool = self.message_tool() if not hasattr(self, "ctx"): self.ctx = zmq.Context() self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger) self.net_init() if self.source_addresses: self.registration_response = self.register() self._handle_registration_response(self.registration_response) else: self.registration_response = None self.interrupted = False signal.signal(signal.SIGTERM, self.signal_handler()) signal.signal(signal.SIGINT, self.signal_handler()) def signal_handler(self): def handler(signum, frame): self.logger.info("[!!!!] %s %s %s %s", self.name, "got signal", signum, frame) self.interrupted = True return handler def peer_type(self): return 'obci_control_peer' def message_tool(self): return OBCIMessageTool(message_templates) def _publisher_thread(self, pub_addrs, pull_address, push_addr): # FIXME aaaaahhh pub_addresses are set here, not in the main thread # (which reads them in _register method) pub_sock, self.pub_addresses = self._init_socket( pub_addrs, zmq.PUB) pull_sock = self.ctx.socket(zmq.PULL) pull_sock.bind(pull_address) push_sock = self.ctx.socket(zmq.PUSH) push_sock.connect(push_addr) send_msg(push_sock, b'1') po = PollingObject() while not self._stop_publishing: try: to_publish, det = po.poll_recv(pull_sock, 500) if to_publish: send_msg(pub_sock, to_publish) except: # print self.name, '.Publisher -- STOP.' break # self.logger.info( "close sock %s %s", pub_addrs, pub_sock) pub_sock.close() pull_sock.close() push_sock.close() def _subprocess_info(self, push_addr): push_sock = self.ctx.socket(zmq.PUSH) push_sock.connect(push_addr) send_msg(push_sock, b'1') while not self._stop_monitoring: dead = self.subprocess_mgr.not_running_processes() if dead: # self.logger.warning("DEAD process" + str(dead)) for key, status in dead.items(): send_msg(push_sock, self.mtool.fill_msg('dead_process', machine=key[0], pid=key[1], status=status)) time.sleep(0.5) push_sock.close() def _push_sock(self, ctx, addr): sock = ctx.socket(zmq.PUSH) sock.connect(addr) return sock def _prepare_publisher(self): tmp_pull = self.ctx.socket(zmq.PULL) tmp_pull.bind(self._pull_addr) self.pub_thr = threading.Thread(target=self._publisher_thread, args=[self.pub_addresses, self._push_addr, self._pull_addr]) self.pub_thr.daemon = True self._stop_publishing = False self.pub_thr.start() recv_msg(tmp_pull) self._publish_socket = self._push_sock(self.ctx, self._push_addr) self._all_sockets.append(self._publish_socket) tmp_pull.close() def _prepare_subprocess_info(self): self._subprocess_pull = self.ctx.socket(zmq.PULL) self._subprocess_pull.bind(self._subpr_push_addr) self.subprocess_thr = threading.Thread(target=self._subprocess_info, args=[self._subpr_push_addr]) self.subprocess_thr.daemon = True self._stop_monitoring = False self.subprocess_thr.start() recv_msg(self._subprocess_pull) self._all_sockets.append(self._subprocess_pull) def net_init(self): # (self.pub_socket, self.pub_addresses) = self._init_socket( # self.pub_addresses, zmq.PUB) self._all_sockets = [] self._prepare_publisher() self._prepare_subprocess_info() (self.rep_socket, self.rep_addresses) = self._init_socket( self.rep_addresses, zmq.REP) self.rep_socket.setsockopt(zmq.LINGER, 0) self._all_sockets.append(self.rep_socket) print("\n\tname: {0}\n\tpeer_type: {1}\n\tuuid: {2}\n".format( self.name, self.peer_type(), self.uuid)) print("rep: {0}".format(self.rep_addresses)) print("pub: {0}\n".format(self.pub_addresses)) self.source_req_socket = self.ctx.socket(zmq.REQ) if self.source_addresses: for addr in self.source_addresses: self.source_req_socket.connect(addr) self._all_sockets.append(self.source_req_socket) self._set_poll_sockets() def _init_socket(self, addrs, zmq_type): # print self.peer_type(), "addresses for socket init:", addrs addresses = addrs if addrs else ['tcp://*'] random_port = True if not addrs else False sock = self.ctx.socket(zmq_type) port = None try: for i, addr in enumerate(addresses): if random_port and net.is_net_addr(addr): port = str(sock.bind_to_random_port(addr, min_port=settings.PORT_RANGE[0], max_port=settings.PORT_RANGE[1])) addresses[i] = addr + ':' + str(port) else: sock.bind(addr) except Exception as e: self.logger.critical("CRITICAL error: %s", str(e)) raise(e) advertised_addrs = [] for addr in addresses: if addr.startswith('tcp://*'): port = addr.rsplit(':', 1)[1] advertised_addrs.append('tcp://' + socket.gethostname() + ':' + str(port)) advertised_addrs.append('tcp://' + 'localhost:' + str(port)) else: advertised_addrs.append(addr) return sock, advertised_addrs def _register(self, rep_addrs, pub_addrs, params): message = self.mtool.fill_msg("register_peer", peer_type=self.type, uuid=self.uuid, rep_addrs=rep_addrs, pub_addrs=pub_addrs, name=self.name, other_params=params) self.logger.debug("_register() " + str(message)) send_msg(self.source_req_socket, message) response_str = recv_msg(self.source_req_socket) response = self.mtool.unpack_msg(response_str) if response.type == "rq_error": self.logger.critical("Registration failed: {0}".format(response_str)) sys.exit(2) return response def register(self): params = self.params_for_registration() return self._register(self.rep_addresses, self.pub_addresses, params) def _handle_registration_response(self, response): pass def shutdown(self): self.logger.info("SHUTTING DOWN") sys.exit(0) def params_for_registration(self): return {} def basic_sockets(self): return [self.rep_socket, self._subprocess_pull] def custom_sockets(self): """ subclass this """ return [] def all_sockets(self): return self.basic_sockets() + self.custom_sockets() def _set_poll_sockets(self): self._poll_sockets = self.all_sockets() @log_crash def run(self): self.pre_run() poller = zmq.Poller() poll_sockets = list(self._poll_sockets) for sock in poll_sockets: poller.register(sock, zmq.POLLIN) try: while True: socks = [] try: socks = dict(poller.poll()) except zmq.ZMQError as e: self.logger.warning(": zmq.poll(): " + str(e.strerror)) for sock in socks: if socks[sock] == zmq.POLLIN: more = True while more: try: msg = recv_msg(sock, flags=zmq.NOBLOCK) except zmq.ZMQError as e: if e.errno == zmq.EAGAIN or sock.getsockopt(zmq.TYPE) == zmq.REP: more = False else: self.logger.error("handling socket read error: %s %d %s", e, e.errno, sock) poller.unregister(sock) if sock in poll_sockets: poll_sockets.remove(sock) self.handle_socket_read_error(sock, e) break else: self.handle_message(msg, sock) else: self.logger.warning("sock not zmq.POLLIN! Ignore !") if self.interrupted: break self._update_poller(poller, poll_sockets) except Exception as e: # from urllib2 import HTTPError # try: # self.logger.critical("UNHANDLED EXCEPTION IN %s!!! ABORTING! Exception data: %s, e.args: %s, %s", # self.name, e, e.args, vars(e), exc_info=True, # extra={'stack': True}) # except HTTPError, e: # self.logger.info('sentry sending failed....') self._clean_up() raise(e) self._clean_up() def _crash_extra_description(self, exception=None): return "" def _crash_extra_data(self, exception=None): return {} def _crash_extra_tags(self, exception=None): return {'obci_part': 'launcher'} def _update_poller(self, poller, curr_sockets): self._set_poll_sockets() new_sockets = list(self._poll_sockets) for sock in new_sockets: if sock not in curr_sockets: poller.register(sock, zmq.POLLIN) for sock in curr_sockets: if sock not in new_sockets: poller.unregister(sock) curr_sockets = new_sockets def handle_socket_read_error(self, socket, error): pass def pre_run(self): pass def _clean_up(self): time.sleep(0.01) self._stop_publishing = True self._stop_monitoring = True self.pub_thr.join() self.subprocess_thr.join() for sock in self._all_sockets: # print self.name, "closing ", sock sock.close() # try: # self.ctx.term() # except zmq.ZMQError(), e: # print "Ctx closing interrupted." self.clean_up() def clean_up(self): self.logger.info("CLEANING UP") # message handling ###################################### def handle_message(self, message, sock): handler = self.msg_handlers.default try: msg = self.mtool.unpack_msg(message) if msg.type != "ping" and msg.type != "rq_ok": self.logger.debug("got message: {0}".format(msg.type)) if msg.type == "get_tail": print(self.msg_handlers) except ValueError as e: print("{0} [{1}], Bad message format! {2}".format( self.name, self.peer_type(), message)) if sock.getsockopt(zmq.TYPE) == zmq.REP: handler = self.msg_handlers.error msg = message print(e) else: msg_type = msg.type handler = self.msg_handlers.handler_for(msg_type) if handler is None: # print "{0} [{1}], Unknown message type: {2}".format( # self.name, self.peer_type(),msg_type) # print message handler = self.msg_handlers.unsupported handler(self, msg, sock) @msg_handlers.handler("register_peer") def handle_register_peer(self, message, sock): """Subclass this.""" result = self.mtool.fill_msg("rq_error", request=vars(message), err_code="unsupported_peer_type") send_msg(sock, result) @msg_handlers.handler("ping") def handle_ping(self, message, sock): if sock.socket_type in [zmq.REP, zmq.ROUTER]: send_msg(sock, self.mtool.fill_msg("pong")) @msg_handlers.default_handler() def default_handler(self, message, sock): """Ignore message""" pass @msg_handlers.unsupported_handler() def unsupported_msg_handler(self, message, sock): if sock.socket_type in [zmq.REP, zmq.ROUTER]: msg = self.mtool.fill_msg("rq_error", request=vars(message), err_code="unsupported_msg_type", sender=self.uuid) send_msg(sock, msg) # print "--" @msg_handlers.error_handler() def bad_msg_handler(self, message, sock): msg = self.mtool.fill_msg("rq_error", request=message, err_code="invalid_msg_format") send_msg(sock, msg) @msg_handlers.handler("kill") def handle_kill(self, message, sock): if not message.receiver or message.receiver == self.uuid: self.cleanup_before_net_shutdown(message, sock) self._clean_up() self.shutdown() @msg_handlers.handler("dead_process") def handle_dead_process(self, message, sock): pass def cleanup_before_net_shutdown(self, kill_message, sock=None): for sock in self._all_sockets: sock.close()
class OBCIProcessSupervisor(OBCIControlPeer): msg_handlers = OBCIControlPeer.msg_handlers.copy() @log_crash def __init__(self, sandbox_dir, source_addresses=None, source_pub_addresses=None, rep_addresses=None, pub_addresses=None, experiment_uuid='', name='obci_process_supervisor'): self.peers = {} self.status = launcher_tools.READY_TO_LAUNCH self.source_pub_addresses = source_pub_addresses self.machine = socket.gethostname() self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR self.ctx = zmq.Context() self.mx_data = self.set_mx_data() self.env = self.peer_env(self.mx_data) self.launch_data = [] self.peer_order = [] self._running_peer_order = [] self._current_part = None self.__cfg_launch_info = None self.__cfg_morph = False self.experiment_uuid = experiment_uuid self.peers_to_launch = [] self.processes = {} self.restarting = [] self.rqs = 0 self._nearby_machines = net.DNS() self.test_count = 0 self.__cfg_lock = threading.RLock() super(OBCIProcessSupervisor, self).__init__( source_addresses=source_addresses, rep_addresses=rep_addresses, pub_addresses=pub_addresses, name=name) self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger) def peer_type(self): return "obci_process_supervisor" def net_init(self): self.source_sub_socket = self.ctx.socket(zmq.SUB) self.source_sub_socket.setsockopt_string(zmq.SUBSCRIBE, "") self._all_sockets.append(self.source_sub_socket) if self.source_pub_addresses: for addr in self.source_pub_addresses: self.source_sub_socket.connect(addr) (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL) # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "") self.cs_addr = net.choose_local(self.cs_addresses) if not self.cs_addr: self.cs_addr = net.choose_not_local(self.cs_addresses)[0] else: self.cs_addr = self.cs_addr[0] self._all_sockets.append(self.config_server_socket) super(OBCIProcessSupervisor, self).net_init() def params_for_registration(self): mx_data = None if None not in self.mx_data: mx_data = [self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]] return dict(pid=os.getpid(), machine=self.machine, mx_data=mx_data) def custom_sockets(self): return [self.source_sub_socket, self.config_server_socket] def _handle_registration_response(self, response): self.launch_data = response.params['launch_data'] self.peers_to_launch = list(self.launch_data.keys()) self.peer_order = response.params['peer_order'] for part in self.peer_order: self._running_peer_order.append(list(part)) self.logger.info("RECEIVED LAUNCH DATA: %s", self.launch_data) def set_mx_data(self): src_ = net.choose_not_local(self.source_pub_addresses)[:1] if not src_: src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1] src = src_[0] src = src[6:].split(':')[0] if src == socket.gethostname(): sock = self.ctx.socket(zmq.REP) port = str(sock.bind_to_random_port("tcp://127.0.0.1", min_port=settings.PORT_RANGE[0], max_port=settings.PORT_RANGE[1])) sock.close() return ('0.0.0.0', port), "" # empty passwd else: return None, None def mx_addr_str(self, mx_data): if mx_data[0] is None: return None addr, port = mx_data[0] self.logger.info("mx addr str: " + addr + ':' + str(port)) return addr + ':' + str(port) def peer_env(self, mx_data): if mx_data[0] is None: return None env = os.environ.copy() addr, port = mx_data[0] if addr == '0.0.0.0': addr = socket.gethostname() _env = { "MULTIPLEXER_ADDRESSES": str(addr) + ':' + str(port) } env.update(_env) return env @msg_handlers.handler("start_broker") def handle_start_broker(self, message, sock): if 'mx' in self.launch_data and self.mx_data[0] is not None: self.logger.info("..starting multiplexer") self.peer_order.remove(['mx']) self.peers_to_launch.remove('mx') path = launcher_tools.broker_path() args = [ 'run_multiplexer', self.mx_addr_str((('0.0.0.0', self.mx_data[0][1]), self.mx_data[1])) ] proc, details = self._launch_process(path, args, 'multiplexer', 'mx', env=self.env) self.processes['mx'] = proc if proc is not None: self.mx = proc @msg_handlers.handler("start_config_server") def handle_start_config_srv(self, message, sock): if 'mx' not in self.launch_data: mx_addr = message.mx_data[1].split(':') mx_addr[1] = int(mx_addr[1]) md = list(self.mx_data) md[0] = tuple(mx_addr) self.mx_data = tuple(md) self.env = self.peer_env(self.mx_data) if "config_server" in self.launch_data: proc, details, wait, info_obj = \ self.launch_process("config_server", self.launch_data["config_server"], restore_config=message.restore_config) tim = threading.Timer(1.5, self.__if_config_server_conn_didnt_work) tim.start() def __if_config_server_conn_didnt_work(self): with self.__cfg_lock: if self.__cfg_launch_info: send_msg(self._publish_socket, self.__cfg_launch_info) self.__cfg_launch_info = None self.logger.info("connection to config server is shaky :(") @msg_handlers.handler("start_peers") def handle_start_peers(self, message, sock): self.logger.info("start peers -- my mx_data: %s, received mx_data: %s", self.mx_data, message.mx_data) if 'mx' not in self.launch_data: mx_addr = message.mx_data[1].split(':') mx_addr[1] = int(mx_addr[1]) md = list(self.mx_data) md[0] = tuple(mx_addr) self.mx_data = tuple(md) self.env = self.peer_env(self.mx_data) # tmp.workarounds: wait for mx on other machine to initialize time.sleep(0.75) if message.add_launch_data: if self.machine in message.add_launch_data: self._launch_processes(message.add_launch_data[self.machine]) else: self._launch_processes(self.launch_data) @msg_handlers.handler("manage_peers") def handle_manage_peers(self, message, sock): if not message.receiver == self.uuid: return for peer in message.kill_peers: proc = self.processes.get(peer, None) if not proc: self.logger.error("peer to kill not found: %s", peer) continue self.logger.info("MORPH: KILLING %s ", peer) proc.kill_with_force() self.logger.info("MORPH: KILLED %s ", peer) del self.processes[peer] del self.launch_data[peer] for peer, data in message.start_peers_data.items(): self.launch_data[peer] = data self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers] self._launch_processes(message.start_peers_data) def _launch_processes(self, launch_data, restore_config=[]): proc, details, info_obj = None, None, None success = True self.status = launcher_tools.LAUNCHING ldata = [] if 'amplifier' in launch_data: ldata.append(('amplifier', launch_data['amplifier'])) for peer, data in launch_data.items(): if (peer, data) not in ldata and peer != 'config_server': ldata.append((peer, data)) for peer, data in ldata: # self.launch_data.iteritems(): if peer.startswith('mx'): continue proc, details, wait, info_obj = self.launch_process(peer, data, restore_config=restore_config) time.sleep(wait) if proc is None: success = False break if success: send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched", machine=self.machine)) def launch_process(self, peer, launch_data, restore_config=[]): data = launch_data wait = 0 p = os.path.expanduser(data['path']) if not os.path.isabs(p): path = os.path.join(launcher_tools.obci_root(), p) path = os.path.abspath(path) else: path = os.path.realpath(p) args = data['args'] args = self._attach_base_config_path(path, args) args += ['-p', 'experiment_uuid', self.experiment_uuid] if peer.startswith('config_server'): args += ['-p', 'launcher_socket_addr', self.cs_addr] if restore_config: args += ['-p', 'restore_peers', ' '.join(restore_config)] # wait = 0.5 if "log_dir" in args: idx = args.index("log_dir") + 1 log_dir = args[idx] log_dir = os.path.join(log_dir, self.name) args[idx] = log_dir else: log_dir = os.path.join(CONFIG_DEFAULTS["log_dir"], self.name) args += ['-p', 'log_dir', log_dir] if not os.path.exists(log_dir): os.makedirs(log_dir) proc, details = self._launch_process(path, args, data['peer_type'], peer, env=self.env, capture_io=NO_STDIO) info_obj = { "path": path, "args": args, "peer": peer } if proc is not None: self.processes[peer] = proc else: self.logger.error("OBCI LAUNCH FAILED") send_msg(self._publish_socket, self.mtool.fill_msg("obci_launch_failed", machine=self.machine, path=info_obj['path'], args=info_obj['args'], details=details)) self.processes = {} self.subprocess_mgr.killall(force=True) return proc, details, wait, info_obj def _launch_process(self, path, args, proc_type, name, env=None, capture_io=NO_STDIO): self.logger.debug("launching..... %s %s", path, args) proc, details = self.subprocess_mgr.new_local_process(path, args, proc_type=proc_type, name=name, monitoring_optflags=RETURNCODE, capture_io=capture_io, env=env) if proc is None: self.logger.error("process launch FAILED: %s --- %s", path, str(args)) send_msg(self._publish_socket, self.mtool.fill_msg("launch_error", sender=self.uuid, details=dict(machine=self.machine, path=path, args=args, error=details, peer_id=name))) else: self.logger.info("process launch success:" + path + str(args) + str(proc.pid)) msg = self.mtool.fill_msg("launched_process_info", sender=self.uuid, machine=self.machine, pid=proc.pid, proc_type=proc_type, name=name, path=path, args=args) if name == "config_server": self.__cfg_launch_info = msg else: send_msg(self._publish_socket, msg) return proc, details def _attach_base_config_path(self, launch_path, launch_args): peer_id = launch_args[0] base = launch_path.rsplit('.', 1)[0] ini = '.'.join([base, 'ini']) return [peer_id, ini] + launch_args[1:] @msg_handlers.handler("get_tail") def handle_get_tail(self, message, sock): lines = message.len if message.len else DEFAULT_TAIL_RQ peer = message.peer_id if peer not in self.launch_data: return experiment_id = self.launch_data[peer]['experiment_id'] txt = self.processes[peer].tail_stdout(lines=lines) send_msg(self._publish_socket, self.mtool.fill_msg("tail", txt=txt, sender=self.uuid, experiment_id=experiment_id, peer_id=peer)) @msg_handlers.handler("experiment_finished") def handle_experiment_finished(self, message, sock): pass @msg_handlers.handler("morph_to_new_scenario") def handle_morph(self, message, sock): pass @msg_handlers.handler('nearby_machines') def handle_nearby_machines(self, message, sock): self._nearby_machines.mass_update(message.nearby_machines) @msg_handlers.handler("stop_all") def handle_stop_all(self, message, sock): self.subprocess_mgr.killall(force=True) @msg_handlers.handler("_kill_peer") def handle_kill_peer(self, message, sock): proc = self.processes.get(message.peer_id, None) if proc is not None: # is on this machine if message.morph and message.peer_id == 'config_server': self.__cfg_morph = True proc.kill_with_force() @msg_handlers.handler("rq_ok") def handle_rq_ok(self, message, sock): self.rqs += 1 # print "--> ", self.rqs if self.rqs == 10000: self.logger.debug("GOT %s %s", str(self.rqs), "messages!") self.rqs = 0 @msg_handlers.handler("experiment_launch_error") def handle_experiment_launch_error(self, message, sock): self.subprocess_mgr.killall(force=True) @msg_handlers.handler("dead_process") def handle_dead_process(self, message, sock): proc = self.subprocess_mgr.process(message.machine, message.pid) if proc is not None: proc.mark_delete() name = proc.name if (proc.proc_type == 'obci_peer' or proc.proc_type == 'multiplexer') and \ not (name in self.restarting and message.status[0] == 'terminated'): self.logger.info("KILLLING! sending obci_peer_" "dead for process %s", proc.name) send_msg(self._publish_socket, self.mtool.fill_msg("obci_peer_dead", sender=self.uuid, sender_ip=self.machine, peer_id=proc.name, path=proc.path, status=proc.status() )) if name in self.restarting: self.restarting.remove(name) if self.__cfg_morph and name == 'config_server': self.__cfg_morph = False @msg_handlers.handler("obci_peer_registered") def handle_obci_peer_registered(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_params_changed") def handle_obci_peer_params_changed(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_ready") def handle_obci_peer_ready(self, message, sock): self.logger.info("got! " + message.type) send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("config_server_ready") def handle_obci_peer_ready(self, message, sock): # config_server successfully connected to MX, now send "launched_process_info" with self.__cfg_lock: if self.__cfg_launch_info: send_msg(self._publish_socket, self.__cfg_launch_info) self.__cfg_launch_info = None @msg_handlers.handler("obci_control_message") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("obci_peer_dead") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("process_supervisor_registered") def handle_supervisor_registered(self, messsage, sock): # also ignore pass def cleanup_before_net_shutdown(self, kill_message, sock=None): self.processes = {} self.subprocess_mgr.killall(force=True) def clean_up(self): self.logger.info("cleaning up") self.processes = {} self.subprocess_mgr.killall(force=True) self.subprocess_mgr.delete_all() def _crash_extra_data(self, exception=None): data = super(OBCIProcessSupervisor, self)._crash_extra_data(exception) data.update({ 'experiment_uuid': self.experiment_uuid, 'name': self.name }) return data
class OBCIProcessSupervisor(OBCIControlPeer): msg_handlers = OBCIControlPeer.msg_handlers.copy() @log_crash def __init__(self, sandbox_dir, source_addresses=None, source_pub_addresses=None, rep_addresses=None, pub_addresses=None, experiment_uuid='', name='obci_process_supervisor'): self.peers = {} self.status = launcher_tools.READY_TO_LAUNCH self.source_pub_addresses = source_pub_addresses self.machine = socket.gethostname() self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR self.ctx = zmq.Context() self.mx_data = self.set_mx_data() self.env = self.peer_env(self.mx_data) self.launch_data = [] self.peer_order = [] self._running_peer_order = [] self._current_part = None self.__cfg_launch_info = None self.__cfg_morph = False self.experiment_uuid = experiment_uuid self.peers_to_launch = [] self.processes = {} self.restarting = [] self.rqs = 0 self._nearby_machines = net.DNS() self.test_count = 0 self.__cfg_lock = threading.RLock() super(OBCIProcessSupervisor, self).__init__( source_addresses=source_addresses, rep_addresses=rep_addresses, pub_addresses=pub_addresses, name=name) self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger) def peer_type(self): return "obci_process_supervisor" def net_init(self): self.source_sub_socket = self.ctx.socket(zmq.SUB) self.source_sub_socket.setsockopt(zmq.SUBSCRIBE, "") self._all_sockets.append(self.source_sub_socket) if self.source_pub_addresses: for addr in self.source_pub_addresses: self.source_sub_socket.connect(addr) (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL) # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "") self.cs_addr = net.choose_local(self.cs_addresses) if not self.cs_addr: self.cs_addr = net.choose_not_local(self.cs_addresses)[0] else: self.cs_addr = self.cs_addr[0] self._all_sockets.append(self.config_server_socket) super(OBCIProcessSupervisor, self).net_init() def params_for_registration(self): mx_data = None if None not in self.mx_data: mx_data = [self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]] return dict(pid=os.getpid(), machine=self.machine, mx_data=mx_data) def custom_sockets(self): return [self.source_sub_socket, self.config_server_socket] def _handle_registration_response(self, response): self.launch_data = response.params['launch_data'] self.peers_to_launch = list(self.launch_data.keys()) self.peer_order = response.params['peer_order'] for part in self.peer_order: self._running_peer_order.append(list(part)) self.logger.info("RECEIVED LAUNCH DATA: %s", self.launch_data) def set_mx_data(self): src_ = net.choose_not_local(self.source_pub_addresses)[:1] if not src_: src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1] src = src_[0] src = src[6:].split(':')[0] if src == socket.gethostname(): sock = self.ctx.socket(zmq.REP) port = str(sock.bind_to_random_port("tcp://127.0.0.1", min_port=settings.PORT_RANGE[0], max_port=settings.PORT_RANGE[1])) sock.close() return ('0.0.0.0', port), "" # empty passwd else: return None, None def mx_addr_str(self, mx_data): if mx_data[0] is None: return None addr, port = mx_data[0] self.logger.info("mx addr str: " + addr + ':' + str(port)) return addr + ':' + str(port) def peer_env(self, mx_data): if mx_data[0] is None: return None env = os.environ.copy() addr, port = mx_data[0] if addr == '0.0.0.0': addr = socket.gethostname() _env = { "MULTIPLEXER_ADDRESSES": str(addr) + ':' + str(port), "MULTIPLEXER_PASSWORD": '', # mx_data[1], "MULTIPLEXER_RULES": str(launcher_tools.mx_rules_path()) } env.update(_env) return env @msg_handlers.handler("start_mx") def handle_start_mx(self, message, sock): if 'mx' in self.launch_data and self.mx_data[0] is not None: self.logger.info("..starting multiplexer") self.peer_order.remove(['mx']) self.peers_to_launch.remove('mx') path = launcher_tools.mx_path() args = ['run_multiplexer', self.mx_addr_str( (('0.0.0.0', self.mx_data[0][1]), self.mx_data[1])), '--multiplexer-password', self.mx_data[1], '--rules', launcher_tools.mx_rules_path()] proc, details = self._launch_process(path, args, 'multiplexer', 'mx', env=self.env) self.processes['mx'] = proc if proc is not None: self.mx = proc @msg_handlers.handler("start_config_server") def handle_start_config_srv(self, message, sock): if 'mx' not in self.launch_data: mx_addr = message.mx_data[1].split(':') mx_addr[1] = int(mx_addr[1]) md = list(self.mx_data) md[0] = tuple(mx_addr) self.mx_data = tuple(md) self.env = self.peer_env(self.mx_data) if "config_server" in self.launch_data: proc, details, wait, info_obj = \ self.launch_process("config_server", self.launch_data["config_server"], restore_config=message.restore_config) tim = threading.Timer(1.5, self.__if_config_server_conn_didnt_work) tim.start() def __if_config_server_conn_didnt_work(self): with self.__cfg_lock: if self.__cfg_launch_info: send_msg(self._publish_socket, self.__cfg_launch_info) self.__cfg_launch_info = None self.logger.info("connection to config server is shaky :(") @msg_handlers.handler("start_peers") def handle_start_peers(self, message, sock): self.logger.info("start peers -- my mx_data: %s, received mx_data: %s", self.mx_data, message.mx_data) if 'mx' not in self.launch_data: mx_addr = message.mx_data[1].split(':') mx_addr[1] = int(mx_addr[1]) md = list(self.mx_data) md[0] = tuple(mx_addr) self.mx_data = tuple(md) self.env = self.peer_env(self.mx_data) # tmp.workarounds: wait for mx on other machine to initialize time.sleep(0.75) if message.add_launch_data: if self.machine in message.add_launch_data: self._launch_processes(message.add_launch_data[self.machine]) else: self._launch_processes(self.launch_data) @msg_handlers.handler("manage_peers") def handle_manage_peers(self, message, sock): if not message.receiver == self.uuid: return for peer in message.kill_peers: proc = self.processes.get(peer, None) if not proc: self.logger.error("peer to kill not found: %s", peer) continue self.logger.info("MORPH: KILLING %s ", peer) proc.kill_with_force() self.logger.info("MORPH: KILLED %s ", peer) del self.processes[peer] del self.launch_data[peer] for peer, data in message.start_peers_data.iteritems(): self.launch_data[peer] = data self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers] self._launch_processes(message.start_peers_data) def _launch_processes(self, launch_data, restore_config=[]): proc, details, info_obj = None, None, None success = True path, args = None, None self.status = launcher_tools.LAUNCHING ldata = [] if 'amplifier' in launch_data: ldata.append(('amplifier', launch_data['amplifier'])) for peer, data in launch_data.iteritems(): if (peer, data) not in ldata and peer != 'config_server': ldata.append((peer, data)) for peer, data in ldata: # self.launch_data.iteritems(): if peer.startswith('mx'): continue proc, details, wait, info_obj = self.launch_process(peer, data, restore_config=restore_config) time.sleep(wait) if proc is None: success = False break if success: send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched", machine=self.machine)) def launch_process(self, peer, launch_data, restore_config=[]): data = launch_data wait = 0 p = os.path.expanduser(data['path']) if not os.path.isabs(p): path = os.path.join(launcher_tools.obci_root(), p) path = os.path.abspath(path) else: path = os.path.realpath(p) args = data['args'] args = self._attach_base_config_path(path, args) args += ['-p', 'experiment_uuid', self.experiment_uuid] if peer.startswith('config_server'): args += ['-p', 'launcher_socket_addr', self.cs_addr] if restore_config: args += ['-p', 'restore_peers', ' '.join(restore_config)] # wait = 0.5 if "log_dir" in args: idx = args.index("log_dir") + 1 log_dir = args[idx] log_dir = os.path.join(log_dir, self.name) args[idx] = log_dir else: log_dir = os.path.join(CONFIG_DEFAULTS["log_dir"], self.name) args += ['-p', 'log_dir', log_dir] if not os.path.exists(log_dir): os.makedirs(log_dir) proc, details = self._launch_process(path, args, data['peer_type'], peer, env=self.env, capture_io=NO_STDIO) info_obj = { "path": path, "args": args, "peer": peer } if proc is not None: self.processes[peer] = proc else: self.logger.error("OBCI LAUNCH FAILED") send_msg(self._publish_socket, self.mtool.fill_msg("obci_launch_failed", machine=self.machine, path=info_obj['path'], args=info_obj['args'], details=details)) self.processes = {} self.subprocess_mgr.killall(force=True) return proc, details, wait, info_obj def _launch_process(self, path, args, proc_type, name, env=None, capture_io=NO_STDIO): self.logger.debug("launching..... %s %s", path, args) proc, details = self.subprocess_mgr.new_local_process(path, args, proc_type=proc_type, name=name, monitoring_optflags=RETURNCODE, capture_io=capture_io, env=env) if proc is None: self.logger.error("process launch FAILED: %s --- %s", path, str(args)) send_msg(self._publish_socket, self.mtool.fill_msg("launch_error", sender=self.uuid, details=dict(machine=self.machine, path=path, args=args, error=details, peer_id=name))) else: self.logger.info("process launch success:" + path + str(args) + str(proc.pid)) msg = self.mtool.fill_msg("launched_process_info", sender=self.uuid, machine=self.machine, pid=proc.pid, proc_type=proc_type, name=name, path=path, args=args) if name == "config_server": self.__cfg_launch_info = msg else: send_msg(self._publish_socket, msg) return proc, details def _attach_base_config_path(self, launch_path, launch_args): peer_id = launch_args[0] base = launch_path.rsplit('.', 1)[0] ini = '.'.join([base, 'ini']) return [peer_id, ini] + launch_args[1:] @msg_handlers.handler("get_tail") def handle_get_tail(self, message, sock): lines = message.len if message.len else DEFAULT_TAIL_RQ peer = message.peer_id if peer not in self.launch_data: return experiment_id = self.launch_data[peer]['experiment_id'] txt = self.processes[peer].tail_stdout(lines=lines) send_msg(self._publish_socket, self.mtool.fill_msg("tail", txt=txt, sender=self.uuid, experiment_id=experiment_id, peer_id=peer)) @msg_handlers.handler("experiment_finished") def handle_experiment_finished(self, message, sock): pass @msg_handlers.handler("morph_to_new_scenario") def handle_morph(self, message, sock): pass @msg_handlers.handler('nearby_machines') def handle_nearby_machines(self, message, sock): self._nearby_machines.mass_update(message.nearby_machines) @msg_handlers.handler("stop_all") def handle_stop_all(self, message, sock): self.subprocess_mgr.killall(force=True) @msg_handlers.handler("_kill_peer") def handle_kill_peer(self, message, sock): proc = self.processes.get(message.peer_id, None) if proc is not None: # is on this machine if message.morph and message.peer_id == 'config_server': self.__cfg_morph = True proc.kill_with_force() @msg_handlers.handler("rq_ok") def handle_rq_ok(self, message, sock): self.rqs += 1 # print "--> ", self.rqs if self.rqs == 10000: self.logger.debug("GOT %s %s", str(self.rqs), "messages!") self.rqs = 0 @msg_handlers.handler("experiment_launch_error") def handle_experiment_launch_error(self, message, sock): self.subprocess_mgr.killall(force=True) @msg_handlers.handler("dead_process") def handle_dead_process(self, message, sock): proc = self.subprocess_mgr.process(message.machine, message.pid) if proc is not None: proc.mark_delete() name = proc.name if (proc.proc_type == 'obci_peer' or proc.proc_type == 'multiplexer') and \ not (name in self.restarting and message.status[0] == 'terminated'): self.logger.info("KILLLING! sending obci_peer_" "dead for process %s", proc.name) send_msg(self._publish_socket, self.mtool.fill_msg("obci_peer_dead", sender=self.uuid, sender_ip=self.machine, peer_id=proc.name, path=proc.path, status=proc.status() )) if name in self.restarting: self.restarting.remove(name) if self.__cfg_morph and name == 'config_server': self.__cfg_morph = False @msg_handlers.handler("obci_peer_registered") def handle_obci_peer_registered(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_params_changed") def handle_obci_peer_params_changed(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_ready") def handle_obci_peer_ready(self, message, sock): self.logger.info("got! " + message.type) send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("config_server_ready") def handle_obci_peer_ready(self, message, sock): # config_server successfully connected to MX, now send "launched_process_info" with self.__cfg_lock: if self.__cfg_launch_info: send_msg(self._publish_socket, self.__cfg_launch_info) self.__cfg_launch_info = None @msg_handlers.handler("obci_control_message") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("obci_peer_dead") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("process_supervisor_registered") def handle_supervisor_registered(self, messsage, sock): # also ignore pass def cleanup_before_net_shutdown(self, kill_message, sock=None): self.processes = {} self.subprocess_mgr.killall(force=True) def clean_up(self): self.logger.info("cleaning up") self.processes = {} self.subprocess_mgr.killall(force=True) self.subprocess_mgr.delete_all() def _crash_extra_data(self, exception=None): data = super(OBCIProcessSupervisor, self)._crash_extra_data(exception) data.update({ 'experiment_uuid': self.experiment_uuid, 'name': self.name }) return data
class OBCIControlPeer(object): msg_handlers = HandlerCollection() def __init__(self, source_addresses=None, rep_addresses=None, pub_addresses=None, name='obci_control_peer'): ###TODO TODO TODO !!!! ###cleaner subclassing of obci_control_peer!!! self.hostname = socket.gethostname() self.source_addresses = source_addresses if source_addresses else [] self.rep_addresses = rep_addresses self.pub_addresses = pub_addresses self._all_sockets = [] self._pull_addr = 'inproc://publisher_msg' self._push_addr = 'inproc://publisher' self._subpr_push_addr = 'inproc://subprocess_info' self.uuid = str(uuid.uuid4()) self.name = str(name) self.type = self.peer_type() log_dir = os.path.join(settings.OBCI_CONTROL_LOG_DIR, self.name + '-' + self.uuid[:8]) if not hasattr(self, 'logger'): if not os.path.exists(log_dir): os.makedirs(log_dir) self.logger = get_logger(self.peer_type(), log_dir=log_dir, stream_level=net_tools.peer_loglevel(), obci_peer=self) self.mtool = self.message_tool() if not hasattr(self, "ctx"): self.ctx = zmq.Context() self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger) self.net_init() if self.source_addresses: self.registration_response = self.register() self._handle_registration_response(self.registration_response) else: self.registration_response = None self.interrupted = False signal.signal(signal.SIGTERM, self.signal_handler()) signal.signal(signal.SIGINT, self.signal_handler()) def signal_handler(self): def handler(signum, frame): self.logger.info("[!!!!] %s %s %s %s", self.name, "got signal", signum, frame) self.interrupted = True return handler def peer_type(self): return 'obci_control_peer' def message_tool(self): return OBCIMessageTool(message_templates) def _publisher_thread(self, pub_addrs, pull_address, push_addr): #FIXME aaaaahhh pub_addresses are set here, not in the main thread # (which reads them in _register method) pub_sock, self.pub_addresses = self._init_socket( pub_addrs, zmq.PUB) pull_sock = self.ctx.socket(zmq.PULL) pull_sock.bind(pull_address) push_sock = self.ctx.socket(zmq.PUSH) push_sock.connect(push_addr) send_msg(push_sock, u'1') po = PollingObject() while not self._stop_publishing: try: to_publish, det = po.poll_recv(pull_sock, 500) if to_publish: send_msg(pub_sock, to_publish) except: #print self.name, '.Publisher -- STOP.' break # self.logger.info( "close sock %s %s", pub_addrs, pub_sock) pub_sock.close() pull_sock.close() push_sock.close() def _subprocess_info(self, push_addr): push_sock = self.ctx.socket(zmq.PUSH) push_sock.connect(push_addr) send_msg(push_sock, u'1') while not self._stop_monitoring: dead = self.subprocess_mgr.not_running_processes() if dead: # self.logger.warning("DEAD process" + str(dead)) for key, status in dead.iteritems(): send_msg(push_sock, self.mtool.fill_msg('dead_process', machine=key[0], pid=key[1], status=status)) time.sleep(0.5) push_sock.close() def _push_sock(self, ctx, addr): sock = ctx.socket(zmq.PUSH) sock.connect(addr) return sock def _prepare_publisher(self): tmp_pull = self.ctx.socket(zmq.PULL) tmp_pull.bind(self._pull_addr) self.pub_thr = threading.Thread(target=self._publisher_thread, args=[self.pub_addresses, self._push_addr, self._pull_addr]) self.pub_thr.daemon = True self._stop_publishing = False self.pub_thr.start() recv_msg(tmp_pull) self._publish_socket = self._push_sock(self.ctx, self._push_addr) self._all_sockets.append(self._publish_socket) tmp_pull.close() def _prepare_subprocess_info(self): self._subprocess_pull = self.ctx.socket(zmq.PULL) self._subprocess_pull.bind(self._subpr_push_addr) self.subprocess_thr = threading.Thread(target=self._subprocess_info, args=[self._subpr_push_addr]) self.subprocess_thr.daemon = True self._stop_monitoring = False self.subprocess_thr.start() recv_msg(self._subprocess_pull) self._all_sockets.append(self._subprocess_pull) def net_init(self): # (self.pub_socket, self.pub_addresses) = self._init_socket( # self.pub_addresses, zmq.PUB) self._all_sockets = [] self._prepare_publisher() self._prepare_subprocess_info() (self.rep_socket, self.rep_addresses) = self._init_socket( self.rep_addresses, zmq.REP) self.rep_socket.setsockopt(zmq.LINGER, 0) self._all_sockets.append(self.rep_socket) print "\n\tname: {0}\n\tpeer_type: {1}\n\tuuid: {2}\n".format( self.name, self.peer_type(), self.uuid) print "rep: {0}".format(self.rep_addresses) print "pub: {0}\n".format(self.pub_addresses) self.source_req_socket = self.ctx.socket(zmq.REQ) if self.source_addresses: for addr in self.source_addresses: self.source_req_socket.connect(addr) self._all_sockets.append(self.source_req_socket) self._set_poll_sockets() def _init_socket(self, addrs, zmq_type): # print self.peer_type(), "addresses for socket init:", addrs addresses = addrs if addrs else ['tcp://*'] random_port = True if not addrs else False sock = self.ctx.socket(zmq_type) port = None try: for i, addr in enumerate(addresses): if random_port and net.is_net_addr(addr): port = str(sock.bind_to_random_port(addr, min_port=settings.PORT_RANGE[0], max_port=settings.PORT_RANGE[1])) addresses[i] = addr + ':' + str(port) else: sock.bind(addr) except Exception, e: self.logger.critical("CRITICAL error: %s", str(e)) raise(e) advertised_addrs = [] for addr in addresses: if addr.startswith('tcp://*'): port = addr.rsplit(':', 1)[1] advertised_addrs.append('tcp://' + socket.gethostname() + ':' +str(port)) advertised_addrs.append('tcp://' + 'localhost:' + str(port)) else: advertised_addrs.append(addr) return sock, advertised_addrs
class OBCIControlPeer(object): msg_handlers = HandlerCollection() def __init__(self, source_addresses=None, rep_addresses=None, pub_addresses=None, name='obci_control_peer'): # TODO TODO TODO !!!! # cleaner subclassing of obci_control_peer!!! self.hostname = socket.gethostname() self.source_addresses = source_addresses if source_addresses else [] self.rep_addresses = rep_addresses self.pub_addresses = pub_addresses self._all_sockets = [] self._pull_addr = 'inproc://publisher_msg' self._push_addr = 'inproc://publisher' self._subpr_push_addr = 'inproc://subprocess_info' self.uuid = str(uuid.uuid4()) self.name = str(name) self.type = self.peer_type() log_dir = os.path.join(settings.OBCI_CONTROL_LOG_DIR, self.name + '-' + self.uuid[:8]) if not hasattr(self, 'logger'): if not os.path.exists(log_dir): os.makedirs(log_dir) self.logger = get_logger(self.peer_type(), log_dir=log_dir, stream_level=net_tools.peer_loglevel(), obci_peer=self) self.mtool = self.message_tool() if not hasattr(self, "ctx"): self.ctx = zmq.Context() self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger) self.net_init() if self.source_addresses: self.registration_response = self.register() self._handle_registration_response(self.registration_response) else: self.registration_response = None self.interrupted = False signal.signal(signal.SIGTERM, self.signal_handler()) signal.signal(signal.SIGINT, self.signal_handler()) def signal_handler(self): def handler(signum, frame): self.logger.info("[!!!!] %s %s %s %s", self.name, "got signal", signum, frame) self.interrupted = True return handler def peer_type(self): return 'obci_control_peer' def message_tool(self): return OBCIMessageTool(message_templates) def _publisher_thread(self, pub_addrs, pull_address, push_addr): # FIXME aaaaahhh pub_addresses are set here, not in the main thread # (which reads them in _register method) pub_sock, self.pub_addresses = self._init_socket( pub_addrs, zmq.PUB) pull_sock = self.ctx.socket(zmq.PULL) pull_sock.bind(pull_address) push_sock = self.ctx.socket(zmq.PUSH) push_sock.connect(push_addr) send_msg(push_sock, u'1') po = PollingObject() while not self._stop_publishing: try: to_publish, det = po.poll_recv(pull_sock, 500) if to_publish: send_msg(pub_sock, to_publish) except: # print self.name, '.Publisher -- STOP.' break # self.logger.info( "close sock %s %s", pub_addrs, pub_sock) pub_sock.close() pull_sock.close() push_sock.close() def _subprocess_info(self, push_addr): push_sock = self.ctx.socket(zmq.PUSH) push_sock.connect(push_addr) send_msg(push_sock, u'1') while not self._stop_monitoring: dead = self.subprocess_mgr.not_running_processes() if dead: # self.logger.warning("DEAD process" + str(dead)) for key, status in dead.iteritems(): send_msg(push_sock, self.mtool.fill_msg('dead_process', machine=key[0], pid=key[1], status=status)) time.sleep(0.5) push_sock.close() def _push_sock(self, ctx, addr): sock = ctx.socket(zmq.PUSH) sock.connect(addr) return sock def _prepare_publisher(self): tmp_pull = self.ctx.socket(zmq.PULL) tmp_pull.bind(self._pull_addr) self.pub_thr = threading.Thread(target=self._publisher_thread, args=[self.pub_addresses, self._push_addr, self._pull_addr]) self.pub_thr.daemon = True self._stop_publishing = False self.pub_thr.start() recv_msg(tmp_pull) self._publish_socket = self._push_sock(self.ctx, self._push_addr) self._all_sockets.append(self._publish_socket) tmp_pull.close() def _prepare_subprocess_info(self): self._subprocess_pull = self.ctx.socket(zmq.PULL) self._subprocess_pull.bind(self._subpr_push_addr) self.subprocess_thr = threading.Thread(target=self._subprocess_info, args=[self._subpr_push_addr]) self.subprocess_thr.daemon = True self._stop_monitoring = False self.subprocess_thr.start() recv_msg(self._subprocess_pull) self._all_sockets.append(self._subprocess_pull) def net_init(self): # (self.pub_socket, self.pub_addresses) = self._init_socket( # self.pub_addresses, zmq.PUB) self._all_sockets = [] self._prepare_publisher() self._prepare_subprocess_info() (self.rep_socket, self.rep_addresses) = self._init_socket( self.rep_addresses, zmq.REP) self.rep_socket.setsockopt(zmq.LINGER, 0) self._all_sockets.append(self.rep_socket) print "\n\tname: {0}\n\tpeer_type: {1}\n\tuuid: {2}\n".format( self.name, self.peer_type(), self.uuid) print "rep: {0}".format(self.rep_addresses) print "pub: {0}\n".format(self.pub_addresses) self.source_req_socket = self.ctx.socket(zmq.REQ) if self.source_addresses: for addr in self.source_addresses: self.source_req_socket.connect(addr) self._all_sockets.append(self.source_req_socket) self._set_poll_sockets() def _init_socket(self, addrs, zmq_type): # print self.peer_type(), "addresses for socket init:", addrs addresses = addrs if addrs else ['tcp://*'] random_port = True if not addrs else False sock = self.ctx.socket(zmq_type) port = None try: for i, addr in enumerate(addresses): if random_port and net.is_net_addr(addr): port = str(sock.bind_to_random_port(addr, min_port=settings.PORT_RANGE[0], max_port=settings.PORT_RANGE[1])) addresses[i] = addr + ':' + str(port) else: sock.bind(addr) except Exception, e: self.logger.critical("CRITICAL error: %s", str(e)) raise(e) advertised_addrs = [] for addr in addresses: if addr.startswith('tcp://*'): port = addr.rsplit(':', 1)[1] advertised_addrs.append('tcp://' + socket.gethostname() + ':' + str(port)) advertised_addrs.append('tcp://' + 'localhost:' + str(port)) else: advertised_addrs.append(addr) return sock, advertised_addrs
class OBCIProcessSupervisor(OBCIControlPeer): msg_handlers = OBCIControlPeer.msg_handlers.copy() def __init__(self, sandbox_dir, source_addresses=None, source_pub_addresses=None, rep_addresses=None, pub_addresses=None, experiment_uuid='', name='obci_process_supervisor'): self.peers = {} self.status = launcher_tools.READY_TO_LAUNCH self.source_pub_addresses = source_pub_addresses self.machine = socket.gethostname() self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR self.ctx = zmq.Context() self.mx_data = self.set_mx_data() self.env = self.peer_env(self.mx_data) self.launch_data = [] self.peer_order = [] self._running_peer_order = [] self._current_part = None self.experiment_uuid = experiment_uuid self.peers_to_launch = [] self.processes = {} self.restarting = [] super(OBCIProcessSupervisor, self).__init__( source_addresses=source_addresses, rep_addresses=rep_addresses, pub_addresses=pub_addresses, name=name) self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid) def peer_type(self): return "obci_process_supervisor" def net_init(self): self.source_sub_socket = self.ctx.socket(zmq.SUB) self.source_sub_socket.setsockopt(zmq.SUBSCRIBE, "") self._all_sockets.append(self.source_sub_socket) if self.source_pub_addresses: for addr in self.source_pub_addresses: self.source_sub_socket.connect(addr) (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL) # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "") self.cs_addr = net.choose_not_local(self.cs_addresses) if not self.cs_addr: self.cs_addr = net.choose_local(self.cs_addresses)[0] else: self.cs_addr = self.cs_addr[0] self._all_sockets.append(self.config_server_socket) super(OBCIProcessSupervisor, self).net_init() def params_for_registration(self): return dict(pid=os.getpid(), machine=self.machine, mx_data=[self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]]) def custom_sockets(self): return [self.source_sub_socket, self.config_server_socket] def _handle_registration_response(self, response): self.launch_data = response.params['launch_data'] self.peers_to_launch = list(self.launch_data.keys()) self.peer_order = response.params['peer_order'] for part in self.peer_order: self._running_peer_order.append(list(part)) print self.name,'[', self.type, ']', "RECEIVED LAUNCH DATA: ", self.launch_data def set_mx_data(self): src_ = net.choose_not_local(self.source_pub_addresses)[:1] if not src_: src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1] src = src_[0] src = src[6:].split(':')[0] if src == socket.gethostname(): sock = self.ctx.socket(zmq.REP) port = str(sock.bind_to_random_port("tcp://127.0.0.1", min_port=settings.PORT_RANGE[0], max_port=settings.PORT_RANGE[1])) sock.close() return ('0.0.0.0', port), "" #empty passwd else: return None, None def mx_addr_str(self, mx_data): if mx_data[0] is None: return None addr, port = mx_data[0] print self.name,'[', self.type, ']', "mx addr str", addr + ':' + str(port) return addr + ':' + str(port) def peer_env(self, mx_data): if mx_data[0] is None: return None env = os.environ.copy() addr, port = mx_data[0] _env = { "MULTIPLEXER_ADDRESSES": socket.gethostname() + ':' + str(port), "MULTIPLEXER_PASSWORD": mx_data[1], "MULTIPLEXER_RULES": launcher_tools.mx_rules_path() } env.update(_env) return env @msg_handlers.handler("start_mx") def handle_start_mx(self, message, sock): if 'mx' in self.launch_data and self.mx_data[0] is not None: print self.name,'[', self.type, ']', "..starting multiplexer" self.peer_order.remove(['mx']) self.peers_to_launch.remove('mx') path = launcher_tools.mx_path() args = ['run_multiplexer', self.mx_addr_str( (('0.0.0.0', self.mx_data[0][1]), self.mx_data[1])), '--multiplexer-password', self.mx_data[1], '--rules', launcher_tools.mx_rules_path()] proc, details = self._launch_process(path, args, 'multiplexer', 'mx', env=self.env) self.processes['mx'] = proc if proc is not None: self.mx = proc @msg_handlers.handler("start_peers") def handle_start_peers(self, message, sock): self._launch_processes(self.launch_data) def test(self): # for i in range(SEND): # send_msg(self.push, str(i)) self.pull = self.ctx.socket(zmq.SUB) self.pull.bind('tcp://*:16789') received = 0 prev = -1 for i in range(SEND): msg = recv_msg(self.pull) if int(msg): # prev = int(msg) received += 1 if received % 10000 == 0: print "zmq: received ", received, "messages, last: ", msg if received == SEND: print "zmq: OK" else: print "WUT?", received # self.push.close() self.pull.close() @msg_handlers.handler("manage_peers") def handle_manage_peers(self, message, sock): if not message.receiver == self.uuid: return message.kill_peers.append('config_server') message.start_peers_data['config_server'] = dict(self.launch_data['config_server']) restore_config = [peer for peer in self.processes if peer not in message.kill_peers] for peer in message.kill_peers: proc = self.processes.get(peer, None) if not proc: print self.name,'[', self.type, ']', "peer to kill not found:", peer continue print "MORPH: KILLING ", peer proc.kill() print "MORPH: KILLED ", peer del self.processes[peer] del self.launch_data[peer] for peer, data in message.start_peers_data.iteritems(): self.launch_data[peer] = data self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers] self._launch_processes(message.start_peers_data, restore_config=restore_config) def _launch_processes(self, launch_data, restore_config=[]): proc, details = None, None success = True path, args = None, None self.status = launcher_tools.LAUNCHING ldata = [] if 'config_server' in launch_data: ldata.append(('config_server', launch_data['config_server'])) if 'amplifier' in launch_data: ldata.append(('amplifier', launch_data['amplifier'])) for peer, data in launch_data.iteritems(): if (peer, data) not in ldata: ldata.append((peer, data)) for peer, data in ldata:#self.launch_data.iteritems(): wait = 0 if peer.startswith('mx'): continue path = os.path.join(launcher_tools.obci_root(), data['path']) args = data['args'] if peer.startswith('config_server'): args += ['-p', 'launcher_socket_addr', self.cs_addr] args += ['-p', 'experiment_uuid', self.experiment_uuid] if restore_config: args += ['-p', 'restore_peers', ' '.join(restore_config)] wait = 0.4 proc, details = self._launch_process(path, args, data['peer_type'], peer, env=self.env, capture_io=NO_STDIO) if proc is not None: self.processes[peer] = proc else: success = False break time.sleep(wait) if success: send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched", machine=self.machine)) else: print self.name,'[', self.type, ']', "OBCI LAUNCH FAILED" send_msg(self._publish_socket, self.mtool.fill_msg("obci_launch_failed", machine=self.machine, path=path, args=args, details=details)) self.processes = {} self.subprocess_mgr.killall() def _launch_process(self, path, args, proc_type, name, env=None, capture_io=NO_STDIO): proc, details = self.subprocess_mgr.new_local_process(path, args, proc_type=proc_type, name=name, monitoring_optflags=RETURNCODE, capture_io=capture_io, env=env) if proc is None: print self.name,'[', self.type, ']', "process launch FAILED:", path, args send_msg(self._publish_socket, self.mtool.fill_msg("launch_error", sender=self.uuid, details=dict(machine=self.machine, path=path, args=args, error=details))) else: print self.name,'[', self.type, ']', "process launch success:", path, args, proc.pid send_msg(self._publish_socket, self.mtool.fill_msg("launched_process_info", sender=self.uuid, machine=self.machine, pid=proc.pid, proc_type=proc_type, name=name, path=path, args=args)) return proc, details @msg_handlers.handler("get_tail") def handle_get_tail(self, message, sock): lines = message.len if message.len else DEFAULT_TAIL_RQ peer = message.peer_id if peer not in self.launch_data: return experiment_id = self.launch_data[peer]['experiment_id'] txt = self.processes[peer].tail_stdout(lines=lines) send_msg(self._publish_socket, self.mtool.fill_msg("tail", txt=txt, sender=self.uuid, experiment_id=experiment_id, peer_id=peer)) @msg_handlers.handler("experiment_finished") def handle_experiment_finished(self, message, sock): pass @msg_handlers.handler("morph_to_new_scenario") def handle_morph(self, message, sock): pass @msg_handlers.handler("stop_all") def handle_stop_all(self, message, sock): self.subprocess_mgr.killall() @msg_handlers.handler("dead_process") def handle_dead_process(self, message, sock): proc = self.subprocess_mgr.process(message.machine, message.pid) if proc is not None: proc.mark_delete() name = proc.name print '~~~~~ ~~~~~ ', name, self.restarting, message.status[0] if (proc.proc_type == 'obci_peer' or proc.proc_type == 'multiplexer') and \ not (name in self.restarting and message.status[0] == 'terminated'): print "KILLLLLING and sending obci_peer_dead", proc.name send_msg(self._publish_socket, self.mtool.fill_msg("obci_peer_dead", sender=self.uuid, sender_ip=self.machine, peer_id=proc.name, path=proc.path, status=proc.status() )) if name in self.restarting: self.restarting.remove(name) @msg_handlers.handler("obci_peer_registered") def handle_obci_peer_registered(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_params_changed") def handle_obci_peer_params_changed(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_ready") def handle_obci_peer_ready(self, message, sock): print self.name , "got!", message.type send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_control_message") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("obci_peer_dead") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("process_supervisor_registered") def handle_supervisor_registered(self, messsage, sock): # also ignore pass def cleanup_before_net_shutdown(self, kill_message, sock=None): self.processes = {} #self.subprocess_mgr.killall() def clean_up(self): print self.name,'[', self.type, ']', "cleaning up" self.processes = {} self.subprocess_mgr.killall() self.subprocess_mgr.delete_all()