Пример #1
0
class OBCIServer(OBCIControlPeer):

    msg_handlers = OBCIControlPeer.msg_handlers.copy()

    @log_crash
    def __init__(self, rep_addresses=None, pub_addresses=None, name='obci_server'):

        self.experiments = {}
        self.exp_process_supervisors = {}
        self._nearby_servers = net.DNS()
        super(OBCIServer, self).__init__(None, rep_addresses,
                                         pub_addresses,
                                         name)

        self.machine = socket.gethostname()

        self.rep_port = int(net.server_rep_port())
        self.pub_port = int(net.server_pub_port())
        bcast_port = int(net.server_bcast_port())
        self._nearby_servers.logger = self.logger
        self._bcast_server = threading.Thread(target=broadcast_server,
                                              args=[self.uuid,
                                                    self.rep_port, self.pub_port, bcast_port])
        self._bcast_server.daemon = True
        self._bcast_server.start()

        self._nearby_updater = threading.Thread(target=update_nearby_servers,
                                                args=[self._nearby_servers,

                                                      bcast_port,
                                                      self.ctx,
                                                      self._push_addr])

        self._nearby_updater.daemon = True
        self._nearby_updater.start()
        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)

    def nearby_server_addrs(self):
        snap = self._nearby_servers.snapshot()
        return [srv.ip for srv in snap.values()]

    def nearby_servers(self):
        return self._nearby_servers.snapshot()

    def my_ip(self):
        addr = "127.0.1.1"
        try:
            addr = self._nearby_servers.this_addr_network()
        except Exception as e:
            self.logger.error(str(e))
        return addr

    def network_ready(self):
        # i know my network IP
        return self.my_ip() != self.machine

    def handle_socket_read_error(self, socket, error):
        if socket == self.rep_socket:
            self.logger.warning("reinitialising REP socket")
            self._all_sockets.remove(self.rep_socket)
            if socket in self.client_rq:
                self.client_rq = None
            self.rep_socket.close()  # linger=0)
            self.rep_socket = None
            time.sleep(0.2)
            (self.rep_socket, self.rep_addresses) = self._init_socket(
                ['tcp://*:' + str(self.rep_port)], zmq.REP)
            self.rep_socket.setsockopt(zmq.LINGER, 0)
            self._all_sockets.append(self.rep_socket)
            self.logger.info(self.rep_addresses)

        elif socket == self.exp_rep:
            self.logger.info("reinitialising EXPERIMENT REP socket")
            self.exp_rep.close()  # linger=0)

            (self.exp_rep, self.exp_rep_addrs) = self._init_socket(
                self.exp_rep_addrs, zmq.REP)
            self.exp_rep.setsockopt(zmq.LINGER, 0)
            self._all_sockets.append(self.exp_rep)

    def peer_type(self):
        return 'obci_server'

    def net_init(self):

        (self.exp_rep, self.exp_rep_addrs) = self._init_socket(
            [], zmq.REP)
        # (self.exp_pub, self.exp_pub_addrs) = self._init_socket(
        #                                         [], zmq.PUB)
        # self.exp_pub.setsockopt(zmq.LINGER, 0)
        self._all_sockets.append(self.exp_rep)
        # self._all_sockets.append(self.exp_pub)
        tcp_port = int(net.server_tcp_proxy_port())

        self._tcp_proxy_thr, tcp_port = twisted_tcp_handling.run_twisted_server(
            ('0.0.0.0', tcp_port),
            self.ctx,
            self.rep_addresses[0])

        self.tcp_addresses = [(self.my_ip(), tcp_port),
                              (socket.gethostname(), tcp_port)]
        super(OBCIServer, self).net_init()

    def custom_sockets(self):
        return [self.exp_rep]  # , self.srv_rep, self.srv_pub]

    def clean_up(self):
        # self._tcp_srv.shutdown()
        pass

    def cleanup_before_net_shutdown(self, kill_message, sock=None):
        send_msg(self._publish_socket,  # self.exp_pub,
                 self.mtool.fill_msg("kill", receiver=""))
        send_msg(self._publish_socket, self.mtool.fill_msg("launcher_shutdown",
                                                           sender=self.uuid))
        for sup in self.exp_process_supervisors:
            self.exp_process_supervisors[sup].kill()
        self.logger.info('sent KILL to experiments')

    def _args_for_experiment(self, sandbox_dir, launch_file, local=False, name=None, overwrites=None):

        args = ['--sv-addresses']
        args += self.exp_rep_addrs
        args.append('--sv-pub-addresses')
        # if local:
        #     addrs = net.choose_local(self.exp_pub_addrs)
        # else:
        #     addrs = net.choose_not_local(self.exp_pub_addrs)
        addrs = net.choose_local(self.pub_addresses)  # self.exp_pub_addrs

        args += addrs
        exp_name = name if name else os.path.basename(launch_file)

        args += [
            '--sandbox-dir', str(sandbox_dir),
            '--launch-file', str(launch_file),
            '--name', exp_name,
            '--current-ip', self.my_ip()]
        if overwrites is not None:
            args += peer_cmd.peer_overwrites_cmd(overwrites)
        # print '{0} [{1}] -- experiment args: {2}'.format(self.name, self.peer_type(), args)
        return args

    def start_experiment_process(self, sandbox_dir, launch_file, name=None, overwrites=None):
        path = 'obci_experiment'
        args = self._args_for_experiment(sandbox_dir, launch_file,
                                         local=True, name=name, overwrites=overwrites)
        return self.subprocess_mgr.new_local_process(path, args,
                                                     proc_type='obci_experiment',
                                                     capture_io=NO_STDIO)

    def handle_register_experiment(self, message, sock):
        machine, pid = message.other_params['origin_machine'], message.other_params['pid']
        status, det = message.other_params['status_name'], message.other_params['details']
        launch_file = message.other_params['launch_file_path']
        tcp_addr = message.other_params['tcp_addrs']

        exp_proc = self.subprocess_mgr.process(machine, pid)

        if exp_proc is None:
            send_msg(sock, self.mtool.fill_msg("rq_error", err_code="experiment_not_found"))
            return

        info = self.experiments[message.uuid] = ExperimentInfo(message.uuid,
                                                               message.name,
                                                               message.rep_addrs,
                                                               message.pub_addrs,
                                                               time.time(),
                                                               machine,
                                                               pid,
                                                               status,
                                                               det,
                                                               launch_file,
                                                               tcp_addr,
                                                               self._nearby_servers.this_addr_network())

        exp_proc.registered(info)
        for addrs in [info.rep_addrs, info.pub_addrs]:
            one = addrs[0]
            port = net.port(one)
            addrs = [self._nearby_servers.this_addr_network() + ':' + str(port)] + addrs

        info_msg = self.mtool.fill_msg("experiment_created",
                                       uuid=info.uuid,
                                       name=info.name,
                                       rep_addrs=info.rep_addrs,
                                       pub_addrs=info.pub_addrs,
                                       origin_machine=info.origin_machine,
                                       status_name=status,
                                       details=det,
                                       launch_file_path=launch_file,
                                       tcp_addrs=tcp_addr)

        if self.client_rq:
            msg_type = self.client_rq[0].type
            rq_sock = self.client_rq[1]
            if msg_type == "create_experiment":
                self.client_rq = None
                send_msg(rq_sock, info_msg)

        send_msg(sock, self.mtool.fill_msg("rq_ok", params=self._nearby_servers.dict_snapshot()))
        send_msg(self._publish_socket, info_msg)

    def _handle_register_experiment_timeout(self, exp):
        self.logger.error("New experiment process failed to "
                          "register before timeout" + str(exp.pid))

        if exp.returncode is None:
            exp.kill()
            exp.wait()

        # msg_type = self.client_rq[0].type
        rq_sock = self.client_rq[1]
        send_msg(rq_sock, self.mtool.fill_msg("rq_error",
                                              err_code="create_experiment_error",
                                              request=vars(self.client_rq[0])))

    @msg_handlers.handler("register_peer")
    def handle_register_peer(self, message, sock):
        """Register peer"""
        if message.peer_type == "obci_client":
            send_msg(sock, self.mtool.fill_msg("rq_ok"))
        elif message.peer_type == "obci_experiment":
            self.handle_register_experiment(message, sock)
        else:
            super(OBCIServer, self).handle_register_peer(message, sock)

    @msg_handlers.handler("create_experiment")
    def handle_create_experiment(self, message, sock):

        if not self.network_ready() and self._nearby_servers.dict_snapshot():
            send_msg(sock, self.mtool.fill_msg("rq_error",
                                               err_code='server_network_not_ready'))
            return

        launch_file = message.launch_file
        sandbox = message.sandbox_dir
        name = message.name
        overwrites = message.overwrites

        sandbox = sandbox if sandbox else settings.DEFAULT_SANDBOX_DIR

        exp, details = self.start_experiment_process(
            sandbox, launch_file, name, overwrites)

        if exp is None:
            self.logger.error("failed to launch experiment "
                              "process, request: " + str(vars(message)))
            send_msg(sock, self.mtool.fill_msg("rq_error",
                                               request=vars(message),
                                               err_code='launch_error', details=details))
        else:
            self.logger.info("experiment process "
                             "launched:  {0}".format(exp.pid))
            if sock.socket_type in [zmq.REP, zmq.ROUTER]:
                self.client_rq = (message, sock)

    @msg_handlers.handler("list_experiments")
    def handle_list_experiments(self, message, sock):
        exp_data = {}
        for exp_id in self.experiments:
            exp_data[exp_id] = self.experiments[exp_id].info()

        nearby = self.nearby_servers()
        nearby_dict = {}
        for srv in nearby.values():
            nearby_dict[srv.ip] = srv.hostname
        info = '\n{'
        for srv in nearby_dict:
            info += '\n' + srv + ' : ' + nearby_dict[srv] + ','
        info += '}'
        self.logger.debug("nearby servers:  count: {0}, {1}".format(
            len(nearby), info))
        send_msg(sock, self.mtool.fill_msg("running_experiments",
                                           exp_data=exp_data,
                                           nearby_machines=nearby_dict))

    @msg_handlers.handler("list_nearby_machines")
    def handle_list_nearby_machines(self, message, sock):
        send_msg(sock, self.mtool.fill_msg('nearby_machines',
                                           nearby_machines=self._nearby_servers.dict_snapshot()))

    def _handle_match_name(self, message, sock, this_machine=False):
        matches = self.exp_matching(message.strname)
        match = None
        msg = None
        if not matches:
            msg = self.mtool.fill_msg("rq_error", request=vars(message),
                                      err_code='experiment_not_found')

        elif len(matches) > 1:
            matches = [(exp.uuid, exp.name) for exp in matches]
            msg = self.mtool.fill_msg("rq_error", request=vars(message),
                                      err_code='ambiguous_exp_name',
                                      details=matches)
        else:
            match = matches.pop()
            if this_machine and match.origin_machine != self.machine:
                msg = self.mtool.fill_msg("rq_error", request=vars(message),
                                          err_code='exp_not_on_this_machine', details=match.origin_machine)
                match = None
        if msg and sock.socket_type in [zmq.REP, zmq.ROUTER]:
            send_msg(sock, msg)
        return match

    @msg_handlers.handler("get_experiment_contact")
    def handle_get_experiment_contact(self, message, sock):
        self.logger.debug("##### rq contact for: %s", message.strname)

        info = self._handle_match_name(message, sock)
        if info:
            send_msg(sock, self.mtool.fill_msg("experiment_contact",
                                               uuid=info.uuid,
                                               name=info.name,
                                               rep_addrs=info.rep_addrs,
                                               pub_addrs=info.pub_addrs,
                                               tcp_addrs=info.tcp_addrs,
                                               machine=info.origin_machine,
                                               status_name=info.status_name,
                                               details=info.details))

    @msg_handlers.handler("experiment_status_change")
    def handle_experiment_status_change(self, message, sock):
        exp = self.experiments.get(message.uuid, None)
        if not exp:
            if sock.socket_type in [zmq.REP, zmq.ROUTER]:
                send_msg(sock, self.mtool.fill_msg('rq_error', err_code='experiment_not_found'))
            return
        exp.status_name = message.status_name
        exp.details = message.details
        if sock.socket_type in [zmq.REP, zmq.ROUTER]:
            send_msg(sock, self.mtool.fill_msg('rq_ok'))

        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("experiment_info_change")
    def handle_experiment_info_change(self, message, sock):
        exp = self.experiments.get(message.uuid, None)
        if not exp:
            self.logger.warning("UUID not found  " + message.uuid)
            if sock.socket_type in [zmq.REP, zmq.ROUTER]:
                send_msg(sock, self.mtool.fill_msg('rq_error', err_code='experiment_not_found'))
            return
        exp.name = message.name
        exp.launch_file_path = message.launch_file_path
        if sock.socket_type in [zmq.REP, zmq.ROUTER]:
            send_msg(sock, self.mtool.fill_msg('rq_ok'))
        self.logger.info("INFO CHANGED %s", exp.launch_file_path)
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("experiment_transformation")
    def handle_experiment_transformation(self, message, sock):
        exp = self.experiments.get(message.uuid, None)
        if not exp:
            if sock.socket_type in [zmq.REP, zmq.ROUTER]:
                send_msg(sock, self.mtool.fill_msg('rq_error', err_code='experiment_not_found'))
            return
        exp.status_name = message.status_name
        exp.details = message.details
        exp.launch_file_path = message.launch_file
        exp.name = message.name
        if sock.socket_type in [zmq.REP, zmq.ROUTER]:
            send_msg(sock, self.mtool.fill_msg('rq_ok'))
        send_msg(self._publish_socket, message.SerializeToString())

    def exp_matching(self, strname):
        """Match *strname* against all created experiment IDs and
        names. Return those experiment descriptions which name
        or uuid starts with strname.
        """
        match_names = {}
        for uid, exp in self.experiments.items():
            if exp.name.startswith(strname):
                match_names[uid] = exp

        ids = self.experiments.keys()
        match_ids = [uid for uid in ids if uid.startswith(strname)]

        experiments = set()
        for uid in match_ids:
            experiments.add(self.experiments[uid])
        for name, exp in match_names.items():
            experiments.add(exp)

        return experiments

    @msg_handlers.handler("kill_experiment")
    def handle_kill_experiment(self, message, sock):
        match = self._handle_match_name(message, sock, this_machine=True)

        if match:
            if match.kill_timer is not None:
                send_msg(sock, self.mtool.fill_msg("rq_error", err_code="already_killed",
                                                   details="Experiment already shutting down"))

            elif not message.force:
                self.logger.info("sending kill to experiment "
                                 "{0} ({1})".format(match.uuid, match.name))
                send_msg(self._publish_socket,  # self.exp_pub,
                         self.mtool.fill_msg("kill", receiver=match.uuid))

                send_msg(sock, self.mtool.fill_msg("kill_sent", experiment_id=match.uuid))
                pid = match.experiment_pid
                uid = match.uuid
                self.logger.info("Waiting for experiment process {0} to terminate".format(uid))
                match.kill_timer = threading.Timer(1.1,
                                                   self._handle_killing_exp, args=[pid, uid])
                match.kill_timer.start()
                send_msg(self._publish_socket, self.mtool.fill_msg('kill_sent',
                                                                   experiment_id=match.uuid
                                                                   ))

    def _handle_killing_exp(self, pid, uid):
        proc = self.subprocess_mgr.process(self.machine, pid)
        if proc.process_is_running():
            proc.kill()
        self.logger.info("experiment {0} FINISHED".format(uid))
        proc.delete = True
        del self.experiments[uid]

        return proc.popen_obj.returncode

    @msg_handlers.handler("launch_process")
    def handle_launch_process(self, message, sock):
        if message.proc_type == 'obci_process_supervisor':
            self._handle_launch_process_supervisor(message, sock)

    def _handle_launch_process_supervisor(self, message, sock):
        sv_obj, details = self._start_obci_supervisor_process(message)

        self.logger.info("LAUNCH PROCESS SV   " + str(sv_obj) + str(details))
        if sv_obj:
            self.exp_process_supervisors[message.sender] = sv_obj
            send_msg(sock,
                     self.mtool.fill_msg("launched_process_info",
                                         sender=self.uuid, machine=self.machine,
                                         pid=sv_obj.pid, proc_type=sv_obj.proc_type,
                                         name=sv_obj.name,
                                         path=sv_obj.path))
            self.logger.info("CONFIRMED LAUNCH")
        else:
            send_msg(sock, self.mtool.fill_msg('rq_error', request=message.dict(),
                                               err_code="launch_error",
                                               details=details))
            self.logger.error("PROCESS SUPERVISOR LAUNCH FAILURE")

    @msg_handlers.handler("kill_process")
    def handle_kill_process_supervisor(self, message, sock):
        proc = self.subprocess_mgr.process(message.machine, message.pid)
        if not proc:
            send_msg(sock, self.mtool.fill_msg("rq_error", err_code="process_not_found"))
        else:
            # TODO
            # name = proc.name
            proc.kill()
            proc.mark_delete()
            send_msg(sock, self.mtool.fill_msg("rq_ok"))
            del self.exp_process_supervisors[proc.name]

    @msg_handlers.handler("dead_process")
    def handle_dead_process(self, message, sock):
        proc = self.subprocess_mgr.process(message.machine, message.pid)
        if proc is not None:
            proc.mark_delete()
            status, details = proc.status()
            self.logger.warning("Process " + proc.proc_type + " dead: " +
                                status + str(details) + proc.name + str(proc.pid))
            if proc.proc_type == 'obci_process_supervisor':
                pass
            elif proc.proc_type == 'obci_experiment':
                pass
            if status == subprocess_monitor.FAILED:
                pass

    @msg_handlers.handler("find_eeg_experiments")
    def handle_find_eeg_experiments(self, message, sock):

        if not self.network_ready() and self._nearby_servers.dict_snapshot():
            send_msg(sock, self.mtool.fill_msg("rq_error",
                                               err_code='server_network_not_ready'))
            return

        send_msg(sock, self.mtool.fill_msg("rq_ok"))
        finder_thr = threading.Thread(target=find_eeg_experiments_and_push_results,
                                      args=[self.ctx, self.rep_addresses,
                                            message,
                                            self._nearby_servers.copy()])
        finder_thr.daemon = True
        finder_thr.start()

    @msg_handlers.handler("find_eeg_amplifiers")
    def handle_find_new_eeg_amplifiers(self, message, sock):
        if not self.network_ready() and self._nearby_servers.dict_snapshot():
            send_msg(sock, self.mtool.fill_msg("rq_error",
                                               err_code='server_network_not_ready'))
            return

        send_msg(sock, self.mtool.fill_msg("rq_ok"))
        amp_thr = threading.Thread(target=find_new_experiments_and_push_results,
                                   args=[self.ctx,
                                         message])
        amp_thr.daemon = True
        amp_thr.start()

    @msg_handlers.handler("start_eeg_signal")
    def handle_start_eeg_signal(self, message, sock):
        if not self.network_ready() and self._nearby_servers.dict_snapshot():
            send_msg(sock, self.mtool.fill_msg("rq_error",
                                               err_code='server_network_not_ready'))
            return
        send_msg(sock, self.mtool.fill_msg("rq_ok"))
        start_thr = threading.Thread(target=start_eeg_signal_experiment,
                                     args=[self.ctx, self.rep_addresses,
                                           message])
        start_thr.daemon = True
        start_thr.start()

    def _start_obci_supervisor_process(self, rq_message):
        path = obci_process_supervisor.__file__
        path = '.'.join([path.rsplit('.', 1)[0], 'py'])
        start_params = rq_message.dict()
        start_params['path'] = path
        del start_params['type']
        del start_params['sender']
        del start_params['sender_ip']
        del start_params['receiver']
        sv_obj, details = self.subprocess_mgr.new_local_process(**start_params)
        if sv_obj is None:
            return None, details

        return sv_obj, False

    def _crash_extra_data(self, exception=None):
        data = super(OBCIServer, self)._crash_extra_data(exception)
        data.update({
            'experiments': [e.info() for e in self.experiments.values()]
        })
        return data
Пример #2
0
class OBCIProcessSupervisor(OBCIControlPeer):

    msg_handlers = OBCIControlPeer.msg_handlers.copy()

    def __init__(
        self,
        sandbox_dir,
        source_addresses=None,
        source_pub_addresses=None,
        rep_addresses=None,
        pub_addresses=None,
        experiment_uuid="",
        name="obci_process_supervisor",
    ):

        self.peers = {}
        self.status = launcher_tools.READY_TO_LAUNCH
        self.source_pub_addresses = source_pub_addresses
        self.machine = socket.gethostname()
        self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR
        self.ctx = zmq.Context()
        self.mx_data = self.set_mx_data()
        self.env = self.peer_env(self.mx_data)
        self.launch_data = []
        self.peer_order = []
        self._running_peer_order = []
        self._current_part = None
        self.experiment_uuid = experiment_uuid
        self.peers_to_launch = []
        self.processes = {}
        self.restarting = []

        super(OBCIProcessSupervisor, self).__init__(
            source_addresses=source_addresses, rep_addresses=rep_addresses, pub_addresses=pub_addresses, name=name
        )
        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid)

    def peer_type(self):
        return "obci_process_supervisor"

    def net_init(self):
        self.source_sub_socket = self.ctx.socket(zmq.SUB)
        self.source_sub_socket.setsockopt(zmq.SUBSCRIBE, "")

        self._all_sockets.append(self.source_sub_socket)

        if self.source_pub_addresses:
            for addr in self.source_pub_addresses:
                self.source_sub_socket.connect(addr)

        (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL)
        # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "")

        self.cs_addr = net.choose_not_local(self.cs_addresses)
        if not self.cs_addr:
            self.cs_addr = net.choose_local(self.cs_addresses)[0]
        else:
            self.cs_addr = self.cs_addr[0]

        self._all_sockets.append(self.config_server_socket)

        super(OBCIProcessSupervisor, self).net_init()

    def params_for_registration(self):
        return dict(
            pid=os.getpid(),
            machine=self.machine,
            mx_data=[self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]],
        )

    def custom_sockets(self):
        return [self.source_sub_socket, self.config_server_socket]

    def _handle_registration_response(self, response):
        self.launch_data = response.params["launch_data"]
        self.peers_to_launch = list(self.launch_data.keys())
        self.peer_order = response.params["peer_order"]
        for part in self.peer_order:
            self._running_peer_order.append(list(part))
        print self.name, "[", self.type, "]", "RECEIVED LAUNCH DATA: ", self.launch_data

    def set_mx_data(self):

        src_ = net.choose_not_local(self.source_pub_addresses)[:1]
        if not src_:
            src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1]
        src = src_[0]
        src = src[6:].split(":")[0]

        if src == socket.gethostname():
            sock = self.ctx.socket(zmq.REP)
            port = str(
                sock.bind_to_random_port(
                    "tcp://127.0.0.1", min_port=settings.PORT_RANGE[0], max_port=settings.PORT_RANGE[1]
                )
            )
            sock.close()
            return ("0.0.0.0", port), ""  # empty passwd
        else:
            return None, None

    def mx_addr_str(self, mx_data):
        if mx_data[0] is None:
            return None
        addr, port = mx_data[0]
        print self.name, "[", self.type, "]", "mx addr str", addr + ":" + str(port)
        return addr + ":" + str(port)

    def peer_env(self, mx_data):

        if mx_data[0] is None:
            return None

        env = os.environ.copy()
        addr, port = mx_data[0]

        _env = {
            "MULTIPLEXER_ADDRESSES": socket.gethostname() + ":" + str(port),
            "MULTIPLEXER_PASSWORD": mx_data[1],
            "MULTIPLEXER_RULES": launcher_tools.mx_rules_path(),
        }
        env.update(_env)
        return env

    @msg_handlers.handler("start_mx")
    def handle_start_mx(self, message, sock):
        if "mx" in self.launch_data and self.mx_data[0] is not None:
            print self.name, "[", self.type, "]", "..starting multiplexer"
            self.peer_order.remove(["mx"])
            self.peers_to_launch.remove("mx")
            path = launcher_tools.mx_path()

            args = [
                "run_multiplexer",
                self.mx_addr_str((("0.0.0.0", self.mx_data[0][1]), self.mx_data[1])),
                "--multiplexer-password",
                self.mx_data[1],
                "--rules",
                launcher_tools.mx_rules_path(),
            ]
            proc, details = self._launch_process(path, args, "multiplexer", "mx", env=self.env)
            self.processes["mx"] = proc
            if proc is not None:
                self.mx = proc

    @msg_handlers.handler("start_peers")
    def handle_start_peers(self, message, sock):
        self._launch_processes(self.launch_data)

    def test(self):
        # for i in range(SEND):
        #     send_msg(self.push, str(i))
        self.pull = self.ctx.socket(zmq.SUB)
        self.pull.bind("tcp://*:16789")

        received = 0
        prev = -1
        for i in range(SEND):
            msg = recv_msg(self.pull)
            if int(msg):
                # prev = int(msg)
                received += 1
            if received % 10000 == 0:
                print "zmq: received ", received, "messages, last: ", msg

        if received == SEND:
            print "zmq: OK"
        else:
            print "WUT?", received
        # self.push.close()
        self.pull.close()

    @msg_handlers.handler("manage_peers")
    def handle_manage_peers(self, message, sock):
        if not message.receiver == self.uuid:
            return
        message.kill_peers.append("config_server")

        message.start_peers_data["config_server"] = dict(self.launch_data["config_server"])
        restore_config = [peer for peer in self.processes if peer not in message.kill_peers]
        for peer in message.kill_peers:
            proc = self.processes.get(peer, None)
            if not proc:
                print self.name, "[", self.type, "]", "peer to kill not found:", peer
                continue
            print "MORPH:  KILLING ", peer
            proc.kill()
            print "MORPH:  KILLED ", peer
            del self.processes[peer]
            del self.launch_data[peer]

        for peer, data in message.start_peers_data.iteritems():
            self.launch_data[peer] = data
        self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers]

        self._launch_processes(message.start_peers_data, restore_config=restore_config)

    def _launch_processes(self, launch_data, restore_config=[]):
        proc, details = None, None
        success = True
        path, args = None, None

        self.status = launcher_tools.LAUNCHING

        ldata = []
        if "config_server" in launch_data:
            ldata.append(("config_server", launch_data["config_server"]))
        if "amplifier" in launch_data:
            ldata.append(("amplifier", launch_data["amplifier"]))
        for peer, data in launch_data.iteritems():
            if (peer, data) not in ldata:
                ldata.append((peer, data))

        for peer, data in ldata:  # self.launch_data.iteritems():
            wait = 0
            if peer.startswith("mx"):
                continue
            path = os.path.join(launcher_tools.obci_root(), data["path"])
            args = data["args"]
            if peer.startswith("config_server"):
                args += ["-p", "launcher_socket_addr", self.cs_addr]
                args += ["-p", "experiment_uuid", self.experiment_uuid]

                if restore_config:
                    args += ["-p", "restore_peers", " ".join(restore_config)]
                wait = 0.4
            proc, details = self._launch_process(path, args, data["peer_type"], peer, env=self.env, capture_io=NO_STDIO)
            if proc is not None:
                self.processes[peer] = proc
            else:
                success = False
                break
            time.sleep(wait)
        if success:
            send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched", machine=self.machine))
        else:
            print self.name, "[", self.type, "]", "OBCI LAUNCH FAILED"
            send_msg(
                self._publish_socket,
                self.mtool.fill_msg("obci_launch_failed", machine=self.machine, path=path, args=args, details=details),
            )
            self.processes = {}
            self.subprocess_mgr.killall()

    def _launch_process(self, path, args, proc_type, name, env=None, capture_io=NO_STDIO):
        proc, details = self.subprocess_mgr.new_local_process(
            path, args, proc_type=proc_type, name=name, monitoring_optflags=RETURNCODE, capture_io=capture_io, env=env
        )
        if proc is None:
            print self.name, "[", self.type, "]", "process launch FAILED:", path, args
            send_msg(
                self._publish_socket,
                self.mtool.fill_msg(
                    "launch_error",
                    sender=self.uuid,
                    details=dict(machine=self.machine, path=path, args=args, error=details),
                ),
            )
        else:
            print self.name, "[", self.type, "]", "process launch success:", path, args, proc.pid
            send_msg(
                self._publish_socket,
                self.mtool.fill_msg(
                    "launched_process_info",
                    sender=self.uuid,
                    machine=self.machine,
                    pid=proc.pid,
                    proc_type=proc_type,
                    name=name,
                    path=path,
                    args=args,
                ),
            )
        return proc, details

    @msg_handlers.handler("get_tail")
    def handle_get_tail(self, message, sock):
        lines = message.len if message.len else DEFAULT_TAIL_RQ
        peer = message.peer_id
        if peer not in self.launch_data:
            return
        experiment_id = self.launch_data[peer]["experiment_id"]
        txt = self.processes[peer].tail_stdout(lines=lines)
        send_msg(
            self._publish_socket,
            self.mtool.fill_msg("tail", txt=txt, sender=self.uuid, experiment_id=experiment_id, peer_id=peer),
        )

    @msg_handlers.handler("experiment_finished")
    def handle_experiment_finished(self, message, sock):
        pass

    @msg_handlers.handler("morph_to_new_scenario")
    def handle_morph(self, message, sock):
        pass

    @msg_handlers.handler("stop_all")
    def handle_stop_all(self, message, sock):

        self.subprocess_mgr.killall()

    @msg_handlers.handler("dead_process")
    def handle_dead_process(self, message, sock):
        proc = self.subprocess_mgr.process(message.machine, message.pid)
        if proc is not None:
            proc.mark_delete()
            name = proc.name
            print "~~~~~   ~~~~~   ", name, self.restarting, message.status[0]

            if (proc.proc_type == "obci_peer" or proc.proc_type == "multiplexer") and not (
                name in self.restarting and message.status[0] == "terminated"
            ):
                print "KILLLLLING     and sending obci_peer_dead", proc.name
                send_msg(
                    self._publish_socket,
                    self.mtool.fill_msg(
                        "obci_peer_dead",
                        sender=self.uuid,
                        sender_ip=self.machine,
                        peer_id=proc.name,
                        path=proc.path,
                        status=proc.status(),
                    ),
                )
            if name in self.restarting:
                self.restarting.remove(name)

    @msg_handlers.handler("obci_peer_registered")
    def handle_obci_peer_registered(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_params_changed")
    def handle_obci_peer_params_changed(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_ready")
    def handle_obci_peer_ready(self, message, sock):
        print self.name, "got!", message.type
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_control_message")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("obci_peer_dead")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("process_supervisor_registered")
    def handle_supervisor_registered(self, messsage, sock):
        # also ignore
        pass

    def cleanup_before_net_shutdown(self, kill_message, sock=None):
        self.processes = {}
        # self.subprocess_mgr.killall()

    def clean_up(self):
        print self.name, "[", self.type, "]", "cleaning up"

        self.processes = {}
        self.subprocess_mgr.killall()
        self.subprocess_mgr.delete_all()
Пример #3
0
class OBCIProcessSupervisor(OBCIControlPeer):
    msg_handlers = OBCIControlPeer.msg_handlers.copy()

    @log_crash
    def __init__(self, sandbox_dir,
                 source_addresses=None,
                 source_pub_addresses=None,
                 rep_addresses=None,
                 pub_addresses=None,
                 experiment_uuid='',
                 name='obci_process_supervisor'):

        self.peers = {}
        self.status = launcher_tools.READY_TO_LAUNCH
        self.source_pub_addresses = source_pub_addresses
        self.machine = socket.gethostname()
        self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR
        self.ctx = zmq.Context()
        self.mx_data = self.set_mx_data()
        self.env = self.peer_env(self.mx_data)
        self.launch_data = []
        self.peer_order = []
        self._running_peer_order = []
        self._current_part = None
        self.__cfg_launch_info = None
        self.__cfg_morph = False
        self.experiment_uuid = experiment_uuid
        self.peers_to_launch = []
        self.processes = {}
        self.restarting = []
        self.rqs = 0
        self._nearby_machines = net.DNS()

        self.test_count = 0
        self.__cfg_lock = threading.RLock()

        super(OBCIProcessSupervisor, self).__init__(
            source_addresses=source_addresses,
                                            rep_addresses=rep_addresses,
                                            pub_addresses=pub_addresses,
                                            name=name)
        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)

    def peer_type(self):
        return "obci_process_supervisor"

    def net_init(self):
        self.source_sub_socket = self.ctx.socket(zmq.SUB)
        self.source_sub_socket.setsockopt(zmq.SUBSCRIBE, "")

        self._all_sockets.append(self.source_sub_socket)

        if self.source_pub_addresses:
            for addr in self.source_pub_addresses:
                self.source_sub_socket.connect(addr)

        (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL)
        # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "")

        self.cs_addr = net.choose_local(self.cs_addresses)
        if not self.cs_addr:
            self.cs_addr = net.choose_not_local(self.cs_addresses)[0]
        else:
            self.cs_addr = self.cs_addr[0]

        self._all_sockets.append(self.config_server_socket)

        super(OBCIProcessSupervisor, self).net_init()

    def params_for_registration(self):
        mx_data = None
        if None not in self.mx_data:
            mx_data = [self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]]
        return dict(pid=os.getpid(), machine=self.machine,
                    mx_data=mx_data)

    def custom_sockets(self):
        return [self.source_sub_socket, self.config_server_socket]

    def _handle_registration_response(self, response):
        self.launch_data = response.params['launch_data']
        self.peers_to_launch = list(self.launch_data.keys())
        self.peer_order = response.params['peer_order']
        for part in self.peer_order:
            self._running_peer_order.append(list(part))
        self.logger.info("RECEIVED LAUNCH DATA: %s", self.launch_data)

    def set_mx_data(self):

        src_ = net.choose_not_local(self.source_pub_addresses)[:1]
        if not src_:
            src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1]
        src = src_[0]
        src = src[6:].split(':')[0]

        if src == socket.gethostname():
            sock = self.ctx.socket(zmq.REP)
            port = str(sock.bind_to_random_port("tcp://127.0.0.1",
                                                min_port=settings.PORT_RANGE[0],
                                                max_port=settings.PORT_RANGE[1]))
            sock.close()
            return ('0.0.0.0', port), ""  # empty passwd
        else:
            return None, None

    def mx_addr_str(self, mx_data):
        if mx_data[0] is None:
            return None
        addr, port = mx_data[0]
        self.logger.info("mx addr str:  " + addr + ':' + str(port))
        return addr + ':' + str(port)

    def peer_env(self, mx_data):

        if mx_data[0] is None:
            return None

        env = os.environ.copy()
        addr, port = mx_data[0]
        if addr == '0.0.0.0':
            addr = socket.gethostname()

        _env = {
            "MULTIPLEXER_ADDRESSES": str(addr) + ':' + str(port),
            "MULTIPLEXER_PASSWORD": '',  # mx_data[1],
            "MULTIPLEXER_RULES": str(launcher_tools.mx_rules_path())
        }

        env.update(_env)
        return env

    @msg_handlers.handler("start_mx")
    def handle_start_mx(self, message, sock):
        if 'mx' in self.launch_data and self.mx_data[0] is not None:
            self.logger.info("..starting multiplexer")
            self.peer_order.remove(['mx'])
            self.peers_to_launch.remove('mx')
            path = launcher_tools.mx_path()

            args = ['run_multiplexer', self.mx_addr_str(
                (('0.0.0.0', self.mx_data[0][1]), self.mx_data[1])),
                '--multiplexer-password', self.mx_data[1],
                    '--rules', launcher_tools.mx_rules_path()]
            proc, details = self._launch_process(path, args, 'multiplexer', 'mx',
                                                 env=self.env)
            self.processes['mx'] = proc
            if proc is not None:
                self.mx = proc

    @msg_handlers.handler("start_config_server")
    def handle_start_config_srv(self, message, sock):
        if 'mx' not in self.launch_data:
            mx_addr = message.mx_data[1].split(':')
            mx_addr[1] = int(mx_addr[1])
            md = list(self.mx_data)
            md[0] = tuple(mx_addr)
            self.mx_data = tuple(md)
            self.env = self.peer_env(self.mx_data)
        if "config_server" in self.launch_data:
            proc, details, wait, info_obj = \
                self.launch_process("config_server", self.launch_data["config_server"],
                                    restore_config=message.restore_config)
            tim = threading.Timer(1.5, self.__if_config_server_conn_didnt_work)
            tim.start()

    def __if_config_server_conn_didnt_work(self):
        with self.__cfg_lock:
            if self.__cfg_launch_info:
                send_msg(self._publish_socket, self.__cfg_launch_info)
                self.__cfg_launch_info = None
                self.logger.info("connection to config server is shaky :(")

    @msg_handlers.handler("start_peers")
    def handle_start_peers(self, message, sock):
        self.logger.info("start peers --  my mx_data: %s, received mx_data: %s",
                         self.mx_data, message.mx_data)
        if 'mx' not in self.launch_data:
            mx_addr = message.mx_data[1].split(':')
            mx_addr[1] = int(mx_addr[1])
            md = list(self.mx_data)
            md[0] = tuple(mx_addr)
            self.mx_data = tuple(md)
            self.env = self.peer_env(self.mx_data)
        # tmp.workarounds: wait for mx  on other machine to initialize
            time.sleep(0.75)

        if message.add_launch_data:
            if self.machine in message.add_launch_data:
                self._launch_processes(message.add_launch_data[self.machine])
        else:
            self._launch_processes(self.launch_data)

    @msg_handlers.handler("manage_peers")
    def handle_manage_peers(self, message, sock):
        if not message.receiver == self.uuid:
            return

        for peer in message.kill_peers:
            proc = self.processes.get(peer, None)
            if not proc:
                self.logger.error("peer to kill not found: %s", peer)
                continue
            self.logger.info("MORPH:  KILLING %s ", peer)
            proc.kill_with_force()
            self.logger.info("MORPH:  KILLED %s ", peer)
            del self.processes[peer]
            del self.launch_data[peer]

        for peer, data in message.start_peers_data.iteritems():
            self.launch_data[peer] = data
        self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers]

        self._launch_processes(message.start_peers_data)

    def _launch_processes(self, launch_data, restore_config=[]):
        proc, details, info_obj = None, None, None
        success = True
        path, args = None, None

        self.status = launcher_tools.LAUNCHING

        ldata = []

        if 'amplifier' in launch_data:
            ldata.append(('amplifier', launch_data['amplifier']))
        for peer, data in launch_data.iteritems():
            if (peer, data) not in ldata and peer != 'config_server':
                ldata.append((peer, data))

        for peer, data in ldata:  # self.launch_data.iteritems():
            if peer.startswith('mx'):
                continue
            proc, details, wait, info_obj = self.launch_process(peer, data, restore_config=restore_config)
            time.sleep(wait)
            if proc is None:
                success = False
                break

        if success:
            send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched",
                                                               machine=self.machine))

    def launch_process(self, peer, launch_data, restore_config=[]):
        data = launch_data
        wait = 0
        p = os.path.expanduser(data['path'])
        if not os.path.isabs(p):
            path = os.path.join(launcher_tools.obci_root(), p)
            path = os.path.abspath(path)
        else:
            path = os.path.realpath(p)

        args = data['args']
        args = self._attach_base_config_path(path, args)
        args += ['-p', 'experiment_uuid', self.experiment_uuid]
        if peer.startswith('config_server'):
            args += ['-p', 'launcher_socket_addr', self.cs_addr]

            if restore_config:
                args += ['-p', 'restore_peers', ' '.join(restore_config)]
            # wait = 0.5
        if "log_dir" in args:
            idx = args.index("log_dir") + 1
            log_dir = args[idx]
            log_dir = os.path.join(log_dir, self.name)
            args[idx] = log_dir
        else:
            log_dir = os.path.join(CONFIG_DEFAULTS["log_dir"], self.name)
            args += ['-p', 'log_dir', log_dir]
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

        proc, details = self._launch_process(path, args, data['peer_type'],
                                             peer, env=self.env, capture_io=NO_STDIO)
        info_obj = {
            "path": path,
            "args": args,
            "peer": peer
        }
        if proc is not None:
            self.processes[peer] = proc
        else:
            self.logger.error("OBCI LAUNCH FAILED")
            send_msg(self._publish_socket, self.mtool.fill_msg("obci_launch_failed",
                                                               machine=self.machine, path=info_obj['path'],
                                                               args=info_obj['args'], details=details))
            self.processes = {}
            self.subprocess_mgr.killall(force=True)

        return proc, details, wait, info_obj

    def _launch_process(self, path, args, proc_type, name,
                        env=None, capture_io=NO_STDIO):
        self.logger.debug("launching..... %s %s", path, args)
        proc, details = self.subprocess_mgr.new_local_process(path, args,
                                                              proc_type=proc_type,
                                                              name=name,
                                                              monitoring_optflags=RETURNCODE,
                                                              capture_io=capture_io,
                                                              env=env)

        if proc is None:
            self.logger.error("process launch FAILED: %s --- %s",
                              path, str(args))
            send_msg(self._publish_socket, self.mtool.fill_msg("launch_error",
                                                               sender=self.uuid,
                                                               details=dict(machine=self.machine, path=path, args=args,
                                                                            error=details, peer_id=name)))
        else:
            self.logger.info("process launch success:" +
                             path + str(args) + str(proc.pid))
            msg = self.mtool.fill_msg("launched_process_info",
                                      sender=self.uuid,
                                      machine=self.machine,
                                      pid=proc.pid,
                                      proc_type=proc_type, name=name,
                                      path=path,
                                      args=args)
            if name == "config_server":
                self.__cfg_launch_info = msg
            else:
                send_msg(self._publish_socket, msg)
        return proc, details

    def _attach_base_config_path(self, launch_path, launch_args):
        peer_id = launch_args[0]
        base = launch_path.rsplit('.', 1)[0]
        ini = '.'.join([base, 'ini'])
        return [peer_id, ini] + launch_args[1:]

    @msg_handlers.handler("get_tail")
    def handle_get_tail(self, message, sock):
        lines = message.len if message.len else DEFAULT_TAIL_RQ
        peer = message.peer_id
        if peer not in self.launch_data:
            return
        experiment_id = self.launch_data[peer]['experiment_id']
        txt = self.processes[peer].tail_stdout(lines=lines)
        send_msg(self._publish_socket, self.mtool.fill_msg("tail", txt=txt,
                                                           sender=self.uuid,
                                                           experiment_id=experiment_id,
                                                           peer_id=peer))

    @msg_handlers.handler("experiment_finished")
    def handle_experiment_finished(self, message, sock):
        pass

    @msg_handlers.handler("morph_to_new_scenario")
    def handle_morph(self, message, sock):
        pass

    @msg_handlers.handler('nearby_machines')
    def handle_nearby_machines(self, message, sock):
        self._nearby_machines.mass_update(message.nearby_machines)

    @msg_handlers.handler("stop_all")
    def handle_stop_all(self, message, sock):
        self.subprocess_mgr.killall(force=True)

    @msg_handlers.handler("_kill_peer")
    def handle_kill_peer(self, message, sock):
        proc = self.processes.get(message.peer_id, None)

        if proc is not None:  # is on this machine
            if message.morph and message.peer_id == 'config_server':
                self.__cfg_morph = True
            proc.kill_with_force()

    @msg_handlers.handler("rq_ok")
    def handle_rq_ok(self, message, sock):
        self.rqs += 1
        # print "--> ", self.rqs
        if self.rqs == 10000:

            self.logger.debug("GOT %s %s", str(self.rqs), "messages!")
            self.rqs = 0

    @msg_handlers.handler("experiment_launch_error")
    def handle_experiment_launch_error(self, message, sock):
        self.subprocess_mgr.killall(force=True)

    @msg_handlers.handler("dead_process")
    def handle_dead_process(self, message, sock):
        proc = self.subprocess_mgr.process(message.machine, message.pid)
        if proc is not None:
            proc.mark_delete()
            name = proc.name
            if (proc.proc_type == 'obci_peer' or proc.proc_type == 'multiplexer') and \
                    not (name in self.restarting and message.status[0] == 'terminated'):
                self.logger.info("KILLLING! sending obci_peer_"
                                 "dead for process %s", proc.name)
                send_msg(self._publish_socket, self.mtool.fill_msg("obci_peer_dead",
                                                                   sender=self.uuid,
                                                                   sender_ip=self.machine,
                                                                   peer_id=proc.name,
                                                                   path=proc.path,
                                                                   status=proc.status()
                                                                   ))
            if name in self.restarting:
                self.restarting.remove(name)
            if self.__cfg_morph and name == 'config_server':
                self.__cfg_morph = False

    @msg_handlers.handler("obci_peer_registered")
    def handle_obci_peer_registered(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_params_changed")
    def handle_obci_peer_params_changed(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_ready")
    def handle_obci_peer_ready(self, message, sock):
        self.logger.info("got! " + message.type)
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("config_server_ready")
    def handle_obci_peer_ready(self, message, sock):
        # config_server successfully connected to MX, now send "launched_process_info"
        with self.__cfg_lock:
            if self.__cfg_launch_info:
                send_msg(self._publish_socket, self.__cfg_launch_info)
                self.__cfg_launch_info = None

    @msg_handlers.handler("obci_control_message")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("obci_peer_dead")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("process_supervisor_registered")
    def handle_supervisor_registered(self, messsage, sock):
        # also ignore
        pass

    def cleanup_before_net_shutdown(self, kill_message, sock=None):
        self.processes = {}
        self.subprocess_mgr.killall(force=True)

    def clean_up(self):
        self.logger.info("cleaning up")

        self.processes = {}
        self.subprocess_mgr.killall(force=True)
        self.subprocess_mgr.delete_all()

    def _crash_extra_data(self, exception=None):
        data = super(OBCIProcessSupervisor, self)._crash_extra_data(exception)
        data.update({
            'experiment_uuid': self.experiment_uuid,
            'name': self.name
        })
        return data
Пример #4
0
class OBCIProcessSupervisor(OBCIControlPeer):
    msg_handlers = OBCIControlPeer.msg_handlers.copy()

    @log_crash
    def __init__(self, sandbox_dir,
                 source_addresses=None,
                 source_pub_addresses=None,
                 rep_addresses=None,
                 pub_addresses=None,
                 experiment_uuid='',
                 name='obci_process_supervisor'):

        self.peers = {}
        self.status = launcher_tools.READY_TO_LAUNCH
        self.source_pub_addresses = source_pub_addresses
        self.machine = socket.gethostname()
        self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR
        self.ctx = zmq.Context()
        self.mx_data = self.set_mx_data()
        self.env = self.peer_env(self.mx_data)
        self.launch_data = []
        self.peer_order = []
        self._running_peer_order = []
        self._current_part = None
        self.__cfg_launch_info = None
        self.__cfg_morph = False
        self.experiment_uuid = experiment_uuid
        self.peers_to_launch = []
        self.processes = {}
        self.restarting = []
        self.rqs = 0
        self._nearby_machines = net.DNS()

        self.test_count = 0
        self.__cfg_lock = threading.RLock()

        super(OBCIProcessSupervisor, self).__init__(
            source_addresses=source_addresses,
            rep_addresses=rep_addresses,
            pub_addresses=pub_addresses,
            name=name)
        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)

    def peer_type(self):
        return "obci_process_supervisor"

    def net_init(self):
        self.source_sub_socket = self.ctx.socket(zmq.SUB)
        self.source_sub_socket.setsockopt_string(zmq.SUBSCRIBE, "")

        self._all_sockets.append(self.source_sub_socket)

        if self.source_pub_addresses:
            for addr in self.source_pub_addresses:
                self.source_sub_socket.connect(addr)

        (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL)
        # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "")

        self.cs_addr = net.choose_local(self.cs_addresses)
        if not self.cs_addr:
            self.cs_addr = net.choose_not_local(self.cs_addresses)[0]
        else:
            self.cs_addr = self.cs_addr[0]

        self._all_sockets.append(self.config_server_socket)

        super(OBCIProcessSupervisor, self).net_init()

    def params_for_registration(self):
        mx_data = None
        if None not in self.mx_data:
            mx_data = [self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]]
        return dict(pid=os.getpid(), machine=self.machine,
                    mx_data=mx_data)

    def custom_sockets(self):
        return [self.source_sub_socket, self.config_server_socket]

    def _handle_registration_response(self, response):
        self.launch_data = response.params['launch_data']
        self.peers_to_launch = list(self.launch_data.keys())
        self.peer_order = response.params['peer_order']
        for part in self.peer_order:
            self._running_peer_order.append(list(part))
        self.logger.info("RECEIVED LAUNCH DATA: %s", self.launch_data)

    def set_mx_data(self):

        src_ = net.choose_not_local(self.source_pub_addresses)[:1]
        if not src_:
            src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1]
        src = src_[0]
        src = src[6:].split(':')[0]

        if src == socket.gethostname():
            sock = self.ctx.socket(zmq.REP)
            port = str(sock.bind_to_random_port("tcp://127.0.0.1",
                                                min_port=settings.PORT_RANGE[0],
                                                max_port=settings.PORT_RANGE[1]))
            sock.close()
            return ('0.0.0.0', port), ""  # empty passwd
        else:
            return None, None

    def mx_addr_str(self, mx_data):
        if mx_data[0] is None:
            return None
        addr, port = mx_data[0]
        self.logger.info("mx addr str:  " + addr + ':' + str(port))
        return addr + ':' + str(port)

    def peer_env(self, mx_data):
        if mx_data[0] is None:
            return None

        env = os.environ.copy()
        addr, port = mx_data[0]
        if addr == '0.0.0.0':
            addr = socket.gethostname()

        _env = {
            "MULTIPLEXER_ADDRESSES": str(addr) + ':' + str(port)
        }
        env.update(_env)
        return env

    @msg_handlers.handler("start_broker")
    def handle_start_broker(self, message, sock):
        if 'mx' in self.launch_data and self.mx_data[0] is not None:
            self.logger.info("..starting multiplexer")
            self.peer_order.remove(['mx'])
            self.peers_to_launch.remove('mx')
            path = launcher_tools.broker_path()

            args = [
                'run_multiplexer',
                self.mx_addr_str((('0.0.0.0', self.mx_data[0][1]), self.mx_data[1]))
            ]
            proc, details = self._launch_process(path, args, 'multiplexer', 'mx',
                                                 env=self.env)
            self.processes['mx'] = proc
            if proc is not None:
                self.mx = proc

    @msg_handlers.handler("start_config_server")
    def handle_start_config_srv(self, message, sock):
        if 'mx' not in self.launch_data:
            mx_addr = message.mx_data[1].split(':')
            mx_addr[1] = int(mx_addr[1])
            md = list(self.mx_data)
            md[0] = tuple(mx_addr)
            self.mx_data = tuple(md)
            self.env = self.peer_env(self.mx_data)
        if "config_server" in self.launch_data:
            proc, details, wait, info_obj = \
                self.launch_process("config_server", self.launch_data["config_server"],
                                    restore_config=message.restore_config)
            tim = threading.Timer(1.5, self.__if_config_server_conn_didnt_work)
            tim.start()

    def __if_config_server_conn_didnt_work(self):
        with self.__cfg_lock:
            if self.__cfg_launch_info:
                send_msg(self._publish_socket, self.__cfg_launch_info)
                self.__cfg_launch_info = None
                self.logger.info("connection to config server is shaky :(")

    @msg_handlers.handler("start_peers")
    def handle_start_peers(self, message, sock):
        self.logger.info("start peers --  my mx_data: %s, received mx_data: %s",
                         self.mx_data, message.mx_data)
        if 'mx' not in self.launch_data:
            mx_addr = message.mx_data[1].split(':')
            mx_addr[1] = int(mx_addr[1])
            md = list(self.mx_data)
            md[0] = tuple(mx_addr)
            self.mx_data = tuple(md)
            self.env = self.peer_env(self.mx_data)
        # tmp.workarounds: wait for mx  on other machine to initialize
            time.sleep(0.75)

        if message.add_launch_data:
            if self.machine in message.add_launch_data:
                self._launch_processes(message.add_launch_data[self.machine])
        else:
            self._launch_processes(self.launch_data)

    @msg_handlers.handler("manage_peers")
    def handle_manage_peers(self, message, sock):
        if not message.receiver == self.uuid:
            return

        for peer in message.kill_peers:
            proc = self.processes.get(peer, None)
            if not proc:
                self.logger.error("peer to kill not found: %s", peer)
                continue
            self.logger.info("MORPH:  KILLING %s ", peer)
            proc.kill_with_force()
            self.logger.info("MORPH:  KILLED %s ", peer)
            del self.processes[peer]
            del self.launch_data[peer]

        for peer, data in message.start_peers_data.items():
            self.launch_data[peer] = data
        self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers]

        self._launch_processes(message.start_peers_data)

    def _launch_processes(self, launch_data, restore_config=[]):
        proc, details, info_obj = None, None, None
        success = True

        self.status = launcher_tools.LAUNCHING

        ldata = []

        if 'amplifier' in launch_data:
            ldata.append(('amplifier', launch_data['amplifier']))
        for peer, data in launch_data.items():
            if (peer, data) not in ldata and peer != 'config_server':
                ldata.append((peer, data))

        for peer, data in ldata:  # self.launch_data.iteritems():
            if peer.startswith('mx'):
                continue
            proc, details, wait, info_obj = self.launch_process(peer, data, restore_config=restore_config)
            time.sleep(wait)
            if proc is None:
                success = False
                break

        if success:
            send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched",
                                                               machine=self.machine))

    def launch_process(self, peer, launch_data, restore_config=[]):
        data = launch_data
        wait = 0
        p = os.path.expanduser(data['path'])
        if not os.path.isabs(p):
            path = os.path.join(launcher_tools.obci_root(), p)
            path = os.path.abspath(path)
        else:
            path = os.path.realpath(p)

        args = data['args']
        args = self._attach_base_config_path(path, args)
        args += ['-p', 'experiment_uuid', self.experiment_uuid]
        if peer.startswith('config_server'):
            args += ['-p', 'launcher_socket_addr', self.cs_addr]

            if restore_config:
                args += ['-p', 'restore_peers', ' '.join(restore_config)]
            # wait = 0.5
        if "log_dir" in args:
            idx = args.index("log_dir") + 1
            log_dir = args[idx]
            log_dir = os.path.join(log_dir, self.name)
            args[idx] = log_dir
        else:
            log_dir = os.path.join(CONFIG_DEFAULTS["log_dir"], self.name)
            args += ['-p', 'log_dir', log_dir]
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

        proc, details = self._launch_process(path, args, data['peer_type'],
                                             peer, env=self.env, capture_io=NO_STDIO)
        info_obj = {
            "path": path,
            "args": args,
            "peer": peer
        }
        if proc is not None:
            self.processes[peer] = proc
        else:
            self.logger.error("OBCI LAUNCH FAILED")
            send_msg(self._publish_socket, self.mtool.fill_msg("obci_launch_failed",
                                                               machine=self.machine, path=info_obj['path'],
                                                               args=info_obj['args'], details=details))
            self.processes = {}
            self.subprocess_mgr.killall(force=True)

        return proc, details, wait, info_obj

    def _launch_process(self, path, args, proc_type, name,
                        env=None, capture_io=NO_STDIO):
        self.logger.debug("launching..... %s %s", path, args)
        proc, details = self.subprocess_mgr.new_local_process(path, args,
                                                              proc_type=proc_type,
                                                              name=name,
                                                              monitoring_optflags=RETURNCODE,
                                                              capture_io=capture_io,
                                                              env=env)

        if proc is None:
            self.logger.error("process launch FAILED: %s --- %s",
                              path, str(args))
            send_msg(self._publish_socket, self.mtool.fill_msg("launch_error",
                                                               sender=self.uuid,
                                                               details=dict(machine=self.machine, path=path, args=args,
                                                                            error=details, peer_id=name)))
        else:
            self.logger.info("process launch success:" +
                             path + str(args) + str(proc.pid))
            msg = self.mtool.fill_msg("launched_process_info",
                                      sender=self.uuid,
                                      machine=self.machine,
                                      pid=proc.pid,
                                      proc_type=proc_type, name=name,
                                      path=path,
                                      args=args)
            if name == "config_server":
                self.__cfg_launch_info = msg
            else:
                send_msg(self._publish_socket, msg)
        return proc, details

    def _attach_base_config_path(self, launch_path, launch_args):
        peer_id = launch_args[0]
        base = launch_path.rsplit('.', 1)[0]
        ini = '.'.join([base, 'ini'])
        return [peer_id, ini] + launch_args[1:]

    @msg_handlers.handler("get_tail")
    def handle_get_tail(self, message, sock):
        lines = message.len if message.len else DEFAULT_TAIL_RQ
        peer = message.peer_id
        if peer not in self.launch_data:
            return
        experiment_id = self.launch_data[peer]['experiment_id']
        txt = self.processes[peer].tail_stdout(lines=lines)
        send_msg(self._publish_socket, self.mtool.fill_msg("tail", txt=txt,
                                                           sender=self.uuid,
                                                           experiment_id=experiment_id,
                                                           peer_id=peer))

    @msg_handlers.handler("experiment_finished")
    def handle_experiment_finished(self, message, sock):
        pass

    @msg_handlers.handler("morph_to_new_scenario")
    def handle_morph(self, message, sock):
        pass

    @msg_handlers.handler('nearby_machines')
    def handle_nearby_machines(self, message, sock):
        self._nearby_machines.mass_update(message.nearby_machines)

    @msg_handlers.handler("stop_all")
    def handle_stop_all(self, message, sock):
        self.subprocess_mgr.killall(force=True)

    @msg_handlers.handler("_kill_peer")
    def handle_kill_peer(self, message, sock):
        proc = self.processes.get(message.peer_id, None)

        if proc is not None:  # is on this machine
            if message.morph and message.peer_id == 'config_server':
                self.__cfg_morph = True
            proc.kill_with_force()

    @msg_handlers.handler("rq_ok")
    def handle_rq_ok(self, message, sock):
        self.rqs += 1
        # print "--> ", self.rqs
        if self.rqs == 10000:

            self.logger.debug("GOT %s %s", str(self.rqs), "messages!")
            self.rqs = 0

    @msg_handlers.handler("experiment_launch_error")
    def handle_experiment_launch_error(self, message, sock):
        self.subprocess_mgr.killall(force=True)

    @msg_handlers.handler("dead_process")
    def handle_dead_process(self, message, sock):
        proc = self.subprocess_mgr.process(message.machine, message.pid)
        if proc is not None:
            proc.mark_delete()
            name = proc.name
            if (proc.proc_type == 'obci_peer' or proc.proc_type == 'multiplexer') and \
                    not (name in self.restarting and message.status[0] == 'terminated'):
                self.logger.info("KILLLING! sending obci_peer_"
                                 "dead for process %s", proc.name)
                send_msg(self._publish_socket, self.mtool.fill_msg("obci_peer_dead",
                                                                   sender=self.uuid,
                                                                   sender_ip=self.machine,
                                                                   peer_id=proc.name,
                                                                   path=proc.path,
                                                                   status=proc.status()
                                                                   ))
            if name in self.restarting:
                self.restarting.remove(name)
            if self.__cfg_morph and name == 'config_server':
                self.__cfg_morph = False

    @msg_handlers.handler("obci_peer_registered")
    def handle_obci_peer_registered(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_params_changed")
    def handle_obci_peer_params_changed(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_ready")
    def handle_obci_peer_ready(self, message, sock):
        self.logger.info("got! " + message.type)
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("config_server_ready")
    def handle_obci_peer_ready(self, message, sock):
        # config_server successfully connected to MX, now send "launched_process_info"
        with self.__cfg_lock:
            if self.__cfg_launch_info:
                send_msg(self._publish_socket, self.__cfg_launch_info)
                self.__cfg_launch_info = None

    @msg_handlers.handler("obci_control_message")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("obci_peer_dead")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("process_supervisor_registered")
    def handle_supervisor_registered(self, messsage, sock):
        # also ignore
        pass

    def cleanup_before_net_shutdown(self, kill_message, sock=None):
        self.processes = {}
        self.subprocess_mgr.killall(force=True)

    def clean_up(self):
        self.logger.info("cleaning up")

        self.processes = {}
        self.subprocess_mgr.killall(force=True)
        self.subprocess_mgr.delete_all()

    def _crash_extra_data(self, exception=None):
        data = super(OBCIProcessSupervisor, self)._crash_extra_data(exception)
        data.update({
            'experiment_uuid': self.experiment_uuid,
            'name': self.name
        })
        return data
Пример #5
0
class OBCIProcessSupervisor(OBCIControlPeer):

    msg_handlers = OBCIControlPeer.msg_handlers.copy()

    def __init__(self, sandbox_dir,
                                        source_addresses=None,
                                        source_pub_addresses=None,
                                        rep_addresses=None,
                                        pub_addresses=None,
                                        experiment_uuid='',
                                        name='obci_process_supervisor'):

        self.peers = {}
        self.status = launcher_tools.READY_TO_LAUNCH
        self.source_pub_addresses = source_pub_addresses
        self.machine = socket.gethostname()
        self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR
        self.ctx = zmq.Context()
        self.mx_data = self.set_mx_data()
        self.env = self.peer_env(self.mx_data)
        self.launch_data = []
        self.peer_order = []
        self._running_peer_order = []
        self._current_part = None
        self.experiment_uuid = experiment_uuid
        self.peers_to_launch = []
        self.processes = {}
        self.restarting = []

        super(OBCIProcessSupervisor, self).__init__(
                                            source_addresses=source_addresses,
                                            rep_addresses=rep_addresses,
                                            pub_addresses=pub_addresses,
                                            name=name)
        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid)


    def peer_type(self):
        return "obci_process_supervisor"

    def net_init(self):
        self.source_sub_socket = self.ctx.socket(zmq.SUB)
        self.source_sub_socket.setsockopt(zmq.SUBSCRIBE, "")

        self._all_sockets.append(self.source_sub_socket)

        if self.source_pub_addresses:
            for addr in self.source_pub_addresses:
                self.source_sub_socket.connect(addr)

        (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL)
        # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "")

        self.cs_addr = net.choose_not_local(self.cs_addresses)
        if not self.cs_addr:
            self.cs_addr = net.choose_local(self.cs_addresses)[0]
        else:
            self.cs_addr = self.cs_addr[0]

        self._all_sockets.append(self.config_server_socket)

        super(OBCIProcessSupervisor, self).net_init()

    def params_for_registration(self):
        return dict(pid=os.getpid(), machine=self.machine,
                    mx_data=[self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]])

    def custom_sockets(self):
        return [self.source_sub_socket, self.config_server_socket]


    def _handle_registration_response(self, response):
        self.launch_data = response.params['launch_data']
        self.peers_to_launch = list(self.launch_data.keys())
        self.peer_order = response.params['peer_order']
        for part in self.peer_order:
            self._running_peer_order.append(list(part))
        print self.name,'[', self.type, ']',  "RECEIVED LAUNCH DATA: ", self.launch_data


    def set_mx_data(self):

        src_ = net.choose_not_local(self.source_pub_addresses)[:1]
        if not src_:
            src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1]
        src = src_[0]
        src = src[6:].split(':')[0]

        if src == socket.gethostname():
            sock = self.ctx.socket(zmq.REP)
            port = str(sock.bind_to_random_port("tcp://127.0.0.1", 
                                            min_port=settings.PORT_RANGE[0],
                                            max_port=settings.PORT_RANGE[1]))
            sock.close()
            return ('0.0.0.0', port), "" #empty passwd
        else:
            return None, None

    def mx_addr_str(self, mx_data):
        if mx_data[0] is None:
            return None
        addr, port = mx_data[0]
        print self.name,'[', self.type, ']', "mx addr str", addr + ':' + str(port)
        return addr + ':' + str(port)


    def peer_env(self, mx_data):

        if mx_data[0] is None:
            return None

        env = os.environ.copy()
        addr, port = mx_data[0]

        _env = {
            "MULTIPLEXER_ADDRESSES": socket.gethostname() + ':' + str(port),
            "MULTIPLEXER_PASSWORD": mx_data[1],
            "MULTIPLEXER_RULES": launcher_tools.mx_rules_path()
        }
        env.update(_env)
        return env

    @msg_handlers.handler("start_mx")
    def handle_start_mx(self, message, sock):
        if 'mx' in self.launch_data and self.mx_data[0] is not None:
            print self.name,'[', self.type, ']', "..starting multiplexer"
            self.peer_order.remove(['mx'])
            self.peers_to_launch.remove('mx')
            path = launcher_tools.mx_path()

            args = ['run_multiplexer', self.mx_addr_str(
                                (('0.0.0.0', self.mx_data[0][1]), self.mx_data[1])),
                    '--multiplexer-password', self.mx_data[1],
                    '--rules', launcher_tools.mx_rules_path()]
            proc, details = self._launch_process(path, args, 'multiplexer', 'mx',
                                                env=self.env)
            self.processes['mx'] = proc
            if proc is not None:
                self.mx = proc


    @msg_handlers.handler("start_peers")
    def handle_start_peers(self, message, sock):
        self._launch_processes(self.launch_data)

    def test(self):
        # for i in range(SEND):
        #     send_msg(self.push, str(i))
        self.pull = self.ctx.socket(zmq.SUB)
        self.pull.bind('tcp://*:16789')

        received = 0
        prev = -1
        for i in range(SEND):
            msg = recv_msg(self.pull)
            if int(msg):
                # prev = int(msg)
                received += 1
            if received % 10000 == 0:
                print "zmq: received ", received, "messages, last: ", msg

        if received == SEND:
            print "zmq: OK"
        else:
            print "WUT?", received
        # self.push.close()
        self.pull.close()


    @msg_handlers.handler("manage_peers")
    def handle_manage_peers(self, message, sock):
        if not message.receiver == self.uuid:
            return
        message.kill_peers.append('config_server')
        
        message.start_peers_data['config_server'] = dict(self.launch_data['config_server'])
        restore_config = [peer for peer in self.processes if peer not in message.kill_peers]
        for peer in message.kill_peers:
            proc = self.processes.get(peer, None)
            if not proc:
                print self.name,'[', self.type, ']', "peer to kill not found:", peer
                continue
            print "MORPH:  KILLING ", peer
            proc.kill()
            print "MORPH:  KILLED ", peer
            del self.processes[peer]
            del self.launch_data[peer]

        for peer, data in message.start_peers_data.iteritems():
            self.launch_data[peer] = data
        self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers]
        
        self._launch_processes(message.start_peers_data, restore_config=restore_config)


    def _launch_processes(self, launch_data, restore_config=[]):
        proc, details = None, None
        success = True
        path, args = None, None

        self.status = launcher_tools.LAUNCHING

        ldata = []
        if 'config_server' in launch_data:
            ldata.append(('config_server', launch_data['config_server']))
        if 'amplifier' in launch_data:
            ldata.append(('amplifier', launch_data['amplifier']))
        for peer, data in launch_data.iteritems():
            if (peer, data) not in ldata:
                ldata.append((peer, data))

        for peer, data in ldata:#self.launch_data.iteritems():
            wait = 0
            if peer.startswith('mx'):
                continue
            path = os.path.join(launcher_tools.obci_root(), data['path'])
            args = data['args']
            if peer.startswith('config_server'):
                args += ['-p', 'launcher_socket_addr', self.cs_addr]
                args += ['-p', 'experiment_uuid', self.experiment_uuid]
                
                if restore_config:
                    args += ['-p', 'restore_peers', ' '.join(restore_config)]
                wait = 0.4
            proc, details = self._launch_process(path, args, data['peer_type'],
                                                        peer, env=self.env, capture_io=NO_STDIO)
            if proc is not None:
                self.processes[peer] = proc
            else:
                success = False
                break
            time.sleep(wait)
        if success:
            send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched",
                                                    machine=self.machine))
        else:
            print self.name,'[', self.type, ']', "OBCI LAUNCH FAILED"
            send_msg(self._publish_socket, self.mtool.fill_msg("obci_launch_failed",
                                                    machine=self.machine, path=path,
                                                    args=args, details=details))
            self.processes = {}
            self.subprocess_mgr.killall()


    def _launch_process(self, path, args, proc_type, name,
                                    env=None, capture_io=NO_STDIO):
        proc, details = self.subprocess_mgr.new_local_process(path, args,
                                                        proc_type=proc_type,
                                                        name=name,
                                                        monitoring_optflags=RETURNCODE,
                                                        capture_io=capture_io,
                                                        env=env)
        if proc is None:
            print self.name,'[', self.type, ']', "process launch FAILED:", path, args
            send_msg(self._publish_socket, self.mtool.fill_msg("launch_error",
                                            sender=self.uuid,
                                            details=dict(machine=self.machine, path=path, args=args,
                                                        error=details)))
        else:
            print self.name,'[', self.type, ']', "process launch success:", path, args, proc.pid
            send_msg(self._publish_socket, self.mtool.fill_msg("launched_process_info",
                                            sender=self.uuid,
                                            machine=self.machine,
                                            pid=proc.pid,
                                            proc_type=proc_type, name=name,
                                            path=path,
                                            args=args))
        return proc, details

    @msg_handlers.handler("get_tail")
    def handle_get_tail(self, message, sock):
        lines = message.len if message.len else DEFAULT_TAIL_RQ
        peer = message.peer_id
        if peer not in self.launch_data:
            return
        experiment_id = self.launch_data[peer]['experiment_id']
        txt = self.processes[peer].tail_stdout(lines=lines)
        send_msg(self._publish_socket, self.mtool.fill_msg("tail", txt=txt,
                                                    sender=self.uuid,
                                                    experiment_id=experiment_id,
                                                peer_id=peer))


    @msg_handlers.handler("experiment_finished")
    def handle_experiment_finished(self, message, sock):
        pass

    @msg_handlers.handler("morph_to_new_scenario")
    def handle_morph(self, message, sock):
        pass

    @msg_handlers.handler("stop_all")
    def handle_stop_all(self, message, sock):

        self.subprocess_mgr.killall()

    @msg_handlers.handler("dead_process")
    def handle_dead_process(self, message, sock):
        proc = self.subprocess_mgr.process(message.machine, message.pid)
        if proc is not None:
            proc.mark_delete()
            name = proc.name
            print '~~~~~   ~~~~~   ', name, self.restarting, message.status[0]

            if (proc.proc_type == 'obci_peer' or proc.proc_type == 'multiplexer') and \
                                not (name in self.restarting and message.status[0] == 'terminated'):
                print "KILLLLLING     and sending obci_peer_dead", proc.name
                send_msg(self._publish_socket, self.mtool.fill_msg("obci_peer_dead",
                                                sender=self.uuid,
                                                sender_ip=self.machine,
                                                peer_id=proc.name,
                                                path=proc.path,
                                                status=proc.status()
                                                ))
            if name in self.restarting:
                self.restarting.remove(name)

    @msg_handlers.handler("obci_peer_registered")
    def handle_obci_peer_registered(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_params_changed")
    def handle_obci_peer_params_changed(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_ready")
    def handle_obci_peer_ready(self, message, sock):
        print self.name , "got!", message.type
        send_msg(self._publish_socket, message.SerializeToString())


    @msg_handlers.handler("obci_control_message")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("obci_peer_dead")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("process_supervisor_registered")
    def handle_supervisor_registered(self, messsage, sock):
        # also ignore
        pass

    def cleanup_before_net_shutdown(self, kill_message, sock=None):
        self.processes = {}
        #self.subprocess_mgr.killall()

    def clean_up(self):
        print self.name,'[', self.type, ']',  "cleaning up"

        self.processes = {}
        self.subprocess_mgr.killall()
        self.subprocess_mgr.delete_all()