Пример #1
0
    def __init__(self, sandbox_dir,
                                        source_addresses=None,
                                        source_pub_addresses=None,
                                        rep_addresses=None,
                                        pub_addresses=None,
                                        experiment_uuid='',
                                        name='obci_process_supervisor'):

        self.peers = {}
        self.status = launcher_tools.READY_TO_LAUNCH
        self.source_pub_addresses = source_pub_addresses
        self.machine = socket.gethostname()
        self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR
        self.ctx = zmq.Context()
        self.mx_data = self.set_mx_data()
        self.env = self.peer_env(self.mx_data)
        self.launch_data = []
        self.peer_order = []
        self._running_peer_order = []
        self._current_part = None
        self.experiment_uuid = experiment_uuid
        self.peers_to_launch = []
        self.processes = {}
        self.restarting = []

        super(OBCIProcessSupervisor, self).__init__(
                                            source_addresses=source_addresses,
                                            rep_addresses=rep_addresses,
                                            pub_addresses=pub_addresses,
                                            name=name)
        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid)
Пример #2
0
    def __init__(self, rep_addresses=None, pub_addresses=None, name='obci_server'):

        self.experiments = {}
        self.exp_process_supervisors = {}
        self._nearby_servers = net.DNS()
        super(OBCIServer, self).__init__(None, rep_addresses,
                                         pub_addresses,
                                         name)

        self.machine = socket.gethostname()

        self.rep_port = int(net.server_rep_port())
        self.pub_port = int(net.server_pub_port())
        bcast_port = int(net.server_bcast_port())
        self._nearby_servers.logger = self.logger
        self._bcast_server = threading.Thread(target=broadcast_server,
                                              args=[self.uuid,
                                                    self.rep_port, self.pub_port, bcast_port])
        self._bcast_server.daemon = True
        self._bcast_server.start()

        self._nearby_updater = threading.Thread(target=update_nearby_servers,
                                                args=[self._nearby_servers,

                                                      bcast_port,
                                                      self.ctx,
                                                      self._push_addr])

        self._nearby_updater.daemon = True
        self._nearby_updater.start()
        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)
Пример #3
0
    def __init__(
        self,
        sandbox_dir,
        source_addresses=None,
        source_pub_addresses=None,
        rep_addresses=None,
        pub_addresses=None,
        experiment_uuid="",
        name="obci_process_supervisor",
    ):

        self.peers = {}
        self.status = launcher_tools.READY_TO_LAUNCH
        self.source_pub_addresses = source_pub_addresses
        self.machine = socket.gethostname()
        self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR
        self.ctx = zmq.Context()
        self.mx_data = self.set_mx_data()
        self.env = self.peer_env(self.mx_data)
        self.launch_data = []
        self.peer_order = []
        self._running_peer_order = []
        self._current_part = None
        self.experiment_uuid = experiment_uuid
        self.peers_to_launch = []
        self.processes = {}
        self.restarting = []

        super(OBCIProcessSupervisor, self).__init__(
            source_addresses=source_addresses, rep_addresses=rep_addresses, pub_addresses=pub_addresses, name=name
        )
        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid)
Пример #4
0
    def __init__(self,
                 rep_addresses=None,
                 pub_addresses=None,
                 name='obci_server'):

        self.experiments = {}
        self.exp_process_supervisors = {}
        self._nearby_servers = net.DNS()
        super(OBCIServer, self).__init__(None, rep_addresses, pub_addresses,
                                         name)

        self.machine = socket.gethostname()

        self.rep_port = int(net.server_rep_port())
        self.pub_port = int(net.server_pub_port())
        bcast_port = int(net.server_bcast_port())
        self._nearby_servers.logger = self.logger
        self._bcast_server = threading.Thread(
            target=broadcast_server,
            args=[self.uuid, self.rep_port, self.pub_port, bcast_port])
        self._bcast_server.daemon = True
        self._bcast_server.start()

        self._nearby_updater = threading.Thread(
            target=update_nearby_servers,
            args=[self._nearby_servers, bcast_port, self.ctx, self._push_addr])

        self._nearby_updater.daemon = True
        self._nearby_updater.start()
        self.subprocess_mgr = SubprocessMonitor(self.ctx,
                                                self.uuid,
                                                logger=self.logger)
Пример #5
0
    def __init__(self, source_addresses=None,
                 rep_addresses=None, pub_addresses=None, name='obci_control_peer'):

        # TODO TODO TODO !!!!
        # cleaner subclassing of obci_control_peer!!!
        self.hostname = socket.gethostname()
        self.source_addresses = source_addresses if source_addresses else []
        self.rep_addresses = rep_addresses
        self.pub_addresses = pub_addresses
        self._all_sockets = []
        self._pull_addr = 'inproc://publisher_msg'
        self._push_addr = 'inproc://publisher'
        self._subpr_push_addr = 'inproc://subprocess_info'

        self.uuid = str(uuid.uuid4())
        self.name = str(name)
        self.type = self.peer_type()

        log_dir = os.path.join(settings.OBCI_CONTROL_LOG_DIR,
                               self.name + '-' + self.uuid[:8])
        if not hasattr(self, 'logger'):
            if not os.path.exists(log_dir):
                os.makedirs(log_dir)
            self.logger = get_logger(self.peer_type(), log_dir=log_dir,
                                     stream_level=net_tools.peer_loglevel(), obci_peer=self)

        self.mtool = self.message_tool()

        if not hasattr(self, "ctx"):
            self.ctx = zmq.Context()

        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)
        self.net_init()

        if self.source_addresses:
            self.registration_response = self.register()
            self._handle_registration_response(self.registration_response)
        else:
            self.registration_response = None

        self.interrupted = False
        signal.signal(signal.SIGTERM, self.signal_handler())
        signal.signal(signal.SIGINT, self.signal_handler())
Пример #6
0
    def __init__(self, source_addresses=None,
                 rep_addresses=None, pub_addresses=None, name='obci_control_peer'):

        # TODO TODO TODO !!!!
        # cleaner subclassing of obci_control_peer!!!
        self.hostname = socket.gethostname()
        self.source_addresses = source_addresses if source_addresses else []
        self.rep_addresses = rep_addresses
        self.pub_addresses = pub_addresses
        self._all_sockets = []
        self._pull_addr = 'inproc://publisher_msg'
        self._push_addr = 'inproc://publisher'
        self._subpr_push_addr = 'inproc://subprocess_info'

        self.uuid = str(uuid.uuid4())
        self.name = str(name)
        self.type = self.peer_type()

        log_dir = os.path.join(settings.OBCI_CONTROL_LOG_DIR,
                               self.name + '-' + self.uuid[:8])
        if not hasattr(self, 'logger'):
            if not os.path.exists(log_dir):
                os.makedirs(log_dir)
            self.logger = get_logger(self.peer_type(), log_dir=log_dir,
                                     stream_level=net_tools.peer_loglevel(), obci_peer=self)

        self.mtool = self.message_tool()

        if not hasattr(self, "ctx"):
            self.ctx = zmq.Context()

        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)
        self.net_init()

        if self.source_addresses:
            self.registration_response = self.register()
            self._handle_registration_response(self.registration_response)
        else:
            self.registration_response = None

        self.interrupted = False
        signal.signal(signal.SIGTERM, self.signal_handler())
        signal.signal(signal.SIGINT, self.signal_handler())
Пример #7
0
    def __init__(self, sandbox_dir,
                 source_addresses=None,
                 source_pub_addresses=None,
                 rep_addresses=None,
                 pub_addresses=None,
                 experiment_uuid='',
                 name='obci_process_supervisor'):

        self.peers = {}
        self.status = launcher_tools.READY_TO_LAUNCH
        self.source_pub_addresses = source_pub_addresses
        self.machine = socket.gethostname()
        self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR
        self.ctx = zmq.Context()
        self.mx_data = self.set_mx_data()
        self.env = self.peer_env(self.mx_data)
        self.launch_data = []
        self.peer_order = []
        self._running_peer_order = []
        self._current_part = None
        self.__cfg_launch_info = None
        self.__cfg_morph = False
        self.experiment_uuid = experiment_uuid
        self.peers_to_launch = []
        self.processes = {}
        self.restarting = []
        self.rqs = 0
        self._nearby_machines = net.DNS()

        self.test_count = 0
        self.__cfg_lock = threading.RLock()

        super(OBCIProcessSupervisor, self).__init__(
            source_addresses=source_addresses,
            rep_addresses=rep_addresses,
            pub_addresses=pub_addresses,
            name=name)
        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)
Пример #8
0
class OBCIServer(OBCIControlPeer):

    msg_handlers = OBCIControlPeer.msg_handlers.copy()

    @log_crash
    def __init__(self, rep_addresses=None, pub_addresses=None, name='obci_server'):

        self.experiments = {}
        self.exp_process_supervisors = {}
        self._nearby_servers = net.DNS()
        super(OBCIServer, self).__init__(None, rep_addresses,
                                         pub_addresses,
                                         name)

        self.machine = socket.gethostname()

        self.rep_port = int(net.server_rep_port())
        self.pub_port = int(net.server_pub_port())
        bcast_port = int(net.server_bcast_port())
        self._nearby_servers.logger = self.logger
        self._bcast_server = threading.Thread(target=broadcast_server,
                                              args=[self.uuid,
                                                    self.rep_port, self.pub_port, bcast_port])
        self._bcast_server.daemon = True
        self._bcast_server.start()

        self._nearby_updater = threading.Thread(target=update_nearby_servers,
                                                args=[self._nearby_servers,

                                                      bcast_port,
                                                      self.ctx,
                                                      self._push_addr])

        self._nearby_updater.daemon = True
        self._nearby_updater.start()
        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)

    def nearby_server_addrs(self):
        snap = self._nearby_servers.snapshot()
        return [srv.ip for srv in snap.values()]

    def nearby_servers(self):
        return self._nearby_servers.snapshot()

    def my_ip(self):
        addr = "127.0.1.1"
        try:
            addr = self._nearby_servers.this_addr_network()
        except Exception as e:
            self.logger.error(str(e))
        return addr

    def network_ready(self):
        # i know my network IP
        return self.my_ip() != self.machine

    def handle_socket_read_error(self, socket, error):
        if socket == self.rep_socket:
            self.logger.warning("reinitialising REP socket")
            self._all_sockets.remove(self.rep_socket)
            if socket in self.client_rq:
                self.client_rq = None
            self.rep_socket.close()  # linger=0)
            self.rep_socket = None
            time.sleep(0.2)
            (self.rep_socket, self.rep_addresses) = self._init_socket(
                ['tcp://*:' + str(self.rep_port)], zmq.REP)
            self.rep_socket.setsockopt(zmq.LINGER, 0)
            self._all_sockets.append(self.rep_socket)
            self.logger.info(self.rep_addresses)

        elif socket == self.exp_rep:
            self.logger.info("reinitialising EXPERIMENT REP socket")
            self.exp_rep.close()  # linger=0)

            (self.exp_rep, self.exp_rep_addrs) = self._init_socket(
                self.exp_rep_addrs, zmq.REP)
            self.exp_rep.setsockopt(zmq.LINGER, 0)
            self._all_sockets.append(self.exp_rep)

    def peer_type(self):
        return 'obci_server'

    def net_init(self):

        (self.exp_rep, self.exp_rep_addrs) = self._init_socket(
            [], zmq.REP)
        # (self.exp_pub, self.exp_pub_addrs) = self._init_socket(
        #                                         [], zmq.PUB)
        # self.exp_pub.setsockopt(zmq.LINGER, 0)
        self._all_sockets.append(self.exp_rep)
        # self._all_sockets.append(self.exp_pub)
        tcp_port = int(net.server_tcp_proxy_port())

        self._tcp_proxy_thr, tcp_port = twisted_tcp_handling.run_twisted_server(
            ('0.0.0.0', tcp_port),
            self.ctx,
            self.rep_addresses[0])

        self.tcp_addresses = [(self.my_ip(), tcp_port),
                              (socket.gethostname(), tcp_port)]
        super(OBCIServer, self).net_init()

    def custom_sockets(self):
        return [self.exp_rep]  # , self.srv_rep, self.srv_pub]

    def clean_up(self):
        # self._tcp_srv.shutdown()
        pass

    def cleanup_before_net_shutdown(self, kill_message, sock=None):
        send_msg(self._publish_socket,  # self.exp_pub,
                 self.mtool.fill_msg("kill", receiver=""))
        send_msg(self._publish_socket, self.mtool.fill_msg("launcher_shutdown",
                                                           sender=self.uuid))
        for sup in self.exp_process_supervisors:
            self.exp_process_supervisors[sup].kill()
        self.logger.info('sent KILL to experiments')

    def _args_for_experiment(self, sandbox_dir, launch_file, local=False, name=None, overwrites=None):

        args = ['--sv-addresses']
        args += self.exp_rep_addrs
        args.append('--sv-pub-addresses')
        # if local:
        #     addrs = net.choose_local(self.exp_pub_addrs)
        # else:
        #     addrs = net.choose_not_local(self.exp_pub_addrs)
        addrs = net.choose_local(self.pub_addresses)  # self.exp_pub_addrs

        args += addrs
        exp_name = name if name else os.path.basename(launch_file)

        args += [
            '--sandbox-dir', str(sandbox_dir),
            '--launch-file', str(launch_file),
            '--name', exp_name,
            '--current-ip', self.my_ip()]
        if overwrites is not None:
            args += peer_cmd.peer_overwrites_cmd(overwrites)
        # print '{0} [{1}] -- experiment args: {2}'.format(self.name, self.peer_type(), args)
        return args

    def start_experiment_process(self, sandbox_dir, launch_file, name=None, overwrites=None):
        path = 'obci_experiment'
        args = self._args_for_experiment(sandbox_dir, launch_file,
                                         local=True, name=name, overwrites=overwrites)
        return self.subprocess_mgr.new_local_process(path, args,
                                                     proc_type='obci_experiment',
                                                     capture_io=NO_STDIO)

    def handle_register_experiment(self, message, sock):
        machine, pid = message.other_params['origin_machine'], message.other_params['pid']
        status, det = message.other_params['status_name'], message.other_params['details']
        launch_file = message.other_params['launch_file_path']
        tcp_addr = message.other_params['tcp_addrs']

        exp_proc = self.subprocess_mgr.process(machine, pid)

        if exp_proc is None:
            send_msg(sock, self.mtool.fill_msg("rq_error", err_code="experiment_not_found"))
            return

        info = self.experiments[message.uuid] = ExperimentInfo(message.uuid,
                                                               message.name,
                                                               message.rep_addrs,
                                                               message.pub_addrs,
                                                               time.time(),
                                                               machine,
                                                               pid,
                                                               status,
                                                               det,
                                                               launch_file,
                                                               tcp_addr,
                                                               self._nearby_servers.this_addr_network())

        exp_proc.registered(info)
        for addrs in [info.rep_addrs, info.pub_addrs]:
            one = addrs[0]
            port = net.port(one)
            addrs = [self._nearby_servers.this_addr_network() + ':' + str(port)] + addrs

        info_msg = self.mtool.fill_msg("experiment_created",
                                       uuid=info.uuid,
                                       name=info.name,
                                       rep_addrs=info.rep_addrs,
                                       pub_addrs=info.pub_addrs,
                                       origin_machine=info.origin_machine,
                                       status_name=status,
                                       details=det,
                                       launch_file_path=launch_file,
                                       tcp_addrs=tcp_addr)

        if self.client_rq:
            msg_type = self.client_rq[0].type
            rq_sock = self.client_rq[1]
            if msg_type == "create_experiment":
                self.client_rq = None
                send_msg(rq_sock, info_msg)

        send_msg(sock, self.mtool.fill_msg("rq_ok", params=self._nearby_servers.dict_snapshot()))
        send_msg(self._publish_socket, info_msg)

    def _handle_register_experiment_timeout(self, exp):
        self.logger.error("New experiment process failed to "
                          "register before timeout" + str(exp.pid))

        if exp.returncode is None:
            exp.kill()
            exp.wait()

        # msg_type = self.client_rq[0].type
        rq_sock = self.client_rq[1]
        send_msg(rq_sock, self.mtool.fill_msg("rq_error",
                                              err_code="create_experiment_error",
                                              request=vars(self.client_rq[0])))

    @msg_handlers.handler("register_peer")
    def handle_register_peer(self, message, sock):
        """Register peer"""
        if message.peer_type == "obci_client":
            send_msg(sock, self.mtool.fill_msg("rq_ok"))
        elif message.peer_type == "obci_experiment":
            self.handle_register_experiment(message, sock)
        else:
            super(OBCIServer, self).handle_register_peer(message, sock)

    @msg_handlers.handler("create_experiment")
    def handle_create_experiment(self, message, sock):

        if not self.network_ready() and self._nearby_servers.dict_snapshot():
            send_msg(sock, self.mtool.fill_msg("rq_error",
                                               err_code='server_network_not_ready'))
            return

        launch_file = message.launch_file
        sandbox = message.sandbox_dir
        name = message.name
        overwrites = message.overwrites

        sandbox = sandbox if sandbox else settings.DEFAULT_SANDBOX_DIR

        exp, details = self.start_experiment_process(
            sandbox, launch_file, name, overwrites)

        if exp is None:
            self.logger.error("failed to launch experiment "
                              "process, request: " + str(vars(message)))
            send_msg(sock, self.mtool.fill_msg("rq_error",
                                               request=vars(message),
                                               err_code='launch_error', details=details))
        else:
            self.logger.info("experiment process "
                             "launched:  {0}".format(exp.pid))
            if sock.socket_type in [zmq.REP, zmq.ROUTER]:
                self.client_rq = (message, sock)

    @msg_handlers.handler("list_experiments")
    def handle_list_experiments(self, message, sock):
        exp_data = {}
        for exp_id in self.experiments:
            exp_data[exp_id] = self.experiments[exp_id].info()

        nearby = self.nearby_servers()
        nearby_dict = {}
        for srv in nearby.values():
            nearby_dict[srv.ip] = srv.hostname
        info = '\n{'
        for srv in nearby_dict:
            info += '\n' + srv + ' : ' + nearby_dict[srv] + ','
        info += '}'
        self.logger.debug("nearby servers:  count: {0}, {1}".format(
            len(nearby), info))
        send_msg(sock, self.mtool.fill_msg("running_experiments",
                                           exp_data=exp_data,
                                           nearby_machines=nearby_dict))

    @msg_handlers.handler("list_nearby_machines")
    def handle_list_nearby_machines(self, message, sock):
        send_msg(sock, self.mtool.fill_msg('nearby_machines',
                                           nearby_machines=self._nearby_servers.dict_snapshot()))

    def _handle_match_name(self, message, sock, this_machine=False):
        matches = self.exp_matching(message.strname)
        match = None
        msg = None
        if not matches:
            msg = self.mtool.fill_msg("rq_error", request=vars(message),
                                      err_code='experiment_not_found')

        elif len(matches) > 1:
            matches = [(exp.uuid, exp.name) for exp in matches]
            msg = self.mtool.fill_msg("rq_error", request=vars(message),
                                      err_code='ambiguous_exp_name',
                                      details=matches)
        else:
            match = matches.pop()
            if this_machine and match.origin_machine != self.machine:
                msg = self.mtool.fill_msg("rq_error", request=vars(message),
                                          err_code='exp_not_on_this_machine', details=match.origin_machine)
                match = None
        if msg and sock.socket_type in [zmq.REP, zmq.ROUTER]:
            send_msg(sock, msg)
        return match

    @msg_handlers.handler("get_experiment_contact")
    def handle_get_experiment_contact(self, message, sock):
        self.logger.debug("##### rq contact for: %s", message.strname)

        info = self._handle_match_name(message, sock)
        if info:
            send_msg(sock, self.mtool.fill_msg("experiment_contact",
                                               uuid=info.uuid,
                                               name=info.name,
                                               rep_addrs=info.rep_addrs,
                                               pub_addrs=info.pub_addrs,
                                               tcp_addrs=info.tcp_addrs,
                                               machine=info.origin_machine,
                                               status_name=info.status_name,
                                               details=info.details))

    @msg_handlers.handler("experiment_status_change")
    def handle_experiment_status_change(self, message, sock):
        exp = self.experiments.get(message.uuid, None)
        if not exp:
            if sock.socket_type in [zmq.REP, zmq.ROUTER]:
                send_msg(sock, self.mtool.fill_msg('rq_error', err_code='experiment_not_found'))
            return
        exp.status_name = message.status_name
        exp.details = message.details
        if sock.socket_type in [zmq.REP, zmq.ROUTER]:
            send_msg(sock, self.mtool.fill_msg('rq_ok'))

        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("experiment_info_change")
    def handle_experiment_info_change(self, message, sock):
        exp = self.experiments.get(message.uuid, None)
        if not exp:
            self.logger.warning("UUID not found  " + message.uuid)
            if sock.socket_type in [zmq.REP, zmq.ROUTER]:
                send_msg(sock, self.mtool.fill_msg('rq_error', err_code='experiment_not_found'))
            return
        exp.name = message.name
        exp.launch_file_path = message.launch_file_path
        if sock.socket_type in [zmq.REP, zmq.ROUTER]:
            send_msg(sock, self.mtool.fill_msg('rq_ok'))
        self.logger.info("INFO CHANGED %s", exp.launch_file_path)
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("experiment_transformation")
    def handle_experiment_transformation(self, message, sock):
        exp = self.experiments.get(message.uuid, None)
        if not exp:
            if sock.socket_type in [zmq.REP, zmq.ROUTER]:
                send_msg(sock, self.mtool.fill_msg('rq_error', err_code='experiment_not_found'))
            return
        exp.status_name = message.status_name
        exp.details = message.details
        exp.launch_file_path = message.launch_file
        exp.name = message.name
        if sock.socket_type in [zmq.REP, zmq.ROUTER]:
            send_msg(sock, self.mtool.fill_msg('rq_ok'))
        send_msg(self._publish_socket, message.SerializeToString())

    def exp_matching(self, strname):
        """Match *strname* against all created experiment IDs and
        names. Return those experiment descriptions which name
        or uuid starts with strname.
        """
        match_names = {}
        for uid, exp in self.experiments.items():
            if exp.name.startswith(strname):
                match_names[uid] = exp

        ids = self.experiments.keys()
        match_ids = [uid for uid in ids if uid.startswith(strname)]

        experiments = set()
        for uid in match_ids:
            experiments.add(self.experiments[uid])
        for name, exp in match_names.items():
            experiments.add(exp)

        return experiments

    @msg_handlers.handler("kill_experiment")
    def handle_kill_experiment(self, message, sock):
        match = self._handle_match_name(message, sock, this_machine=True)

        if match:
            if match.kill_timer is not None:
                send_msg(sock, self.mtool.fill_msg("rq_error", err_code="already_killed",
                                                   details="Experiment already shutting down"))

            elif not message.force:
                self.logger.info("sending kill to experiment "
                                 "{0} ({1})".format(match.uuid, match.name))
                send_msg(self._publish_socket,  # self.exp_pub,
                         self.mtool.fill_msg("kill", receiver=match.uuid))

                send_msg(sock, self.mtool.fill_msg("kill_sent", experiment_id=match.uuid))
                pid = match.experiment_pid
                uid = match.uuid
                self.logger.info("Waiting for experiment process {0} to terminate".format(uid))
                match.kill_timer = threading.Timer(1.1,
                                                   self._handle_killing_exp, args=[pid, uid])
                match.kill_timer.start()
                send_msg(self._publish_socket, self.mtool.fill_msg('kill_sent',
                                                                   experiment_id=match.uuid
                                                                   ))

    def _handle_killing_exp(self, pid, uid):
        proc = self.subprocess_mgr.process(self.machine, pid)
        if proc.process_is_running():
            proc.kill()
        self.logger.info("experiment {0} FINISHED".format(uid))
        proc.delete = True
        del self.experiments[uid]

        return proc.popen_obj.returncode

    @msg_handlers.handler("launch_process")
    def handle_launch_process(self, message, sock):
        if message.proc_type == 'obci_process_supervisor':
            self._handle_launch_process_supervisor(message, sock)

    def _handle_launch_process_supervisor(self, message, sock):
        sv_obj, details = self._start_obci_supervisor_process(message)

        self.logger.info("LAUNCH PROCESS SV   " + str(sv_obj) + str(details))
        if sv_obj:
            self.exp_process_supervisors[message.sender] = sv_obj
            send_msg(sock,
                     self.mtool.fill_msg("launched_process_info",
                                         sender=self.uuid, machine=self.machine,
                                         pid=sv_obj.pid, proc_type=sv_obj.proc_type,
                                         name=sv_obj.name,
                                         path=sv_obj.path))
            self.logger.info("CONFIRMED LAUNCH")
        else:
            send_msg(sock, self.mtool.fill_msg('rq_error', request=message.dict(),
                                               err_code="launch_error",
                                               details=details))
            self.logger.error("PROCESS SUPERVISOR LAUNCH FAILURE")

    @msg_handlers.handler("kill_process")
    def handle_kill_process_supervisor(self, message, sock):
        proc = self.subprocess_mgr.process(message.machine, message.pid)
        if not proc:
            send_msg(sock, self.mtool.fill_msg("rq_error", err_code="process_not_found"))
        else:
            # TODO
            # name = proc.name
            proc.kill()
            proc.mark_delete()
            send_msg(sock, self.mtool.fill_msg("rq_ok"))
            del self.exp_process_supervisors[proc.name]

    @msg_handlers.handler("dead_process")
    def handle_dead_process(self, message, sock):
        proc = self.subprocess_mgr.process(message.machine, message.pid)
        if proc is not None:
            proc.mark_delete()
            status, details = proc.status()
            self.logger.warning("Process " + proc.proc_type + " dead: " +
                                status + str(details) + proc.name + str(proc.pid))
            if proc.proc_type == 'obci_process_supervisor':
                pass
            elif proc.proc_type == 'obci_experiment':
                pass
            if status == subprocess_monitor.FAILED:
                pass

    @msg_handlers.handler("find_eeg_experiments")
    def handle_find_eeg_experiments(self, message, sock):

        if not self.network_ready() and self._nearby_servers.dict_snapshot():
            send_msg(sock, self.mtool.fill_msg("rq_error",
                                               err_code='server_network_not_ready'))
            return

        send_msg(sock, self.mtool.fill_msg("rq_ok"))
        finder_thr = threading.Thread(target=find_eeg_experiments_and_push_results,
                                      args=[self.ctx, self.rep_addresses,
                                            message,
                                            self._nearby_servers.copy()])
        finder_thr.daemon = True
        finder_thr.start()

    @msg_handlers.handler("find_eeg_amplifiers")
    def handle_find_new_eeg_amplifiers(self, message, sock):
        if not self.network_ready() and self._nearby_servers.dict_snapshot():
            send_msg(sock, self.mtool.fill_msg("rq_error",
                                               err_code='server_network_not_ready'))
            return

        send_msg(sock, self.mtool.fill_msg("rq_ok"))
        amp_thr = threading.Thread(target=find_new_experiments_and_push_results,
                                   args=[self.ctx,
                                         message])
        amp_thr.daemon = True
        amp_thr.start()

    @msg_handlers.handler("start_eeg_signal")
    def handle_start_eeg_signal(self, message, sock):
        if not self.network_ready() and self._nearby_servers.dict_snapshot():
            send_msg(sock, self.mtool.fill_msg("rq_error",
                                               err_code='server_network_not_ready'))
            return
        send_msg(sock, self.mtool.fill_msg("rq_ok"))
        start_thr = threading.Thread(target=start_eeg_signal_experiment,
                                     args=[self.ctx, self.rep_addresses,
                                           message])
        start_thr.daemon = True
        start_thr.start()

    def _start_obci_supervisor_process(self, rq_message):
        path = obci_process_supervisor.__file__
        path = '.'.join([path.rsplit('.', 1)[0], 'py'])
        start_params = rq_message.dict()
        start_params['path'] = path
        del start_params['type']
        del start_params['sender']
        del start_params['sender_ip']
        del start_params['receiver']
        sv_obj, details = self.subprocess_mgr.new_local_process(**start_params)
        if sv_obj is None:
            return None, details

        return sv_obj, False

    def _crash_extra_data(self, exception=None):
        data = super(OBCIServer, self)._crash_extra_data(exception)
        data.update({
            'experiments': [e.info() for e in self.experiments.values()]
        })
        return data
Пример #9
0
class OBCIProcessSupervisor(OBCIControlPeer):

    msg_handlers = OBCIControlPeer.msg_handlers.copy()

    def __init__(
        self,
        sandbox_dir,
        source_addresses=None,
        source_pub_addresses=None,
        rep_addresses=None,
        pub_addresses=None,
        experiment_uuid="",
        name="obci_process_supervisor",
    ):

        self.peers = {}
        self.status = launcher_tools.READY_TO_LAUNCH
        self.source_pub_addresses = source_pub_addresses
        self.machine = socket.gethostname()
        self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR
        self.ctx = zmq.Context()
        self.mx_data = self.set_mx_data()
        self.env = self.peer_env(self.mx_data)
        self.launch_data = []
        self.peer_order = []
        self._running_peer_order = []
        self._current_part = None
        self.experiment_uuid = experiment_uuid
        self.peers_to_launch = []
        self.processes = {}
        self.restarting = []

        super(OBCIProcessSupervisor, self).__init__(
            source_addresses=source_addresses, rep_addresses=rep_addresses, pub_addresses=pub_addresses, name=name
        )
        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid)

    def peer_type(self):
        return "obci_process_supervisor"

    def net_init(self):
        self.source_sub_socket = self.ctx.socket(zmq.SUB)
        self.source_sub_socket.setsockopt(zmq.SUBSCRIBE, "")

        self._all_sockets.append(self.source_sub_socket)

        if self.source_pub_addresses:
            for addr in self.source_pub_addresses:
                self.source_sub_socket.connect(addr)

        (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL)
        # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "")

        self.cs_addr = net.choose_not_local(self.cs_addresses)
        if not self.cs_addr:
            self.cs_addr = net.choose_local(self.cs_addresses)[0]
        else:
            self.cs_addr = self.cs_addr[0]

        self._all_sockets.append(self.config_server_socket)

        super(OBCIProcessSupervisor, self).net_init()

    def params_for_registration(self):
        return dict(
            pid=os.getpid(),
            machine=self.machine,
            mx_data=[self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]],
        )

    def custom_sockets(self):
        return [self.source_sub_socket, self.config_server_socket]

    def _handle_registration_response(self, response):
        self.launch_data = response.params["launch_data"]
        self.peers_to_launch = list(self.launch_data.keys())
        self.peer_order = response.params["peer_order"]
        for part in self.peer_order:
            self._running_peer_order.append(list(part))
        print self.name, "[", self.type, "]", "RECEIVED LAUNCH DATA: ", self.launch_data

    def set_mx_data(self):

        src_ = net.choose_not_local(self.source_pub_addresses)[:1]
        if not src_:
            src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1]
        src = src_[0]
        src = src[6:].split(":")[0]

        if src == socket.gethostname():
            sock = self.ctx.socket(zmq.REP)
            port = str(
                sock.bind_to_random_port(
                    "tcp://127.0.0.1", min_port=settings.PORT_RANGE[0], max_port=settings.PORT_RANGE[1]
                )
            )
            sock.close()
            return ("0.0.0.0", port), ""  # empty passwd
        else:
            return None, None

    def mx_addr_str(self, mx_data):
        if mx_data[0] is None:
            return None
        addr, port = mx_data[0]
        print self.name, "[", self.type, "]", "mx addr str", addr + ":" + str(port)
        return addr + ":" + str(port)

    def peer_env(self, mx_data):

        if mx_data[0] is None:
            return None

        env = os.environ.copy()
        addr, port = mx_data[0]

        _env = {
            "MULTIPLEXER_ADDRESSES": socket.gethostname() + ":" + str(port),
            "MULTIPLEXER_PASSWORD": mx_data[1],
            "MULTIPLEXER_RULES": launcher_tools.mx_rules_path(),
        }
        env.update(_env)
        return env

    @msg_handlers.handler("start_mx")
    def handle_start_mx(self, message, sock):
        if "mx" in self.launch_data and self.mx_data[0] is not None:
            print self.name, "[", self.type, "]", "..starting multiplexer"
            self.peer_order.remove(["mx"])
            self.peers_to_launch.remove("mx")
            path = launcher_tools.mx_path()

            args = [
                "run_multiplexer",
                self.mx_addr_str((("0.0.0.0", self.mx_data[0][1]), self.mx_data[1])),
                "--multiplexer-password",
                self.mx_data[1],
                "--rules",
                launcher_tools.mx_rules_path(),
            ]
            proc, details = self._launch_process(path, args, "multiplexer", "mx", env=self.env)
            self.processes["mx"] = proc
            if proc is not None:
                self.mx = proc

    @msg_handlers.handler("start_peers")
    def handle_start_peers(self, message, sock):
        self._launch_processes(self.launch_data)

    def test(self):
        # for i in range(SEND):
        #     send_msg(self.push, str(i))
        self.pull = self.ctx.socket(zmq.SUB)
        self.pull.bind("tcp://*:16789")

        received = 0
        prev = -1
        for i in range(SEND):
            msg = recv_msg(self.pull)
            if int(msg):
                # prev = int(msg)
                received += 1
            if received % 10000 == 0:
                print "zmq: received ", received, "messages, last: ", msg

        if received == SEND:
            print "zmq: OK"
        else:
            print "WUT?", received
        # self.push.close()
        self.pull.close()

    @msg_handlers.handler("manage_peers")
    def handle_manage_peers(self, message, sock):
        if not message.receiver == self.uuid:
            return
        message.kill_peers.append("config_server")

        message.start_peers_data["config_server"] = dict(self.launch_data["config_server"])
        restore_config = [peer for peer in self.processes if peer not in message.kill_peers]
        for peer in message.kill_peers:
            proc = self.processes.get(peer, None)
            if not proc:
                print self.name, "[", self.type, "]", "peer to kill not found:", peer
                continue
            print "MORPH:  KILLING ", peer
            proc.kill()
            print "MORPH:  KILLED ", peer
            del self.processes[peer]
            del self.launch_data[peer]

        for peer, data in message.start_peers_data.iteritems():
            self.launch_data[peer] = data
        self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers]

        self._launch_processes(message.start_peers_data, restore_config=restore_config)

    def _launch_processes(self, launch_data, restore_config=[]):
        proc, details = None, None
        success = True
        path, args = None, None

        self.status = launcher_tools.LAUNCHING

        ldata = []
        if "config_server" in launch_data:
            ldata.append(("config_server", launch_data["config_server"]))
        if "amplifier" in launch_data:
            ldata.append(("amplifier", launch_data["amplifier"]))
        for peer, data in launch_data.iteritems():
            if (peer, data) not in ldata:
                ldata.append((peer, data))

        for peer, data in ldata:  # self.launch_data.iteritems():
            wait = 0
            if peer.startswith("mx"):
                continue
            path = os.path.join(launcher_tools.obci_root(), data["path"])
            args = data["args"]
            if peer.startswith("config_server"):
                args += ["-p", "launcher_socket_addr", self.cs_addr]
                args += ["-p", "experiment_uuid", self.experiment_uuid]

                if restore_config:
                    args += ["-p", "restore_peers", " ".join(restore_config)]
                wait = 0.4
            proc, details = self._launch_process(path, args, data["peer_type"], peer, env=self.env, capture_io=NO_STDIO)
            if proc is not None:
                self.processes[peer] = proc
            else:
                success = False
                break
            time.sleep(wait)
        if success:
            send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched", machine=self.machine))
        else:
            print self.name, "[", self.type, "]", "OBCI LAUNCH FAILED"
            send_msg(
                self._publish_socket,
                self.mtool.fill_msg("obci_launch_failed", machine=self.machine, path=path, args=args, details=details),
            )
            self.processes = {}
            self.subprocess_mgr.killall()

    def _launch_process(self, path, args, proc_type, name, env=None, capture_io=NO_STDIO):
        proc, details = self.subprocess_mgr.new_local_process(
            path, args, proc_type=proc_type, name=name, monitoring_optflags=RETURNCODE, capture_io=capture_io, env=env
        )
        if proc is None:
            print self.name, "[", self.type, "]", "process launch FAILED:", path, args
            send_msg(
                self._publish_socket,
                self.mtool.fill_msg(
                    "launch_error",
                    sender=self.uuid,
                    details=dict(machine=self.machine, path=path, args=args, error=details),
                ),
            )
        else:
            print self.name, "[", self.type, "]", "process launch success:", path, args, proc.pid
            send_msg(
                self._publish_socket,
                self.mtool.fill_msg(
                    "launched_process_info",
                    sender=self.uuid,
                    machine=self.machine,
                    pid=proc.pid,
                    proc_type=proc_type,
                    name=name,
                    path=path,
                    args=args,
                ),
            )
        return proc, details

    @msg_handlers.handler("get_tail")
    def handle_get_tail(self, message, sock):
        lines = message.len if message.len else DEFAULT_TAIL_RQ
        peer = message.peer_id
        if peer not in self.launch_data:
            return
        experiment_id = self.launch_data[peer]["experiment_id"]
        txt = self.processes[peer].tail_stdout(lines=lines)
        send_msg(
            self._publish_socket,
            self.mtool.fill_msg("tail", txt=txt, sender=self.uuid, experiment_id=experiment_id, peer_id=peer),
        )

    @msg_handlers.handler("experiment_finished")
    def handle_experiment_finished(self, message, sock):
        pass

    @msg_handlers.handler("morph_to_new_scenario")
    def handle_morph(self, message, sock):
        pass

    @msg_handlers.handler("stop_all")
    def handle_stop_all(self, message, sock):

        self.subprocess_mgr.killall()

    @msg_handlers.handler("dead_process")
    def handle_dead_process(self, message, sock):
        proc = self.subprocess_mgr.process(message.machine, message.pid)
        if proc is not None:
            proc.mark_delete()
            name = proc.name
            print "~~~~~   ~~~~~   ", name, self.restarting, message.status[0]

            if (proc.proc_type == "obci_peer" or proc.proc_type == "multiplexer") and not (
                name in self.restarting and message.status[0] == "terminated"
            ):
                print "KILLLLLING     and sending obci_peer_dead", proc.name
                send_msg(
                    self._publish_socket,
                    self.mtool.fill_msg(
                        "obci_peer_dead",
                        sender=self.uuid,
                        sender_ip=self.machine,
                        peer_id=proc.name,
                        path=proc.path,
                        status=proc.status(),
                    ),
                )
            if name in self.restarting:
                self.restarting.remove(name)

    @msg_handlers.handler("obci_peer_registered")
    def handle_obci_peer_registered(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_params_changed")
    def handle_obci_peer_params_changed(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_ready")
    def handle_obci_peer_ready(self, message, sock):
        print self.name, "got!", message.type
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_control_message")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("obci_peer_dead")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("process_supervisor_registered")
    def handle_supervisor_registered(self, messsage, sock):
        # also ignore
        pass

    def cleanup_before_net_shutdown(self, kill_message, sock=None):
        self.processes = {}
        # self.subprocess_mgr.killall()

    def clean_up(self):
        print self.name, "[", self.type, "]", "cleaning up"

        self.processes = {}
        self.subprocess_mgr.killall()
        self.subprocess_mgr.delete_all()
Пример #10
0
class OBCIControlPeer(object):

    msg_handlers = HandlerCollection()

    def __init__(self, source_addresses=None,
                 rep_addresses=None, pub_addresses=None, name='obci_control_peer'):

        # TODO TODO TODO !!!!
        # cleaner subclassing of obci_control_peer!!!
        self.hostname = socket.gethostname()
        self.source_addresses = source_addresses if source_addresses else []
        self.rep_addresses = rep_addresses
        self.pub_addresses = pub_addresses
        self._all_sockets = []
        self._pull_addr = 'inproc://publisher_msg'
        self._push_addr = 'inproc://publisher'
        self._subpr_push_addr = 'inproc://subprocess_info'

        self.uuid = str(uuid.uuid4())
        self.name = str(name)
        self.type = self.peer_type()

        log_dir = os.path.join(settings.OBCI_CONTROL_LOG_DIR,
                               self.name + '-' + self.uuid[:8])
        if not hasattr(self, 'logger'):
            if not os.path.exists(log_dir):
                os.makedirs(log_dir)
            self.logger = get_logger(self.peer_type(), log_dir=log_dir,
                                     stream_level=net_tools.peer_loglevel(), obci_peer=self)

        self.mtool = self.message_tool()

        if not hasattr(self, "ctx"):
            self.ctx = zmq.Context()

        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)
        self.net_init()

        if self.source_addresses:
            self.registration_response = self.register()
            self._handle_registration_response(self.registration_response)
        else:
            self.registration_response = None

        self.interrupted = False
        signal.signal(signal.SIGTERM, self.signal_handler())
        signal.signal(signal.SIGINT, self.signal_handler())

    def signal_handler(self):
        def handler(signum, frame):
            self.logger.info("[!!!!] %s %s %s %s",
                             self.name, "got signal", signum, frame)
            self.interrupted = True
        return handler

    def peer_type(self):
        return 'obci_control_peer'

    def message_tool(self):
        return OBCIMessageTool(message_templates)

    def _publisher_thread(self, pub_addrs, pull_address, push_addr):
        # FIXME aaaaahhh pub_addresses are set here, not in the main thread
        # (which reads them in _register method)
        pub_sock, self.pub_addresses = self._init_socket(
            pub_addrs, zmq.PUB)

        pull_sock = self.ctx.socket(zmq.PULL)
        pull_sock.bind(pull_address)

        push_sock = self.ctx.socket(zmq.PUSH)
        push_sock.connect(push_addr)

        send_msg(push_sock, b'1')
        po = PollingObject()

        while not self._stop_publishing:
            try:
                to_publish, det = po.poll_recv(pull_sock, 500)

                if to_publish:
                    send_msg(pub_sock, to_publish)

            except:
                # print self.name, '.Publisher -- STOP.'
                break
        # self.logger.info( "close  sock %s %s", pub_addrs, pub_sock)
        pub_sock.close()
        pull_sock.close()
        push_sock.close()

    def _subprocess_info(self, push_addr):
        push_sock = self.ctx.socket(zmq.PUSH)
        push_sock.connect(push_addr)

        send_msg(push_sock, b'1')
        while not self._stop_monitoring:
            dead = self.subprocess_mgr.not_running_processes()
            if dead:
                # self.logger.warning("DEAD  process" +  str(dead))
                for key, status in dead.items():
                    send_msg(push_sock, self.mtool.fill_msg('dead_process', machine=key[0],
                                                            pid=key[1], status=status))
            time.sleep(0.5)
        push_sock.close()

    def _push_sock(self, ctx, addr):
        sock = ctx.socket(zmq.PUSH)
        sock.connect(addr)
        return sock

    def _prepare_publisher(self):
        tmp_pull = self.ctx.socket(zmq.PULL)
        tmp_pull.bind(self._pull_addr)
        self.pub_thr = threading.Thread(target=self._publisher_thread,
                                        args=[self.pub_addresses,
                                              self._push_addr,
                                              self._pull_addr])
        self.pub_thr.daemon = True

        self._stop_publishing = False
        self.pub_thr.start()
        recv_msg(tmp_pull)
        self._publish_socket = self._push_sock(self.ctx, self._push_addr)
        self._all_sockets.append(self._publish_socket)
        tmp_pull.close()

    def _prepare_subprocess_info(self):
        self._subprocess_pull = self.ctx.socket(zmq.PULL)
        self._subprocess_pull.bind(self._subpr_push_addr)

        self.subprocess_thr = threading.Thread(target=self._subprocess_info,
                                               args=[self._subpr_push_addr])
        self.subprocess_thr.daemon = True
        self._stop_monitoring = False

        self.subprocess_thr.start()
        recv_msg(self._subprocess_pull)

        self._all_sockets.append(self._subprocess_pull)

    def net_init(self):
        # (self.pub_socket, self.pub_addresses) = self._init_socket(
        #                                         self.pub_addresses, zmq.PUB)
        self._all_sockets = []
        self._prepare_publisher()
        self._prepare_subprocess_info()

        (self.rep_socket, self.rep_addresses) = self._init_socket(
            self.rep_addresses, zmq.REP)
        self.rep_socket.setsockopt(zmq.LINGER, 0)
        self._all_sockets.append(self.rep_socket)

        print("\n\tname: {0}\n\tpeer_type: {1}\n\tuuid: {2}\n".format(
            self.name, self.peer_type(), self.uuid))
        print("rep: {0}".format(self.rep_addresses))
        print("pub: {0}\n".format(self.pub_addresses))

        self.source_req_socket = self.ctx.socket(zmq.REQ)

        if self.source_addresses:
            for addr in self.source_addresses:
                self.source_req_socket.connect(addr)
        self._all_sockets.append(self.source_req_socket)
        self._set_poll_sockets()

    def _init_socket(self, addrs, zmq_type):
        # print self.peer_type(), "addresses for socket init:", addrs
        addresses = addrs if addrs else ['tcp://*']

        random_port = True if not addrs else False

        sock = self.ctx.socket(zmq_type)
        port = None
        try:
            for i, addr in enumerate(addresses):
                if random_port and net.is_net_addr(addr):
                    port = str(sock.bind_to_random_port(addr,
                                                        min_port=settings.PORT_RANGE[0],
                                                        max_port=settings.PORT_RANGE[1]))
                    addresses[i] = addr + ':' + str(port)
                else:
                    sock.bind(addr)
        except Exception as e:
            self.logger.critical("CRITICAL error: %s", str(e))
            raise(e)

        advertised_addrs = []
        for addr in addresses:
            if addr.startswith('tcp://*'):
                port = addr.rsplit(':', 1)[1]
                advertised_addrs.append('tcp://' + socket.gethostname() + ':' + str(port))
                advertised_addrs.append('tcp://' + 'localhost:' + str(port))
            else:
                advertised_addrs.append(addr)
        return sock, advertised_addrs

    def _register(self, rep_addrs, pub_addrs, params):
        message = self.mtool.fill_msg("register_peer", peer_type=self.type,
                                      uuid=self.uuid,
                                      rep_addrs=rep_addrs,
                                      pub_addrs=pub_addrs,
                                      name=self.name,
                                      other_params=params)
        self.logger.debug("_register()  " + str(message))
        send_msg(self.source_req_socket, message)
        response_str = recv_msg(self.source_req_socket)
        response = self.mtool.unpack_msg(response_str)
        if response.type == "rq_error":
            self.logger.critical("Registration failed: {0}".format(response_str))
            sys.exit(2)
        return response

    def register(self):
        params = self.params_for_registration()
        return self._register(self.rep_addresses, self.pub_addresses, params)

    def _handle_registration_response(self, response):
        pass

    def shutdown(self):
        self.logger.info("SHUTTING DOWN")
        sys.exit(0)

    def params_for_registration(self):
        return {}

    def basic_sockets(self):
        return [self.rep_socket, self._subprocess_pull]

    def custom_sockets(self):
        """
        subclass this
        """
        return []

    def all_sockets(self):
        return self.basic_sockets() + self.custom_sockets()

    def _set_poll_sockets(self):
        self._poll_sockets = self.all_sockets()

    @log_crash
    def run(self):
        self.pre_run()
        poller = zmq.Poller()
        poll_sockets = list(self._poll_sockets)
        for sock in poll_sockets:
            poller.register(sock, zmq.POLLIN)

        try:
            while True:
                socks = []
                try:
                    socks = dict(poller.poll())
                except zmq.ZMQError as e:
                    self.logger.warning(": zmq.poll(): " + str(e.strerror))
                for sock in socks:
                    if socks[sock] == zmq.POLLIN:
                        more = True
                        while more:
                            try:
                                msg = recv_msg(sock, flags=zmq.NOBLOCK)
                            except zmq.ZMQError as e:
                                if e.errno == zmq.EAGAIN or sock.getsockopt(zmq.TYPE) == zmq.REP:
                                    more = False
                                else:
                                    self.logger.error("handling socket read error: %s  %d  %s",
                                                      e, e.errno, sock)
                                    poller.unregister(sock)
                                    if sock in poll_sockets:
                                        poll_sockets.remove(sock)
                                    self.handle_socket_read_error(sock, e)
                                    break
                            else:
                                self.handle_message(msg, sock)
                    else:
                        self.logger.warning("sock not zmq.POLLIN! Ignore !")

                if self.interrupted:
                    break
                self._update_poller(poller, poll_sockets)
        except Exception as e:
            # from urllib2 import HTTPError
            # try:
            #     self.logger.critical("UNHANDLED EXCEPTION IN %s!!! ABORTING!  Exception data: %s, e.args: %s, %s",
            #                         self.name, e, e.args, vars(e), exc_info=True,
            #                         extra={'stack': True})
            # except HTTPError, e:
            #     self.logger.info('sentry sending failed....')
            self._clean_up()
            raise(e)

        self._clean_up()

    def _crash_extra_description(self, exception=None):
        return ""

    def _crash_extra_data(self, exception=None):
        return {}

    def _crash_extra_tags(self, exception=None):
        return {'obci_part': 'launcher'}

    def _update_poller(self, poller, curr_sockets):
        self._set_poll_sockets()
        new_sockets = list(self._poll_sockets)

        for sock in new_sockets:
            if sock not in curr_sockets:
                poller.register(sock, zmq.POLLIN)
        for sock in curr_sockets:
            if sock not in new_sockets:
                poller.unregister(sock)
        curr_sockets = new_sockets

    def handle_socket_read_error(self, socket, error):
        pass

    def pre_run(self):
        pass

    def _clean_up(self):
        time.sleep(0.01)
        self._stop_publishing = True
        self._stop_monitoring = True
        self.pub_thr.join()
        self.subprocess_thr.join()

        for sock in self._all_sockets:
            # print self.name, "closing ", sock
            sock.close()
        # try:
        #     self.ctx.term()
        # except zmq.ZMQError(), e:
        #     print "Ctx closing interrupted."
        self.clean_up()

    def clean_up(self):
        self.logger.info("CLEANING UP")


# message handling ######################################

    def handle_message(self, message, sock):

        handler = self.msg_handlers.default

        try:
            msg = self.mtool.unpack_msg(message)
            if msg.type != "ping" and msg.type != "rq_ok":
                self.logger.debug("got message: {0}".format(msg.type))
                if msg.type == "get_tail":
                    print(self.msg_handlers)
        except ValueError as e:
            print("{0} [{1}], Bad message format! {2}".format(
                self.name, self.peer_type(), message))
            if sock.getsockopt(zmq.TYPE) == zmq.REP:
                handler = self.msg_handlers.error
            msg = message
            print(e)
        else:
            msg_type = msg.type

            handler = self.msg_handlers.handler_for(msg_type)
            if handler is None:
                # print "{0} [{1}], Unknown message type: {2}".format(
                #                         self.name, self.peer_type(),msg_type)
                # print message

                handler = self.msg_handlers.unsupported
        handler(self, msg, sock)

    @msg_handlers.handler("register_peer")
    def handle_register_peer(self, message, sock):
        """Subclass this."""
        result = self.mtool.fill_msg("rq_error",
                                     request=vars(message), err_code="unsupported_peer_type")
        send_msg(sock, result)

    @msg_handlers.handler("ping")
    def handle_ping(self, message, sock):
        if sock.socket_type in [zmq.REP, zmq.ROUTER]:
            send_msg(sock, self.mtool.fill_msg("pong"))

    @msg_handlers.default_handler()
    def default_handler(self, message, sock):
        """Ignore message"""
        pass

    @msg_handlers.unsupported_handler()
    def unsupported_msg_handler(self, message, sock):
        if sock.socket_type in [zmq.REP, zmq.ROUTER]:
            msg = self.mtool.fill_msg("rq_error",
                                      request=vars(message), err_code="unsupported_msg_type", sender=self.uuid)
            send_msg(sock, msg)
        # print "--"

    @msg_handlers.error_handler()
    def bad_msg_handler(self, message, sock):
        msg = self.mtool.fill_msg("rq_error",
                                  request=message, err_code="invalid_msg_format")
        send_msg(sock, msg)

    @msg_handlers.handler("kill")
    def handle_kill(self, message, sock):

        if not message.receiver or message.receiver == self.uuid:
            self.cleanup_before_net_shutdown(message, sock)
            self._clean_up()
            self.shutdown()

    @msg_handlers.handler("dead_process")
    def handle_dead_process(self, message, sock):
        pass

    def cleanup_before_net_shutdown(self, kill_message, sock=None):
        for sock in self._all_sockets:
            sock.close()
Пример #11
0
class OBCIProcessSupervisor(OBCIControlPeer):
    msg_handlers = OBCIControlPeer.msg_handlers.copy()

    @log_crash
    def __init__(self, sandbox_dir,
                 source_addresses=None,
                 source_pub_addresses=None,
                 rep_addresses=None,
                 pub_addresses=None,
                 experiment_uuid='',
                 name='obci_process_supervisor'):

        self.peers = {}
        self.status = launcher_tools.READY_TO_LAUNCH
        self.source_pub_addresses = source_pub_addresses
        self.machine = socket.gethostname()
        self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR
        self.ctx = zmq.Context()
        self.mx_data = self.set_mx_data()
        self.env = self.peer_env(self.mx_data)
        self.launch_data = []
        self.peer_order = []
        self._running_peer_order = []
        self._current_part = None
        self.__cfg_launch_info = None
        self.__cfg_morph = False
        self.experiment_uuid = experiment_uuid
        self.peers_to_launch = []
        self.processes = {}
        self.restarting = []
        self.rqs = 0
        self._nearby_machines = net.DNS()

        self.test_count = 0
        self.__cfg_lock = threading.RLock()

        super(OBCIProcessSupervisor, self).__init__(
            source_addresses=source_addresses,
            rep_addresses=rep_addresses,
            pub_addresses=pub_addresses,
            name=name)
        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)

    def peer_type(self):
        return "obci_process_supervisor"

    def net_init(self):
        self.source_sub_socket = self.ctx.socket(zmq.SUB)
        self.source_sub_socket.setsockopt_string(zmq.SUBSCRIBE, "")

        self._all_sockets.append(self.source_sub_socket)

        if self.source_pub_addresses:
            for addr in self.source_pub_addresses:
                self.source_sub_socket.connect(addr)

        (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL)
        # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "")

        self.cs_addr = net.choose_local(self.cs_addresses)
        if not self.cs_addr:
            self.cs_addr = net.choose_not_local(self.cs_addresses)[0]
        else:
            self.cs_addr = self.cs_addr[0]

        self._all_sockets.append(self.config_server_socket)

        super(OBCIProcessSupervisor, self).net_init()

    def params_for_registration(self):
        mx_data = None
        if None not in self.mx_data:
            mx_data = [self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]]
        return dict(pid=os.getpid(), machine=self.machine,
                    mx_data=mx_data)

    def custom_sockets(self):
        return [self.source_sub_socket, self.config_server_socket]

    def _handle_registration_response(self, response):
        self.launch_data = response.params['launch_data']
        self.peers_to_launch = list(self.launch_data.keys())
        self.peer_order = response.params['peer_order']
        for part in self.peer_order:
            self._running_peer_order.append(list(part))
        self.logger.info("RECEIVED LAUNCH DATA: %s", self.launch_data)

    def set_mx_data(self):

        src_ = net.choose_not_local(self.source_pub_addresses)[:1]
        if not src_:
            src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1]
        src = src_[0]
        src = src[6:].split(':')[0]

        if src == socket.gethostname():
            sock = self.ctx.socket(zmq.REP)
            port = str(sock.bind_to_random_port("tcp://127.0.0.1",
                                                min_port=settings.PORT_RANGE[0],
                                                max_port=settings.PORT_RANGE[1]))
            sock.close()
            return ('0.0.0.0', port), ""  # empty passwd
        else:
            return None, None

    def mx_addr_str(self, mx_data):
        if mx_data[0] is None:
            return None
        addr, port = mx_data[0]
        self.logger.info("mx addr str:  " + addr + ':' + str(port))
        return addr + ':' + str(port)

    def peer_env(self, mx_data):
        if mx_data[0] is None:
            return None

        env = os.environ.copy()
        addr, port = mx_data[0]
        if addr == '0.0.0.0':
            addr = socket.gethostname()

        _env = {
            "MULTIPLEXER_ADDRESSES": str(addr) + ':' + str(port)
        }
        env.update(_env)
        return env

    @msg_handlers.handler("start_broker")
    def handle_start_broker(self, message, sock):
        if 'mx' in self.launch_data and self.mx_data[0] is not None:
            self.logger.info("..starting multiplexer")
            self.peer_order.remove(['mx'])
            self.peers_to_launch.remove('mx')
            path = launcher_tools.broker_path()

            args = [
                'run_multiplexer',
                self.mx_addr_str((('0.0.0.0', self.mx_data[0][1]), self.mx_data[1]))
            ]
            proc, details = self._launch_process(path, args, 'multiplexer', 'mx',
                                                 env=self.env)
            self.processes['mx'] = proc
            if proc is not None:
                self.mx = proc

    @msg_handlers.handler("start_config_server")
    def handle_start_config_srv(self, message, sock):
        if 'mx' not in self.launch_data:
            mx_addr = message.mx_data[1].split(':')
            mx_addr[1] = int(mx_addr[1])
            md = list(self.mx_data)
            md[0] = tuple(mx_addr)
            self.mx_data = tuple(md)
            self.env = self.peer_env(self.mx_data)
        if "config_server" in self.launch_data:
            proc, details, wait, info_obj = \
                self.launch_process("config_server", self.launch_data["config_server"],
                                    restore_config=message.restore_config)
            tim = threading.Timer(1.5, self.__if_config_server_conn_didnt_work)
            tim.start()

    def __if_config_server_conn_didnt_work(self):
        with self.__cfg_lock:
            if self.__cfg_launch_info:
                send_msg(self._publish_socket, self.__cfg_launch_info)
                self.__cfg_launch_info = None
                self.logger.info("connection to config server is shaky :(")

    @msg_handlers.handler("start_peers")
    def handle_start_peers(self, message, sock):
        self.logger.info("start peers --  my mx_data: %s, received mx_data: %s",
                         self.mx_data, message.mx_data)
        if 'mx' not in self.launch_data:
            mx_addr = message.mx_data[1].split(':')
            mx_addr[1] = int(mx_addr[1])
            md = list(self.mx_data)
            md[0] = tuple(mx_addr)
            self.mx_data = tuple(md)
            self.env = self.peer_env(self.mx_data)
        # tmp.workarounds: wait for mx  on other machine to initialize
            time.sleep(0.75)

        if message.add_launch_data:
            if self.machine in message.add_launch_data:
                self._launch_processes(message.add_launch_data[self.machine])
        else:
            self._launch_processes(self.launch_data)

    @msg_handlers.handler("manage_peers")
    def handle_manage_peers(self, message, sock):
        if not message.receiver == self.uuid:
            return

        for peer in message.kill_peers:
            proc = self.processes.get(peer, None)
            if not proc:
                self.logger.error("peer to kill not found: %s", peer)
                continue
            self.logger.info("MORPH:  KILLING %s ", peer)
            proc.kill_with_force()
            self.logger.info("MORPH:  KILLED %s ", peer)
            del self.processes[peer]
            del self.launch_data[peer]

        for peer, data in message.start_peers_data.items():
            self.launch_data[peer] = data
        self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers]

        self._launch_processes(message.start_peers_data)

    def _launch_processes(self, launch_data, restore_config=[]):
        proc, details, info_obj = None, None, None
        success = True

        self.status = launcher_tools.LAUNCHING

        ldata = []

        if 'amplifier' in launch_data:
            ldata.append(('amplifier', launch_data['amplifier']))
        for peer, data in launch_data.items():
            if (peer, data) not in ldata and peer != 'config_server':
                ldata.append((peer, data))

        for peer, data in ldata:  # self.launch_data.iteritems():
            if peer.startswith('mx'):
                continue
            proc, details, wait, info_obj = self.launch_process(peer, data, restore_config=restore_config)
            time.sleep(wait)
            if proc is None:
                success = False
                break

        if success:
            send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched",
                                                               machine=self.machine))

    def launch_process(self, peer, launch_data, restore_config=[]):
        data = launch_data
        wait = 0
        p = os.path.expanduser(data['path'])
        if not os.path.isabs(p):
            path = os.path.join(launcher_tools.obci_root(), p)
            path = os.path.abspath(path)
        else:
            path = os.path.realpath(p)

        args = data['args']
        args = self._attach_base_config_path(path, args)
        args += ['-p', 'experiment_uuid', self.experiment_uuid]
        if peer.startswith('config_server'):
            args += ['-p', 'launcher_socket_addr', self.cs_addr]

            if restore_config:
                args += ['-p', 'restore_peers', ' '.join(restore_config)]
            # wait = 0.5
        if "log_dir" in args:
            idx = args.index("log_dir") + 1
            log_dir = args[idx]
            log_dir = os.path.join(log_dir, self.name)
            args[idx] = log_dir
        else:
            log_dir = os.path.join(CONFIG_DEFAULTS["log_dir"], self.name)
            args += ['-p', 'log_dir', log_dir]
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

        proc, details = self._launch_process(path, args, data['peer_type'],
                                             peer, env=self.env, capture_io=NO_STDIO)
        info_obj = {
            "path": path,
            "args": args,
            "peer": peer
        }
        if proc is not None:
            self.processes[peer] = proc
        else:
            self.logger.error("OBCI LAUNCH FAILED")
            send_msg(self._publish_socket, self.mtool.fill_msg("obci_launch_failed",
                                                               machine=self.machine, path=info_obj['path'],
                                                               args=info_obj['args'], details=details))
            self.processes = {}
            self.subprocess_mgr.killall(force=True)

        return proc, details, wait, info_obj

    def _launch_process(self, path, args, proc_type, name,
                        env=None, capture_io=NO_STDIO):
        self.logger.debug("launching..... %s %s", path, args)
        proc, details = self.subprocess_mgr.new_local_process(path, args,
                                                              proc_type=proc_type,
                                                              name=name,
                                                              monitoring_optflags=RETURNCODE,
                                                              capture_io=capture_io,
                                                              env=env)

        if proc is None:
            self.logger.error("process launch FAILED: %s --- %s",
                              path, str(args))
            send_msg(self._publish_socket, self.mtool.fill_msg("launch_error",
                                                               sender=self.uuid,
                                                               details=dict(machine=self.machine, path=path, args=args,
                                                                            error=details, peer_id=name)))
        else:
            self.logger.info("process launch success:" +
                             path + str(args) + str(proc.pid))
            msg = self.mtool.fill_msg("launched_process_info",
                                      sender=self.uuid,
                                      machine=self.machine,
                                      pid=proc.pid,
                                      proc_type=proc_type, name=name,
                                      path=path,
                                      args=args)
            if name == "config_server":
                self.__cfg_launch_info = msg
            else:
                send_msg(self._publish_socket, msg)
        return proc, details

    def _attach_base_config_path(self, launch_path, launch_args):
        peer_id = launch_args[0]
        base = launch_path.rsplit('.', 1)[0]
        ini = '.'.join([base, 'ini'])
        return [peer_id, ini] + launch_args[1:]

    @msg_handlers.handler("get_tail")
    def handle_get_tail(self, message, sock):
        lines = message.len if message.len else DEFAULT_TAIL_RQ
        peer = message.peer_id
        if peer not in self.launch_data:
            return
        experiment_id = self.launch_data[peer]['experiment_id']
        txt = self.processes[peer].tail_stdout(lines=lines)
        send_msg(self._publish_socket, self.mtool.fill_msg("tail", txt=txt,
                                                           sender=self.uuid,
                                                           experiment_id=experiment_id,
                                                           peer_id=peer))

    @msg_handlers.handler("experiment_finished")
    def handle_experiment_finished(self, message, sock):
        pass

    @msg_handlers.handler("morph_to_new_scenario")
    def handle_morph(self, message, sock):
        pass

    @msg_handlers.handler('nearby_machines')
    def handle_nearby_machines(self, message, sock):
        self._nearby_machines.mass_update(message.nearby_machines)

    @msg_handlers.handler("stop_all")
    def handle_stop_all(self, message, sock):
        self.subprocess_mgr.killall(force=True)

    @msg_handlers.handler("_kill_peer")
    def handle_kill_peer(self, message, sock):
        proc = self.processes.get(message.peer_id, None)

        if proc is not None:  # is on this machine
            if message.morph and message.peer_id == 'config_server':
                self.__cfg_morph = True
            proc.kill_with_force()

    @msg_handlers.handler("rq_ok")
    def handle_rq_ok(self, message, sock):
        self.rqs += 1
        # print "--> ", self.rqs
        if self.rqs == 10000:

            self.logger.debug("GOT %s %s", str(self.rqs), "messages!")
            self.rqs = 0

    @msg_handlers.handler("experiment_launch_error")
    def handle_experiment_launch_error(self, message, sock):
        self.subprocess_mgr.killall(force=True)

    @msg_handlers.handler("dead_process")
    def handle_dead_process(self, message, sock):
        proc = self.subprocess_mgr.process(message.machine, message.pid)
        if proc is not None:
            proc.mark_delete()
            name = proc.name
            if (proc.proc_type == 'obci_peer' or proc.proc_type == 'multiplexer') and \
                    not (name in self.restarting and message.status[0] == 'terminated'):
                self.logger.info("KILLLING! sending obci_peer_"
                                 "dead for process %s", proc.name)
                send_msg(self._publish_socket, self.mtool.fill_msg("obci_peer_dead",
                                                                   sender=self.uuid,
                                                                   sender_ip=self.machine,
                                                                   peer_id=proc.name,
                                                                   path=proc.path,
                                                                   status=proc.status()
                                                                   ))
            if name in self.restarting:
                self.restarting.remove(name)
            if self.__cfg_morph and name == 'config_server':
                self.__cfg_morph = False

    @msg_handlers.handler("obci_peer_registered")
    def handle_obci_peer_registered(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_params_changed")
    def handle_obci_peer_params_changed(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_ready")
    def handle_obci_peer_ready(self, message, sock):
        self.logger.info("got! " + message.type)
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("config_server_ready")
    def handle_obci_peer_ready(self, message, sock):
        # config_server successfully connected to MX, now send "launched_process_info"
        with self.__cfg_lock:
            if self.__cfg_launch_info:
                send_msg(self._publish_socket, self.__cfg_launch_info)
                self.__cfg_launch_info = None

    @msg_handlers.handler("obci_control_message")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("obci_peer_dead")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("process_supervisor_registered")
    def handle_supervisor_registered(self, messsage, sock):
        # also ignore
        pass

    def cleanup_before_net_shutdown(self, kill_message, sock=None):
        self.processes = {}
        self.subprocess_mgr.killall(force=True)

    def clean_up(self):
        self.logger.info("cleaning up")

        self.processes = {}
        self.subprocess_mgr.killall(force=True)
        self.subprocess_mgr.delete_all()

    def _crash_extra_data(self, exception=None):
        data = super(OBCIProcessSupervisor, self)._crash_extra_data(exception)
        data.update({
            'experiment_uuid': self.experiment_uuid,
            'name': self.name
        })
        return data
Пример #12
0
class OBCIProcessSupervisor(OBCIControlPeer):
    msg_handlers = OBCIControlPeer.msg_handlers.copy()

    @log_crash
    def __init__(self, sandbox_dir,
                 source_addresses=None,
                 source_pub_addresses=None,
                 rep_addresses=None,
                 pub_addresses=None,
                 experiment_uuid='',
                 name='obci_process_supervisor'):

        self.peers = {}
        self.status = launcher_tools.READY_TO_LAUNCH
        self.source_pub_addresses = source_pub_addresses
        self.machine = socket.gethostname()
        self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR
        self.ctx = zmq.Context()
        self.mx_data = self.set_mx_data()
        self.env = self.peer_env(self.mx_data)
        self.launch_data = []
        self.peer_order = []
        self._running_peer_order = []
        self._current_part = None
        self.__cfg_launch_info = None
        self.__cfg_morph = False
        self.experiment_uuid = experiment_uuid
        self.peers_to_launch = []
        self.processes = {}
        self.restarting = []
        self.rqs = 0
        self._nearby_machines = net.DNS()

        self.test_count = 0
        self.__cfg_lock = threading.RLock()

        super(OBCIProcessSupervisor, self).__init__(
            source_addresses=source_addresses,
                                            rep_addresses=rep_addresses,
                                            pub_addresses=pub_addresses,
                                            name=name)
        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)

    def peer_type(self):
        return "obci_process_supervisor"

    def net_init(self):
        self.source_sub_socket = self.ctx.socket(zmq.SUB)
        self.source_sub_socket.setsockopt(zmq.SUBSCRIBE, "")

        self._all_sockets.append(self.source_sub_socket)

        if self.source_pub_addresses:
            for addr in self.source_pub_addresses:
                self.source_sub_socket.connect(addr)

        (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL)
        # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "")

        self.cs_addr = net.choose_local(self.cs_addresses)
        if not self.cs_addr:
            self.cs_addr = net.choose_not_local(self.cs_addresses)[0]
        else:
            self.cs_addr = self.cs_addr[0]

        self._all_sockets.append(self.config_server_socket)

        super(OBCIProcessSupervisor, self).net_init()

    def params_for_registration(self):
        mx_data = None
        if None not in self.mx_data:
            mx_data = [self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]]
        return dict(pid=os.getpid(), machine=self.machine,
                    mx_data=mx_data)

    def custom_sockets(self):
        return [self.source_sub_socket, self.config_server_socket]

    def _handle_registration_response(self, response):
        self.launch_data = response.params['launch_data']
        self.peers_to_launch = list(self.launch_data.keys())
        self.peer_order = response.params['peer_order']
        for part in self.peer_order:
            self._running_peer_order.append(list(part))
        self.logger.info("RECEIVED LAUNCH DATA: %s", self.launch_data)

    def set_mx_data(self):

        src_ = net.choose_not_local(self.source_pub_addresses)[:1]
        if not src_:
            src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1]
        src = src_[0]
        src = src[6:].split(':')[0]

        if src == socket.gethostname():
            sock = self.ctx.socket(zmq.REP)
            port = str(sock.bind_to_random_port("tcp://127.0.0.1",
                                                min_port=settings.PORT_RANGE[0],
                                                max_port=settings.PORT_RANGE[1]))
            sock.close()
            return ('0.0.0.0', port), ""  # empty passwd
        else:
            return None, None

    def mx_addr_str(self, mx_data):
        if mx_data[0] is None:
            return None
        addr, port = mx_data[0]
        self.logger.info("mx addr str:  " + addr + ':' + str(port))
        return addr + ':' + str(port)

    def peer_env(self, mx_data):

        if mx_data[0] is None:
            return None

        env = os.environ.copy()
        addr, port = mx_data[0]
        if addr == '0.0.0.0':
            addr = socket.gethostname()

        _env = {
            "MULTIPLEXER_ADDRESSES": str(addr) + ':' + str(port),
            "MULTIPLEXER_PASSWORD": '',  # mx_data[1],
            "MULTIPLEXER_RULES": str(launcher_tools.mx_rules_path())
        }

        env.update(_env)
        return env

    @msg_handlers.handler("start_mx")
    def handle_start_mx(self, message, sock):
        if 'mx' in self.launch_data and self.mx_data[0] is not None:
            self.logger.info("..starting multiplexer")
            self.peer_order.remove(['mx'])
            self.peers_to_launch.remove('mx')
            path = launcher_tools.mx_path()

            args = ['run_multiplexer', self.mx_addr_str(
                (('0.0.0.0', self.mx_data[0][1]), self.mx_data[1])),
                '--multiplexer-password', self.mx_data[1],
                    '--rules', launcher_tools.mx_rules_path()]
            proc, details = self._launch_process(path, args, 'multiplexer', 'mx',
                                                 env=self.env)
            self.processes['mx'] = proc
            if proc is not None:
                self.mx = proc

    @msg_handlers.handler("start_config_server")
    def handle_start_config_srv(self, message, sock):
        if 'mx' not in self.launch_data:
            mx_addr = message.mx_data[1].split(':')
            mx_addr[1] = int(mx_addr[1])
            md = list(self.mx_data)
            md[0] = tuple(mx_addr)
            self.mx_data = tuple(md)
            self.env = self.peer_env(self.mx_data)
        if "config_server" in self.launch_data:
            proc, details, wait, info_obj = \
                self.launch_process("config_server", self.launch_data["config_server"],
                                    restore_config=message.restore_config)
            tim = threading.Timer(1.5, self.__if_config_server_conn_didnt_work)
            tim.start()

    def __if_config_server_conn_didnt_work(self):
        with self.__cfg_lock:
            if self.__cfg_launch_info:
                send_msg(self._publish_socket, self.__cfg_launch_info)
                self.__cfg_launch_info = None
                self.logger.info("connection to config server is shaky :(")

    @msg_handlers.handler("start_peers")
    def handle_start_peers(self, message, sock):
        self.logger.info("start peers --  my mx_data: %s, received mx_data: %s",
                         self.mx_data, message.mx_data)
        if 'mx' not in self.launch_data:
            mx_addr = message.mx_data[1].split(':')
            mx_addr[1] = int(mx_addr[1])
            md = list(self.mx_data)
            md[0] = tuple(mx_addr)
            self.mx_data = tuple(md)
            self.env = self.peer_env(self.mx_data)
        # tmp.workarounds: wait for mx  on other machine to initialize
            time.sleep(0.75)

        if message.add_launch_data:
            if self.machine in message.add_launch_data:
                self._launch_processes(message.add_launch_data[self.machine])
        else:
            self._launch_processes(self.launch_data)

    @msg_handlers.handler("manage_peers")
    def handle_manage_peers(self, message, sock):
        if not message.receiver == self.uuid:
            return

        for peer in message.kill_peers:
            proc = self.processes.get(peer, None)
            if not proc:
                self.logger.error("peer to kill not found: %s", peer)
                continue
            self.logger.info("MORPH:  KILLING %s ", peer)
            proc.kill_with_force()
            self.logger.info("MORPH:  KILLED %s ", peer)
            del self.processes[peer]
            del self.launch_data[peer]

        for peer, data in message.start_peers_data.iteritems():
            self.launch_data[peer] = data
        self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers]

        self._launch_processes(message.start_peers_data)

    def _launch_processes(self, launch_data, restore_config=[]):
        proc, details, info_obj = None, None, None
        success = True
        path, args = None, None

        self.status = launcher_tools.LAUNCHING

        ldata = []

        if 'amplifier' in launch_data:
            ldata.append(('amplifier', launch_data['amplifier']))
        for peer, data in launch_data.iteritems():
            if (peer, data) not in ldata and peer != 'config_server':
                ldata.append((peer, data))

        for peer, data in ldata:  # self.launch_data.iteritems():
            if peer.startswith('mx'):
                continue
            proc, details, wait, info_obj = self.launch_process(peer, data, restore_config=restore_config)
            time.sleep(wait)
            if proc is None:
                success = False
                break

        if success:
            send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched",
                                                               machine=self.machine))

    def launch_process(self, peer, launch_data, restore_config=[]):
        data = launch_data
        wait = 0
        p = os.path.expanduser(data['path'])
        if not os.path.isabs(p):
            path = os.path.join(launcher_tools.obci_root(), p)
            path = os.path.abspath(path)
        else:
            path = os.path.realpath(p)

        args = data['args']
        args = self._attach_base_config_path(path, args)
        args += ['-p', 'experiment_uuid', self.experiment_uuid]
        if peer.startswith('config_server'):
            args += ['-p', 'launcher_socket_addr', self.cs_addr]

            if restore_config:
                args += ['-p', 'restore_peers', ' '.join(restore_config)]
            # wait = 0.5
        if "log_dir" in args:
            idx = args.index("log_dir") + 1
            log_dir = args[idx]
            log_dir = os.path.join(log_dir, self.name)
            args[idx] = log_dir
        else:
            log_dir = os.path.join(CONFIG_DEFAULTS["log_dir"], self.name)
            args += ['-p', 'log_dir', log_dir]
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

        proc, details = self._launch_process(path, args, data['peer_type'],
                                             peer, env=self.env, capture_io=NO_STDIO)
        info_obj = {
            "path": path,
            "args": args,
            "peer": peer
        }
        if proc is not None:
            self.processes[peer] = proc
        else:
            self.logger.error("OBCI LAUNCH FAILED")
            send_msg(self._publish_socket, self.mtool.fill_msg("obci_launch_failed",
                                                               machine=self.machine, path=info_obj['path'],
                                                               args=info_obj['args'], details=details))
            self.processes = {}
            self.subprocess_mgr.killall(force=True)

        return proc, details, wait, info_obj

    def _launch_process(self, path, args, proc_type, name,
                        env=None, capture_io=NO_STDIO):
        self.logger.debug("launching..... %s %s", path, args)
        proc, details = self.subprocess_mgr.new_local_process(path, args,
                                                              proc_type=proc_type,
                                                              name=name,
                                                              monitoring_optflags=RETURNCODE,
                                                              capture_io=capture_io,
                                                              env=env)

        if proc is None:
            self.logger.error("process launch FAILED: %s --- %s",
                              path, str(args))
            send_msg(self._publish_socket, self.mtool.fill_msg("launch_error",
                                                               sender=self.uuid,
                                                               details=dict(machine=self.machine, path=path, args=args,
                                                                            error=details, peer_id=name)))
        else:
            self.logger.info("process launch success:" +
                             path + str(args) + str(proc.pid))
            msg = self.mtool.fill_msg("launched_process_info",
                                      sender=self.uuid,
                                      machine=self.machine,
                                      pid=proc.pid,
                                      proc_type=proc_type, name=name,
                                      path=path,
                                      args=args)
            if name == "config_server":
                self.__cfg_launch_info = msg
            else:
                send_msg(self._publish_socket, msg)
        return proc, details

    def _attach_base_config_path(self, launch_path, launch_args):
        peer_id = launch_args[0]
        base = launch_path.rsplit('.', 1)[0]
        ini = '.'.join([base, 'ini'])
        return [peer_id, ini] + launch_args[1:]

    @msg_handlers.handler("get_tail")
    def handle_get_tail(self, message, sock):
        lines = message.len if message.len else DEFAULT_TAIL_RQ
        peer = message.peer_id
        if peer not in self.launch_data:
            return
        experiment_id = self.launch_data[peer]['experiment_id']
        txt = self.processes[peer].tail_stdout(lines=lines)
        send_msg(self._publish_socket, self.mtool.fill_msg("tail", txt=txt,
                                                           sender=self.uuid,
                                                           experiment_id=experiment_id,
                                                           peer_id=peer))

    @msg_handlers.handler("experiment_finished")
    def handle_experiment_finished(self, message, sock):
        pass

    @msg_handlers.handler("morph_to_new_scenario")
    def handle_morph(self, message, sock):
        pass

    @msg_handlers.handler('nearby_machines')
    def handle_nearby_machines(self, message, sock):
        self._nearby_machines.mass_update(message.nearby_machines)

    @msg_handlers.handler("stop_all")
    def handle_stop_all(self, message, sock):
        self.subprocess_mgr.killall(force=True)

    @msg_handlers.handler("_kill_peer")
    def handle_kill_peer(self, message, sock):
        proc = self.processes.get(message.peer_id, None)

        if proc is not None:  # is on this machine
            if message.morph and message.peer_id == 'config_server':
                self.__cfg_morph = True
            proc.kill_with_force()

    @msg_handlers.handler("rq_ok")
    def handle_rq_ok(self, message, sock):
        self.rqs += 1
        # print "--> ", self.rqs
        if self.rqs == 10000:

            self.logger.debug("GOT %s %s", str(self.rqs), "messages!")
            self.rqs = 0

    @msg_handlers.handler("experiment_launch_error")
    def handle_experiment_launch_error(self, message, sock):
        self.subprocess_mgr.killall(force=True)

    @msg_handlers.handler("dead_process")
    def handle_dead_process(self, message, sock):
        proc = self.subprocess_mgr.process(message.machine, message.pid)
        if proc is not None:
            proc.mark_delete()
            name = proc.name
            if (proc.proc_type == 'obci_peer' or proc.proc_type == 'multiplexer') and \
                    not (name in self.restarting and message.status[0] == 'terminated'):
                self.logger.info("KILLLING! sending obci_peer_"
                                 "dead for process %s", proc.name)
                send_msg(self._publish_socket, self.mtool.fill_msg("obci_peer_dead",
                                                                   sender=self.uuid,
                                                                   sender_ip=self.machine,
                                                                   peer_id=proc.name,
                                                                   path=proc.path,
                                                                   status=proc.status()
                                                                   ))
            if name in self.restarting:
                self.restarting.remove(name)
            if self.__cfg_morph and name == 'config_server':
                self.__cfg_morph = False

    @msg_handlers.handler("obci_peer_registered")
    def handle_obci_peer_registered(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_params_changed")
    def handle_obci_peer_params_changed(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_ready")
    def handle_obci_peer_ready(self, message, sock):
        self.logger.info("got! " + message.type)
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("config_server_ready")
    def handle_obci_peer_ready(self, message, sock):
        # config_server successfully connected to MX, now send "launched_process_info"
        with self.__cfg_lock:
            if self.__cfg_launch_info:
                send_msg(self._publish_socket, self.__cfg_launch_info)
                self.__cfg_launch_info = None

    @msg_handlers.handler("obci_control_message")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("obci_peer_dead")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("process_supervisor_registered")
    def handle_supervisor_registered(self, messsage, sock):
        # also ignore
        pass

    def cleanup_before_net_shutdown(self, kill_message, sock=None):
        self.processes = {}
        self.subprocess_mgr.killall(force=True)

    def clean_up(self):
        self.logger.info("cleaning up")

        self.processes = {}
        self.subprocess_mgr.killall(force=True)
        self.subprocess_mgr.delete_all()

    def _crash_extra_data(self, exception=None):
        data = super(OBCIProcessSupervisor, self)._crash_extra_data(exception)
        data.update({
            'experiment_uuid': self.experiment_uuid,
            'name': self.name
        })
        return data
Пример #13
0
class OBCIControlPeer(object):

    msg_handlers = HandlerCollection()

    def __init__(self, source_addresses=None,
                    rep_addresses=None, pub_addresses=None, name='obci_control_peer'):

        ###TODO TODO TODO !!!!
        ###cleaner subclassing of obci_control_peer!!!
        self.hostname = socket.gethostname()
        self.source_addresses = source_addresses if source_addresses else []
        self.rep_addresses = rep_addresses
        self.pub_addresses = pub_addresses
        self._all_sockets = []
        self._pull_addr = 'inproc://publisher_msg'
        self._push_addr = 'inproc://publisher'
        self._subpr_push_addr = 'inproc://subprocess_info'

        self.uuid = str(uuid.uuid4())
        self.name = str(name)
        self.type = self.peer_type()

        log_dir = os.path.join(settings.OBCI_CONTROL_LOG_DIR,
                                self.name + '-' + self.uuid[:8])
        if not hasattr(self, 'logger'):
            if not os.path.exists(log_dir):
                os.makedirs(log_dir)
            self.logger = get_logger(self.peer_type(), log_dir=log_dir,
                                    stream_level=net_tools.peer_loglevel(), obci_peer=self)

        self.mtool = self.message_tool()

        if not hasattr(self, "ctx"):
            self.ctx = zmq.Context()

        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)
        self.net_init()

        if self.source_addresses:
            self.registration_response = self.register()
            self._handle_registration_response(self.registration_response)
        else: self.registration_response = None

        self.interrupted = False
        signal.signal(signal.SIGTERM, self.signal_handler())
        signal.signal(signal.SIGINT, self.signal_handler())



    def signal_handler(self):
        def handler(signum, frame):
            self.logger.info("[!!!!] %s %s %s %s",
                        self.name, "got signal", signum, frame)
            self.interrupted = True
        return handler

    def peer_type(self):
        return 'obci_control_peer'

    def message_tool(self):
        return OBCIMessageTool(message_templates)

    def _publisher_thread(self, pub_addrs, pull_address, push_addr):
        #FIXME aaaaahhh pub_addresses are set here, not in the main thread
        # (which reads them in _register method)
        pub_sock, self.pub_addresses = self._init_socket(
                                    pub_addrs, zmq.PUB)

        pull_sock = self.ctx.socket(zmq.PULL)
        pull_sock.bind(pull_address)

        push_sock = self.ctx.socket(zmq.PUSH)
        push_sock.connect(push_addr)

        send_msg(push_sock, u'1')
        po = PollingObject()

        while not self._stop_publishing:
            try:
                to_publish, det = po.poll_recv(pull_sock, 500)

                if to_publish:
                    send_msg(pub_sock, to_publish)

            except:
                #print self.name, '.Publisher -- STOP.'
                break
        # self.logger.info( "close  sock %s %s", pub_addrs, pub_sock)
        pub_sock.close()
        pull_sock.close()
        push_sock.close()

    def _subprocess_info(self, push_addr):
        push_sock = self.ctx.socket(zmq.PUSH)
        push_sock.connect(push_addr)

        send_msg(push_sock, u'1')
        while not self._stop_monitoring:
            dead = self.subprocess_mgr.not_running_processes()
            if dead:
                # self.logger.warning("DEAD  process" +  str(dead))
                for key, status in dead.iteritems():
                    send_msg(push_sock, self.mtool.fill_msg('dead_process', machine=key[0],
                                                        pid=key[1], status=status))
            time.sleep(0.5)
        push_sock.close()


    def _push_sock(self, ctx, addr):
        sock = ctx.socket(zmq.PUSH)
        sock.connect(addr)
        return sock

    def _prepare_publisher(self):
        tmp_pull = self.ctx.socket(zmq.PULL)
        tmp_pull.bind(self._pull_addr)
        self.pub_thr = threading.Thread(target=self._publisher_thread,
                                        args=[self.pub_addresses,
                                            self._push_addr,
                                            self._pull_addr])
        self.pub_thr.daemon = True

        self._stop_publishing = False
        self.pub_thr.start()
        recv_msg(tmp_pull)
        self._publish_socket = self._push_sock(self.ctx, self._push_addr)
        self._all_sockets.append(self._publish_socket)
        tmp_pull.close()

    def _prepare_subprocess_info(self):
        self._subprocess_pull = self.ctx.socket(zmq.PULL)
        self._subprocess_pull.bind(self._subpr_push_addr)

        self.subprocess_thr = threading.Thread(target=self._subprocess_info,
                                            args=[self._subpr_push_addr])
        self.subprocess_thr.daemon = True
        self._stop_monitoring = False

        self.subprocess_thr.start()
        recv_msg(self._subprocess_pull)

        self._all_sockets.append(self._subprocess_pull)


    def net_init(self):
        # (self.pub_socket, self.pub_addresses) = self._init_socket(
        #                                         self.pub_addresses, zmq.PUB)
        self._all_sockets = []
        self._prepare_publisher()
        self._prepare_subprocess_info()

        (self.rep_socket, self.rep_addresses) = self._init_socket(
                                                self.rep_addresses, zmq.REP)
        self.rep_socket.setsockopt(zmq.LINGER, 0)
        self._all_sockets.append(self.rep_socket)

        print "\n\tname: {0}\n\tpeer_type: {1}\n\tuuid: {2}\n".format(
                                    self.name, self.peer_type(), self.uuid)
        print "rep: {0}".format(self.rep_addresses)
        print "pub: {0}\n".format(self.pub_addresses)

        self.source_req_socket = self.ctx.socket(zmq.REQ)

        if self.source_addresses:
            for addr in self.source_addresses:
                self.source_req_socket.connect(addr)
        self._all_sockets.append(self.source_req_socket)
        self._set_poll_sockets()


    def _init_socket(self, addrs, zmq_type):
        # print self.peer_type(), "addresses for socket init:", addrs
        addresses = addrs if addrs else ['tcp://*']

        random_port = True if not addrs else False

        sock = self.ctx.socket(zmq_type)
        port = None
        try:
            for i, addr in enumerate(addresses):
                if random_port and net.is_net_addr(addr):
                    port = str(sock.bind_to_random_port(addr,
                                                min_port=settings.PORT_RANGE[0],
                                                max_port=settings.PORT_RANGE[1]))
                    addresses[i] = addr + ':' + str(port)
                else:
                    sock.bind(addr)
        except Exception, e:
            self.logger.critical("CRITICAL error: %s", str(e))
            raise(e)

        advertised_addrs = []
        for addr in addresses:
            if addr.startswith('tcp://*'):
                port = addr.rsplit(':', 1)[1]
                advertised_addrs.append('tcp://' + socket.gethostname() + ':' +str(port))
                advertised_addrs.append('tcp://' + 'localhost:' + str(port))
            else:
                advertised_addrs.append(addr)
        return sock, advertised_addrs
Пример #14
0
class OBCIControlPeer(object):

    msg_handlers = HandlerCollection()

    def __init__(self, source_addresses=None,
                 rep_addresses=None, pub_addresses=None, name='obci_control_peer'):

        # TODO TODO TODO !!!!
        # cleaner subclassing of obci_control_peer!!!
        self.hostname = socket.gethostname()
        self.source_addresses = source_addresses if source_addresses else []
        self.rep_addresses = rep_addresses
        self.pub_addresses = pub_addresses
        self._all_sockets = []
        self._pull_addr = 'inproc://publisher_msg'
        self._push_addr = 'inproc://publisher'
        self._subpr_push_addr = 'inproc://subprocess_info'

        self.uuid = str(uuid.uuid4())
        self.name = str(name)
        self.type = self.peer_type()

        log_dir = os.path.join(settings.OBCI_CONTROL_LOG_DIR,
                               self.name + '-' + self.uuid[:8])
        if not hasattr(self, 'logger'):
            if not os.path.exists(log_dir):
                os.makedirs(log_dir)
            self.logger = get_logger(self.peer_type(), log_dir=log_dir,
                                     stream_level=net_tools.peer_loglevel(), obci_peer=self)

        self.mtool = self.message_tool()

        if not hasattr(self, "ctx"):
            self.ctx = zmq.Context()

        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger)
        self.net_init()

        if self.source_addresses:
            self.registration_response = self.register()
            self._handle_registration_response(self.registration_response)
        else:
            self.registration_response = None

        self.interrupted = False
        signal.signal(signal.SIGTERM, self.signal_handler())
        signal.signal(signal.SIGINT, self.signal_handler())

    def signal_handler(self):
        def handler(signum, frame):
            self.logger.info("[!!!!] %s %s %s %s",
                             self.name, "got signal", signum, frame)
            self.interrupted = True
        return handler

    def peer_type(self):
        return 'obci_control_peer'

    def message_tool(self):
        return OBCIMessageTool(message_templates)

    def _publisher_thread(self, pub_addrs, pull_address, push_addr):
        # FIXME aaaaahhh pub_addresses are set here, not in the main thread
        # (which reads them in _register method)
        pub_sock, self.pub_addresses = self._init_socket(
            pub_addrs, zmq.PUB)

        pull_sock = self.ctx.socket(zmq.PULL)
        pull_sock.bind(pull_address)

        push_sock = self.ctx.socket(zmq.PUSH)
        push_sock.connect(push_addr)

        send_msg(push_sock, u'1')
        po = PollingObject()

        while not self._stop_publishing:
            try:
                to_publish, det = po.poll_recv(pull_sock, 500)

                if to_publish:
                    send_msg(pub_sock, to_publish)

            except:
                # print self.name, '.Publisher -- STOP.'
                break
        # self.logger.info( "close  sock %s %s", pub_addrs, pub_sock)
        pub_sock.close()
        pull_sock.close()
        push_sock.close()

    def _subprocess_info(self, push_addr):
        push_sock = self.ctx.socket(zmq.PUSH)
        push_sock.connect(push_addr)

        send_msg(push_sock, u'1')
        while not self._stop_monitoring:
            dead = self.subprocess_mgr.not_running_processes()
            if dead:
                # self.logger.warning("DEAD  process" +  str(dead))
                for key, status in dead.iteritems():
                    send_msg(push_sock, self.mtool.fill_msg('dead_process', machine=key[0],
                                                            pid=key[1], status=status))
            time.sleep(0.5)
        push_sock.close()

    def _push_sock(self, ctx, addr):
        sock = ctx.socket(zmq.PUSH)
        sock.connect(addr)
        return sock

    def _prepare_publisher(self):
        tmp_pull = self.ctx.socket(zmq.PULL)
        tmp_pull.bind(self._pull_addr)
        self.pub_thr = threading.Thread(target=self._publisher_thread,
                                        args=[self.pub_addresses,
                                              self._push_addr,
                                              self._pull_addr])
        self.pub_thr.daemon = True

        self._stop_publishing = False
        self.pub_thr.start()
        recv_msg(tmp_pull)
        self._publish_socket = self._push_sock(self.ctx, self._push_addr)
        self._all_sockets.append(self._publish_socket)
        tmp_pull.close()

    def _prepare_subprocess_info(self):
        self._subprocess_pull = self.ctx.socket(zmq.PULL)
        self._subprocess_pull.bind(self._subpr_push_addr)

        self.subprocess_thr = threading.Thread(target=self._subprocess_info,
                                               args=[self._subpr_push_addr])
        self.subprocess_thr.daemon = True
        self._stop_monitoring = False

        self.subprocess_thr.start()
        recv_msg(self._subprocess_pull)

        self._all_sockets.append(self._subprocess_pull)

    def net_init(self):
        # (self.pub_socket, self.pub_addresses) = self._init_socket(
        #                                         self.pub_addresses, zmq.PUB)
        self._all_sockets = []
        self._prepare_publisher()
        self._prepare_subprocess_info()

        (self.rep_socket, self.rep_addresses) = self._init_socket(
            self.rep_addresses, zmq.REP)
        self.rep_socket.setsockopt(zmq.LINGER, 0)
        self._all_sockets.append(self.rep_socket)

        print "\n\tname: {0}\n\tpeer_type: {1}\n\tuuid: {2}\n".format(
            self.name, self.peer_type(), self.uuid)
        print "rep: {0}".format(self.rep_addresses)
        print "pub: {0}\n".format(self.pub_addresses)

        self.source_req_socket = self.ctx.socket(zmq.REQ)

        if self.source_addresses:
            for addr in self.source_addresses:
                self.source_req_socket.connect(addr)
        self._all_sockets.append(self.source_req_socket)
        self._set_poll_sockets()

    def _init_socket(self, addrs, zmq_type):
        # print self.peer_type(), "addresses for socket init:", addrs
        addresses = addrs if addrs else ['tcp://*']

        random_port = True if not addrs else False

        sock = self.ctx.socket(zmq_type)
        port = None
        try:
            for i, addr in enumerate(addresses):
                if random_port and net.is_net_addr(addr):
                    port = str(sock.bind_to_random_port(addr,
                                                        min_port=settings.PORT_RANGE[0],
                                                        max_port=settings.PORT_RANGE[1]))
                    addresses[i] = addr + ':' + str(port)
                else:
                    sock.bind(addr)
        except Exception, e:
            self.logger.critical("CRITICAL error: %s", str(e))
            raise(e)

        advertised_addrs = []
        for addr in addresses:
            if addr.startswith('tcp://*'):
                port = addr.rsplit(':', 1)[1]
                advertised_addrs.append('tcp://' + socket.gethostname() + ':' + str(port))
                advertised_addrs.append('tcp://' + 'localhost:' + str(port))
            else:
                advertised_addrs.append(addr)
        return sock, advertised_addrs
Пример #15
0
class OBCIProcessSupervisor(OBCIControlPeer):

    msg_handlers = OBCIControlPeer.msg_handlers.copy()

    def __init__(self, sandbox_dir,
                                        source_addresses=None,
                                        source_pub_addresses=None,
                                        rep_addresses=None,
                                        pub_addresses=None,
                                        experiment_uuid='',
                                        name='obci_process_supervisor'):

        self.peers = {}
        self.status = launcher_tools.READY_TO_LAUNCH
        self.source_pub_addresses = source_pub_addresses
        self.machine = socket.gethostname()
        self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR
        self.ctx = zmq.Context()
        self.mx_data = self.set_mx_data()
        self.env = self.peer_env(self.mx_data)
        self.launch_data = []
        self.peer_order = []
        self._running_peer_order = []
        self._current_part = None
        self.experiment_uuid = experiment_uuid
        self.peers_to_launch = []
        self.processes = {}
        self.restarting = []

        super(OBCIProcessSupervisor, self).__init__(
                                            source_addresses=source_addresses,
                                            rep_addresses=rep_addresses,
                                            pub_addresses=pub_addresses,
                                            name=name)
        self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid)


    def peer_type(self):
        return "obci_process_supervisor"

    def net_init(self):
        self.source_sub_socket = self.ctx.socket(zmq.SUB)
        self.source_sub_socket.setsockopt(zmq.SUBSCRIBE, "")

        self._all_sockets.append(self.source_sub_socket)

        if self.source_pub_addresses:
            for addr in self.source_pub_addresses:
                self.source_sub_socket.connect(addr)

        (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL)
        # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "")

        self.cs_addr = net.choose_not_local(self.cs_addresses)
        if not self.cs_addr:
            self.cs_addr = net.choose_local(self.cs_addresses)[0]
        else:
            self.cs_addr = self.cs_addr[0]

        self._all_sockets.append(self.config_server_socket)

        super(OBCIProcessSupervisor, self).net_init()

    def params_for_registration(self):
        return dict(pid=os.getpid(), machine=self.machine,
                    mx_data=[self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]])

    def custom_sockets(self):
        return [self.source_sub_socket, self.config_server_socket]


    def _handle_registration_response(self, response):
        self.launch_data = response.params['launch_data']
        self.peers_to_launch = list(self.launch_data.keys())
        self.peer_order = response.params['peer_order']
        for part in self.peer_order:
            self._running_peer_order.append(list(part))
        print self.name,'[', self.type, ']',  "RECEIVED LAUNCH DATA: ", self.launch_data


    def set_mx_data(self):

        src_ = net.choose_not_local(self.source_pub_addresses)[:1]
        if not src_:
            src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1]
        src = src_[0]
        src = src[6:].split(':')[0]

        if src == socket.gethostname():
            sock = self.ctx.socket(zmq.REP)
            port = str(sock.bind_to_random_port("tcp://127.0.0.1", 
                                            min_port=settings.PORT_RANGE[0],
                                            max_port=settings.PORT_RANGE[1]))
            sock.close()
            return ('0.0.0.0', port), "" #empty passwd
        else:
            return None, None

    def mx_addr_str(self, mx_data):
        if mx_data[0] is None:
            return None
        addr, port = mx_data[0]
        print self.name,'[', self.type, ']', "mx addr str", addr + ':' + str(port)
        return addr + ':' + str(port)


    def peer_env(self, mx_data):

        if mx_data[0] is None:
            return None

        env = os.environ.copy()
        addr, port = mx_data[0]

        _env = {
            "MULTIPLEXER_ADDRESSES": socket.gethostname() + ':' + str(port),
            "MULTIPLEXER_PASSWORD": mx_data[1],
            "MULTIPLEXER_RULES": launcher_tools.mx_rules_path()
        }
        env.update(_env)
        return env

    @msg_handlers.handler("start_mx")
    def handle_start_mx(self, message, sock):
        if 'mx' in self.launch_data and self.mx_data[0] is not None:
            print self.name,'[', self.type, ']', "..starting multiplexer"
            self.peer_order.remove(['mx'])
            self.peers_to_launch.remove('mx')
            path = launcher_tools.mx_path()

            args = ['run_multiplexer', self.mx_addr_str(
                                (('0.0.0.0', self.mx_data[0][1]), self.mx_data[1])),
                    '--multiplexer-password', self.mx_data[1],
                    '--rules', launcher_tools.mx_rules_path()]
            proc, details = self._launch_process(path, args, 'multiplexer', 'mx',
                                                env=self.env)
            self.processes['mx'] = proc
            if proc is not None:
                self.mx = proc


    @msg_handlers.handler("start_peers")
    def handle_start_peers(self, message, sock):
        self._launch_processes(self.launch_data)

    def test(self):
        # for i in range(SEND):
        #     send_msg(self.push, str(i))
        self.pull = self.ctx.socket(zmq.SUB)
        self.pull.bind('tcp://*:16789')

        received = 0
        prev = -1
        for i in range(SEND):
            msg = recv_msg(self.pull)
            if int(msg):
                # prev = int(msg)
                received += 1
            if received % 10000 == 0:
                print "zmq: received ", received, "messages, last: ", msg

        if received == SEND:
            print "zmq: OK"
        else:
            print "WUT?", received
        # self.push.close()
        self.pull.close()


    @msg_handlers.handler("manage_peers")
    def handle_manage_peers(self, message, sock):
        if not message.receiver == self.uuid:
            return
        message.kill_peers.append('config_server')
        
        message.start_peers_data['config_server'] = dict(self.launch_data['config_server'])
        restore_config = [peer for peer in self.processes if peer not in message.kill_peers]
        for peer in message.kill_peers:
            proc = self.processes.get(peer, None)
            if not proc:
                print self.name,'[', self.type, ']', "peer to kill not found:", peer
                continue
            print "MORPH:  KILLING ", peer
            proc.kill()
            print "MORPH:  KILLED ", peer
            del self.processes[peer]
            del self.launch_data[peer]

        for peer, data in message.start_peers_data.iteritems():
            self.launch_data[peer] = data
        self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers]
        
        self._launch_processes(message.start_peers_data, restore_config=restore_config)


    def _launch_processes(self, launch_data, restore_config=[]):
        proc, details = None, None
        success = True
        path, args = None, None

        self.status = launcher_tools.LAUNCHING

        ldata = []
        if 'config_server' in launch_data:
            ldata.append(('config_server', launch_data['config_server']))
        if 'amplifier' in launch_data:
            ldata.append(('amplifier', launch_data['amplifier']))
        for peer, data in launch_data.iteritems():
            if (peer, data) not in ldata:
                ldata.append((peer, data))

        for peer, data in ldata:#self.launch_data.iteritems():
            wait = 0
            if peer.startswith('mx'):
                continue
            path = os.path.join(launcher_tools.obci_root(), data['path'])
            args = data['args']
            if peer.startswith('config_server'):
                args += ['-p', 'launcher_socket_addr', self.cs_addr]
                args += ['-p', 'experiment_uuid', self.experiment_uuid]
                
                if restore_config:
                    args += ['-p', 'restore_peers', ' '.join(restore_config)]
                wait = 0.4
            proc, details = self._launch_process(path, args, data['peer_type'],
                                                        peer, env=self.env, capture_io=NO_STDIO)
            if proc is not None:
                self.processes[peer] = proc
            else:
                success = False
                break
            time.sleep(wait)
        if success:
            send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched",
                                                    machine=self.machine))
        else:
            print self.name,'[', self.type, ']', "OBCI LAUNCH FAILED"
            send_msg(self._publish_socket, self.mtool.fill_msg("obci_launch_failed",
                                                    machine=self.machine, path=path,
                                                    args=args, details=details))
            self.processes = {}
            self.subprocess_mgr.killall()


    def _launch_process(self, path, args, proc_type, name,
                                    env=None, capture_io=NO_STDIO):
        proc, details = self.subprocess_mgr.new_local_process(path, args,
                                                        proc_type=proc_type,
                                                        name=name,
                                                        monitoring_optflags=RETURNCODE,
                                                        capture_io=capture_io,
                                                        env=env)
        if proc is None:
            print self.name,'[', self.type, ']', "process launch FAILED:", path, args
            send_msg(self._publish_socket, self.mtool.fill_msg("launch_error",
                                            sender=self.uuid,
                                            details=dict(machine=self.machine, path=path, args=args,
                                                        error=details)))
        else:
            print self.name,'[', self.type, ']', "process launch success:", path, args, proc.pid
            send_msg(self._publish_socket, self.mtool.fill_msg("launched_process_info",
                                            sender=self.uuid,
                                            machine=self.machine,
                                            pid=proc.pid,
                                            proc_type=proc_type, name=name,
                                            path=path,
                                            args=args))
        return proc, details

    @msg_handlers.handler("get_tail")
    def handle_get_tail(self, message, sock):
        lines = message.len if message.len else DEFAULT_TAIL_RQ
        peer = message.peer_id
        if peer not in self.launch_data:
            return
        experiment_id = self.launch_data[peer]['experiment_id']
        txt = self.processes[peer].tail_stdout(lines=lines)
        send_msg(self._publish_socket, self.mtool.fill_msg("tail", txt=txt,
                                                    sender=self.uuid,
                                                    experiment_id=experiment_id,
                                                peer_id=peer))


    @msg_handlers.handler("experiment_finished")
    def handle_experiment_finished(self, message, sock):
        pass

    @msg_handlers.handler("morph_to_new_scenario")
    def handle_morph(self, message, sock):
        pass

    @msg_handlers.handler("stop_all")
    def handle_stop_all(self, message, sock):

        self.subprocess_mgr.killall()

    @msg_handlers.handler("dead_process")
    def handle_dead_process(self, message, sock):
        proc = self.subprocess_mgr.process(message.machine, message.pid)
        if proc is not None:
            proc.mark_delete()
            name = proc.name
            print '~~~~~   ~~~~~   ', name, self.restarting, message.status[0]

            if (proc.proc_type == 'obci_peer' or proc.proc_type == 'multiplexer') and \
                                not (name in self.restarting and message.status[0] == 'terminated'):
                print "KILLLLLING     and sending obci_peer_dead", proc.name
                send_msg(self._publish_socket, self.mtool.fill_msg("obci_peer_dead",
                                                sender=self.uuid,
                                                sender_ip=self.machine,
                                                peer_id=proc.name,
                                                path=proc.path,
                                                status=proc.status()
                                                ))
            if name in self.restarting:
                self.restarting.remove(name)

    @msg_handlers.handler("obci_peer_registered")
    def handle_obci_peer_registered(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_params_changed")
    def handle_obci_peer_params_changed(self, message, sock):
        send_msg(self._publish_socket, message.SerializeToString())

    @msg_handlers.handler("obci_peer_ready")
    def handle_obci_peer_ready(self, message, sock):
        print self.name , "got!", message.type
        send_msg(self._publish_socket, message.SerializeToString())


    @msg_handlers.handler("obci_control_message")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("obci_peer_dead")
    def handle_obci_control_message(self, message, sock):
        # ignore :)
        pass

    @msg_handlers.handler("process_supervisor_registered")
    def handle_supervisor_registered(self, messsage, sock):
        # also ignore
        pass

    def cleanup_before_net_shutdown(self, kill_message, sock=None):
        self.processes = {}
        #self.subprocess_mgr.killall()

    def clean_up(self):
        print self.name,'[', self.type, ']',  "cleaning up"

        self.processes = {}
        self.subprocess_mgr.killall()
        self.subprocess_mgr.delete_all()