예제 #1
0
class Minion:
    MSG_PROCESSED = 'message_processed'

    def __init__(self, redis_conn, workflow_id, app_id, config):
        self.redis_conn = redis_conn
        self.state_control = StateControlRedis(self.redis_conn)
        self.workflow_id = workflow_id
        self.app_id = app_id
        self.config = config

        # Errors and messages
        self.MNN000 = ('MNN000', _('Success.'))
        self.MNN001 = ('MNN001', _('Port output format not supported.'))
        self.MNN002 = ('MNN002', _('Success getting data from task.'))
        self.MNN003 = ('MNN003', _('State does not exists, processing app.'))
        self.MNN004 = ('MNN004', _('Invalid port.'))
        self.MNN005 = ('MNN005',
                       _('Unable to retrieve data because a previous error.'))
        self.MNN006 = ('MNN006',
                       _('Invalid Python code or incorrect encoding: {}'))
        self.MNN007 = ('MNN007', _('Job {} was canceled'))
        self.MNN008 = ('MNN008', _('App {} was terminated'))
        self.MNN009 = ('MNN009', _('Workflow specification is missing'))
        self.MNN010 = (
            'MNN010',
            _('Task completed, but not executed (not used in the workflow).'))

        # Used in the template file, declared here to gettext detect them
        self.msgs = [
            _('Task running'),
            _('Task completed'),
            _('Task running (cached data)')
        ]

    def process(self):
        raise NotImplementedError()

    def _generate_output(self, message, status=None, code=None):
        """
        Sends feedback about execution of this minion.
        """
        obj = {
            'message': message,
            'workflow_id': self.workflow_id,
            'app_id': self.app_id,
            'code': code,
            'date': datetime.datetime.now().isoformat(),
            'status': status if status is not None else 'OK'
        }

        m = json.dumps(obj)
        self.state_control.push_app_output_queue(self.app_id, m)

    def _perform_ping(self):
        status = {
            'status': 'READY',
            'pid': os.getpid(),
        }
        self.state_control.set_minion_status(self.app_id,
                                             json.dumps(status),
                                             ex=10,
                                             nx=False)

    @staticmethod
    def reload_code(q):
        wm = pyinotify.WatchManager()
        notifier = pyinotify.Notifier(wm, EventHandler())
        wm.add_watch(_watch_dir, pyinotify.ALL_EVENTS, rec=True)
        notifier.loop()

    def ping(self, q):
        """ Pings redis to inform master this minion is online """
        log.info('Start ping')
        while q.empty():
            self._perform_ping()
            time.sleep(5)
예제 #2
0
파일: server.py 프로젝트: yuanbw/juicer
class JuicerServer:
    """
    The JuicerServer is responsible for managing the lifecycle of minions.
    A minion controls a application, i.e., an active instance of an workflow.
    Thus, the JuicerServer receives launch request from clients, launches and
    manages minion processes and takes care of their properly termination.
    """
    STARTED = 'STARTED'
    LOADED = 'LOADED'
    TERMINATED = 'TERMINATED'
    HELP_UNHANDLED_EXCEPTION = 1
    HELP_STATE_LOST = 2
    BANNER = """
     ██╗██╗   ██╗██╗ ██████╗███████╗██████╗ 
     ██║██║   ██║██║██╔════╝██╔════╝██╔══██╗
     ██║██║   ██║██║██║     █████╗  ██████╔╝
██   ██║██║   ██║██║██║     ██╔══╝  ██╔══██╗
╚█████╔╝╚██████╔╝██║╚██████╗███████╗██║  ██║
 ╚════╝  ╚═════╝ ╚═╝ ╚═════╝╚══════╝╚═╝  ╚═╝
"""

    def __init__(self,
                 config,
                 minion_executable,
                 log_dir='/tmp',
                 config_file_path=None):

        self.minion_support_process = None
        self.new_minion_watch_process = None
        self.start_process = None
        self.minion_status_process = None
        self.state_control = None
        self.minion_watch_process = None

        self.active_minions = {}

        self.config = config
        configuration.set_config(config)
        self.config_file_path = config_file_path
        self.minion_executable = minion_executable
        self.log_dir = log_dir or self.config['juicer'].get('log', {}).get(
            'path', '/tmp')

        signal.signal(signal.SIGTERM, self._terminate)

        self.port_range = list(
            range(*(config['juicer'].get('minion', {}).get(
                'libprocess_port_range', [36000, 36500]))))
        self.advertise_ip = config['juicer'].get(
            'minion', {}).get('libprocess_advertise_ip')

        # Minion requires 3 different ports:
        # 1 for libprocess/Mesos communication
        # 1 for driver port
        # 1 for block manager
        self.port_offset = config['juicer'].get('minion',
                                                {}).get('port_offset', 100)

        self.mgr = socketio.RedisManager(
            config['juicer']['servers']['redis_url'], 'job_output')

    def _emit_event(self, room, name, namespace, message, status, identifier,
                    **kwargs):
        data = {'message': message, 'status': status, 'id': identifier}
        data.update(kwargs)
        print('-' * 20)
        print('Emiting', data)
        print('-' * 20)
        self.mgr.emit(name, data=data, room=str(room), namespace=namespace)

    def start(self):
        signal.signal(signal.SIGTERM, self._terminate_minions)
        log.info(_('Starting master process. Reading "start" queue'))

        parsed_url = urlparse(self.config['juicer']['servers']['redis_url'])
        redis_conn = redis.StrictRedis(host=parsed_url.hostname,
                                       port=parsed_url.port,
                                       decode_responses=True)

        # Start pending minions
        apps = [q.split('_')[-1] for q in redis_conn.keys('queue_app_*')]
        self.state_control = StateControlRedis(redis_conn)

        for app_id in apps:
            pending = redis_conn.lrange('queue_app_{}'.format(app_id), 0, 0)
            if pending and len(pending) > 0:
                msg = json.loads(pending[0])
                log.warn(_('Starting pending app_id {}').format(app_id))
                # FIXME: cluster
                cluster = msg['cluster']
                platform = msg['workflow']['platform']['slug']
                job_id = msg['job_id']

                self._start_minion(app_id,
                                   app_id,
                                   job_id,
                                   self.state_control,
                                   platform,
                                   cluster=cluster)
            else:
                log.warn(_("Pending queue is empty"))

        while True:
            self.read_start_queue(redis_conn)

    # noinspection PyMethodMayBeStatic
    def read_start_queue(self, redis_conn):
        app_id = None
        try:
            self.state_control = StateControlRedis(redis_conn)
            # Process next message
            log.info(_('Reading "start" queue.'))
            msg = self.state_control.pop_start_queue()
            log.info(_('Forwarding message to minion.'))
            msg_info = json.loads(msg)

            # Extract message type and common parameters
            msg_type = msg_info['type']
            workflow_id = str(msg_info['workflow_id'])
            app_id = str(msg_info['app_id'])
            job_id = str(msg_info.get('job_id', 0))
            if msg_type in juicer_protocol.EXECUTE:
                platform = msg_info['workflow'].get('platform',
                                                    {}).get('slug', 'spark')
                cluster = msg_info['cluster']
                self._forward_to_minion(msg_type, workflow_id, app_id, job_id,
                                        msg, platform, cluster)

            elif msg_type == juicer_protocol.TERMINATE:
                cluster = msg_info.get('cluster')
                platform = msg_info.get('workflow',
                                        {}).get('platform',
                                                {}).get('slug', 'spark')
                # FIXME
                job_id = 0
                self._forward_to_minion(msg_type, workflow_id, app_id, job_id,
                                        msg, platform, cluster)
                self._terminate_minion(workflow_id, app_id)

            else:
                log.warn(_('Unknown message type %s'), msg_type)

        except ConnectionError as cx:
            log.exception(cx)
            time.sleep(1)

        except JuicerException as je:
            log.exception(je)
            if app_id:
                self.state_control.push_app_output_queue(
                    app_id, json.dumps({
                        'code': je.code,
                        'message': str(je)
                    }))
        except KeyboardInterrupt:
            pass
        except Exception as ex:
            log.exception(ex)
            if app_id:
                self.state_control.push_app_output_queue(
                    app_id, json.dumps({
                        'code': 500,
                        'message': str(ex)
                    }))

    def _forward_to_minion(self, msg_type, workflow_id, app_id, job_id, msg,
                           platform, cluster):
        # Get minion status, if it exists
        minion_info = self.state_control.get_minion_status(app_id)
        log.info(_('Minion status for (workflow_id=%s,app_id=%s): %s'),
                 workflow_id, app_id, minion_info)

        # If there is status registered for the application then we do not
        # need to launch a minion for it, because it is already running.
        # Otherwise, we launch a new minion for the application.
        if minion_info:
            log.info(_('Minion (workflow_id=%s,app_id=%s) is running on %s.'),
                     workflow_id, app_id, platform)
        else:
            # This is a special case when the minion timed out.
            # In this case we kill it before starting a new one
            if (workflow_id, app_id) in self.active_minions:
                self._terminate_minion(workflow_id, app_id)

            minion_process = self._start_minion(workflow_id,
                                                app_id,
                                                job_id,
                                                self.state_control,
                                                platform,
                                                cluster=cluster)
            # FIXME Kubernetes
            self.active_minions[(workflow_id, app_id)] = {
                'pid': minion_process.pid if minion_process else 0,
                'process': minion_process,
                'cluster': cluster,
                'port': self._get_next_available_port()
            }

        # Forward the message to the minion, which can be an execute or a
        # deliver command
        self.state_control.push_app_queue(app_id, msg)
        self.state_control.set_workflow_status(workflow_id, self.STARTED)

        log.info(
            _('Message %s forwarded to minion (workflow_id=%s,app_id=%s)'),
            msg_type, workflow_id, app_id)
        # log.info(_('Message content (workflow_id=%s,app_id=%s): %s'),
        #          workflow_id, app_id, msg)
        self.state_control.push_app_output_queue(
            app_id,
            json.dumps({
                'code': 0,
                'message': 'Minion is processing message %s' % msg_type
            }))

    def _start_minion(self,
                      workflow_id,
                      app_id,
                      job_id,
                      state_control,
                      platform,
                      restart=False,
                      cluster=None):

        log.info('Cluster: %s', cluster)
        if cluster is None:
            cluster = {}
        if cluster.get('type') == 'KUBERNETES':
            return self._start_kubernetes_minion(workflow_id, app_id, job_id,
                                                 state_control, platform,
                                                 restart, cluster)
        else:
            return self._start_subprocess_minion(workflow_id, app_id, job_id,
                                                 state_control, platform,
                                                 restart, cluster)

    def _start_kubernetes_minion(self,
                                 workflow_id,
                                 app_id,
                                 job_id,
                                 state_control,
                                 platform,
                                 restart=False,
                                 cluster=None):
        if cluster is None:
            cluster = {}
        from juicer.kb8s import create_kb8s_job

        self._emit_event(room=job_id,
                         namespace='/stand',
                         name='update job',
                         message=_('Creating a JOB in Kubernetes.'),
                         status='INFO',
                         identifier=job_id)

        minion_id = 'minion_{}_{}'.format(workflow_id, app_id)
        log.info(_('Starting minion %s in Kubernetes.'), minion_id)

        minion_cmd = [
            'python',
            '/usr/local/juicer/juicer/runner/minion.py',
            '-w',
            str(workflow_id),
            '-a',
            str(app_id),
            '-t',
            platform,
            '-c',
            self.config_file_path,
        ]
        log.info(_('Minion command: %s'), json.dumps(minion_cmd))
        create_kb8s_job(workflow_id, minion_cmd, cluster)

        # Expires in 300 seconds (enough to KB8s start the pod?)
        proc_id = int(1)
        state_control.set_minion_status(app_id,
                                        json.dumps({'pid': proc_id}),
                                        ex=300,
                                        nx=False)
        return {}

    def _start_subprocess_minion(self,
                                 workflow_id,
                                 app_id,
                                 job_id,
                                 state_control,
                                 platform,
                                 restart=False,
                                 cluster=None):
        if cluster is None:
            cluster = {}
        minion_id = 'minion_{}_{}'.format(workflow_id, app_id)
        stdout_log = os.path.join(self.log_dir, minion_id + '_out.log')
        stderr_log = os.path.join(self.log_dir, minion_id + '_err.log')
        log.info(_('Forking minion %s.'), minion_id)

        port = self._get_next_available_port()

        # Setup command and launch the minion script. We return the subprocess
        # created as part of an active minion.
        # spark.driver.port and spark.driver.blockManager.port are required
        # when running the driver inside a docker container.
        minion_cmd = [
            'nohup',
            sys.executable,
            self.minion_executable,
            '-w',
            str(workflow_id),
            '-a',
            str(app_id),
            '-t',
            platform,
            '-c',
            self.config_file_path,
        ]
        log.info(_('Minion command: %s'), json.dumps(minion_cmd))

        # Mesos / libprocess configuration. See:
        # http://mesos.apache.org/documentation/latest/configuration/libprocess/
        cloned_env = os.environ.copy()
        cloned_env['LIBPROCESS_PORT'] = str(port)
        cloned_env['SPARK_DRIVER_PORT'] = str(port + self.port_offset)
        cloned_env['SPARK_DRIVER_BLOCKMANAGER_PORT'] = str(port + 2 *
                                                           self.port_offset)

        if self.advertise_ip is not None:
            cloned_env['LIBPROCESS_ADVERTISE_IP'] = self.advertise_ip

        proc = subprocess.Popen(minion_cmd,
                                stdout=open(stdout_log, 'a'),
                                stderr=open(stderr_log, 'a'),
                                env=cloned_env)

        # Expires in 30 seconds and sets only if it doesn't exist
        proc_id = int(proc.pid)
        state_control.set_minion_status(app_id,
                                        json.dumps({
                                            'pid': proc_id,
                                            'port': port
                                        }),
                                        ex=30,
                                        nx=False)
        return proc

    def _terminate_minion(self, workflow_id, app_id):
        # In this case we got a request for terminating this workflow
        # execution instance (app). Thus, we are going to explicitly
        # terminate the workflow, clear any remaining metadata and return
        if not (workflow_id, app_id) in self.active_minions:
            log.warn('(%s, %s) not in active minions ', workflow_id, app_id)
        log.info(_("Terminating (workflow_id=%s,app_id=%s)"), workflow_id,
                 app_id)
        minion_data = self.active_minions.get((workflow_id, app_id))
        cluster = minion_data.get('cluster', {}) if minion_data else None
        if cluster is not None and cluster.get('type') == 'KUBERNETES':
            # try to kill Job in KB8s
            delete_kb8s_job(workflow_id, cluster)
        elif (workflow_id, app_id) in self.active_minions:
            os.kill(self.active_minions[(workflow_id, app_id)].get('pid'),
                    signal.SIGTERM)
            del self.active_minions[(workflow_id, app_id)]

    def minion_support(self):
        """
         Control minion resource allocation and execution.
         Improve: define a parameter for sleeping time
        """
        # while True:
        #     print(self.active_minions)
        #     for (workflow_id, app_id), minion_data in list(
        #             self.active_minions.items()):
        #         cluster = minion_data.get('cluster', {})
        #         if cluster is not None and cluster.get('type') == 'KUBERNETES'
        #             eval_and_kill_pending_jobs(cluster)
        #     time.sleep(10)
        pass

    #
    # def read_minion_support_queue(self, redis_conn):
    #     try:
    #         state_control = StateControlRedis(redis_conn)
    #         ticket = json.loads(state_control.pop_master_queue())
    #         workflow_id = ticket.get('workflow_id')
    #         app_id = ticket.get('app_id', ticket.get('workflow_id'))
    #         reason = ticket.get('reason')
    #         log.info(_("Master received a ticket for app %s"), app_id)
    #         if reason == self.HELP_UNHANDLED_EXCEPTION:
    #             # Let's kill the minion and start another
    #             minion_info = json.loads(
    #                 state_control.get_minion_status(app_id))
    #             while True:
    #                 try:
    #                     os.kill(minion_info['pid'], signal.SIGKILL)
    #                 except OSError as err:
    #                     if err.errno == errno.ESRCH:
    #                         break
    #                 time.sleep(.5)
    #
    #             # Review with cluster
    #             # FIXME: platform
    #             platform = 'spark'
    #             self._start_minion(workflow_id, app_id, state_control,
    #                                platform)
    #
    #         elif reason == self.HELP_STATE_LOST:
    #             pass
    #         else:
    #             log.warn(_("Unknown help reason %s"), reason)
    #     except KeyboardInterrupt:
    #         pass
    #     except ConnectionError as cx:
    #         log.exception(cx)
    #         time.sleep(1)
    #
    #     except Exception as ex:
    #         log.exception(ex)

    def _get_next_available_port(self):
        used_ports = set(
            [minion['port'] for minion in list(self.active_minions.values())])
        for i in self.port_range:
            if i not in used_ports:
                return i
        raise ValueError(
            _('Unable to launch minion: there is not available '
              'port for libprocess.'))

    def watch_new_minion(self):
        try:
            log.info(_('Watching minions events.'))

            parsed_url = urlparse(
                self.config['juicer']['servers']['redis_url'])
            redis_conn = redis.StrictRedis(host=parsed_url.hostname,
                                           port=parsed_url.port)
            redis_conn.config_set('notify-keyspace-events', 'KE$gx')
            pub_sub = redis_conn.pubsub()
            pub_sub.psubscribe('__keyspace*__:key_minion_app*')
            for msg in pub_sub.listen():
                # print('|{}|'.format(msg.get('channel')))
                app_id = msg.get('channel', '').decode('utf8').split('_')[-1]
                if app_id.isdigit():
                    app_id = int(app_id)
                    key = (app_id, app_id)
                    data = msg.get('data', '')
                    if key in self.active_minions:
                        if data == b'del' or data == b'expired':
                            del self.active_minions[key]
                            log.info(_('Minion {} finished.').format(app_id))
                            pending = redis_conn.lrange(
                                'queue_app_{}'.format(app_id), 0, 0)
                            if pending:
                                log.warn(
                                    _('There are messages to process in app {} '
                                      'queue, starting minion.').format(
                                          app_id))
                                if self.state_control is None:
                                    self.state_control = StateControlRedis(
                                        redis_conn)
                                # FIXME: Cluster and platform and job_id
                                print('-' * 10)
                                print(pending)
                                print('-' * 10)
                                platform = 'spark'
                                self._start_minion(app_id, app_id, 0,
                                                   self.state_control,
                                                   platform)

                    elif data == b'set':
                        # Externally launched minion
                        minion_info = json.loads(
                            redis_conn.get('key_minion_app_{}'.format(
                                app_id)).decode('utf8'))
                        port = self._get_next_available_port()
                        self.active_minions[key] = {
                            'pid': minion_info.get('pid'),
                            'port': port
                        }
                        log.info(
                            _('Minion {} joined (pid: {}, port: {}).').format(
                                app_id, minion_info.get('pid'), port))
        except KeyboardInterrupt:
            pass
        except ConnectionError as cx:
            log.exception(cx)
            time.sleep(1)

    def process(self):
        log.info(_('Juicer server started (pid=%s)'), os.getpid())
        self.start_process = multiprocessing.Process(name="master",
                                                     target=self.start)
        self.start_process.daemon = False

        self.minion_support_process = multiprocessing.Process(
            name="help_desk", target=self.minion_support)
        self.minion_support_process.daemon = False

        self.new_minion_watch_process = multiprocessing.Process(
            name="minion_status", target=self.watch_new_minion)
        self.new_minion_watch_process.daemon = False

        self.start_process.start()
        self.minion_support_process.start()
        self.new_minion_watch_process.start()

        try:
            self.start_process.join()
            self.minion_support_process.join()
            self.new_minion_watch_process.join()
        except KeyboardInterrupt:
            self._terminate(None, None)

    # noinspection PyUnusedLocal
    def _terminate_minions(self, _signal, _frame):
        log.info(_('Terminating %s active minions'), len(self.active_minions))
        minions = [m for m in self.active_minions]
        for (wid, aid) in minions:
            self._terminate_minion(wid, aid)
        sys.exit(0)

    # noinspection PyUnusedLocal
    def _terminate(self, _signal, _frame):
        """
        This is a handler that reacts to a sigkill signal.
        """
        log.info(_('Killing juicer server subprocesses and terminating'))
        if self.start_process:
            os.kill(self.start_process.pid, signal.SIGTERM)
        if self.minion_support_process:
            os.kill(self.minion_support_process.pid, signal.SIGKILL)
        # if self.minion_watch_process:
        #     os.kill(self.minion_watch_process.pid, signal.SIGKILL)
        if self.new_minion_watch_process:
            os.kill(self.new_minion_watch_process.pid, signal.SIGKILL)