Exemplo n.º 1
0
    def __init__(
        self,
        mesos_address,
        mesos_master_port=None,
        secret=None,
        principal=None,
        mesos_role=None,
        framework_id=None,
        enabled=True,
        default_volumes=None,
        dockercfg_location=None,
        offer_timeout=None,
    ):
        self.mesos_address = mesos_address
        self.mesos_master_port = mesos_master_port
        self.secret = secret
        self.principal = principal
        self.mesos_role = mesos_role
        self.enabled = enabled
        self.default_volumes = default_volumes or []
        self.dockercfg_location = dockercfg_location
        self.offer_timeout = offer_timeout
        self.framework_id = framework_id

        self.processor = TaskProcessor()
        self.queue = PyDeferredQueue()
        self.deferred = None
        self.runner = None
        self.tasks = {}

        self.processor.load_plugin(
            provider_module='task_processing.plugins.mesos')
        self.connect()
Exemplo n.º 2
0
    def stop(self):
        self.framework_id = None
        if self.runner:
            self.runner.stop()

        # Clear message queue
        if self.deferred:
            self.deferred.cancel()
        self.queue = PyDeferredQueue()

        for key, task in list(self.tasks.items()):
            task.exited(None)
            del self.tasks[key]
Exemplo n.º 3
0
    def __init__(
        self,
        mesos_address,
        mesos_master_port=None,
        secret=None,
        principal=None,
        mesos_role=None,
        framework_id=None,
        enabled=True,
        default_volumes=None,
        dockercfg_location=None,
        offer_timeout=None,
    ):
        self.mesos_address = mesos_address
        self.mesos_master_port = mesos_master_port
        self.secret = secret
        self.principal = principal
        self.mesos_role = mesos_role
        self.enabled = enabled
        self.default_volumes = default_volumes or []
        self.dockercfg_location = dockercfg_location
        self.offer_timeout = offer_timeout
        self.framework_id = framework_id

        self.processor = TaskProcessor()
        self.queue = PyDeferredQueue()
        self.deferred = None
        self.runner = None
        self.tasks = {}

        self.processor.load_plugin(
            provider_module='task_processing.plugins.mesos'
        )
        self.connect()
Exemplo n.º 4
0
    def stop(self, fail_tasks=False):
        self.framework_id = None
        if self.runner:
            self.runner.stop()

        # Clear message queue
        if self.deferred:
            self.deferred.cancel()
            self.deferred = None
        self.queue = PyDeferredQueue()

        if fail_tasks:
            for key, task in list(self.tasks.items()):
                task.exited(None)
                del self.tasks[key]
Exemplo n.º 5
0
class MesosCluster:
    def __init__(
        self,
        mesos_address,
        mesos_master_port=None,
        secret=None,
        principal=None,
        mesos_role=None,
        framework_id=None,
        enabled=True,
        default_volumes=None,
        dockercfg_location=None,
        offer_timeout=None,
    ):
        self.mesos_address = mesos_address
        self.mesos_master_port = mesos_master_port
        self.secret = secret
        self.principal = principal
        self.mesos_role = mesos_role
        self.enabled = enabled
        self.default_volumes = default_volumes or []
        self.dockercfg_location = dockercfg_location
        self.offer_timeout = offer_timeout
        self.framework_id = framework_id

        self.processor = TaskProcessor()
        self.queue = PyDeferredQueue()
        self.deferred = None
        self.runner = None
        self.tasks = {}

        self.processor.load_plugin(
            provider_module='task_processing.plugins.mesos')
        self.connect()

    def set_enabled(self, is_enabled):
        self.enabled = is_enabled
        if is_enabled:
            self.connect()
        else:
            self.stop(fail_tasks=True)

    def configure_tasks(
        self,
        default_volumes,
        dockercfg_location,
        offer_timeout,
    ):
        self.default_volumes = default_volumes
        self.dockercfg_location = dockercfg_location
        self.offer_timeout = offer_timeout

    def connect(self):
        self.runner = self.get_runner(self.mesos_address, self.queue)
        self.handle_next_event()

    def handle_next_event(self, deferred_result=None):
        if self.deferred and not self.deferred.called:
            log.warning(
                'Already have handlers waiting for next event in queue, '
                'not adding more')
            return
        self.deferred = self.queue.get()
        self.deferred.addCallback(self._process_event)
        self.deferred.addCallback(self.handle_next_event)
        self.deferred.addErrback(logError)
        self.deferred.addErrback(self.handle_next_event)

    def _check_connection(self):
        if self.runner.stopping:
            # Last framework was terminated for some reason, re-connect.
            log.info('Last framework stopped, re-connecting')
            self.connect()
        elif self.deferred.called:
            # Just in case callbacks are missing, re-add.
            self.handle_next_event()

    def submit(self, task):
        if not task:
            return

        if not self.enabled:
            task.log.info('Task failed to start, Mesos is disabled.')
            task.exited(1)
            return
        self._check_connection()

        mesos_task_id = task.get_mesos_id()
        self.tasks[mesos_task_id] = task
        env = task.get_config()['environment']
        clusterman_resource_str = env.get('CLUSTERMAN_RESOURCES')
        clusterman_metrics = get_clusterman_metrics()
        if clusterman_resource_str and clusterman_metrics:
            clusterman_resources = json.loads(clusterman_resource_str)
            cluster = env.get('EXECUTOR_CLUSTER', env.get('PAASTA_CLUSTER'))
            pool = env.get('EXECUTOR_POOL', env.get('PAASTA_POOL'))
            aws_region = staticconf.read(f'clusters.{cluster}.aws_region',
                                         namespace='clusterman')
            metrics_client = clusterman_metrics.ClustermanMetricsBotoClient(
                region_name=aws_region,
                app_identifier=pool,
            )
            with metrics_client.get_writer(
                    clusterman_metrics.APP_METRICS,
                    aggregate_meteorite_dims=True) as writer:
                for metric_key, metric_value in clusterman_resources.items():
                    writer.send((metric_key, int(time.time()), metric_value))
        self.runner.run(task.get_config())
        log.info(
            'Submitting task {} to {}'.format(
                mesos_task_id,
                self.mesos_address,
            ), )
        task.report_resources()

    def recover(self, task):
        if not task:
            return

        if not self.enabled:
            task.log.info('Could not recover task, Mesos is disabled.')
            task.exited(None)
            return
        self._check_connection()

        mesos_task_id = task.get_mesos_id()
        self.tasks[mesos_task_id] = task
        task.log.info(
            'TRON RESTARTED! Starting recovery procedure by reconciling state for this task from Mesos'
        )
        task.started()
        self.runner.reconcile(task.get_config())
        task.report_resources()

    def create_task(
        self,
        action_run_id,
        command,
        cpus,
        mem,
        disk,
        constraints,
        docker_image,
        docker_parameters,
        env,
        extra_volumes,
        serializer,
        task_id=None,
    ):
        if not self.runner:
            return None

        uris = [self.dockercfg_location] if self.dockercfg_location else []
        volumes = combine_volumes(self.default_volumes, extra_volumes)
        task_kwargs = {
            'name': action_run_id,
            'cmd': command,
            'cpus': cpus,
            'mem': mem,
            'disk': disk,
            'constraints': constraints,
            'image': docker_image,
            'docker_parameters': docker_parameters,
            'environment': env,
            'volumes': volumes,
            'uris': uris,
            'offer_timeout': self.offer_timeout,
        }
        task_config = self.runner.TASK_CONFIG_INTERFACE(**task_kwargs)

        if task_id is not None:
            try:
                task_config = task_config.set_task_id(task_id)
            except ValueError:
                log.error(f'Invalid {task_id} for {action_run_id}')
                return

        return MesosTask(action_run_id, task_config, serializer)

    def get_runner(self, mesos_address, queue):
        if not self.enabled:
            log.info('Mesos is disabled, not creating a framework.')
            return None

        if self.runner and not self.runner.stopping:
            log.info('Already have a running framework, not creating one.')
            return self.runner

        framework_name = 'tron-{}'.format(socket.gethostname())
        executor = self.processor.executor_from_config(
            provider='mesos_task',
            provider_config={
                'secret':
                self.secret,
                'principal':
                self.principal,
                'mesos_address':
                get_mesos_leader(mesos_address, self.mesos_master_port),
                'role':
                self.mesos_role,
                'framework_name':
                framework_name,
                'framework_id':
                self.framework_id,
                'failover':
                True,
            })

        def log_output(task_id, message, stream):
            logger = logging.getLogger('{}.{}.{}'.format(
                TASK_OUTPUT_LOGGER,
                task_id,
                stream,
            ))
            logger.info(message)

        logging_executor = self.processor.executor_from_config(
            provider='logging',
            provider_config={
                'downstream_executor': executor,
                'handler': log_output,
                'format_string': '{line}',
            },
        )
        return Subscription(logging_executor, queue)

    def _process_event(self, event):
        if event.kind == 'control':
            message = getattr(event, 'message', None)
            if message == 'stop':
                # Framework has been removed, stop it.
                log.warning('Framework has been stopped: {}'.format(event.raw))
                self.stop()
                MesosClusterRepository.remove(self.mesos_address)
            elif message == 'unknown':
                log.warning('Unknown error from Mesos master: {}'.format(
                    event.raw))
            elif message == 'registered':
                framework_id = event.raw['framework_id']['value']
                MesosClusterRepository.save(self.mesos_address, framework_id)
            else:
                log.warning('Unknown type of control event: {}'.format(event))

        elif event.kind == 'task':
            if not hasattr(event, 'task_id'):
                log.warning('Task event missing task_id: {}'.format(event))
                return
            if event.task_id not in self.tasks:
                log.warning(
                    'Received event for unknown task {}: {}'.format(
                        event.task_id,
                        event,
                    ), )
                return
            task = self.tasks[event.task_id]
            task.handle_event(event)
            if task.is_done:
                del self.tasks[event.task_id]
        else:
            log.warning('Unknown type of event: {}'.format(event))

    def stop(self, fail_tasks=False):
        self.framework_id = None
        if self.runner:
            self.runner.stop()

        # Clear message queue
        if self.deferred:
            self.deferred.cancel()
            self.deferred = None
        self.queue = PyDeferredQueue()

        if fail_tasks:
            for key, task in list(self.tasks.items()):
                task.exited(None)
                del self.tasks[key]

    def kill(self, task_id):
        return self.runner.kill(task_id)
Exemplo n.º 6
0
class MesosCluster:
    def __init__(
        self,
        mesos_address,
        mesos_master_port=None,
        secret=None,
        principal=None,
        mesos_role=None,
        framework_id=None,
        enabled=True,
        default_volumes=None,
        dockercfg_location=None,
        offer_timeout=None,
    ):
        self.mesos_address = mesos_address
        self.mesos_master_port = mesos_master_port
        self.secret = secret
        self.principal = principal
        self.mesos_role = mesos_role
        self.enabled = enabled
        self.default_volumes = default_volumes or []
        self.dockercfg_location = dockercfg_location
        self.offer_timeout = offer_timeout
        self.framework_id = framework_id

        self.processor = TaskProcessor()
        self.queue = PyDeferredQueue()
        self.deferred = None
        self.runner = None
        self.tasks = {}

        self.processor.load_plugin(
            provider_module='task_processing.plugins.mesos'
        )
        self.connect()

    def set_enabled(self, is_enabled):
        self.enabled = is_enabled
        if is_enabled:
            self.connect()
        else:
            self.stop(fail_tasks=True)

    def configure_tasks(
        self,
        default_volumes,
        dockercfg_location,
        offer_timeout,
    ):
        self.default_volumes = default_volumes
        self.dockercfg_location = dockercfg_location
        self.offer_timeout = offer_timeout

    def connect(self):
        self.runner = self.get_runner(self.mesos_address, self.queue)
        self.handle_next_event()

    def handle_next_event(self, deferred_result=None):
        if self.deferred and not self.deferred.called:
            log.warning(
                'Already have handlers waiting for next event in queue, '
                'not adding more'
            )
            return
        self.deferred = self.queue.get()
        self.deferred.addCallback(self._process_event)
        self.deferred.addCallback(self.handle_next_event)
        self.deferred.addErrback(logError)
        self.deferred.addErrback(self.handle_next_event)

    def _check_connection(self):
        if self.runner.stopping:
            # Last framework was terminated for some reason, re-connect.
            log.info('Last framework stopped, re-connecting')
            self.connect()
        elif self.deferred.called:
            # Just in case callbacks are missing, re-add.
            self.handle_next_event()

    def submit(self, task):
        if not task:
            return

        if not self.enabled:
            task.log.info('Task failed to start, Mesos is disabled.')
            task.exited(1)
            return
        self._check_connection()

        mesos_task_id = task.get_mesos_id()
        self.tasks[mesos_task_id] = task
        self.runner.run(task.get_config())
        log.info(
            'Submitting task {} to {}'.format(
                mesos_task_id,
                self.mesos_address,
            ),
        )
        task.report_resources()

    def recover(self, task):
        if not task:
            return

        if not self.enabled:
            task.log.info('Could not recover task, Mesos is disabled.')
            task.exited(None)
            return
        self._check_connection()

        mesos_task_id = task.get_mesos_id()
        self.tasks[mesos_task_id] = task
        task.log.info('Reconciling state for this task from Mesos')
        task.started()
        self.runner.reconcile(task.get_config())
        task.report_resources()

    def create_task(
        self,
        action_run_id,
        command,
        cpus,
        mem,
        disk,
        constraints,
        docker_image,
        docker_parameters,
        env,
        extra_volumes,
        serializer,
        task_id=None,
    ):
        if not self.runner:
            return None

        uris = [self.dockercfg_location] if self.dockercfg_location else []
        volumes = combine_volumes(self.default_volumes, extra_volumes)
        task_kwargs = {
            'name': action_run_id,
            'cmd': command,
            'cpus': cpus,
            'mem': mem,
            'disk': disk,
            'constraints': constraints,
            'image': docker_image,
            'docker_parameters': docker_parameters,
            'environment': env,
            'volumes': volumes,
            'uris': uris,
            'offer_timeout': self.offer_timeout,
        }
        task_config = self.runner.TASK_CONFIG_INTERFACE(**task_kwargs)

        if task_id is not None:
            try:
                task_config = task_config.set_task_id(task_id)
            except ValueError:
                log.error(f'Invalid {task_id} for {action_run_id}')
                return

        return MesosTask(action_run_id, task_config, serializer)

    def get_runner(self, mesos_address, queue):
        if not self.enabled:
            log.info('Mesos is disabled, not creating a framework.')
            return None

        if self.runner and not self.runner.stopping:
            log.info('Already have a running framework, not creating one.')
            return self.runner

        framework_name = 'tron-{}'.format(socket.gethostname())
        executor = self.processor.executor_from_config(
            provider='mesos_task',
            provider_config={
                'secret':
                    self.secret,
                'principal':
                    self.principal,
                'mesos_address':
                    get_mesos_leader(mesos_address, self.mesos_master_port),
                'role':
                    self.mesos_role,
                'framework_name':
                    framework_name,
                'framework_id':
                    self.framework_id,
                'failover':
                    True,
            }
        )

        def log_output(task_id, message, stream):
            logger = logging.getLogger(
                '{}.{}.{}'.format(
                    TASK_OUTPUT_LOGGER,
                    task_id,
                    stream,
                )
            )
            logger.info(message)

        logging_executor = self.processor.executor_from_config(
            provider='logging',
            provider_config={
                'downstream_executor': executor,
                'handler': log_output,
                'format_string': '{line}',
            },
        )
        return Subscription(logging_executor, queue)

    def _process_event(self, event):
        if event.kind == 'control':
            message = getattr(event, 'message', None)
            if message == 'stop':
                # Framework has been removed, stop it.
                log.warning('Framework has been stopped: {}'.format(event.raw))
                self.stop()
                MesosClusterRepository.remove(self.mesos_address)
            elif message == 'unknown':
                log.warning(
                    'Unknown error from Mesos master: {}'.format(event.raw)
                )
            elif message == 'registered':
                framework_id = event.raw['framework_id']['value']
                MesosClusterRepository.save(self.mesos_address, framework_id)
            else:
                log.warning('Unknown type of control event: {}'.format(event))

        elif event.kind == 'task':
            if not hasattr(event, 'task_id'):
                log.warning('Task event missing task_id: {}'.format(event))
                return
            if event.task_id not in self.tasks:
                log.warning(
                    'Received event for unknown task {}: {}'.format(
                        event.task_id,
                        event,
                    ),
                )
                return
            task = self.tasks[event.task_id]
            task.handle_event(event)
            if task.is_done:
                del self.tasks[event.task_id]
        else:
            log.warning('Unknown type of event: {}'.format(event))

    def stop(self, fail_tasks=False):
        self.framework_id = None
        if self.runner:
            self.runner.stop()

        # Clear message queue
        if self.deferred:
            self.deferred.cancel()
            self.deferred = None
        self.queue = PyDeferredQueue()

        if fail_tasks:
            for key, task in list(self.tasks.items()):
                task.exited(None)
                del self.tasks[key]

    def kill(self, task_id):
        return self.runner.kill(task_id)