def main(): args = parse_args() processor = TaskProcessor() processor.load_plugin(provider_module='task_processing.plugins.mesos') mesos_executor = processor.executor_from_config(provider='mesos_task', provider_config={ 'secret': args.secret, 'mesos_address': args.master, 'pool': args.pool, 'role': args.role, }) executor = processor.executor_from_config(provider='logging', provider_config={ 'downstream_executor': mesos_executor, }) TaskConfig = mesos_executor.TASK_CONFIG_INTERFACE runner = Sync(executor=executor) task_config = TaskConfig( image="ubuntu:14.04", cmd="bash -c 'for i in $(seq 1 5); do echo $i&&sleep 10; done'") result = runner.run(task_config) print(result) runner.stop()
def main(): mesos_address = os.getenv('MESOS', 'mesosmaster:5050') with open('./examples/cluster/secret') as f: secret = f.read().strip() processor = TaskProcessor() for p in ['mesos', 'stateful']: processor.load_plugin(provider_module='task_processing.plugins.' + p) mesos_executor = processor.executor_from_config(provider='mesos_task', provider_config={ 'secret': secret, 'mesos_address': mesos_address, 'role': 'taskproc', }) executor = processor.executor_from_config( provider='stateful', provider_config={ 'downstream_executor': mesos_executor, 'persister': FilePersistence(output_file='/tmp/foo') }) runner = Sync(executor=executor) tasks = set() TaskConfig = mesos_executor.TASK_CONFIG_INTERFACE for _ in range(1, 2): task_config = TaskConfig(image='busybox', cmd='/bin/true') tasks.add(task_config.task_id) runner.run(task_config) print(executor.status(task_config.task_id))
def main(): args = parse_args() processor = TaskProcessor() processor.load_plugin(provider_module='task_processing.plugins.mesos') executor = processor.executor_from_config(provider='mesos_task', provider_config={ 'secret': args.secret, 'mesos_address': args.master, 'role': args.role, }) counter = Counter() runner = Async(executor, [ EventHandler(predicate=lambda x: x.terminal, cb=counter.process_event) ]) TaskConfig = executor.TASK_CONFIG_INTERFACE tasks_to_launch = 2 for _ in range(tasks_to_launch): task_config = TaskConfig(image='busybox', cmd='/bin/true') runner.run(task_config) for _ in range(5): print('terminated {} tasks'.format(counter.terminated)) if counter.terminated >= tasks_to_launch: break time.sleep(2) runner.stop() return 0 if counter.terminated >= tasks_to_launch else 1
def main(): args = parse_args() processor = TaskProcessor() processor.load_plugin(provider_module='task_processing.plugins.mesos') executor = processor.executor_from_config(provider='mesos_task', provider_config={ 'secret': args.secret, 'mesos_address': args.master, 'pool': args.pool, 'role': args.role, }) TaskConfig = executor.TASK_CONFIG_INTERFACE task_config = TaskConfig(image="busybox", cmd='/bin/true') # This only works on agents that have added mesos as a containerizer # task_config = TaskConfig(containerizer='MESOS', cmd='/bin/true') with ThreadPoolExecutor(max_workers=2) as futures_executor: runner = Promise(executor, futures_executor) future = runner.run(task_config) wait([future]) result = future.result() print(result) print(result.raw) runner.stop() return 0 if result.success else 1
def main(): args = parse_args() processor = TaskProcessor() processor.load_plugin(provider_module='task_processing.plugins.mesos') mesos_executor = processor.executor_from_config(provider='mesos_task', provider_config={ 'secret': args.secret, 'mesos_address': args.master, 'pool': args.pool, 'role': args.role, }) executor = processor.executor_from_config(provider='timeout', provider_config={ 'downstream_executor': mesos_executor, }) TaskConfig = mesos_executor.TASK_CONFIG_INTERFACE runner = Sync(executor=executor) task_config = TaskConfig(image='docker-dev.yelpcorp.com/dumb-busybox', cmd='exec dumb-init /bin/sleep 30', timeout=10) result = runner.run(task_config) print(result) runner.stop()
def test_load_plugin(): tp = TaskProcessor() tp.load_plugin('tests.mock_plugin') assert 'mock_plugin' in tp.registry.plugin_modules assert 'dummy' in tp.registry.task_executors assert 'dummy2' in tp.registry.task_executors with pytest.raises(ValueError): tp.load_plugin('tests.mock_plugin')
def test_executor_from_config(): tp = TaskProcessor() tp.load_plugin('tests.mock_plugin') executor = tp.executor_from_config(provider='dummy', provider_config={'arg': 'foobar'}) assert executor.arg == 'foobar' executor.run(None) executor.kill(None) with pytest.raises(ValueError): tp.executor_from_config('lol')
def main(): mesos_address = os.getenv('MESOS', 'mesosmaster:5050') with open('./examples/cluster/secret') as f: secret = f.read().strip() processor = TaskProcessor() for p in ['mesos', 'stateful']: processor.load_plugin(provider_module='task_processing.plugins.' + p) mesos_executor = processor.executor_from_config(provider='mesos_task', provider_config={ 'secret': secret, 'mesos_address': mesos_address, 'role': 'taskproc', }) s = session.Session(region_name='foo', aws_access_key_id='foo', aws_secret_access_key='bar') dynamo_address = os.getenv('DYNAMO', 'http://dynamodb:5050') client = s.client( service_name='dynamodb', endpoint_url=dynamo_address, ) try: create_table(client) except ClientError: pass executor = processor.executor_from_config( provider='stateful', provider_config={ 'downstream_executor': mesos_executor, 'persister': DynamoDBPersister(table_name='events', endpoint_url=dynamo_address, session=s) }) runner = Sync(executor=executor) tasks = set() TaskConfig = mesos_executor.TASK_CONFIG_INTERFACE for _ in range(1, 2): task_config = TaskConfig(image='ubuntu:14.04', cmd='/bin/sleep 2') tasks.add(task_config.task_id) runner.run(task_config) print(executor.status(task_config.task_id))
def main(): # get address of the Mesos cluster mesos_address = os.getenv('MESOS', 'mesosmaster:5050') # read in secret, this is used to authenticate the taskproc scheduler with # Mesos with open('./examples/cluster/secret') as f: secret = f.read().strip() # create a processor instance processor = TaskProcessor() # configure plugins processor.load_plugin(provider_module='task_processing.plugins.mesos') # create an executor (taskproc executor NOT to be confused with a Mesos # executor) using this defined configuration. this config can also be used # to specify other Mesos properties, such as which role to use executor = processor.executor_from_config( provider='mesos_task', provider_config={ 'secret': secret, 'mesos_address': mesos_address, 'role': 'taskproc', } ) # creates a new Sync runner that will synchronously execute tasks # (i.e. block until completion) runner = Sync(executor) # next, create a TaskConfig to run # this is where properties of the Mesos task can be specified in this # example, we use the busybox Docker image and just echo "hello world" TaskConfig = executor.TASK_CONFIG_INTERFACE task_config = TaskConfig(image="busybox", cmd='echo "hello world"') # run our task and print the result result = runner.run(task_config) print(result) # this stops the taskproc framework and unregisters it from Mesos runner.stop() return 0 if result.success else 1
def main(): mesos_address = os.environ['MESOS'] with open('./examples/cluster/secret') as f: secret = f.read().strip() processor = TaskProcessor() processor.load_plugin(provider_module='task_processing.plugins.mesos') executor = processor.executor_from_config(provider='mesos_task', provider_config={ 'secret': secret, 'mesos_address': mesos_address, 'role': 'taskproc', }) queue = Queue(100) runner = Subscription(executor, queue) tasks = set() TaskConfig = executor.TASK_CONFIG_INTERFACE for _ in range(2): task_config = TaskConfig(image='busybox', cmd='/bin/true') tasks.add(task_config.task_id) runner.run(task_config) print('Running {} tasks: {}'.format(len(tasks), tasks)) while len(tasks) > 0: try: event = queue.get(block=True, timeout=10) except Empty: event = None if event is None: print('Timeout while waiting for {}'.format(tasks)) break else: if event.terminal: tasks.discard(event.task_id) runner.stop() return 0 if len(tasks) == 0 else 1
def main(): c = Counter() args = parse_args() processor = TaskProcessor() processor.load_plugin(provider_module='task_processing.plugins.mesos') mesos_executor = processor.executor_from_config(provider='mesos_task', provider_config={ 'secret': args.secret, 'mesos_address': args.master, 'pool': args.pool, 'role': args.role, }) TaskConfig = mesos_executor.TASK_CONFIG_INTERFACE runner = Async( mesos_executor, [EventHandler( predicate=lambda x: x.terminal, cb=c.process_event, )]) timeout_task_config = TaskConfig( image='busybox', cmd='exec /bin/sleep 100', offer_timeout=5.0, cpus=20, mem=2048, disk=2000, ) runner.run(timeout_task_config) for _ in range(50): if c.terminated >= 1: break print("waiting for task %s to finish" % (timeout_task_config.task_id)) time.sleep(2) runner.stop() return 0
def remote_run_start(args): system_paasta_config, service, cluster, \ soa_dir, instance, instance_type = extract_args(args) overrides_dict = {} constraints_json = args.constraints_json if constraints_json: try: constraints = json.loads(constraints_json) except Exception as e: paasta_print("Error while parsing constraints: %s", e) if constraints: overrides_dict['constraints'] = constraints if args.cmd: overrides_dict['cmd'] = args.cmd if args.instances: overrides_dict['instances'] = args.instances run_id = args.run_id if run_id is None: run_id = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(8)) paasta_print("Assigned random run-id: %s" % run_id) if args.detach: paasta_print("Running in background") if os.fork() > 0: return os.setsid() if os.fork() > 0: return sys.stdout = open('/dev/null', 'w') sys.stderr = open('/dev/null', 'w') paasta_print('Scheduling a task on Mesos') processor = TaskProcessor() processor.load_plugin(provider_module='task_processing.plugins.mesos') processor.load_plugin(provider_module='task_processing.plugins.stateful') MesosExecutor = processor.executor_cls(provider='mesos') native_job_config = load_paasta_native_job_config( service, instance, cluster, soa_dir=soa_dir, instance_type=instance_type, config_overrides=overrides_dict, load_deployments=not args.docker_image, ) try: task_config = MesosExecutor.TASK_CONFIG_INTERFACE( **paasta_to_task_config_kwargs( service=service, instance=instance, system_paasta_config=system_paasta_config, native_job_config=native_job_config, config_overrides=overrides_dict, docker_image=args.docker_image, offer_timeout=args.staging_timeout, ), ) except InvariantException as e: if len(e.missing_fields) > 0: paasta_print( PaastaColors.red( "Mesos task config is missing following fields: {}".format( ', '.join(e.missing_fields), ), ), ) elif len(e.invariant_errors) > 0: paasta_print( PaastaColors.red( "Mesos task config is failing following checks: {}".format( ', '.join(str(ie) for ie in e.invariant_errors), ), ), ) else: paasta_print(PaastaColors.red(f"Mesos task config error: {e}"), ) traceback.print_exc() emit_counter_metric('paasta.remote_run.start.failed', service, instance) sys.exit(1) except PTypeError as e: paasta_print( PaastaColors.red( f"Mesos task config is failing a type check: {e}", ), ) traceback.print_exc() emit_counter_metric('paasta.remote_run.start.failed', service, instance) sys.exit(1) def handle_interrupt(_signum, _frame): paasta_print( PaastaColors.red("Signal received, shutting down scheduler."), ) if runner is not None: runner.stop() if _signum == signal.SIGTERM: sys.exit(143) else: sys.exit(1) signal.signal(signal.SIGINT, handle_interrupt) signal.signal(signal.SIGTERM, handle_interrupt) default_role = system_paasta_config.get_remote_run_config().get( 'default_role') assert default_role try: executor_stack = build_executor_stack( processor=processor, service=service, instance=instance, role=native_job_config.get_role() or default_role, pool=native_job_config.get_pool(), cluster=cluster, run_id=run_id, system_paasta_config=system_paasta_config, framework_staging_timeout=args.staging_timeout, ) runner = Sync(executor_stack) terminal_event = runner.run(task_config) runner.stop() except (Exception, ValueError) as e: paasta_print("Except while running executor stack: %s", e) traceback.print_exc() emit_counter_metric('paasta.remote_run.start.failed', service, instance) sys.exit(1) if terminal_event.success: paasta_print("Task finished successfully") sys.exit(0) else: paasta_print(PaastaColors.red(f"Task failed: {terminal_event.raw}"), ) # This is not necessarily an infrastructure failure. It may just be a # application failure. emit_counter_metric('paasta.remote_run.start.failed', service, instance) sys.exit(1)
def test_import(): from task_processing.task_processor import TaskProcessor tp = TaskProcessor() tp.load_plugin('task_processing.plugins.mesos')
def remote_run_start(args): system_paasta_config, service, cluster, soa_dir, instance, instance_type = extract_args(args) overrides_dict = {} constraints_json = args.constraints_json if constraints_json: try: constraints = json.loads(constraints_json) except Exception as e: paasta_print("Error while parsing constraints: %s", e) if constraints: overrides_dict['constraints'] = constraints if args.cmd: overrides_dict['cmd'] = args.cmd if args.instances: overrides_dict['instances'] = args.instances run_id = args.run_id if run_id is None: run_id = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(8) ) paasta_print("Assigned random run-id: %s" % run_id) if args.detach: paasta_print("Running in background") if os.fork() > 0: return os.setsid() if os.fork() > 0: return sys.stdout = open('/dev/null', 'w') sys.stderr = open('/dev/null', 'w') paasta_print('Scheduling a task on Mesos') processor = TaskProcessor() processor.load_plugin(provider_module='task_processing.plugins.mesos') processor.load_plugin(provider_module='task_processing.plugins.stateful') MesosExecutor = processor.executor_cls(provider='mesos') task_config = MesosExecutor.TASK_CONFIG_INTERFACE( **paasta_to_task_config_kwargs( service, instance, cluster, system_paasta_config, instance_type, soa_dir=soa_dir, config_overrides=overrides_dict, ), ) executor_stack = build_executor_stack( processor, service, instance, cluster, run_id, system_paasta_config, args.staging_timeout, ) runner = Sync(executor_stack) def handle_interrupt(_signum, _frame): paasta_print( PaastaColors.red("Signal received, shutting down scheduler."), ) runner.stop() if _signum == signal.SIGTERM: sys.exit(143) else: sys.exit(1) signal.signal(signal.SIGINT, handle_interrupt) signal.signal(signal.SIGTERM, handle_interrupt) terminal_event = runner.run(task_config) runner.stop() if terminal_event.success: paasta_print("Task finished successfully") sys.exit(0) else: paasta_print( PaastaColors.red("Task failed: {}".format(terminal_event.raw)), ) sys.exit(1)
def remote_run_start(args): """ Start a task in Mesos Steps: 1. Accumulate overrides 2. Create task configuration 3. Build executor stack 4. Run the task on the executor stack """ # accumulate all configuration needed to build what we need to run a task system_paasta_config, service, cluster, \ soa_dir, instance, instance_type = extract_args(args) # TODO: move run_id into task identifier? run_id = args.run_id or generate_run_id(length=10) framework_name = create_framework_name(service, instance, run_id) overrides = accumulate_config_overrides(args, service, instance) # TODO: implement DryRunExecutor? taskproc_config = system_paasta_config.get_taskproc() native_job_config = load_paasta_native_job_config( service, instance, cluster, soa_dir=soa_dir, instance_type=instance_type, config_overrides=overrides, load_deployments=not args.docker_image, ) region = args.aws_region or taskproc_config.get('aws_region') default_role = system_paasta_config.get_remote_run_config().get( 'default_role') assert default_role role = native_job_config.get_role() or default_role pool = native_job_config.get_pool() processor = TaskProcessor() processor.load_plugin(provider_module='task_processing.plugins.stateful') processor.load_plugin(provider_module='task_processing.plugins.mesos') if args.detach: paasta_print("Running in background") if os.fork() > 0: return os.setsid() if os.fork() > 0: return sys.stdout = open('/dev/null', 'w') sys.stderr = open('/dev/null', 'w') # create factory functions for task_config and executors, which makes it # easier to recreate them for retry purposes def task_config_factory(): return create_mesos_task_config( processor=processor, service=service, instance=instance, system_paasta_config=system_paasta_config, native_job_config=native_job_config, offer_timeout=args.staging_timeout, docker_image=args.docker_image, ) framework_config = dict( cluster=cluster, framework_name=framework_name, framework_staging_timeout=args.staging_timeout, role=role, pool=pool, ) executor_kwargs = dict( # used to create mesos executor processor=processor, system_paasta_config=system_paasta_config, taskproc_config=taskproc_config, **framework_config, ) def executor_factory(): mesos_executor = create_mesos_executor(**executor_kwargs) return build_executor_stack( processor, mesos_executor, taskproc_config, cluster, region, ) if args.dry_run: task_config_dict = task_config_to_dict(task_config_factory()) pp = pprint.PrettyPrinter(indent=2) paasta_print( PaastaColors.green("Would have run task with:"), PaastaColors.green("Framework config:"), pp.pformat(framework_config), PaastaColors.green("Task config:"), pp.pformat(task_config_dict), sep='\n', ) return terminals = run_tasks_with_retries( executor_factory, task_config_factory, retries=args.retries, ) final_event, final_task_config = terminals[-1] exit_code = handle_terminal_event( event=final_event, service=service, instance=instance, run_id=run_id, email_address=args.notification_email, framework_config=framework_config, task_config=final_task_config, ) sys.exit(exit_code)
class MesosCluster: def __init__( self, mesos_address, mesos_master_port=None, secret=None, principal=None, mesos_role=None, framework_id=None, enabled=True, default_volumes=None, dockercfg_location=None, offer_timeout=None, ): self.mesos_address = mesos_address self.mesos_master_port = mesos_master_port self.secret = secret self.principal = principal self.mesos_role = mesos_role self.enabled = enabled self.default_volumes = default_volumes or [] self.dockercfg_location = dockercfg_location self.offer_timeout = offer_timeout self.framework_id = framework_id self.processor = TaskProcessor() self.queue = PyDeferredQueue() self.deferred = None self.runner = None self.tasks = {} self.processor.load_plugin( provider_module='task_processing.plugins.mesos') self.connect() def set_enabled(self, is_enabled): self.enabled = is_enabled if is_enabled: self.connect() else: self.stop(fail_tasks=True) def configure_tasks( self, default_volumes, dockercfg_location, offer_timeout, ): self.default_volumes = default_volumes self.dockercfg_location = dockercfg_location self.offer_timeout = offer_timeout def connect(self): self.runner = self.get_runner(self.mesos_address, self.queue) self.handle_next_event() def handle_next_event(self, deferred_result=None): if self.deferred and not self.deferred.called: log.warning( 'Already have handlers waiting for next event in queue, ' 'not adding more') return self.deferred = self.queue.get() self.deferred.addCallback(self._process_event) self.deferred.addCallback(self.handle_next_event) self.deferred.addErrback(logError) self.deferred.addErrback(self.handle_next_event) def _check_connection(self): if self.runner.stopping: # Last framework was terminated for some reason, re-connect. log.info('Last framework stopped, re-connecting') self.connect() elif self.deferred.called: # Just in case callbacks are missing, re-add. self.handle_next_event() def submit(self, task): if not task: return if not self.enabled: task.log.info('Task failed to start, Mesos is disabled.') task.exited(1) return self._check_connection() mesos_task_id = task.get_mesos_id() self.tasks[mesos_task_id] = task env = task.get_config()['environment'] clusterman_resource_str = env.get('CLUSTERMAN_RESOURCES') clusterman_metrics = get_clusterman_metrics() if clusterman_resource_str and clusterman_metrics: clusterman_resources = json.loads(clusterman_resource_str) cluster = env.get('EXECUTOR_CLUSTER', env.get('PAASTA_CLUSTER')) pool = env.get('EXECUTOR_POOL', env.get('PAASTA_POOL')) aws_region = staticconf.read(f'clusters.{cluster}.aws_region', namespace='clusterman') metrics_client = clusterman_metrics.ClustermanMetricsBotoClient( region_name=aws_region, app_identifier=pool, ) with metrics_client.get_writer( clusterman_metrics.APP_METRICS, aggregate_meteorite_dims=True) as writer: for metric_key, metric_value in clusterman_resources.items(): writer.send((metric_key, int(time.time()), metric_value)) self.runner.run(task.get_config()) log.info( 'Submitting task {} to {}'.format( mesos_task_id, self.mesos_address, ), ) task.report_resources() def recover(self, task): if not task: return if not self.enabled: task.log.info('Could not recover task, Mesos is disabled.') task.exited(None) return self._check_connection() mesos_task_id = task.get_mesos_id() self.tasks[mesos_task_id] = task task.log.info( 'TRON RESTARTED! Starting recovery procedure by reconciling state for this task from Mesos' ) task.started() self.runner.reconcile(task.get_config()) task.report_resources() def create_task( self, action_run_id, command, cpus, mem, disk, constraints, docker_image, docker_parameters, env, extra_volumes, serializer, task_id=None, ): if not self.runner: return None uris = [self.dockercfg_location] if self.dockercfg_location else [] volumes = combine_volumes(self.default_volumes, extra_volumes) task_kwargs = { 'name': action_run_id, 'cmd': command, 'cpus': cpus, 'mem': mem, 'disk': disk, 'constraints': constraints, 'image': docker_image, 'docker_parameters': docker_parameters, 'environment': env, 'volumes': volumes, 'uris': uris, 'offer_timeout': self.offer_timeout, } task_config = self.runner.TASK_CONFIG_INTERFACE(**task_kwargs) if task_id is not None: try: task_config = task_config.set_task_id(task_id) except ValueError: log.error(f'Invalid {task_id} for {action_run_id}') return return MesosTask(action_run_id, task_config, serializer) def get_runner(self, mesos_address, queue): if not self.enabled: log.info('Mesos is disabled, not creating a framework.') return None if self.runner and not self.runner.stopping: log.info('Already have a running framework, not creating one.') return self.runner framework_name = 'tron-{}'.format(socket.gethostname()) executor = self.processor.executor_from_config( provider='mesos_task', provider_config={ 'secret': self.secret, 'principal': self.principal, 'mesos_address': get_mesos_leader(mesos_address, self.mesos_master_port), 'role': self.mesos_role, 'framework_name': framework_name, 'framework_id': self.framework_id, 'failover': True, }) def log_output(task_id, message, stream): logger = logging.getLogger('{}.{}.{}'.format( TASK_OUTPUT_LOGGER, task_id, stream, )) logger.info(message) logging_executor = self.processor.executor_from_config( provider='logging', provider_config={ 'downstream_executor': executor, 'handler': log_output, 'format_string': '{line}', }, ) return Subscription(logging_executor, queue) def _process_event(self, event): if event.kind == 'control': message = getattr(event, 'message', None) if message == 'stop': # Framework has been removed, stop it. log.warning('Framework has been stopped: {}'.format(event.raw)) self.stop() MesosClusterRepository.remove(self.mesos_address) elif message == 'unknown': log.warning('Unknown error from Mesos master: {}'.format( event.raw)) elif message == 'registered': framework_id = event.raw['framework_id']['value'] MesosClusterRepository.save(self.mesos_address, framework_id) else: log.warning('Unknown type of control event: {}'.format(event)) elif event.kind == 'task': if not hasattr(event, 'task_id'): log.warning('Task event missing task_id: {}'.format(event)) return if event.task_id not in self.tasks: log.warning( 'Received event for unknown task {}: {}'.format( event.task_id, event, ), ) return task = self.tasks[event.task_id] task.handle_event(event) if task.is_done: del self.tasks[event.task_id] else: log.warning('Unknown type of event: {}'.format(event)) def stop(self, fail_tasks=False): self.framework_id = None if self.runner: self.runner.stop() # Clear message queue if self.deferred: self.deferred.cancel() self.deferred = None self.queue = PyDeferredQueue() if fail_tasks: for key, task in list(self.tasks.items()): task.exited(None) del self.tasks[key] def kill(self, task_id): return self.runner.kill(task_id)
class MesosCluster: def __init__( self, mesos_address, mesos_master_port=None, secret=None, principal=None, mesos_role=None, framework_id=None, enabled=True, default_volumes=None, dockercfg_location=None, offer_timeout=None, ): self.mesos_address = mesos_address self.mesos_master_port = mesos_master_port self.secret = secret self.principal = principal self.mesos_role = mesos_role self.enabled = enabled self.default_volumes = default_volumes or [] self.dockercfg_location = dockercfg_location self.offer_timeout = offer_timeout self.framework_id = framework_id self.processor = TaskProcessor() self.queue = PyDeferredQueue() self.deferred = None self.runner = None self.tasks = {} self.processor.load_plugin( provider_module='task_processing.plugins.mesos' ) self.connect() def set_enabled(self, is_enabled): self.enabled = is_enabled if is_enabled: self.connect() else: self.stop(fail_tasks=True) def configure_tasks( self, default_volumes, dockercfg_location, offer_timeout, ): self.default_volumes = default_volumes self.dockercfg_location = dockercfg_location self.offer_timeout = offer_timeout def connect(self): self.runner = self.get_runner(self.mesos_address, self.queue) self.handle_next_event() def handle_next_event(self, deferred_result=None): if self.deferred and not self.deferred.called: log.warning( 'Already have handlers waiting for next event in queue, ' 'not adding more' ) return self.deferred = self.queue.get() self.deferred.addCallback(self._process_event) self.deferred.addCallback(self.handle_next_event) self.deferred.addErrback(logError) self.deferred.addErrback(self.handle_next_event) def _check_connection(self): if self.runner.stopping: # Last framework was terminated for some reason, re-connect. log.info('Last framework stopped, re-connecting') self.connect() elif self.deferred.called: # Just in case callbacks are missing, re-add. self.handle_next_event() def submit(self, task): if not task: return if not self.enabled: task.log.info('Task failed to start, Mesos is disabled.') task.exited(1) return self._check_connection() mesos_task_id = task.get_mesos_id() self.tasks[mesos_task_id] = task self.runner.run(task.get_config()) log.info( 'Submitting task {} to {}'.format( mesos_task_id, self.mesos_address, ), ) task.report_resources() def recover(self, task): if not task: return if not self.enabled: task.log.info('Could not recover task, Mesos is disabled.') task.exited(None) return self._check_connection() mesos_task_id = task.get_mesos_id() self.tasks[mesos_task_id] = task task.log.info('Reconciling state for this task from Mesos') task.started() self.runner.reconcile(task.get_config()) task.report_resources() def create_task( self, action_run_id, command, cpus, mem, disk, constraints, docker_image, docker_parameters, env, extra_volumes, serializer, task_id=None, ): if not self.runner: return None uris = [self.dockercfg_location] if self.dockercfg_location else [] volumes = combine_volumes(self.default_volumes, extra_volumes) task_kwargs = { 'name': action_run_id, 'cmd': command, 'cpus': cpus, 'mem': mem, 'disk': disk, 'constraints': constraints, 'image': docker_image, 'docker_parameters': docker_parameters, 'environment': env, 'volumes': volumes, 'uris': uris, 'offer_timeout': self.offer_timeout, } task_config = self.runner.TASK_CONFIG_INTERFACE(**task_kwargs) if task_id is not None: try: task_config = task_config.set_task_id(task_id) except ValueError: log.error(f'Invalid {task_id} for {action_run_id}') return return MesosTask(action_run_id, task_config, serializer) def get_runner(self, mesos_address, queue): if not self.enabled: log.info('Mesos is disabled, not creating a framework.') return None if self.runner and not self.runner.stopping: log.info('Already have a running framework, not creating one.') return self.runner framework_name = 'tron-{}'.format(socket.gethostname()) executor = self.processor.executor_from_config( provider='mesos_task', provider_config={ 'secret': self.secret, 'principal': self.principal, 'mesos_address': get_mesos_leader(mesos_address, self.mesos_master_port), 'role': self.mesos_role, 'framework_name': framework_name, 'framework_id': self.framework_id, 'failover': True, } ) def log_output(task_id, message, stream): logger = logging.getLogger( '{}.{}.{}'.format( TASK_OUTPUT_LOGGER, task_id, stream, ) ) logger.info(message) logging_executor = self.processor.executor_from_config( provider='logging', provider_config={ 'downstream_executor': executor, 'handler': log_output, 'format_string': '{line}', }, ) return Subscription(logging_executor, queue) def _process_event(self, event): if event.kind == 'control': message = getattr(event, 'message', None) if message == 'stop': # Framework has been removed, stop it. log.warning('Framework has been stopped: {}'.format(event.raw)) self.stop() MesosClusterRepository.remove(self.mesos_address) elif message == 'unknown': log.warning( 'Unknown error from Mesos master: {}'.format(event.raw) ) elif message == 'registered': framework_id = event.raw['framework_id']['value'] MesosClusterRepository.save(self.mesos_address, framework_id) else: log.warning('Unknown type of control event: {}'.format(event)) elif event.kind == 'task': if not hasattr(event, 'task_id'): log.warning('Task event missing task_id: {}'.format(event)) return if event.task_id not in self.tasks: log.warning( 'Received event for unknown task {}: {}'.format( event.task_id, event, ), ) return task = self.tasks[event.task_id] task.handle_event(event) if task.is_done: del self.tasks[event.task_id] else: log.warning('Unknown type of event: {}'.format(event)) def stop(self, fail_tasks=False): self.framework_id = None if self.runner: self.runner.stop() # Clear message queue if self.deferred: self.deferred.cancel() self.deferred = None self.queue = PyDeferredQueue() if fail_tasks: for key, task in list(self.tasks.items()): task.exited(None) del self.tasks[key] def kill(self, task_id): return self.runner.kill(task_id)