def mesos_task_instance_from_assigned_task(assigned_task): """Deserialize MesosTaskInstance from an AssignedTask thrift.""" thermos_task = assigned_task.task.executorConfig.data if not thermos_task: raise TaskInfoError('Task did not have a thermos config!') try: json_blob = json.loads(thermos_task) except (TypeError, ValueError) as e: raise TaskInfoError('Could not deserialize thermos config: %s' % e) # TODO(wickman) Determine if there are any serialized MesosTaskInstances in the wild; # kill this code if not. if 'instance' in json_blob: return MesosTaskInstance.json_loads(thermos_task) # This is a MesosJob task_instance = task_instance_from_job( MesosJob.json_loads(thermos_task), assigned_task.instanceId) try: ThermosTaskValidator.assert_valid_task(task_instance.task()) ThermosTaskValidator.assert_all_refs_bound(task_instance) except ThermosTaskValidator.InvalidTaskError as e: raise UnexpectedUnboundRefsError('Got invalid task: %s' % e) task_instance, _ = task_instance.interpolate() return task_instance
def mesos_task_instance_from_assigned_task(assigned_task): """Deserialize MesosTaskInstance from an AssignedTask thrift.""" thermos_task = assigned_task.task.executorConfig.data if not thermos_task: raise TaskInfoError('Task did not have a thermos config!') try: json_blob = json.loads(thermos_task) except (TypeError, ValueError) as e: raise TaskInfoError('Could not deserialize thermos config: %s' % e) # TODO(wickman) Determine if there are any serialized MesosTaskInstances in the wild; # kill this code if not. if 'instance' in json_blob: return MesosTaskInstance.json_loads(thermos_task) # This is a MesosJob task_instance = task_instance_from_job(MesosJob.json_loads(thermos_task), assigned_task.instanceId, assigned_task.slaveHost) try: ThermosTaskValidator.assert_valid_task(task_instance.task()) ThermosTaskValidator.assert_all_refs_bound(task_instance) except ThermosTaskValidator.InvalidTaskError as e: raise UnexpectedUnboundRefsError('Got invalid task: %s' % e) task_instance, _ = task_instance.interpolate() return task_instance
def __init__(self, task, checkpoint_root, sandbox, log_dir=None, task_id=None, portmap=None, user=None, chroot=False, clock=time, universal_handler=None, planner_class=TaskPlanner, hostname=None, process_logger_destination=None, process_logger_mode=None, rotate_log_size_mb=None, rotate_log_backups=None, preserve_env=False): """ required: task (config.Task) = the task to run checkpoint_root (path) = the checkpoint root sandbox (path) = the sandbox in which the path will be run [if None, cwd will be assumed, but garbage collection will be disabled for this task.] optional: log_dir (string) = directory to house stdout/stderr logs. If not specified, logs will be written into the sandbox directory under .logs/ task_id (string) = bind to this task id. if not specified, will synthesize an id based upon task.name() portmap (dict) = a map (string => integer) from name to port, e.g. { 'http': 80 } user (string) = the user to run the task as. if not current user, requires setuid privileges. chroot (boolean) = whether or not to chroot into the sandbox prior to exec. clock (time interface) = the clock to use throughout universal_handler = checkpoint record handler (only used for testing) planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task planning policy. process_logger_destination (string) = The destination of logger to use for all processes. process_logger_mode (string) = The mode of logger to use for all processes. rotate_log_size_mb (integer) = The maximum size of the rotated stdout/stderr logs in MiB. rotate_log_backups (integer) = The maximum number of rotated stdout/stderr log backups. preserve_env (boolean) = whether or not env variables for the runner should be in the env for the task being run """ if not issubclass(planner_class, TaskPlanner): raise TypeError('planner_class must be a TaskPlanner.') self._clock = clock launch_time = self._clock.time() launch_time_ms = '%06d' % int( (launch_time - int(launch_time)) * (10**6)) if not task_id: self._task_id = '%s-%s.%s' % ( task.name(), time.strftime('%Y%m%d-%H%M%S', time.localtime(launch_time)), launch_time_ms) else: self._task_id = task_id current_user = TaskRunnerHelper.get_actual_user() self._user = user or current_user # TODO(wickman) This should be delegated to the ProcessPlatform / Helper if self._user != current_user: if os.geteuid() != 0: raise ValueError( 'task specifies user as %s, but %s does not have setuid permission!' % (self._user, current_user)) self._portmap = portmap or {} self._launch_time = launch_time self._log_dir = log_dir or os.path.join(sandbox, '.logs') self._process_logger_destination = process_logger_destination self._process_logger_mode = process_logger_mode self._rotate_log_size_mb = rotate_log_size_mb self._rotate_log_backups = rotate_log_backups self._pathspec = TaskPath(root=checkpoint_root, task_id=self._task_id, log_dir=self._log_dir) self._hostname = hostname or socket.gethostname() try: ThermosTaskValidator.assert_valid_task(task) ThermosTaskValidator.assert_valid_ports(task, self._portmap) except ThermosTaskValidator.InvalidTaskError as e: raise self.InvalidTask('Invalid task: %s' % e) context = ThermosContext(task_id=self._task_id, ports=self._portmap, user=self._user) self._task, uninterp = (task % Environment(thermos=context)).interpolate() if len(uninterp) > 0: raise self.InvalidTask('Failed to interpolate task, missing: %s' % ', '.join(str(ref) for ref in uninterp)) try: ThermosTaskValidator.assert_same_task(self._pathspec, self._task) except ThermosTaskValidator.InvalidTaskError as e: raise self.InvalidTask('Invalid task: %s' % e) self._plan = None # plan currently being executed (updated by Handlers) self._regular_plan = planner_class( self._task, clock=clock, process_filter=lambda proc: proc.final().get() is False) self._finalizing_plan = planner_class( self._task, clock=clock, process_filter=lambda proc: proc.final().get() is True) self._chroot = chroot self._sandbox = sandbox self._terminal_state = None self._ckpt = None self._process_map = dict( (p.name().get(), p) for p in self._task.processes()) self._task_processes = {} self._stages = dict( (state, stage(self)) for state, stage in self.STAGES.items()) self._finalization_start = None self._preemption_deadline = None self._watcher = ProcessMuxer(self._pathspec) self._state = RunnerState(processes={}) self._preserve_env = preserve_env # create runner state universal_handler = universal_handler or TaskRunnerUniversalHandler self._dispatcher = CheckpointDispatcher() self._dispatcher.register_handler(universal_handler(self)) self._dispatcher.register_handler(TaskRunnerProcessHandler(self)) self._dispatcher.register_handler(TaskRunnerTaskHandler(self)) # recover checkpointed runner state and update plan self._recovery = True self._replay_runner_ckpt()
def on_initialization(self, header): log.debug('_on_initialization: %s' % header) ThermosTaskValidator.assert_valid_task(self._runner.task) ThermosTaskValidator.assert_valid_ports(self._runner.task, header.ports) self._checkpoint(RunnerCkpt(runner_header=header))
def convert(job, metadata=frozenset(), ports=frozenset()): """Convert a Pystachio MesosJob to an Aurora Thrift JobConfiguration.""" owner = Identity(user=getpass.getuser()) key = JobKey( role=assert_valid_field('role', fully_interpolated(job.role())), environment=assert_valid_field('environment', fully_interpolated(job.environment())), name=assert_valid_field('name', fully_interpolated(job.name()))) task_raw = job.task() MB = 1024 * 1024 task = TaskConfig() def not_empty_or(item, default): return default if item is Empty else fully_interpolated(item) # job components task.production = fully_interpolated(job.production(), bool) task.isService = select_service_bit(job) task.maxTaskFailures = fully_interpolated(job.max_task_failures()) task.priority = fully_interpolated(job.priority()) task.contactEmail = not_empty_or(job.contact(), None) task.tier = not_empty_or(job.tier(), None) if job.has_partition_policy(): task.partitionPolicy = PartitionPolicy( fully_interpolated(job.partition_policy().reschedule()), fully_interpolated(job.partition_policy().delay_secs())) # Add metadata to a task, to display in the scheduler UI. metadata_set = frozenset() if job.has_metadata(): customized_metadata = job.metadata() metadata_set |= frozenset( (str(fully_interpolated(key_value_metadata.key())), str(fully_interpolated(key_value_metadata.value()))) for key_value_metadata in customized_metadata) metadata_set |= frozenset( (str(key), str(value)) for key, value in metadata) task.metadata = frozenset( Metadata(key=key, value=value) for key, value in metadata_set) # task components if not task_raw.has_resources(): raise InvalidConfig('Task must specify resources!') if (fully_interpolated(task_raw.resources().ram()) == 0 or fully_interpolated(task_raw.resources().disk()) == 0): raise InvalidConfig( 'Must specify ram and disk resources, got ram:%r disk:%r' % (fully_interpolated(task_raw.resources().ram()), fully_interpolated(task_raw.resources().disk()))) numCpus = fully_interpolated(task_raw.resources().cpu()) ramMb = fully_interpolated(task_raw.resources().ram()) / MB diskMb = fully_interpolated(task_raw.resources().disk()) / MB if numCpus <= 0 or ramMb <= 0 or diskMb <= 0: raise InvalidConfig( 'Task has invalid resources. cpu/ramMb/diskMb must all be positive: ' 'cpu:%r ramMb:%r diskMb:%r' % (numCpus, ramMb, diskMb)) numGpus = fully_interpolated(task_raw.resources().gpu()) task.resources = frozenset([ Resource(numCpus=numCpus), Resource(ramMb=ramMb), Resource(diskMb=diskMb) ] + [Resource(namedPort=p) for p in ports] + ([Resource(numGpus=numGpus)] if numGpus else [])) task.job = key task.owner = owner task.taskLinks = {} # See AURORA-739 task.constraints = constraints_to_thrift( not_empty_or(job.constraints(), {})) task.container = create_container_config(job.container()) underlying, refs = job.interpolate() # need to fake an instance id for the sake of schema checking underlying_checked = underlying.bind(mesos={ 'instance': 31337, 'hostname': '' }) try: ThermosTaskValidator.assert_valid_task(underlying_checked.task()) except ThermosTaskValidator.InvalidTaskError as e: raise InvalidConfig('Task is invalid: %s' % e) if not underlying_checked.check().ok(): raise InvalidConfig('Job not fully specified: %s' % underlying.check().message()) unbound = [] for ref in refs: if ref in (THERMOS_TASK_ID_REF, MESOS_INSTANCE_REF, MESOS_HOSTNAME_REF) or (Ref.subscope( THERMOS_PORT_SCOPE_REF, ref)): continue unbound.append(ref) if unbound: raise InvalidConfig('Config contains unbound variables: %s' % ' '.join(map(str, unbound))) # set the executor that will be used by the Mesos task. Thermos is the default executor = job.executor_config() if fully_interpolated(executor.name()) == AURORA_EXECUTOR_NAME: task.executorConfig = ExecutorConfig( name=AURORA_EXECUTOR_NAME, data=filter_aliased_fields(underlying).json_dumps()) else: task.executorConfig = ExecutorConfig( name=fully_interpolated(executor.name()), data=fully_interpolated(executor.data())) return JobConfiguration( key=key, owner=owner, cronSchedule=not_empty_or(job.cron_schedule(), None), cronCollisionPolicy=select_cron_policy(job.cron_collision_policy()), taskConfig=task, instanceCount=fully_interpolated(job.instances()))
def __init__(self, task, checkpoint_root, sandbox, log_dir=None, task_id=None, portmap=None, user=None, chroot=False, clock=time, universal_handler=None, planner_class=TaskPlanner, hostname=None, process_logger_destination=None, process_logger_mode=None, rotate_log_size_mb=None, rotate_log_backups=None, preserve_env=False, mesos_containerizer_path=None, container_sandbox=None): """ required: task (config.Task) = the task to run checkpoint_root (path) = the checkpoint root sandbox (path) = the sandbox in which the path will be run [if None, cwd will be assumed, but garbage collection will be disabled for this task.] optional: log_dir (string) = directory to house stdout/stderr logs. If not specified, logs will be written into the sandbox directory under .logs/ task_id (string) = bind to this task id. if not specified, will synthesize an id based upon task.name() portmap (dict) = a map (string => integer) from name to port, e.g. { 'http': 80 } user (string) = the user to run the task as. if not current user, requires setuid privileges. chroot (boolean) = whether or not to chroot into the sandbox prior to exec. clock (time interface) = the clock to use throughout universal_handler = checkpoint record handler (only used for testing) planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task planning policy. process_logger_destination (string) = The destination of logger to use for all processes. process_logger_mode (string) = The mode of logger to use for all processes. rotate_log_size_mb (integer) = The maximum size of the rotated stdout/stderr logs in MiB. rotate_log_backups (integer) = The maximum number of rotated stdout/stderr log backups. preserve_env (boolean) = whether or not env variables for the runner should be in the env for the task being run mesos_containerizer_path = the path to the mesos-containerizer executable that will be used to isolate the task's filesystem (if using a filesystem image). container_sandbox = the path within the isolated filesystem where the task's sandbox is mounted. """ if not issubclass(planner_class, TaskPlanner): raise TypeError('planner_class must be a TaskPlanner.') self._clock = clock launch_time = self._clock.time() launch_time_ms = '%06d' % int((launch_time - int(launch_time)) * (10 ** 6)) if not task_id: self._task_id = '%s-%s.%s' % (task.name(), time.strftime('%Y%m%d-%H%M%S', time.localtime(launch_time)), launch_time_ms) else: self._task_id = task_id current_user = TaskRunnerHelper.get_actual_user() self._user = user or current_user # TODO(wickman) This should be delegated to the ProcessPlatform / Helper if self._user != current_user: if os.geteuid() != 0: raise ValueError('task specifies user as %s, but %s does not have setuid permission!' % ( self._user, current_user)) self._portmap = portmap or {} self._launch_time = launch_time self._log_dir = log_dir or os.path.join(sandbox, '.logs') self._process_logger_destination = process_logger_destination self._process_logger_mode = process_logger_mode self._rotate_log_size_mb = rotate_log_size_mb self._rotate_log_backups = rotate_log_backups self._pathspec = TaskPath(root=checkpoint_root, task_id=self._task_id, log_dir=self._log_dir) self._hostname = hostname or socket.gethostname() try: ThermosTaskValidator.assert_valid_task(task) ThermosTaskValidator.assert_valid_ports(task, self._portmap) except ThermosTaskValidator.InvalidTaskError as e: raise self.InvalidTask('Invalid task: %s' % e) context = ThermosContext( task_id=self._task_id, ports=self._portmap, user=self._user) self._task, uninterp = (task % Environment(thermos=context)).interpolate() if len(uninterp) > 0: raise self.InvalidTask('Failed to interpolate task, missing: %s' % ', '.join(str(ref) for ref in uninterp)) try: ThermosTaskValidator.assert_same_task(self._pathspec, self._task) except ThermosTaskValidator.InvalidTaskError as e: raise self.InvalidTask('Invalid task: %s' % e) self._plan = None # plan currently being executed (updated by Handlers) self._regular_plan = planner_class(self._task, clock=clock, process_filter=lambda proc: proc.final().get() is False) self._finalizing_plan = planner_class(self._task, clock=clock, process_filter=lambda proc: proc.final().get() is True) self._chroot = chroot self._sandbox = sandbox self._container_sandbox = container_sandbox self._terminal_state = None self._ckpt = None self._process_map = dict((p.name().get(), p) for p in self._task.processes()) self._task_processes = {} self._stages = dict((state, stage(self)) for state, stage in self.STAGES.items()) self._finalization_start = None self._preemption_deadline = None self._watcher = ProcessMuxer(self._pathspec) self._state = RunnerState(processes={}) self._preserve_env = preserve_env self._mesos_containerizer_path = mesos_containerizer_path # create runner state universal_handler = universal_handler or TaskRunnerUniversalHandler self._dispatcher = CheckpointDispatcher() self._dispatcher.register_handler(universal_handler(self)) self._dispatcher.register_handler(TaskRunnerProcessHandler(self)) self._dispatcher.register_handler(TaskRunnerTaskHandler(self)) # recover checkpointed runner state and update plan self._recovery = True self._replay_runner_ckpt()
def on_initialization(self, header): log.debug('_on_initialization: %s', header) ThermosTaskValidator.assert_valid_task(self._runner.task) ThermosTaskValidator.assert_valid_ports(self._runner.task, header.ports) self._checkpoint(RunnerCkpt(runner_header=header))
def convert(job, metadata=frozenset(), ports=frozenset()): """Convert a Pystachio MesosJob to an Aurora Thrift JobConfiguration.""" owner = Identity(user=getpass.getuser()) key = JobKey( role=assert_valid_field('role', fully_interpolated(job.role())), environment=assert_valid_field('environment', fully_interpolated(job.environment())), name=assert_valid_field('name', fully_interpolated(job.name()))) task_raw = job.task() MB = 1024 * 1024 task = TaskConfig() def not_empty_or(item, default): return default if item is Empty else fully_interpolated(item) # job components task.production = fully_interpolated(job.production(), bool) task.isService = select_service_bit(job) task.maxTaskFailures = fully_interpolated(job.max_task_failures()) task.priority = fully_interpolated(job.priority()) task.contactEmail = not_empty_or(job.contact(), None) task.tier = not_empty_or(job.tier(), None) # Add metadata to a task, to display in the scheduler UI. task.metadata = frozenset(Metadata(key=str(key), value=str(value)) for key, value in metadata) # task components if not task_raw.has_resources(): raise InvalidConfig('Task must specify resources!') if (fully_interpolated(task_raw.resources().ram()) == 0 or fully_interpolated(task_raw.resources().disk()) == 0): raise InvalidConfig('Must specify ram and disk resources, got ram:%r disk:%r' % ( fully_interpolated(task_raw.resources().ram()), fully_interpolated(task_raw.resources().disk()))) task.numCpus = fully_interpolated(task_raw.resources().cpu()) task.ramMb = fully_interpolated(task_raw.resources().ram()) / MB task.diskMb = fully_interpolated(task_raw.resources().disk()) / MB if task.numCpus <= 0 or task.ramMb <= 0 or task.diskMb <= 0: raise InvalidConfig('Task has invalid resources. cpu/ramMb/diskMb must all be positive: ' 'cpu:%r ramMb:%r diskMb:%r' % (task.numCpus, task.ramMb, task.diskMb)) task.job = key task.owner = owner task.requestedPorts = ports task.taskLinks = {} # See AURORA-739 task.constraints = constraints_to_thrift(not_empty_or(job.constraints(), {})) task.container = create_container_config(job.container()) underlying, refs = job.interpolate() # need to fake an instance id for the sake of schema checking underlying_checked = underlying.bind(mesos={'instance': 31337, 'hostname': ''}) try: ThermosTaskValidator.assert_valid_task(underlying_checked.task()) except ThermosTaskValidator.InvalidTaskError as e: raise InvalidConfig('Task is invalid: %s' % e) if not underlying_checked.check().ok(): raise InvalidConfig('Job not fully specified: %s' % underlying.check().message()) unbound = [] for ref in refs: if ref in (THERMOS_TASK_ID_REF, MESOS_INSTANCE_REF, MESOS_HOSTNAME_REF) or ( Ref.subscope(THERMOS_PORT_SCOPE_REF, ref)): continue unbound.append(ref) if unbound: raise InvalidConfig('Config contains unbound variables: %s' % ' '.join(map(str, unbound))) task.executorConfig = ExecutorConfig( name=AURORA_EXECUTOR_NAME, data=filter_aliased_fields(underlying).json_dumps()) return JobConfiguration( key=key, owner=owner, cronSchedule=not_empty_or(job.cron_schedule(), None), cronCollisionPolicy=select_cron_policy(job.cron_collision_policy()), taskConfig=task, instanceCount=fully_interpolated(job.instances()))