def __init__( self, job_run_id, name, node, command_config, parent_context=None, output_path=None, cleanup=False, start_time=None, end_time=None, run_state=SCHEDULED, exit_status=None, attempts=None, action_runner=None, retries_remaining=None, retries_delay=None, machine=None, executor=None, trigger_downstreams=None, triggered_by=None, on_upstream_rerun=None, trigger_timeout_timestamp=None, original_command=None, ): super().__init__() self.job_run_id = maybe_decode(job_run_id) self.action_name = maybe_decode(name) self.node = node self.start_time = start_time self.end_time = end_time self.exit_status = exit_status self.action_runner = action_runner or NoActionRunnerFactory() self.machine = machine or Machine.from_machine( ActionRun.STATE_MACHINE, None, run_state ) self.is_cleanup = cleanup self.executor = executor self.command_config = command_config self.original_command = original_command or command_config.command self.attempts = attempts or [] self.output_path = output_path or filehandler.OutputPath() self.output_path.append(self.action_name) self.context = command_context.build_context(self, parent_context) self.retries_remaining = retries_remaining self.retries_delay = retries_delay self.trigger_downstreams = trigger_downstreams self.triggered_by = triggered_by self.on_upstream_rerun = on_upstream_rerun self.trigger_timeout_timestamp = trigger_timeout_timestamp self.trigger_timeout_call = None self.action_command = None self.in_delay = None
def __init__(self, id, command, serializer=None): super().__init__() self.id = id self.command = command self.machine = Machine.from_machine(ActionCommand.STATE_MACHINE) self.exit_status = None self.start_time = None self.end_time = None if serializer: self.stdout = serializer.open(self.STDOUT) self.stderr = serializer.open(self.STDERR) else: self.stdout = filehandler.NullFileHandle self.stderr = filehandler.NullFileHandle
def __init__( self, job_run_id, name, node, bare_command=None, parent_context=None, output_path=None, cleanup=False, start_time=None, end_time=None, run_state=SCHEDULED, rendered_command=None, exit_status=None, action_runner=None, retries_remaining=None, retries_delay=None, exit_statuses=None, machine=None, executor=None, cpus=None, mem=None, constraints=None, docker_image=None, docker_parameters=None, env=None, extra_volumes=None, mesos_task_id=None, trigger_downstreams=None, triggered_by=None, on_upstream_rerun=None, ): super().__init__() self.job_run_id = maybe_decode(job_run_id) self.action_name = maybe_decode(name) self.node = node self.start_time = start_time self.end_time = end_time self.exit_status = exit_status self.bare_command = maybe_decode(bare_command) self.rendered_command = rendered_command self.action_runner = action_runner or NoActionRunnerFactory() self.machine = machine or Machine.from_machine(ActionRun.STATE_MACHINE, None, run_state) self.is_cleanup = cleanup self.executor = executor self.cpus = cpus self.mem = mem self.constraints = constraints self.docker_image = docker_image self.docker_parameters = docker_parameters self.env = env self.extra_volumes = extra_volumes self.mesos_task_id = mesos_task_id self.output_path = output_path or filehandler.OutputPath() self.output_path.append(self.id) self.context = command_context.build_context(self, parent_context) self.retries_remaining = retries_remaining self.retries_delay = retries_delay self.exit_statuses = exit_statuses self.trigger_downstreams = trigger_downstreams self.triggered_by = triggered_by self.on_upstream_rerun = on_upstream_rerun if self.exit_statuses is None: self.exit_statuses = [] self.action_command = None self.in_delay = None
class ActionRun(Observable): """Base class for tracking the state of a single run of an Action. ActionRun's state machine is observed by a parent JobRun. """ CANCELLED = 'cancelled' FAILED = 'failed' QUEUED = 'queued' RUNNING = 'running' SCHEDULED = 'scheduled' SKIPPED = 'skipped' STARTING = 'starting' SUCCEEDED = 'succeeded' UNKNOWN = 'unknown' default_transitions = dict(fail=FAILED, success=SUCCEEDED) STATE_MACHINE = Machine( 'scheduled', **{ CANCELLED: dict(skip=SKIPPED), FAILED: dict(skip=SKIPPED), RUNNING: dict(fail_unknown=UNKNOWN, **default_transitions), STARTING: dict(started=RUNNING, fail=FAILED), UNKNOWN: dict(running=RUNNING, **default_transitions), QUEUED: dict( cancel=CANCELLED, start=STARTING, schedule=SCHEDULED, **default_transitions, ), SCHEDULED: dict( ready=QUEUED, queue=QUEUED, cancel=CANCELLED, start=STARTING, **default_transitions, ), }) # The set of states that are considered end states. Technically some of # these states can be manually transitioned to other states. END_STATES = {FAILED, SUCCEEDED, CANCELLED, SKIPPED, UNKNOWN} # Failed render command is false to ensure that it will fail when run FAILED_RENDER = 'false # Command failed to render correctly. See the Tron error log.' NOTIFY_TRIGGER_READY = 'trigger_ready' context_class = command_context.ActionRunContext # TODO: create a class for ActionRunId, JobRunId, Etc def __init__( self, job_run_id, name, node, bare_command=None, parent_context=None, output_path=None, cleanup=False, start_time=None, end_time=None, run_state=SCHEDULED, rendered_command=None, exit_status=None, action_runner=None, retries_remaining=None, retries_delay=None, exit_statuses=None, machine=None, executor=None, cpus=None, mem=None, constraints=None, docker_image=None, docker_parameters=None, env=None, extra_volumes=None, mesos_task_id=None, trigger_downstreams=None, triggered_by=None, on_upstream_rerun=None, ): super().__init__() self.job_run_id = maybe_decode(job_run_id) self.action_name = maybe_decode(name) self.node = node self.start_time = start_time self.end_time = end_time self.exit_status = exit_status self.bare_command = maybe_decode(bare_command) self.rendered_command = rendered_command self.action_runner = action_runner or NoActionRunnerFactory() self.machine = machine or Machine.from_machine(ActionRun.STATE_MACHINE, None, run_state) self.is_cleanup = cleanup self.executor = executor self.cpus = cpus self.mem = mem self.constraints = constraints self.docker_image = docker_image self.docker_parameters = docker_parameters self.env = env self.extra_volumes = extra_volumes self.mesos_task_id = mesos_task_id self.output_path = output_path or filehandler.OutputPath() self.output_path.append(self.id) self.context = command_context.build_context(self, parent_context) self.retries_remaining = retries_remaining self.retries_delay = retries_delay self.exit_statuses = exit_statuses self.trigger_downstreams = trigger_downstreams self.triggered_by = triggered_by self.on_upstream_rerun = on_upstream_rerun if self.exit_statuses is None: self.exit_statuses = [] self.action_command = None self.in_delay = None @property def state(self): return self.machine.state @property def id(self): return f"{self.job_run_id}.{self.action_name}" @classmethod def from_state( cls, state_data, parent_context, output_path, job_run_node, cleanup=False, ): """Restore the state of this ActionRun from a serialized state.""" pool_repo = node.NodePoolRepository.get_instance() # Support state from older version if 'id' in state_data: job_run_id, action_name = state_data['id'].rsplit('.', 1) else: job_run_id = state_data['job_run_id'] action_name = state_data['action_name'] job_run_node = pool_repo.get_node( state_data.get('node_name'), job_run_node, ) action_runner_data = state_data.get('action_runner') if action_runner_data: action_runner = SubprocessActionRunnerFactory(**action_runner_data) else: action_runner = NoActionRunnerFactory() rendered_command = state_data.get('rendered_command') run = cls( job_run_id=job_run_id, name=action_name, node=job_run_node, parent_context=parent_context, output_path=output_path, rendered_command=rendered_command, bare_command=state_data['command'], cleanup=cleanup, start_time=state_data['start_time'], end_time=state_data['end_time'], run_state=state_data['state'], exit_status=state_data.get('exit_status'), retries_remaining=state_data.get('retries_remaining'), retries_delay=state_data.get('retries_delay'), exit_statuses=state_data.get('exit_statuses'), action_runner=action_runner, executor=state_data.get('executor', ExecutorTypes.ssh), cpus=state_data.get('cpus'), mem=state_data.get('mem'), constraints=state_data.get('constraints'), docker_image=state_data.get('docker_image'), docker_parameters=state_data.get('docker_parameters'), env=state_data.get('env'), extra_volumes=state_data.get('extra_volumes'), mesos_task_id=state_data.get('mesos_task_id'), trigger_downstreams=state_data.get('trigger_downstreams'), triggered_by=state_data.get('triggered_by'), on_upstream_rerun=state_data.get('on_upstream_rerun'), ) # Transition running to fail unknown because exit status was missed if run.is_running: run._done('fail_unknown') if run.is_starting: run._exit_unsuccessful(None) return run def start(self): """Start this ActionRun.""" if self.in_delay is not None: log.warning(f"{self} cancelling suspend timer") self.in_delay.cancel() self.in_delay = None if not self.machine.check('start'): return False if len(self.exit_statuses) == 0: log.info(f"{self} starting") else: log.info(f"{self} restarting, retry {len(self.exit_statuses)}") self.start_time = timeutils.current_time() self.transition_and_notify('start') if not self.is_valid_command: log.error(f"{self} invalid command: {self.bare_command}") self.fail(-1) return return self.submit_command() def submit_command(self): raise NotImplementedError() def stop(self): raise NotImplementedError() def kill(self, final=True): raise NotImplementedError() def _done(self, target, exit_status=0): if self.machine.check(target): if self.triggered_by: EventBus.clear_subscriptions(self.__hash__()) self.exit_status = exit_status self.end_time = timeutils.current_time() log.info(f"{self} completed with {target}, transitioned to " f"{self.state}, exit status: {exit_status}") return self.transition_and_notify(target) else: log.debug( f"{self} cannot transition from {self.state} via {target}") def retry(self): """Invoked externally (via API) when action needs to be re-tried manually. """ if self.retries_remaining is None or self.retries_remaining <= 0: self.retries_remaining = 1 if self.is_done: return self._exit_unsuccessful(self.exit_status) else: log.info(f"{self} getting killed for a retry") return self.kill(final=False) def start_after_delay(self): log.info(f"{self} resuming after retry delay") self.machine.reset() self.in_delay = None self.start() def restart(self): """Used by `fail` when action run has to be re-tried.""" if self.retries_delay is not None: self.in_delay = reactor.callLater(self.retries_delay.seconds, self.start_after_delay) log.info(f"{self} delaying for a retry in {self.retries_delay}s") else: self.machine.reset() return self.start() def fail(self, exit_status=None): if self.retries_remaining: self.retries_remaining = -1 return self._done('fail', exit_status) def _exit_unsuccessful(self, exit_status=None): if self.retries_remaining is not None: if self.retries_remaining > 0: self.retries_remaining -= 1 self.exit_statuses.append(exit_status) return self.restart() else: log.info("Reached maximum number of retries: {}".format( len(self.exit_statuses), )) return self.fail(exit_status) def emit_triggers(self): if isinstance(self.trigger_downstreams, bool): shortdate = self.render_template("{shortdate}") triggers = [f"shortdate.{shortdate}"] elif isinstance(self.trigger_downstreams, dict): triggers = [ f"{k}.{self.render_template(v)}" for k, v in self.trigger_downstreams.items() ] else: log.error(f"{self} trigger_downstreams must be true or dict") return log.info(f"{self} publishing triggers: [{', '.join(triggers)}]") job_id = '.'.join(self.job_run_id.split('.')[:-1]) for trigger in triggers: EventBus.publish(f"{job_id}.{self.action_name}.{trigger}") # TODO: subscribe for events and maintain a list of remaining triggers def remaining_triggers(self): return [ trigger for trigger in map(self.render_template, self.triggered_by or []) if not EventBus.has_event(trigger) ] def success(self): if self.trigger_downstreams: self.emit_triggers() return self._done('success') def fail_unknown(self): """Failed with unknown reason.""" log.warning(f"{self} lost communication") return self.transition_and_notify('fail_unknown') def cancel_delay(self): if self.in_delay is not None: self.in_delay.cancel() self.in_delay = None self.fail(-3) return True @property def state_data(self): """This data is used to serialize the state of this action run.""" rendered_command = self.rendered_command if isinstance(self.action_runner, NoActionRunnerFactory): action_runner = None else: action_runner = dict( status_path=self.action_runner.status_path, exec_path=self.action_runner.exec_path, ) # Freeze command after it's run command = rendered_command if rendered_command else self.bare_command return { 'job_run_id': self.job_run_id, 'action_name': self.action_name, 'state': self.state, 'start_time': self.start_time, 'end_time': self.end_time, 'command': command, 'rendered_command': self.rendered_command, 'node_name': self.node.get_name() if self.node else None, 'exit_status': self.exit_status, 'retries_remaining': self.retries_remaining, 'retries_delay': self.retries_delay, 'exit_statuses': self.exit_statuses, 'action_runner': action_runner, 'executor': self.executor, 'cpus': self.cpus, 'mem': self.mem, 'constraints': self.constraints, 'docker_image': self.docker_image, 'docker_parameters': self.docker_parameters, 'env': self.env, 'extra_volumes': self.extra_volumes, 'mesos_task_id': self.mesos_task_id, 'trigger_downstreams': self.trigger_downstreams, 'triggered_by': self.triggered_by, 'on_upstream_rerun': self.on_upstream_rerun, } def render_template(self, template): """Render our configured command using the command context.""" return StringFormatter(self.context).format(template) def render_command(self): """Render our configured command using the command context.""" return self.render_template(self.bare_command) @property def command(self): if self.rendered_command: return self.rendered_command try: self.rendered_command = self.render_command() except Exception as e: log.error(f"{self} failed rendering command: {e}") # Return a command string that will always fail self.rendered_command = self.FAILED_RENDER return self.rendered_command @property def is_valid_command(self): """Returns True if the bare_command was rendered without any errors. This has the side effect of actually rendering the bare_command. """ return self.command != self.FAILED_RENDER @property def is_done(self): return self.state in self.END_STATES @property def is_complete(self): return self.is_succeeded or self.is_skipped @property def is_broken(self): return self.is_failed or self.is_cancelled or self.is_unknown @property def is_active(self): return self.is_starting or self.is_running def cleanup(self): self.clear_observers() if self.triggered_by: EventBus.clear_subscriptions(self.__hash__()) self.cancel() def setup_subscriptions(self): for trigger_pattern in self.triggered_by or []: trigger = self.render_template(trigger_pattern) EventBus.subscribe(trigger, self.__hash__(), self.trigger_notify) def trigger_notify(self, *_): remaining = self.remaining_triggers() if not remaining: self.notify(ActionRun.NOTIFY_TRIGGER_READY) def __getattr__(self, name: str): """Support convenience properties for checking if this ActionRun is in a specific state (Ex: self.is_running would check if self.state is STATE_RUNNING) or for transitioning to a new state (ex: ready). """ if name in self.machine.transition_names: return lambda: self.transition_and_notify(name) if name.startswith('is_'): state_name = name.replace('is_', '') if state_name not in self.machine.states: raise AttributeError(f"{name} is not a state") return self.state == state_name else: raise AttributeError(name) def __str__(self): return f"ActionRun: {self.id}" def transition_and_notify(self, target): if self.machine.transition(target): self.notify(self.state) return True
class ActionRun(Observable): """Base class for tracking the state of a single run of an Action. ActionRun's state machine is observed by a parent JobRun. """ CANCELLED = 'cancelled' FAILED = 'failed' QUEUED = 'queued' RUNNING = 'running' SCHEDULED = 'scheduled' SKIPPED = 'skipped' STARTING = 'starting' SUCCEEDED = 'succeeded' WAITING = 'waiting' UNKNOWN = 'unknown' default_transitions = dict(fail=FAILED, success=SUCCEEDED) STATE_MACHINE = Machine( SCHEDULED, **{ CANCELLED: dict(skip=SKIPPED), FAILED: dict(skip=SKIPPED), RUNNING: dict( cancel=CANCELLED, fail_unknown=UNKNOWN, **default_transitions, ), STARTING: dict( started=RUNNING, fail=FAILED, fail_unknown=UNKNOWN, cancel=CANCELLED, ), UNKNOWN: dict( running=RUNNING, fail_unknown=UNKNOWN, **default_transitions ), WAITING: dict( cancel=CANCELLED, start=STARTING, **default_transitions, ), QUEUED: dict( ready=WAITING, cancel=CANCELLED, start=STARTING, schedule=SCHEDULED, **default_transitions, ), SCHEDULED: dict( ready=WAITING, queue=QUEUED, cancel=CANCELLED, start=STARTING, **default_transitions, ), } ) # The set of states that are considered end states. Technically some of # these states can be manually transitioned to other states. END_STATES = {FAILED, SUCCEEDED, CANCELLED, SKIPPED, UNKNOWN} # Failed render command is false to ensure that it will fail when run FAILED_RENDER = 'false # Command failed to render correctly. See the Tron error log.' NOTIFY_TRIGGER_READY = 'trigger_ready' EXIT_INVALID_COMMAND = -1 EXIT_NODE_ERROR = -2 EXIT_STOP_KILL = -3 EXIT_TRIGGER_TIMEOUT = -4 EXIT_MESOS_DISABLED = -5 EXIT_REASONS = { EXIT_INVALID_COMMAND: 'Invalid command', EXIT_NODE_ERROR: 'Node error', EXIT_STOP_KILL: 'Stopped or killed', EXIT_TRIGGER_TIMEOUT: 'Timed out waiting for trigger', EXIT_MESOS_DISABLED: 'Mesos disabled', } # This is a list of "alternate locations" that we can look for stdout/stderr in # The PR in question is https://github.com/Yelp/Tron/pull/735/files, which changed # the format of the stdout/stderr paths STDOUT_PATHS = [ os.path.join( '{namespace}.{jobname}', '{namespace}.{jobname}.{run_num}', '{namespace}.{jobname}.{run_num}.{action}', ), # old style paths (pre-#735 PR) os.path.join( '{namespace}.{jobname}', '{namespace}.{jobname}.{run_num}', '{namespace}.{jobname}.{run_num}.{action}', '{namespace}.{jobname}.{run_num}.recovery-{namespace}.{jobname}.{run_num}.{action}', ), # old style recovery paths (pre-#735 PR) os.path.join( '{namespace}', '{jobname}', '{run_num}', '{action}-recovery', ), # new style recovery paths (post-#735 PR) ] context_class = command_context.ActionRunContext # TODO: create a class for ActionRunId, JobRunId, Etc def __init__( self, job_run_id, name, node, command_config, parent_context=None, output_path=None, cleanup=False, start_time=None, end_time=None, run_state=SCHEDULED, exit_status=None, attempts=None, action_runner=None, retries_remaining=None, retries_delay=None, machine=None, executor=None, trigger_downstreams=None, triggered_by=None, on_upstream_rerun=None, trigger_timeout_timestamp=None, original_command=None, ): super().__init__() self.job_run_id = maybe_decode(job_run_id) self.action_name = maybe_decode(name) self.node = node self.start_time = start_time self.end_time = end_time self.exit_status = exit_status self.action_runner = action_runner or NoActionRunnerFactory() self.machine = machine or Machine.from_machine( ActionRun.STATE_MACHINE, None, run_state ) self.is_cleanup = cleanup self.executor = executor self.command_config = command_config self.original_command = original_command or command_config.command self.attempts = attempts or [] self.output_path = output_path or filehandler.OutputPath() self.output_path.append(self.action_name) self.context = command_context.build_context(self, parent_context) self.retries_remaining = retries_remaining self.retries_delay = retries_delay self.trigger_downstreams = trigger_downstreams self.triggered_by = triggered_by self.on_upstream_rerun = on_upstream_rerun self.trigger_timeout_timestamp = trigger_timeout_timestamp self.trigger_timeout_call = None self.action_command = None self.in_delay = None @property def state(self): return self.machine.state @property def id(self): return f"{self.job_run_id}.{self.action_name}" @property def name(self): return self.action_name @property def last_attempt(self): if self.attempts: return self.attempts[-1] return None @property def exit_statuses(self): if self.attempts: return [a.exit_status for a in self.attempts if a.end_time] return [] @property def command(self): if self.attempts: return self.attempts[-1].display_command else: return self.command_config.command @property def rendered_command(self): if self.attempts: return self.attempts[-1].rendered_command return None @classmethod def attempts_from_state(cls, state_data, command_config): attempts = [] if 'attempts' in state_data: attempts = [ActionRunAttempt.from_state(a) for a in state_data['attempts']] else: rendered_command = maybe_decode(state_data.get('rendered_command')) exit_statuses = state_data.get('exit_statuses', []) # If the action has started, add an attempt for the final try if state_data.get('start_time'): exit_statuses = exit_statuses + [state_data.get('exit_status')] for exit_status in exit_statuses: attempts.append(ActionRunAttempt( command_config=command_config, rendered_command=rendered_command, exit_status=exit_status, start_time='unknown', end_time='unknown', )) if attempts: attempts[-1].mesos_task_id = state_data.get('mesos_task_id') return attempts @classmethod def from_state( cls, state_data, parent_context, output_path, job_run_node, action_graph, cleanup=False, ): """Restore the state of this ActionRun from a serialized state.""" pool_repo = node.NodePoolRepository.get_instance() # Support state from older version if 'id' in state_data: job_run_id, action_name = state_data['id'].rsplit('.', 1) else: job_run_id = state_data['job_run_id'] action_name = state_data['action_name'] job_run_node = pool_repo.get_node( state_data.get('node_name'), job_run_node, ) action_runner_data = state_data.get('action_runner') if action_runner_data: action_runner = SubprocessActionRunnerFactory(**action_runner_data) else: action_runner = NoActionRunnerFactory() action_config = action_graph.action_map.get(action_name) if action_config: command_config = action_config.command_config else: command_config = action.ActionCommandConfig(command='') attempts = cls.attempts_from_state(state_data, command_config) run = cls( job_run_id=job_run_id, name=action_name, node=job_run_node, parent_context=parent_context, output_path=output_path, command_config=command_config, original_command=state_data.get('original_command'), cleanup=cleanup, start_time=state_data['start_time'], end_time=state_data['end_time'], run_state=state_data['state'], exit_status=state_data.get('exit_status'), attempts=attempts, retries_remaining=state_data.get('retries_remaining'), retries_delay=state_data.get('retries_delay'), action_runner=action_runner, executor=state_data.get('executor', ExecutorTypes.ssh.value), trigger_downstreams=state_data.get('trigger_downstreams'), triggered_by=state_data.get('triggered_by'), on_upstream_rerun=state_data.get('on_upstream_rerun'), trigger_timeout_timestamp=state_data.get('trigger_timeout_timestamp'), ) # Transition running to fail unknown because exit status was missed # Recovery will look for unknown runs if run.is_active: run.transition_and_notify('fail_unknown') return run def start(self, original_command=True): """Start this ActionRun.""" if self.in_delay is not None: log.warning(f"{self} cancelling suspend timer") self.in_delay.cancel() self.in_delay = None if not self.machine.check('start'): return False if len(self.attempts) == 0: log.info(f"{self} starting") else: log.info(f"{self} restarting, retry {len(self.attempts)}") new_attempt = self.create_attempt(original_command=original_command) self.start_time = new_attempt.start_time self.transition_and_notify('start') if not self.command_config.command: log.error(f"{self} no longer configured in tronfig, cannot run") self.fail(self.EXIT_INVALID_COMMAND) if not self.is_valid_command(new_attempt.rendered_command): log.error(f"{self} invalid command: {new_attempt.command_config.command}") self.fail(self.EXIT_INVALID_COMMAND) return return self.submit_command(new_attempt) def create_attempt(self, original_command=True): current_time = timeutils.current_time() command_config = self.command_config.copy() if original_command: command_config.command = self.original_command rendered_command = self.render_command(command_config.command) new_attempt = ActionRunAttempt( command_config=command_config, start_time=current_time, rendered_command=rendered_command, ) self.attempts.append(new_attempt) return new_attempt def submit_command(self, attempt): raise NotImplementedError() def stop(self): raise NotImplementedError() def kill(self, final=True): raise NotImplementedError() def recover(self): raise NotImplementedError() def _done(self, target, exit_status=0): if self.machine.check(target): if self.triggered_by: EventBus.clear_subscriptions(self.__hash__()) self.clear_trigger_timeout() self.exit_status = exit_status self.end_time = timeutils.current_time() if self.last_attempt is not None and self.last_attempt.end_time is None: self.last_attempt.exit(exit_status, self.end_time) log.info( f"{self} completed with {target}, transitioned to " f"{self.state}, exit status: {exit_status}" ) return self.transition_and_notify(target) else: log.debug( f"{self} cannot transition from {self.state} via {target}" ) def retry(self, original_command=True): """Invoked externally (via API) when action needs to be re-tried manually. """ # Manually retrying means we force the retries to be 1 and # Cancel any delay, so the retry is kicked off asap if self.retries_remaining is None or self.retries_remaining <= 0: self.retries_remaining = 1 if self.in_delay is not None: self.in_delay.cancel() self.in_delay = None self.retries_delay = None if self.is_done: self.machine.reset() return self._exit_unsuccessful(self.exit_status, retry_original_command=original_command) else: log.info(f"{self} getting killed for a retry") return self.kill(final=False) def start_after_delay(self): log.info(f"{self} resuming after retry delay") self.machine.reset() self.in_delay = None self.start() def restart(self, original_command=True): """Used by `fail` when action run has to be re-tried""" if self.retries_delay is not None: self.in_delay = reactor.callLater( self.retries_delay.total_seconds(), self.start_after_delay ) log.info(f"{self} delaying for a retry in {self.retries_delay}s") return True else: self.machine.reset() return self.start(original_command=original_command) def fail(self, exit_status=None): if self.retries_remaining: self.retries_remaining = -1 return self._done('fail', exit_status) def _exit_unsuccessful(self, exit_status=None, retry_original_command=True): if self.is_done: log.info( f'{self} got exit code {exit_status} but already in terminal ' f'state "{self.state}", not retrying', ) return if self.last_attempt is not None: self.last_attempt.exit(exit_status) if self.retries_remaining is not None: if self.retries_remaining > 0: self.retries_remaining -= 1 return self.restart(original_command=retry_original_command) else: log.info( "Reached maximum number of retries: {}".format( len(self.attempts), ) ) if exit_status is None: return self._done('fail_unknown', exit_status) else: return self._done('fail', exit_status) def triggers_to_emit(self) -> List[str]: if not self.trigger_downstreams: return [] if isinstance(self.trigger_downstreams, bool): templates = ["shortdate.{shortdate}"] elif isinstance(self.trigger_downstreams, dict): templates = [ f"{k}.{v}" for k, v in self.trigger_downstreams.items() ] else: log.error(f"{self} trigger_downstreams must be true or dict") return [self.render_template(trig) for trig in templates] def emit_triggers(self): triggers = self.triggers_to_emit() if not triggers: return log.info(f"{self} publishing triggers: [{', '.join(triggers)}]") job_id = '.'.join(self.job_run_id.split('.')[:-1]) for trigger in triggers: EventBus.publish(f"{job_id}.{self.action_name}.{trigger}") # TODO: cache if safe @property def rendered_triggers(self) -> List[str]: return [self.render_template(trig) for trig in self.triggered_by or []] # TODO: subscribe for events and maintain a list of remaining triggers @property def remaining_triggers(self): return [ trig for trig in self.rendered_triggers if not EventBus.has_event(trig) ] def success(self): if self.trigger_downstreams: self.emit_triggers() return self._done('success') def fail_unknown(self): """Failed with unknown reason.""" log.warning(f"{self} failed with no exit code") return self._done('fail_unknown', None) def cancel_delay(self): if self.in_delay is not None: self.in_delay.cancel() self.in_delay = None self.fail(self.EXIT_STOP_KILL) return True @property def state_data(self): """This data is used to serialize the state of this action run.""" if isinstance(self.action_runner, NoActionRunnerFactory): action_runner = None else: action_runner = dict( status_path=self.action_runner.status_path, exec_path=self.action_runner.exec_path, ) return { 'job_run_id': self.job_run_id, 'action_name': self.action_name, 'state': self.state, 'original_command': self.original_command, 'start_time': self.start_time, 'end_time': self.end_time, 'node_name': self.node.get_name() if self.node else None, 'exit_status': self.exit_status, 'attempts': [a.state_data for a in self.attempts], 'retries_remaining': self.retries_remaining, 'retries_delay': self.retries_delay, 'action_runner': action_runner, 'executor': self.executor, 'trigger_downstreams': self.trigger_downstreams, 'triggered_by': self.triggered_by, 'on_upstream_rerun': self.on_upstream_rerun, 'trigger_timeout_timestamp': self.trigger_timeout_timestamp, } def render_template(self, template): """Render our configured command using the command context.""" return StringFormatter(self.context).format(template) def render_command(self, command): """Render our configured command using the command context.""" try: return self.render_template(command) except Exception as e: log.error(f"{self} failed rendering command: {e}") # Return a command string that will always fail return self.FAILED_RENDER def is_valid_command(self, command): return command != self.FAILED_RENDER @property def is_done(self): return self.state in self.END_STATES @property def is_complete(self): return self.is_succeeded or self.is_skipped @property def is_broken(self): return self.is_failed or self.is_cancelled or self.is_unknown @property def is_active(self): return self.is_starting or self.is_running def cleanup(self): self.clear_observers() if self.triggered_by: EventBus.clear_subscriptions(self.__hash__()) self.clear_trigger_timeout() self.cancel() def clear_trigger_timeout(self): if self.trigger_timeout_call: self.trigger_timeout_call.cancel() self.trigger_timeout_call = None def setup_subscriptions(self): remaining_triggers = self.remaining_triggers if not remaining_triggers: return if self.trigger_timeout_timestamp: now = timeutils.current_time().timestamp() delay = max(self.trigger_timeout_timestamp - now, 1) self.trigger_timeout_call = reactor.callLater( delay, self.trigger_timeout_reached ) else: log.error(f"{self} has no trigger_timeout_timestamp") for trigger in remaining_triggers: EventBus.subscribe(trigger, self.__hash__(), self.trigger_notify) def trigger_timeout_reached(self): if self.remaining_triggers: self.trigger_timeout_call = None log.warning( f"{self} reached timeout waiting for: {self.remaining_triggers}" ) self.fail(self.EXIT_TRIGGER_TIMEOUT) else: self.notify(ActionRun.NOTIFY_TRIGGER_READY) def trigger_notify(self, *_): if not self.remaining_triggers: self.clear_trigger_timeout() self.notify(ActionRun.NOTIFY_TRIGGER_READY) @property def is_blocked_on_trigger(self): return not self.is_done and bool(self.remaining_triggers) def clear_end_state(self): self.exit_status = None self.end_time = None last_attempt = self.last_attempt if last_attempt: last_attempt.exit_status = None last_attempt.end_time = None def __getattr__(self, name: str): """Support convenience properties for checking if this ActionRun is in a specific state (Ex: self.is_running would check if self.state is STATE_RUNNING) or for transitioning to a new state (ex: ready). """ if name in self.machine.transition_names: return lambda: self.transition_and_notify(name) if name.startswith('is_'): state_name = name.replace('is_', '') if state_name not in self.machine.states: raise AttributeError(f"{name} is not a state") return self.state == state_name else: raise AttributeError(name) def __str__(self): return f"ActionRun: {self.id}" def transition_and_notify(self, target): if self.machine.transition(target): self.notify(self.state) return True
def __init__( self, job_run_id, name, node, bare_command=None, parent_context=None, output_path=None, cleanup=False, start_time=None, end_time=None, run_state=SCHEDULED, rendered_command=None, exit_status=None, action_runner=None, retries_remaining=None, retries_delay=None, exit_statuses=None, machine=None, executor=None, cpus=None, mem=None, disk=None, constraints=None, docker_image=None, docker_parameters=None, env=None, extra_volumes=None, mesos_task_id=None, trigger_downstreams=None, triggered_by=None, on_upstream_rerun=None, trigger_timeout_timestamp=None, ): super().__init__() self.job_run_id = maybe_decode(job_run_id) self.action_name = maybe_decode(name) self.node = node self.start_time = start_time self.end_time = end_time self.exit_status = exit_status self.bare_command = maybe_decode(bare_command) self.rendered_command = rendered_command self.action_runner = action_runner or NoActionRunnerFactory() self.machine = machine or Machine.from_machine( ActionRun.STATE_MACHINE, None, run_state ) self.is_cleanup = cleanup self.executor = executor self.cpus = cpus self.mem = mem self.disk = disk self.constraints = constraints self.docker_image = docker_image self.docker_parameters = docker_parameters self.env = env self.extra_volumes = extra_volumes self.mesos_task_id = mesos_task_id self.output_path = output_path or filehandler.OutputPath() self.output_path.append(self.id) self.context = command_context.build_context(self, parent_context) self.retries_remaining = retries_remaining self.retries_delay = retries_delay self.exit_statuses = exit_statuses self.trigger_downstreams = trigger_downstreams self.triggered_by = triggered_by self.on_upstream_rerun = on_upstream_rerun self.trigger_timeout_timestamp = trigger_timeout_timestamp self.trigger_timeout_call = None if self.exit_statuses is None: self.exit_statuses = [] self.action_command = None self.in_delay = None
class ActionCommand(Observable): """An ActionCommand encapsulates a runnable task that is passed to a node for execution. A Node calls: started (when the command starts) exited (when the command exits) write_<channel> (when output is received) done (when the command is finished) """ PENDING = 'pending' RUNNING = 'running' EXITING = 'exiting' COMPLETE = 'complete' FAILSTART = 'failstart' STATE_MACHINE = Machine( PENDING, **{ PENDING: { 'start': RUNNING, 'exit': FAILSTART }, RUNNING: { 'exit': EXITING }, EXITING: { 'close': COMPLETE }, }) STDOUT = '.stdout' STDERR = '.stderr' def __init__(self, id, command, serializer=None): super().__init__() self.id = id self.command = command self.machine = Machine.from_machine(ActionCommand.STATE_MACHINE) self.exit_status = None self.start_time = None self.end_time = None if serializer: self.stdout = serializer.open(self.STDOUT) self.stderr = serializer.open(self.STDERR) else: self.stdout = filehandler.NullFileHandle self.stderr = filehandler.NullFileHandle @property def state(self): return self.machine.state def transition_and_notify(self, target): if self.machine.transition(target): self.notify(self.state) return True def started(self): if self.machine.check('start'): self.start_time = timeutils.current_timestamp() return self.transition_and_notify('start') def exited(self, exit_status): if self.machine.check('exit'): self.end_time = timeutils.current_timestamp() self.exit_status = exit_status return self.transition_and_notify('exit') def write_stderr(self, value): self.stderr.write(value) def write_stdout(self, value): self.stdout.write(value) def done(self): if self.machine.check('close'): self.stdout.close() self.stderr.close() return self.transition_and_notify('close') def handle_errback(self, result): """Handle an unexpected error while being run. This will likely be an interval error. Cleanup the state of this ActionCommand and log something useful for debugging. """ log.error(f"Unknown failure for {self}, {str(result)}") self.exited(result) self.done() @property def is_unknown(self): return self.exit_status is None @property def is_failed(self): return bool(self.exit_status) @property def is_complete(self): """Complete implies done and success.""" return self.machine.state == ActionCommand.COMPLETE @property def is_done(self): """Done implies no more work will be done, but might not be success.""" return self.machine.state in (ActionCommand.COMPLETE, ActionCommand.FAILSTART) def __repr__(self): return f"ActionCommand {self.id} {self.command}: {self.state}"