Exemplo n.º 1
0
    def __init__(
        self,
        job_run_id,
        name,
        node,
        command_config,
        parent_context=None,
        output_path=None,
        cleanup=False,
        start_time=None,
        end_time=None,
        run_state=SCHEDULED,
        exit_status=None,
        attempts=None,
        action_runner=None,
        retries_remaining=None,
        retries_delay=None,
        machine=None,
        executor=None,
        trigger_downstreams=None,
        triggered_by=None,
        on_upstream_rerun=None,
        trigger_timeout_timestamp=None,
        original_command=None,
    ):
        super().__init__()
        self.job_run_id = maybe_decode(job_run_id)
        self.action_name = maybe_decode(name)
        self.node = node
        self.start_time = start_time
        self.end_time = end_time
        self.exit_status = exit_status
        self.action_runner = action_runner or NoActionRunnerFactory()
        self.machine = machine or Machine.from_machine(
            ActionRun.STATE_MACHINE, None, run_state
        )
        self.is_cleanup = cleanup

        self.executor = executor
        self.command_config = command_config
        self.original_command = original_command or command_config.command
        self.attempts = attempts or []
        self.output_path = output_path or filehandler.OutputPath()
        self.output_path.append(self.action_name)
        self.context = command_context.build_context(self, parent_context)
        self.retries_remaining = retries_remaining
        self.retries_delay = retries_delay
        self.trigger_downstreams = trigger_downstreams
        self.triggered_by = triggered_by
        self.on_upstream_rerun = on_upstream_rerun
        self.trigger_timeout_timestamp = trigger_timeout_timestamp
        self.trigger_timeout_call = None

        self.action_command = None
        self.in_delay = None
Exemplo n.º 2
0
 def __init__(self, id, command, serializer=None):
     super().__init__()
     self.id = id
     self.command = command
     self.machine = Machine.from_machine(ActionCommand.STATE_MACHINE)
     self.exit_status = None
     self.start_time = None
     self.end_time = None
     if serializer:
         self.stdout = serializer.open(self.STDOUT)
         self.stderr = serializer.open(self.STDERR)
     else:
         self.stdout = filehandler.NullFileHandle
         self.stderr = filehandler.NullFileHandle
Exemplo n.º 3
0
 def __init__(self, id, command, serializer=None):
     super().__init__()
     self.id = id
     self.command = command
     self.machine = Machine.from_machine(ActionCommand.STATE_MACHINE)
     self.exit_status = None
     self.start_time = None
     self.end_time = None
     if serializer:
         self.stdout = serializer.open(self.STDOUT)
         self.stderr = serializer.open(self.STDERR)
     else:
         self.stdout = filehandler.NullFileHandle
         self.stderr = filehandler.NullFileHandle
Exemplo n.º 4
0
    def __init__(
        self,
        job_run_id,
        name,
        node,
        bare_command=None,
        parent_context=None,
        output_path=None,
        cleanup=False,
        start_time=None,
        end_time=None,
        run_state=SCHEDULED,
        rendered_command=None,
        exit_status=None,
        action_runner=None,
        retries_remaining=None,
        retries_delay=None,
        exit_statuses=None,
        machine=None,
        executor=None,
        cpus=None,
        mem=None,
        constraints=None,
        docker_image=None,
        docker_parameters=None,
        env=None,
        extra_volumes=None,
        mesos_task_id=None,
        trigger_downstreams=None,
        triggered_by=None,
        on_upstream_rerun=None,
    ):
        super().__init__()
        self.job_run_id = maybe_decode(job_run_id)
        self.action_name = maybe_decode(name)
        self.node = node
        self.start_time = start_time
        self.end_time = end_time
        self.exit_status = exit_status
        self.bare_command = maybe_decode(bare_command)
        self.rendered_command = rendered_command
        self.action_runner = action_runner or NoActionRunnerFactory()
        self.machine = machine or Machine.from_machine(ActionRun.STATE_MACHINE,
                                                       None, run_state)
        self.is_cleanup = cleanup
        self.executor = executor
        self.cpus = cpus
        self.mem = mem
        self.constraints = constraints
        self.docker_image = docker_image
        self.docker_parameters = docker_parameters
        self.env = env
        self.extra_volumes = extra_volumes
        self.mesos_task_id = mesos_task_id
        self.output_path = output_path or filehandler.OutputPath()
        self.output_path.append(self.id)
        self.context = command_context.build_context(self, parent_context)
        self.retries_remaining = retries_remaining
        self.retries_delay = retries_delay
        self.exit_statuses = exit_statuses
        self.trigger_downstreams = trigger_downstreams
        self.triggered_by = triggered_by
        self.on_upstream_rerun = on_upstream_rerun

        if self.exit_statuses is None:
            self.exit_statuses = []

        self.action_command = None
        self.in_delay = None
Exemplo n.º 5
0
class ActionRun(Observable):
    """Base class for tracking the state of a single run of an Action.

    ActionRun's state machine is observed by a parent JobRun.
    """

    CANCELLED = 'cancelled'
    FAILED = 'failed'
    QUEUED = 'queued'
    RUNNING = 'running'
    SCHEDULED = 'scheduled'
    SKIPPED = 'skipped'
    STARTING = 'starting'
    SUCCEEDED = 'succeeded'
    UNKNOWN = 'unknown'

    default_transitions = dict(fail=FAILED, success=SUCCEEDED)
    STATE_MACHINE = Machine(
        'scheduled', **{
            CANCELLED:
            dict(skip=SKIPPED),
            FAILED:
            dict(skip=SKIPPED),
            RUNNING:
            dict(fail_unknown=UNKNOWN, **default_transitions),
            STARTING:
            dict(started=RUNNING, fail=FAILED),
            UNKNOWN:
            dict(running=RUNNING, **default_transitions),
            QUEUED:
            dict(
                cancel=CANCELLED,
                start=STARTING,
                schedule=SCHEDULED,
                **default_transitions,
            ),
            SCHEDULED:
            dict(
                ready=QUEUED,
                queue=QUEUED,
                cancel=CANCELLED,
                start=STARTING,
                **default_transitions,
            ),
        })

    # The set of states that are considered end states. Technically some of
    # these states can be manually transitioned to other states.
    END_STATES = {FAILED, SUCCEEDED, CANCELLED, SKIPPED, UNKNOWN}

    # Failed render command is false to ensure that it will fail when run
    FAILED_RENDER = 'false # Command failed to render correctly. See the Tron error log.'
    NOTIFY_TRIGGER_READY = 'trigger_ready'

    context_class = command_context.ActionRunContext

    # TODO: create a class for ActionRunId, JobRunId, Etc
    def __init__(
        self,
        job_run_id,
        name,
        node,
        bare_command=None,
        parent_context=None,
        output_path=None,
        cleanup=False,
        start_time=None,
        end_time=None,
        run_state=SCHEDULED,
        rendered_command=None,
        exit_status=None,
        action_runner=None,
        retries_remaining=None,
        retries_delay=None,
        exit_statuses=None,
        machine=None,
        executor=None,
        cpus=None,
        mem=None,
        constraints=None,
        docker_image=None,
        docker_parameters=None,
        env=None,
        extra_volumes=None,
        mesos_task_id=None,
        trigger_downstreams=None,
        triggered_by=None,
        on_upstream_rerun=None,
    ):
        super().__init__()
        self.job_run_id = maybe_decode(job_run_id)
        self.action_name = maybe_decode(name)
        self.node = node
        self.start_time = start_time
        self.end_time = end_time
        self.exit_status = exit_status
        self.bare_command = maybe_decode(bare_command)
        self.rendered_command = rendered_command
        self.action_runner = action_runner or NoActionRunnerFactory()
        self.machine = machine or Machine.from_machine(ActionRun.STATE_MACHINE,
                                                       None, run_state)
        self.is_cleanup = cleanup
        self.executor = executor
        self.cpus = cpus
        self.mem = mem
        self.constraints = constraints
        self.docker_image = docker_image
        self.docker_parameters = docker_parameters
        self.env = env
        self.extra_volumes = extra_volumes
        self.mesos_task_id = mesos_task_id
        self.output_path = output_path or filehandler.OutputPath()
        self.output_path.append(self.id)
        self.context = command_context.build_context(self, parent_context)
        self.retries_remaining = retries_remaining
        self.retries_delay = retries_delay
        self.exit_statuses = exit_statuses
        self.trigger_downstreams = trigger_downstreams
        self.triggered_by = triggered_by
        self.on_upstream_rerun = on_upstream_rerun

        if self.exit_statuses is None:
            self.exit_statuses = []

        self.action_command = None
        self.in_delay = None

    @property
    def state(self):
        return self.machine.state

    @property
    def id(self):
        return f"{self.job_run_id}.{self.action_name}"

    @classmethod
    def from_state(
        cls,
        state_data,
        parent_context,
        output_path,
        job_run_node,
        cleanup=False,
    ):
        """Restore the state of this ActionRun from a serialized state."""
        pool_repo = node.NodePoolRepository.get_instance()

        # Support state from older version
        if 'id' in state_data:
            job_run_id, action_name = state_data['id'].rsplit('.', 1)
        else:
            job_run_id = state_data['job_run_id']
            action_name = state_data['action_name']

        job_run_node = pool_repo.get_node(
            state_data.get('node_name'),
            job_run_node,
        )

        action_runner_data = state_data.get('action_runner')
        if action_runner_data:
            action_runner = SubprocessActionRunnerFactory(**action_runner_data)
        else:
            action_runner = NoActionRunnerFactory()

        rendered_command = state_data.get('rendered_command')
        run = cls(
            job_run_id=job_run_id,
            name=action_name,
            node=job_run_node,
            parent_context=parent_context,
            output_path=output_path,
            rendered_command=rendered_command,
            bare_command=state_data['command'],
            cleanup=cleanup,
            start_time=state_data['start_time'],
            end_time=state_data['end_time'],
            run_state=state_data['state'],
            exit_status=state_data.get('exit_status'),
            retries_remaining=state_data.get('retries_remaining'),
            retries_delay=state_data.get('retries_delay'),
            exit_statuses=state_data.get('exit_statuses'),
            action_runner=action_runner,
            executor=state_data.get('executor', ExecutorTypes.ssh),
            cpus=state_data.get('cpus'),
            mem=state_data.get('mem'),
            constraints=state_data.get('constraints'),
            docker_image=state_data.get('docker_image'),
            docker_parameters=state_data.get('docker_parameters'),
            env=state_data.get('env'),
            extra_volumes=state_data.get('extra_volumes'),
            mesos_task_id=state_data.get('mesos_task_id'),
            trigger_downstreams=state_data.get('trigger_downstreams'),
            triggered_by=state_data.get('triggered_by'),
            on_upstream_rerun=state_data.get('on_upstream_rerun'),
        )

        # Transition running to fail unknown because exit status was missed
        if run.is_running:
            run._done('fail_unknown')
        if run.is_starting:
            run._exit_unsuccessful(None)
        return run

    def start(self):
        """Start this ActionRun."""
        if self.in_delay is not None:
            log.warning(f"{self} cancelling suspend timer")
            self.in_delay.cancel()
            self.in_delay = None

        if not self.machine.check('start'):
            return False

        if len(self.exit_statuses) == 0:
            log.info(f"{self} starting")
        else:
            log.info(f"{self} restarting, retry {len(self.exit_statuses)}")

        self.start_time = timeutils.current_time()
        self.transition_and_notify('start')

        if not self.is_valid_command:
            log.error(f"{self} invalid command: {self.bare_command}")
            self.fail(-1)
            return

        return self.submit_command()

    def submit_command(self):
        raise NotImplementedError()

    def stop(self):
        raise NotImplementedError()

    def kill(self, final=True):
        raise NotImplementedError()

    def _done(self, target, exit_status=0):
        if self.machine.check(target):
            if self.triggered_by:
                EventBus.clear_subscriptions(self.__hash__())
            self.exit_status = exit_status
            self.end_time = timeutils.current_time()
            log.info(f"{self} completed with {target}, transitioned to "
                     f"{self.state}, exit status: {exit_status}")
            return self.transition_and_notify(target)
        else:
            log.debug(
                f"{self} cannot transition from {self.state} via {target}")

    def retry(self):
        """Invoked externally (via API) when action needs to be re-tried
        manually.
        """
        if self.retries_remaining is None or self.retries_remaining <= 0:
            self.retries_remaining = 1

        if self.is_done:
            return self._exit_unsuccessful(self.exit_status)
        else:
            log.info(f"{self} getting killed for a retry")
            return self.kill(final=False)

    def start_after_delay(self):
        log.info(f"{self} resuming after retry delay")
        self.machine.reset()
        self.in_delay = None
        self.start()

    def restart(self):
        """Used by `fail` when action run has to be re-tried."""
        if self.retries_delay is not None:
            self.in_delay = reactor.callLater(self.retries_delay.seconds,
                                              self.start_after_delay)
            log.info(f"{self} delaying for a retry in {self.retries_delay}s")
        else:
            self.machine.reset()
            return self.start()

    def fail(self, exit_status=None):
        if self.retries_remaining:
            self.retries_remaining = -1

        return self._done('fail', exit_status)

    def _exit_unsuccessful(self, exit_status=None):
        if self.retries_remaining is not None:
            if self.retries_remaining > 0:
                self.retries_remaining -= 1
                self.exit_statuses.append(exit_status)
                return self.restart()
            else:
                log.info("Reached maximum number of retries: {}".format(
                    len(self.exit_statuses), ))
        return self.fail(exit_status)

    def emit_triggers(self):
        if isinstance(self.trigger_downstreams, bool):
            shortdate = self.render_template("{shortdate}")
            triggers = [f"shortdate.{shortdate}"]
        elif isinstance(self.trigger_downstreams, dict):
            triggers = [
                f"{k}.{self.render_template(v)}"
                for k, v in self.trigger_downstreams.items()
            ]
        else:
            log.error(f"{self} trigger_downstreams must be true or dict")
            return

        log.info(f"{self} publishing triggers: [{', '.join(triggers)}]")
        job_id = '.'.join(self.job_run_id.split('.')[:-1])
        for trigger in triggers:
            EventBus.publish(f"{job_id}.{self.action_name}.{trigger}")

    # TODO: subscribe for events and maintain a list of remaining triggers
    def remaining_triggers(self):
        return [
            trigger
            for trigger in map(self.render_template, self.triggered_by or [])
            if not EventBus.has_event(trigger)
        ]

    def success(self):
        if self.trigger_downstreams:
            self.emit_triggers()
        return self._done('success')

    def fail_unknown(self):
        """Failed with unknown reason."""
        log.warning(f"{self} lost communication")
        return self.transition_and_notify('fail_unknown')

    def cancel_delay(self):
        if self.in_delay is not None:
            self.in_delay.cancel()
            self.in_delay = None
            self.fail(-3)
            return True

    @property
    def state_data(self):
        """This data is used to serialize the state of this action run."""
        rendered_command = self.rendered_command

        if isinstance(self.action_runner, NoActionRunnerFactory):
            action_runner = None
        else:
            action_runner = dict(
                status_path=self.action_runner.status_path,
                exec_path=self.action_runner.exec_path,
            )
        # Freeze command after it's run
        command = rendered_command if rendered_command else self.bare_command
        return {
            'job_run_id': self.job_run_id,
            'action_name': self.action_name,
            'state': self.state,
            'start_time': self.start_time,
            'end_time': self.end_time,
            'command': command,
            'rendered_command': self.rendered_command,
            'node_name': self.node.get_name() if self.node else None,
            'exit_status': self.exit_status,
            'retries_remaining': self.retries_remaining,
            'retries_delay': self.retries_delay,
            'exit_statuses': self.exit_statuses,
            'action_runner': action_runner,
            'executor': self.executor,
            'cpus': self.cpus,
            'mem': self.mem,
            'constraints': self.constraints,
            'docker_image': self.docker_image,
            'docker_parameters': self.docker_parameters,
            'env': self.env,
            'extra_volumes': self.extra_volumes,
            'mesos_task_id': self.mesos_task_id,
            'trigger_downstreams': self.trigger_downstreams,
            'triggered_by': self.triggered_by,
            'on_upstream_rerun': self.on_upstream_rerun,
        }

    def render_template(self, template):
        """Render our configured command using the command context."""
        return StringFormatter(self.context).format(template)

    def render_command(self):
        """Render our configured command using the command context."""
        return self.render_template(self.bare_command)

    @property
    def command(self):
        if self.rendered_command:
            return self.rendered_command
        try:
            self.rendered_command = self.render_command()
        except Exception as e:
            log.error(f"{self} failed rendering command: {e}")
            # Return a command string that will always fail
            self.rendered_command = self.FAILED_RENDER
        return self.rendered_command

    @property
    def is_valid_command(self):
        """Returns True if the bare_command was rendered without any errors.
        This has the side effect of actually rendering the bare_command.
        """
        return self.command != self.FAILED_RENDER

    @property
    def is_done(self):
        return self.state in self.END_STATES

    @property
    def is_complete(self):
        return self.is_succeeded or self.is_skipped

    @property
    def is_broken(self):
        return self.is_failed or self.is_cancelled or self.is_unknown

    @property
    def is_active(self):
        return self.is_starting or self.is_running

    def cleanup(self):
        self.clear_observers()
        if self.triggered_by:
            EventBus.clear_subscriptions(self.__hash__())
        self.cancel()

    def setup_subscriptions(self):
        for trigger_pattern in self.triggered_by or []:
            trigger = self.render_template(trigger_pattern)
            EventBus.subscribe(trigger, self.__hash__(), self.trigger_notify)

    def trigger_notify(self, *_):
        remaining = self.remaining_triggers()
        if not remaining:
            self.notify(ActionRun.NOTIFY_TRIGGER_READY)

    def __getattr__(self, name: str):
        """Support convenience properties for checking if this ActionRun is in
        a specific state (Ex: self.is_running would check if self.state is
        STATE_RUNNING) or for transitioning to a new state (ex: ready).
        """
        if name in self.machine.transition_names:
            return lambda: self.transition_and_notify(name)

        if name.startswith('is_'):
            state_name = name.replace('is_', '')
            if state_name not in self.machine.states:
                raise AttributeError(f"{name} is not a state")
            return self.state == state_name
        else:
            raise AttributeError(name)

    def __str__(self):
        return f"ActionRun: {self.id}"

    def transition_and_notify(self, target):
        if self.machine.transition(target):
            self.notify(self.state)
            return True
Exemplo n.º 6
0
class ActionRun(Observable):
    """Base class for tracking the state of a single run of an Action.

    ActionRun's state machine is observed by a parent JobRun.
    """

    CANCELLED = 'cancelled'
    FAILED = 'failed'
    QUEUED = 'queued'
    RUNNING = 'running'
    SCHEDULED = 'scheduled'
    SKIPPED = 'skipped'
    STARTING = 'starting'
    SUCCEEDED = 'succeeded'
    WAITING = 'waiting'
    UNKNOWN = 'unknown'

    default_transitions = dict(fail=FAILED, success=SUCCEEDED)
    STATE_MACHINE = Machine(
        SCHEDULED,
        **{
            CANCELLED:
                dict(skip=SKIPPED),
            FAILED:
                dict(skip=SKIPPED),
            RUNNING:
                dict(
                    cancel=CANCELLED,
                    fail_unknown=UNKNOWN,
                    **default_transitions,
                ),
            STARTING:
                dict(
                    started=RUNNING,
                    fail=FAILED,
                    fail_unknown=UNKNOWN,
                    cancel=CANCELLED,
                ),
            UNKNOWN:
                dict(
                    running=RUNNING,
                    fail_unknown=UNKNOWN,
                    **default_transitions
                ),
            WAITING:
                dict(
                    cancel=CANCELLED,
                    start=STARTING,
                    **default_transitions,
                ),
            QUEUED:
                dict(
                    ready=WAITING,
                    cancel=CANCELLED,
                    start=STARTING,
                    schedule=SCHEDULED,
                    **default_transitions,
                ),
            SCHEDULED:
                dict(
                    ready=WAITING,
                    queue=QUEUED,
                    cancel=CANCELLED,
                    start=STARTING,
                    **default_transitions,
                ),
        }
    )

    # The set of states that are considered end states. Technically some of
    # these states can be manually transitioned to other states.
    END_STATES = {FAILED, SUCCEEDED, CANCELLED, SKIPPED, UNKNOWN}

    # Failed render command is false to ensure that it will fail when run
    FAILED_RENDER = 'false # Command failed to render correctly. See the Tron error log.'
    NOTIFY_TRIGGER_READY = 'trigger_ready'

    EXIT_INVALID_COMMAND = -1
    EXIT_NODE_ERROR = -2
    EXIT_STOP_KILL = -3
    EXIT_TRIGGER_TIMEOUT = -4
    EXIT_MESOS_DISABLED = -5

    EXIT_REASONS = {
        EXIT_INVALID_COMMAND: 'Invalid command',
        EXIT_NODE_ERROR: 'Node error',
        EXIT_STOP_KILL: 'Stopped or killed',
        EXIT_TRIGGER_TIMEOUT: 'Timed out waiting for trigger',
        EXIT_MESOS_DISABLED: 'Mesos disabled',
    }

    # This is a list of "alternate locations" that we can look for stdout/stderr in
    # The PR in question is https://github.com/Yelp/Tron/pull/735/files, which changed
    # the format of the stdout/stderr paths
    STDOUT_PATHS = [
        os.path.join(
            '{namespace}.{jobname}',
            '{namespace}.{jobname}.{run_num}',
            '{namespace}.{jobname}.{run_num}.{action}',
        ),  # old style paths (pre-#735 PR)
        os.path.join(
            '{namespace}.{jobname}',
            '{namespace}.{jobname}.{run_num}',
            '{namespace}.{jobname}.{run_num}.{action}',
            '{namespace}.{jobname}.{run_num}.recovery-{namespace}.{jobname}.{run_num}.{action}',
        ),  # old style recovery paths (pre-#735 PR)
        os.path.join(
            '{namespace}',
            '{jobname}',
            '{run_num}',
            '{action}-recovery',
        ),  # new style recovery paths (post-#735 PR)
    ]

    context_class = command_context.ActionRunContext

    # TODO: create a class for ActionRunId, JobRunId, Etc
    def __init__(
        self,
        job_run_id,
        name,
        node,
        command_config,
        parent_context=None,
        output_path=None,
        cleanup=False,
        start_time=None,
        end_time=None,
        run_state=SCHEDULED,
        exit_status=None,
        attempts=None,
        action_runner=None,
        retries_remaining=None,
        retries_delay=None,
        machine=None,
        executor=None,
        trigger_downstreams=None,
        triggered_by=None,
        on_upstream_rerun=None,
        trigger_timeout_timestamp=None,
        original_command=None,
    ):
        super().__init__()
        self.job_run_id = maybe_decode(job_run_id)
        self.action_name = maybe_decode(name)
        self.node = node
        self.start_time = start_time
        self.end_time = end_time
        self.exit_status = exit_status
        self.action_runner = action_runner or NoActionRunnerFactory()
        self.machine = machine or Machine.from_machine(
            ActionRun.STATE_MACHINE, None, run_state
        )
        self.is_cleanup = cleanup

        self.executor = executor
        self.command_config = command_config
        self.original_command = original_command or command_config.command
        self.attempts = attempts or []
        self.output_path = output_path or filehandler.OutputPath()
        self.output_path.append(self.action_name)
        self.context = command_context.build_context(self, parent_context)
        self.retries_remaining = retries_remaining
        self.retries_delay = retries_delay
        self.trigger_downstreams = trigger_downstreams
        self.triggered_by = triggered_by
        self.on_upstream_rerun = on_upstream_rerun
        self.trigger_timeout_timestamp = trigger_timeout_timestamp
        self.trigger_timeout_call = None

        self.action_command = None
        self.in_delay = None

    @property
    def state(self):
        return self.machine.state

    @property
    def id(self):
        return f"{self.job_run_id}.{self.action_name}"

    @property
    def name(self):
        return self.action_name

    @property
    def last_attempt(self):
        if self.attempts:
            return self.attempts[-1]
        return None

    @property
    def exit_statuses(self):
        if self.attempts:
            return [a.exit_status for a in self.attempts if a.end_time]
        return []

    @property
    def command(self):
        if self.attempts:
            return self.attempts[-1].display_command
        else:
            return self.command_config.command

    @property
    def rendered_command(self):
        if self.attempts:
            return self.attempts[-1].rendered_command
        return None

    @classmethod
    def attempts_from_state(cls, state_data, command_config):
        attempts = []
        if 'attempts' in state_data:
            attempts = [ActionRunAttempt.from_state(a) for a in state_data['attempts']]
        else:
            rendered_command = maybe_decode(state_data.get('rendered_command'))
            exit_statuses = state_data.get('exit_statuses', [])
            # If the action has started, add an attempt for the final try
            if state_data.get('start_time'):
                exit_statuses = exit_statuses + [state_data.get('exit_status')]
            for exit_status in exit_statuses:
                attempts.append(ActionRunAttempt(
                    command_config=command_config,
                    rendered_command=rendered_command,
                    exit_status=exit_status,
                    start_time='unknown',
                    end_time='unknown',
                ))
            if attempts:
                attempts[-1].mesos_task_id = state_data.get('mesos_task_id')
        return attempts

    @classmethod
    def from_state(
        cls,
        state_data,
        parent_context,
        output_path,
        job_run_node,
        action_graph,
        cleanup=False,
    ):
        """Restore the state of this ActionRun from a serialized state."""
        pool_repo = node.NodePoolRepository.get_instance()

        # Support state from older version
        if 'id' in state_data:
            job_run_id, action_name = state_data['id'].rsplit('.', 1)
        else:
            job_run_id = state_data['job_run_id']
            action_name = state_data['action_name']

        job_run_node = pool_repo.get_node(
            state_data.get('node_name'),
            job_run_node,
        )

        action_runner_data = state_data.get('action_runner')
        if action_runner_data:
            action_runner = SubprocessActionRunnerFactory(**action_runner_data)
        else:
            action_runner = NoActionRunnerFactory()

        action_config = action_graph.action_map.get(action_name)
        if action_config:
            command_config = action_config.command_config
        else:
            command_config = action.ActionCommandConfig(command='')

        attempts = cls.attempts_from_state(state_data, command_config)
        run = cls(
            job_run_id=job_run_id,
            name=action_name,
            node=job_run_node,
            parent_context=parent_context,
            output_path=output_path,
            command_config=command_config,
            original_command=state_data.get('original_command'),
            cleanup=cleanup,
            start_time=state_data['start_time'],
            end_time=state_data['end_time'],
            run_state=state_data['state'],
            exit_status=state_data.get('exit_status'),
            attempts=attempts,
            retries_remaining=state_data.get('retries_remaining'),
            retries_delay=state_data.get('retries_delay'),
            action_runner=action_runner,
            executor=state_data.get('executor', ExecutorTypes.ssh.value),
            trigger_downstreams=state_data.get('trigger_downstreams'),
            triggered_by=state_data.get('triggered_by'),
            on_upstream_rerun=state_data.get('on_upstream_rerun'),
            trigger_timeout_timestamp=state_data.get('trigger_timeout_timestamp'),
        )

        # Transition running to fail unknown because exit status was missed
        # Recovery will look for unknown runs
        if run.is_active:
            run.transition_and_notify('fail_unknown')
        return run

    def start(self, original_command=True):
        """Start this ActionRun."""
        if self.in_delay is not None:
            log.warning(f"{self} cancelling suspend timer")
            self.in_delay.cancel()
            self.in_delay = None

        if not self.machine.check('start'):
            return False

        if len(self.attempts) == 0:
            log.info(f"{self} starting")
        else:
            log.info(f"{self} restarting, retry {len(self.attempts)}")

        new_attempt = self.create_attempt(original_command=original_command)
        self.start_time = new_attempt.start_time
        self.transition_and_notify('start')

        if not self.command_config.command:
            log.error(f"{self} no longer configured in tronfig, cannot run")
            self.fail(self.EXIT_INVALID_COMMAND)

        if not self.is_valid_command(new_attempt.rendered_command):
            log.error(f"{self} invalid command: {new_attempt.command_config.command}")
            self.fail(self.EXIT_INVALID_COMMAND)
            return

        return self.submit_command(new_attempt)

    def create_attempt(self, original_command=True):
        current_time = timeutils.current_time()
        command_config = self.command_config.copy()
        if original_command:
            command_config.command = self.original_command
        rendered_command = self.render_command(command_config.command)
        new_attempt = ActionRunAttempt(
            command_config=command_config,
            start_time=current_time,
            rendered_command=rendered_command,
        )
        self.attempts.append(new_attempt)
        return new_attempt

    def submit_command(self, attempt):
        raise NotImplementedError()

    def stop(self):
        raise NotImplementedError()

    def kill(self, final=True):
        raise NotImplementedError()

    def recover(self):
        raise NotImplementedError()

    def _done(self, target, exit_status=0):
        if self.machine.check(target):
            if self.triggered_by:
                EventBus.clear_subscriptions(self.__hash__())
            self.clear_trigger_timeout()
            self.exit_status = exit_status
            self.end_time = timeutils.current_time()
            if self.last_attempt is not None and self.last_attempt.end_time is None:
                self.last_attempt.exit(exit_status, self.end_time)
            log.info(
                f"{self} completed with {target}, transitioned to "
                f"{self.state}, exit status: {exit_status}"
            )
            return self.transition_and_notify(target)
        else:
            log.debug(
                f"{self} cannot transition from {self.state} via {target}"
            )

    def retry(self, original_command=True):
        """Invoked externally (via API) when action needs to be re-tried
        manually.
        """

        # Manually retrying means we force the retries to be 1 and
        # Cancel any delay, so the retry is kicked off asap
        if self.retries_remaining is None or self.retries_remaining <= 0:
            self.retries_remaining = 1
        if self.in_delay is not None:
            self.in_delay.cancel()
            self.in_delay = None
        self.retries_delay = None

        if self.is_done:
            self.machine.reset()
            return self._exit_unsuccessful(self.exit_status, retry_original_command=original_command)
        else:
            log.info(f"{self} getting killed for a retry")
            return self.kill(final=False)

    def start_after_delay(self):
        log.info(f"{self} resuming after retry delay")
        self.machine.reset()
        self.in_delay = None
        self.start()

    def restart(self, original_command=True):
        """Used by `fail` when action run has to be re-tried"""
        if self.retries_delay is not None:
            self.in_delay = reactor.callLater(
                self.retries_delay.total_seconds(), self.start_after_delay
            )
            log.info(f"{self} delaying for a retry in {self.retries_delay}s")
            return True
        else:
            self.machine.reset()
            return self.start(original_command=original_command)

    def fail(self, exit_status=None):
        if self.retries_remaining:
            self.retries_remaining = -1

        return self._done('fail', exit_status)

    def _exit_unsuccessful(self, exit_status=None, retry_original_command=True):
        if self.is_done:
            log.info(
                f'{self} got exit code {exit_status} but already in terminal '
                f'state "{self.state}", not retrying',
            )
            return
        if self.last_attempt is not None:
            self.last_attempt.exit(exit_status)
        if self.retries_remaining is not None:
            if self.retries_remaining > 0:
                self.retries_remaining -= 1
                return self.restart(original_command=retry_original_command)
            else:
                log.info(
                    "Reached maximum number of retries: {}".format(
                        len(self.attempts),
                    )
                )
        if exit_status is None:
            return self._done('fail_unknown', exit_status)
        else:
            return self._done('fail', exit_status)

    def triggers_to_emit(self) -> List[str]:
        if not self.trigger_downstreams:
            return []

        if isinstance(self.trigger_downstreams, bool):
            templates = ["shortdate.{shortdate}"]
        elif isinstance(self.trigger_downstreams, dict):
            templates = [
                f"{k}.{v}" for k, v in self.trigger_downstreams.items()
            ]
        else:
            log.error(f"{self} trigger_downstreams must be true or dict")

        return [self.render_template(trig) for trig in templates]

    def emit_triggers(self):
        triggers = self.triggers_to_emit()
        if not triggers:
            return

        log.info(f"{self} publishing triggers: [{', '.join(triggers)}]")
        job_id = '.'.join(self.job_run_id.split('.')[:-1])
        for trigger in triggers:
            EventBus.publish(f"{job_id}.{self.action_name}.{trigger}")

    # TODO: cache if safe
    @property
    def rendered_triggers(self) -> List[str]:
        return [self.render_template(trig) for trig in self.triggered_by or []]

    # TODO: subscribe for events and maintain a list of remaining triggers
    @property
    def remaining_triggers(self):
        return [
            trig for trig in self.rendered_triggers
            if not EventBus.has_event(trig)
        ]

    def success(self):
        if self.trigger_downstreams:
            self.emit_triggers()
        return self._done('success')

    def fail_unknown(self):
        """Failed with unknown reason."""
        log.warning(f"{self} failed with no exit code")
        return self._done('fail_unknown', None)

    def cancel_delay(self):
        if self.in_delay is not None:
            self.in_delay.cancel()
            self.in_delay = None
            self.fail(self.EXIT_STOP_KILL)
            return True

    @property
    def state_data(self):
        """This data is used to serialize the state of this action run."""

        if isinstance(self.action_runner, NoActionRunnerFactory):
            action_runner = None
        else:
            action_runner = dict(
                status_path=self.action_runner.status_path,
                exec_path=self.action_runner.exec_path,
            )

        return {
            'job_run_id': self.job_run_id,
            'action_name': self.action_name,
            'state': self.state,
            'original_command': self.original_command,
            'start_time': self.start_time,
            'end_time': self.end_time,
            'node_name': self.node.get_name() if self.node else None,
            'exit_status': self.exit_status,
            'attempts': [a.state_data for a in self.attempts],
            'retries_remaining': self.retries_remaining,
            'retries_delay': self.retries_delay,
            'action_runner': action_runner,
            'executor': self.executor,
            'trigger_downstreams': self.trigger_downstreams,
            'triggered_by': self.triggered_by,
            'on_upstream_rerun': self.on_upstream_rerun,
            'trigger_timeout_timestamp': self.trigger_timeout_timestamp,
        }

    def render_template(self, template):
        """Render our configured command using the command context."""
        return StringFormatter(self.context).format(template)

    def render_command(self, command):
        """Render our configured command using the command context."""
        try:
            return self.render_template(command)
        except Exception as e:
            log.error(f"{self} failed rendering command: {e}")
            # Return a command string that will always fail
            return self.FAILED_RENDER

    def is_valid_command(self, command):
        return command != self.FAILED_RENDER

    @property
    def is_done(self):
        return self.state in self.END_STATES

    @property
    def is_complete(self):
        return self.is_succeeded or self.is_skipped

    @property
    def is_broken(self):
        return self.is_failed or self.is_cancelled or self.is_unknown

    @property
    def is_active(self):
        return self.is_starting or self.is_running

    def cleanup(self):
        self.clear_observers()
        if self.triggered_by:
            EventBus.clear_subscriptions(self.__hash__())
        self.clear_trigger_timeout()
        self.cancel()

    def clear_trigger_timeout(self):
        if self.trigger_timeout_call:
            self.trigger_timeout_call.cancel()
            self.trigger_timeout_call = None

    def setup_subscriptions(self):
        remaining_triggers = self.remaining_triggers
        if not remaining_triggers:
            return

        if self.trigger_timeout_timestamp:
            now = timeutils.current_time().timestamp()
            delay = max(self.trigger_timeout_timestamp - now, 1)
            self.trigger_timeout_call = reactor.callLater(
                delay, self.trigger_timeout_reached
            )
        else:
            log.error(f"{self} has no trigger_timeout_timestamp")

        for trigger in remaining_triggers:
            EventBus.subscribe(trigger, self.__hash__(), self.trigger_notify)

    def trigger_timeout_reached(self):
        if self.remaining_triggers:
            self.trigger_timeout_call = None
            log.warning(
                f"{self} reached timeout waiting for: {self.remaining_triggers}"
            )
            self.fail(self.EXIT_TRIGGER_TIMEOUT)
        else:
            self.notify(ActionRun.NOTIFY_TRIGGER_READY)

    def trigger_notify(self, *_):
        if not self.remaining_triggers:
            self.clear_trigger_timeout()
            self.notify(ActionRun.NOTIFY_TRIGGER_READY)

    @property
    def is_blocked_on_trigger(self):
        return not self.is_done and bool(self.remaining_triggers)

    def clear_end_state(self):
        self.exit_status = None
        self.end_time = None
        last_attempt = self.last_attempt
        if last_attempt:
            last_attempt.exit_status = None
            last_attempt.end_time = None

    def __getattr__(self, name: str):
        """Support convenience properties for checking if this ActionRun is in
        a specific state (Ex: self.is_running would check if self.state is
        STATE_RUNNING) or for transitioning to a new state (ex: ready).
        """
        if name in self.machine.transition_names:
            return lambda: self.transition_and_notify(name)

        if name.startswith('is_'):
            state_name = name.replace('is_', '')
            if state_name not in self.machine.states:
                raise AttributeError(f"{name} is not a state")
            return self.state == state_name
        else:
            raise AttributeError(name)

    def __str__(self):
        return f"ActionRun: {self.id}"

    def transition_and_notify(self, target):
        if self.machine.transition(target):
            self.notify(self.state)
            return True
Exemplo n.º 7
0
    def __init__(
        self,
        job_run_id,
        name,
        node,
        bare_command=None,
        parent_context=None,
        output_path=None,
        cleanup=False,
        start_time=None,
        end_time=None,
        run_state=SCHEDULED,
        rendered_command=None,
        exit_status=None,
        action_runner=None,
        retries_remaining=None,
        retries_delay=None,
        exit_statuses=None,
        machine=None,
        executor=None,
        cpus=None,
        mem=None,
        disk=None,
        constraints=None,
        docker_image=None,
        docker_parameters=None,
        env=None,
        extra_volumes=None,
        mesos_task_id=None,
        trigger_downstreams=None,
        triggered_by=None,
        on_upstream_rerun=None,
        trigger_timeout_timestamp=None,
    ):
        super().__init__()
        self.job_run_id = maybe_decode(job_run_id)
        self.action_name = maybe_decode(name)
        self.node = node
        self.start_time = start_time
        self.end_time = end_time
        self.exit_status = exit_status
        self.bare_command = maybe_decode(bare_command)
        self.rendered_command = rendered_command
        self.action_runner = action_runner or NoActionRunnerFactory()
        self.machine = machine or Machine.from_machine(
            ActionRun.STATE_MACHINE, None, run_state
        )
        self.is_cleanup = cleanup
        self.executor = executor
        self.cpus = cpus
        self.mem = mem
        self.disk = disk
        self.constraints = constraints
        self.docker_image = docker_image
        self.docker_parameters = docker_parameters
        self.env = env
        self.extra_volumes = extra_volumes
        self.mesos_task_id = mesos_task_id
        self.output_path = output_path or filehandler.OutputPath()
        self.output_path.append(self.id)
        self.context = command_context.build_context(self, parent_context)
        self.retries_remaining = retries_remaining
        self.retries_delay = retries_delay
        self.exit_statuses = exit_statuses
        self.trigger_downstreams = trigger_downstreams
        self.triggered_by = triggered_by
        self.on_upstream_rerun = on_upstream_rerun
        self.trigger_timeout_timestamp = trigger_timeout_timestamp
        self.trigger_timeout_call = None

        if self.exit_statuses is None:
            self.exit_statuses = []

        self.action_command = None
        self.in_delay = None
Exemplo n.º 8
0
class ActionCommand(Observable):
    """An ActionCommand encapsulates a runnable task that is passed to a node
    for execution.

    A Node calls:
      started   (when the command starts)
      exited    (when the command exits)
      write_<channel> (when output is received)
      done      (when the command is finished)
    """

    PENDING = 'pending'
    RUNNING = 'running'
    EXITING = 'exiting'
    COMPLETE = 'complete'
    FAILSTART = 'failstart'

    STATE_MACHINE = Machine(
        PENDING, **{
            PENDING: {
                'start': RUNNING,
                'exit': FAILSTART
            },
            RUNNING: {
                'exit': EXITING
            },
            EXITING: {
                'close': COMPLETE
            },
        })

    STDOUT = '.stdout'
    STDERR = '.stderr'

    def __init__(self, id, command, serializer=None):
        super().__init__()
        self.id = id
        self.command = command
        self.machine = Machine.from_machine(ActionCommand.STATE_MACHINE)
        self.exit_status = None
        self.start_time = None
        self.end_time = None
        if serializer:
            self.stdout = serializer.open(self.STDOUT)
            self.stderr = serializer.open(self.STDERR)
        else:
            self.stdout = filehandler.NullFileHandle
            self.stderr = filehandler.NullFileHandle

    @property
    def state(self):
        return self.machine.state

    def transition_and_notify(self, target):
        if self.machine.transition(target):
            self.notify(self.state)
            return True

    def started(self):
        if self.machine.check('start'):
            self.start_time = timeutils.current_timestamp()
            return self.transition_and_notify('start')

    def exited(self, exit_status):
        if self.machine.check('exit'):
            self.end_time = timeutils.current_timestamp()
            self.exit_status = exit_status
            return self.transition_and_notify('exit')

    def write_stderr(self, value):
        self.stderr.write(value)

    def write_stdout(self, value):
        self.stdout.write(value)

    def done(self):
        if self.machine.check('close'):
            self.stdout.close()
            self.stderr.close()
            return self.transition_and_notify('close')

    def handle_errback(self, result):
        """Handle an unexpected error while being run. This will likely be
        an interval error. Cleanup the state of this ActionCommand and log
        something useful for debugging.
        """
        log.error(f"Unknown failure for {self}, {str(result)}")
        self.exited(result)
        self.done()

    @property
    def is_unknown(self):
        return self.exit_status is None

    @property
    def is_failed(self):
        return bool(self.exit_status)

    @property
    def is_complete(self):
        """Complete implies done and success."""
        return self.machine.state == ActionCommand.COMPLETE

    @property
    def is_done(self):
        """Done implies no more work will be done, but might not be success."""
        return self.machine.state in (ActionCommand.COMPLETE,
                                      ActionCommand.FAILSTART)

    def __repr__(self):
        return f"ActionCommand {self.id} {self.command}: {self.state}"