Exemplo n.º 1
0
def _get_cached(path):
    # 1/ memory cache
    if path in JUMBO_FIELDS_MEMORY_CACHE:
        return JUMBO_FIELDS_MEMORY_CACHE[path]

    # 2/ disk cache
    if SIMPLEFLOW_ENABLE_DISK_CACHE:
        try:
            # NB: this cache may also be triggered on activity workers, where it's not that
            # useful. The performance hit should be minimal. To be improved later.
            # NB2: cache has to be lazily instantiated here, cache objects do not survive forks,
            # see DiskCache docs.
            cache = Cache(constants.CACHE_DIR)
            # generate a dedicated cache key because this cache may be shared with other
            # features of simpleflow at some point
            cache_key = "jumbo_fields/" + path.split("/")[-1]
            if cache_key in cache:
                logger.debug(
                    "diskcache: getting key={} from cache_dir={}".format(
                        cache_key, constants.CACHE_DIR))
                return cache[cache_key]
        except OperationalError:
            logger.warning(
                "diskcache: got an OperationalError, skipping cache usage")

    # nothing to return, but better be explicit here
    return
Exemplo n.º 2
0
def start_worker(
    domain,
    task_list,
    log_level,
    nb_processes,
    heartbeat,
    one_task,
    process_mode,
    poll_data,
):
    if log_level:
        logger.warning(
            "Deprecated: --log-level will be removed, use LOG_LEVEL environment variable instead"
        )

    if process_mode == "kubernetes" and poll_data:
        # don't accept to have a worker that doesn't poll AND doesn't process
        # since it would be just a gate to a scheduling infinite loop
        raise ValueError(
            "--process-mode=kubernetes and --poll-data options are exclusive")

    if not task_list and not poll_data:
        raise ValueError(
            "Please provide a --task-list or some data via --poll-data")

    worker.command.start(
        domain,
        task_list,
        nb_processes,
        heartbeat,
        one_task,
        process_mode,
        poll_data,
    )
Exemplo n.º 3
0
    def run(self, token, task):
        ppid = os.getppid()

        while True:
            time.sleep(self._interval)

            if os.getppid() != ppid:
                os._exit(1)

            try:
                logger.info("heartbeat {} for task {}".format(
                    time.time(), task.activity_type.name))
            except Exception:
                # Do not crash for debug
                pass

            try:
                response = self.send_heartbeat(token)
            except swf.exceptions.DoesNotExistError:
                # Either the task or the workflow execution no longer exists.
                logger.warning(
                    "task {} no longer exists. Stopping heartbeat".format(
                        task.activity_type.name))
                return
            except Exception as error:
                # Let's crash if it cannot notify the heartbeat failed.
                logger.error("cannot send heartbeat for task {}: {}".format(
                    task.activity_type.name, error))
                raise

            if response and response.get("cancelRequested"):
                return
Exemplo n.º 4
0
    def run(self, x, t=30):
        execution = self.get_run_context()
        logger.warning("execution context from decider: {}".format(execution))
        y = self.submit(increment, x)
        yy = self.submit(Delay, t, y)
        z = self.submit(double, y)

        logger.warning("result of ({x} + 1) * 2 = {result}".format(
            x=x, result=z.result))
        futures.wait(yy, z)
        return z.result
Exemplo n.º 5
0
    def run(self, x, t=30):
        execution = self.get_run_context()
        logger.warning("execution context from decider: {}".format(execution))
        y = self.submit(increment, x)
        yy = self.submit(Delay, t, y)
        z = self.submit(double, y)

        logger.warning("result of ({x} + 1) * 2 = {result}".format(
            x=x,
            result=z.result))
        futures.wait(yy, z)
        return z.result
Exemplo n.º 6
0
def start_decider(workflows, domain, task_list, log_level, nb_processes):
    if log_level:
        logger.warning(
            "Deprecated: --log-level will be removed, use LOG_LEVEL environment variable instead"
        )
    decider.command.start(
        workflows,
        domain,
        task_list,
        None,
        nb_processes,
    )
Exemplo n.º 7
0
def start_decider(workflows, domain, task_list, log_level, nb_processes):
    if log_level:
        logger.warning(
            "Deprecated: --log-level will be removed, use LOG_LEVEL environment variable instead"
        )
    decider.command.start(
        workflows,
        domain,
        task_list,
        None,
        nb_processes,
    )
Exemplo n.º 8
0
def start(
    workflows,
    domain,
    task_list,
    log_level=None,
    nb_processes=None,
    repair_with=None,
    force_activities=None,
    is_standalone=False,
    repair_workflow_id=None,
    repair_run_id=None,
):
    """
    Start a decider.
    :param workflows:
    :type workflows: list[str]
    :param domain:
    :type domain:
    :param task_list:
    :type task_list:
    :param log_level:
    :type log_level:
    :param nb_processes:
    :type nb_processes:
    :param repair_with:
    :type repair_with: Optional[simpleflow.history.History]
    :param force_activities:
    :type force_activities:
    :param is_standalone: Whether the executor use this task list (and pass it to the workers)
    :type is_standalone: bool
    :param repair_workflow_id: workflow ID to repair
    :type repair_workflow_id: Optional[str]
    :param repair_run_id: run ID to repair
    :type repair_run_id: Optional[str]
    """
    if log_level:
        logger.warning(
            "Deprecated: --log-level will be removed, use LOG_LEVEL environment variable instead"
        )
    decider = helpers.make_decider(
        workflows,
        domain,
        task_list,
        nb_processes,
        repair_with=repair_with,
        force_activities=force_activities,
        is_standalone=is_standalone,
        repair_workflow_id=repair_workflow_id,
        repair_run_id=repair_run_id,
    )
    decider.is_alive = True
    decider.start()
Exemplo n.º 9
0
def _set_cached(path, content):
    # 1/ memory cache
    JUMBO_FIELDS_MEMORY_CACHE[path] = content

    # 2/ disk cache
    if SIMPLEFLOW_ENABLE_DISK_CACHE:
        try:
            cache = Cache(constants.CACHE_DIR)
            cache_key = "jumbo_fields/" + path.split("/")[-1]
            logger.debug("diskcache: setting key={} on cache_dir={}".format(cache_key, constants.CACHE_DIR))
            cache.set(cache_key, content, expire=3 * constants.HOUR)
        except OperationalError:
            logger.warning("diskcache: got an OperationalError on write, skipping cache write")
Exemplo n.º 10
0
def _set_cached(path, content):
    # 1/ memory cache
    JUMBO_FIELDS_MEMORY_CACHE[path] = content

    # 2/ disk cache
    if SIMPLEFLOW_ENABLE_DISK_CACHE:
        try:
            cache = Cache(constants.CACHE_DIR)
            cache_key = "jumbo_fields/" + path.split("/")[-1]
            logger.debug("diskcache: setting key={} on cache_dir={}".format(
                cache_key, constants.CACHE_DIR))
            cache.set(cache_key, content, expire=3 * constants.HOUR)
        except OperationalError:
            logger.warning(
                "diskcache: got an OperationalError on write, skipping cache write"
            )
Exemplo n.º 11
0
def reap_process_tree(pid, wait_timeout=settings.ACTIVITY_SIGTERM_WAIT_SEC):
    """
    TERMinates (and KILLs) if necessary a process and its descendants.

    See also: https://psutil.readthedocs.io/en/latest/#kill-process-tree.

    :param pid: Process ID
    :type pid: int
    :param wait_timeout: Wait timeout
    :type wait_timeout: float
    """

    def on_terminate(p):
        logger.info("process: terminated pid={} retcode={}".format(p.pid, p.returncode))

    if pid == os.getpid():
        raise RuntimeError("process: cannot terminate self!")
    parent = psutil.Process(pid)
    procs = parent.children(recursive=True)
    procs.append(parent)
    # Terminate
    for p in procs:
        try:
            p.terminate()
        except psutil.NoSuchProcess:
            pass
    _, alive = psutil.wait_procs(procs, timeout=wait_timeout, callback=on_terminate)
    # Kill
    for p in alive:
        logger.warning(
            "process: pid={} status={} did not respond to SIGTERM. Trying SIGKILL".format(
                p.pid, p.status()
            )
        )
        try:
            p.kill()
        except psutil.NoSuchProcess:
            pass
    # Check
    _, alive = psutil.wait_procs(alive)
    for p in alive:
        logger.error(
            "process: pid={} status={} still alive. Giving up!".format(
                p.pid, p.status()
            )
        )
Exemplo n.º 12
0
def start(workflows, domain, task_list, log_level=None, nb_processes=None,
          repair_with=None, force_activities=None, is_standalone=False,
          repair_workflow_id=None, repair_run_id=None,
          ):
    """
    Start a decider.
    :param workflows:
    :type workflows: list[str]
    :param domain:
    :type domain:
    :param task_list:
    :type task_list:
    :param log_level:
    :type log_level:
    :param nb_processes:
    :type nb_processes:
    :param repair_with:
    :type repair_with: Optional[simpleflow.history.History]
    :param force_activities:
    :type force_activities:
    :param is_standalone: Whether the executor use this task list (and pass it to the workers)
    :type is_standalone: bool
    :param repair_workflow_id: workflow ID to repair
    :type repair_workflow_id: Optional[str]
    :param repair_run_id: run ID to repair
    :type repair_run_id: Optional[str]
    """
    if log_level:
        logger.warning(
            "Deprecated: --log-level will be removed, use LOG_LEVEL environment variable instead"
        )
    decider = helpers.make_decider(
        workflows, domain, task_list, nb_processes,
        repair_with=repair_with,
        force_activities=force_activities,
        is_standalone=is_standalone,
        repair_workflow_id=repair_workflow_id,
        repair_run_id=repair_run_id,
    )
    decider.is_alive = True
    decider.start()
Exemplo n.º 13
0
def reap_process_tree(pid, wait_timeout=settings.ACTIVITY_SIGTERM_WAIT_SEC):
    """
    TERMinates (and KILLs) if necessary a process and its descendants.

    See also: https://psutil.readthedocs.io/en/latest/#kill-process-tree.

    :param pid: Process ID
    :type pid: int
    :param wait_timeout: Wait timeout
    :type wait_timeout: float
    """

    def on_terminate(p):
        logger.info('process: terminated pid={} retcode={}'.format(p.pid, p.returncode))

    if pid == os.getpid():
        raise RuntimeError('process: cannot terminate self!')
    parent = psutil.Process(pid)
    procs = parent.children(recursive=True)
    procs.append(parent)
    # Terminate
    for p in procs:
        try:
            p.terminate()
        except psutil.NoSuchProcess:
            pass
    _, alive = psutil.wait_procs(procs, timeout=wait_timeout, callback=on_terminate)
    # Kill
    for p in alive:
        logger.warning('process: pid={} status={} did not respond to SIGTERM. Trying SIGKILL'.format(p.pid, p.status()))
        try:
            p.kill()
        except psutil.NoSuchProcess:
            pass
    # Check
    _, alive = psutil.wait_procs(alive)
    for p in alive:
        logger.error('process: pid={} status={} still alive. Giving up!'.format(p.pid, p.status()))
Exemplo n.º 14
0
def start_worker(domain, task_list, log_level, nb_processes, heartbeat, one_task, process_mode, poll_data):
    if log_level:
        logger.warning(
            "Deprecated: --log-level will be removed, use LOG_LEVEL environment variable instead"
        )

    if process_mode == "kubernetes" and poll_data:
        # don't accept to have a worker that doesn't poll AND doesn't process
        # since it would be just a gate to a scheduling infinite loop
        raise ValueError("--process-mode=kubernetes and --poll-data options are exclusive")

    if not task_list and not poll_data:
        raise ValueError("Please provide a --task-list or some data via --poll-data")

    worker.command.start(
        domain,
        task_list,
        nb_processes,
        heartbeat,
        one_task,
        process_mode,
        poll_data,
    )
Exemplo n.º 15
0
def _get_cached(path):
    # 1/ memory cache
    if path in JUMBO_FIELDS_MEMORY_CACHE:
        return JUMBO_FIELDS_MEMORY_CACHE[path]

    # 2/ disk cache
    if SIMPLEFLOW_ENABLE_DISK_CACHE:
        try:
            # NB: this cache may also be triggered on activity workers, where it's not that
            # useful. The performance hit should be minimal. To be improved later.
            # NB2: cache has to be lazily instantiated here, cache objects do not survive forks,
            # see DiskCache docs.
            cache = Cache(constants.CACHE_DIR)
            # generate a dedicated cache key because this cache may be shared with other
            # features of simpleflow at some point
            cache_key = "jumbo_fields/" + path.split("/")[-1]
            if cache_key in cache:
                logger.debug("diskcache: getting key={} from cache_dir={}".format(cache_key, constants.CACHE_DIR))
                return cache[cache_key]
        except OperationalError:
            logger.warning("diskcache: got an OperationalError, skipping cache usage")

    # nothing to return, but better be explicit here
    return
Exemplo n.º 16
0
def increment(x):
    # Here's how you can access the raw context of the activity task if you need
    # it. It gives you access to the response of the PollForActivityTask call to
    # the SWF API. See docs for more info: http://docs.aws.amazon.com/amazonswf/latest/apireference/API_PollForActivityTask.html#API_PollForActivityTask_ResponseSyntax  # NOQA
    logger.warning("activity context: {}".format(increment.context))
    return x + 1
Exemplo n.º 17
0
def spawn(poller, token, task, heartbeat=60):
    """
    Spawn a process and wait for it to end, sending heartbeats to SWF.

    On activity timeouts and termination, we reap the worker process and its
    children.

    :param poller:
    :type poller: ActivityPoller
    :param token:
    :type token: str
    :param task:
    :type task: swf.models.ActivityTask
    :param heartbeat: heartbeat delay (seconds)
    :type heartbeat: int
    """
    logger.info(
        "spawning new activity worker pid={} heartbeat={}".format(
            os.getpid(), heartbeat
        )
    )
    worker = multiprocessing.Process(target=process_task, args=(poller, token, task),)
    worker.start()

    def worker_alive():
        return psutil.pid_exists(worker.pid)

    while worker_alive():
        worker.join(timeout=heartbeat)
        if not worker_alive():
            # Most certainly unneeded: we'll see
            if worker.exitcode is None:
                # race condition, try and re-join
                worker.join(timeout=0)
                if worker.exitcode is None:
                    logger.warning(
                        "process {} is dead but multiprocessing doesn't know it (simpleflow bug)".format(
                            worker.pid
                        )
                    )
            if worker.exitcode != 0:
                poller.fail_with_retry(
                    token,
                    task,
                    reason="process {} died: exit code {}".format(
                        worker.pid, worker.exitcode
                    ),
                )
            return
        try:
            logger.debug("heartbeating for pid={} (token={})".format(worker.pid, token))
            response = poller.heartbeat(token)
        except swf.exceptions.DoesNotExistError as error:
            # Either the task or the workflow execution no longer exists,
            # let's kill the worker process.
            logger.warning("heartbeat failed: {}".format(error))
            logger.warning("killing (KILL) worker with pid={}".format(worker.pid))
            reap_process_tree(worker.pid)
            return
        except swf.exceptions.RateLimitExceededError as error:
            # ignore rate limit errors: high chances the next heartbeat will be
            # ok anyway, so it would be stupid to break the task for that
            logger.warning(
                'got a "ThrottlingException / Rate exceeded" when heartbeating for task {}: {}'.format(
                    task.activity_type.name, error
                )
            )
            continue
        except Exception as error:
            # Let's crash if it cannot notify the heartbeat failed.  The
            # subprocess will become orphan and the heartbeat timeout may
            # eventually trigger on Amazon SWF side.
            logger.error(
                "cannot send heartbeat for task {}: {}".format(
                    task.activity_type.name, error
                )
            )
            raise

        # Task cancelled.
        if response and response.get("cancelRequested"):
            reap_process_tree(worker.pid)
            return
Exemplo n.º 18
0
    def parse_child_workflow_event(self, events, event):
        """Aggregate all the attributes of a workflow in a single entry.

        See http://docs.aws.amazon.com/amazonswf/latest/apireference/API_HistoryEvent.html

        - StartChildWorkflowExecutionInitiated: A request was made to start a
          child workflow execution.
        - StartChildWorkflowExecutionFailed: Failed to process
          StartChildWorkflowExecution decision. This happens when the decision
          is not configured properly, for example the workflow type specified
          is not registered.
        - ChildWorkflowExecutionStarted: A child workflow execution was
          successfully started.
        - ChildWorkflowExecutionCompleted: A child workflow execution, started
          by this workflow execution, completed successfully and was closed.
        - ChildWorkflowExecutionFailed: A child workflow execution, started by
          this workflow execution, failed to complete successfully and was
          closed.
        - ChildWorkflowExecutionTimedOut: A child workflow execution, started
          by this workflow execution, timed out and was closed.
        - ChildWorkflowExecutionCanceled: A child workflow execution, started
          by this workflow execution, was canceled and closed.
        - ChildWorkflowExecutionTerminated: A child workflow execution, started
          by this workflow execution, was terminated.

        :param events:
        :type events: list[swf.models.event.Event]
        :param event:
        :type event: swf.models.event.Event
        """
        def get_workflow():
            initiated_event = events[event.initiated_event_id - 1]
            return self._child_workflows[initiated_event.workflow_id]

        if event.state == "start_initiated":
            workflow = {
                "type":
                "child_workflow",
                "id":
                event.workflow_id,
                "name":
                event.workflow_type["name"],
                "version":
                event.workflow_type["version"],
                "state":
                event.state,
                "initiated_event_id":
                event.id,
                "raw_input":
                event.raw.get("input"),  # FIXME obsolete; any user out there?
                "input":
                event.input,
                "child_policy":
                event.child_policy,
                "control":
                event.control,
                "tag_list":
                getattr(event, "tag_list", None),
                "task_list":
                event.task_list["name"],
                "initiated_event_timestamp":
                event.timestamp,
                "decision_task_completed_event_id":
                event.decision_task_completed_event_id,
            }
            if event.workflow_id not in self._child_workflows:
                self._child_workflows[event.workflow_id] = workflow
                self._tasks.append(workflow)
            else:
                # May have gotten a start_failed before (or retrying?)
                if (self._child_workflows[event.workflow_id]["state"] ==
                        "start_initiated"):
                    # Should not happen anymore
                    logger.warning(
                        "start_initiated again for workflow {} (initiated @{}, we're @{})"
                        .format(
                            event.workflow_id,
                            self._child_workflows[event.workflow_id]
                            ["initiated_event_id"],
                            event.id,
                        ))
                self._child_workflows[event.workflow_id].update(workflow)
        elif event.state == "start_failed":
            workflow = {
                "type":
                "child_workflow",
                "id":
                event.workflow_id,
                "state":
                event.state,
                "cause":
                event.cause,
                "name":
                event.workflow_type["name"],
                "version":
                event.workflow_type["version"],
                "control":
                event.control,
                "start_failed_id":
                event.id,
                "start_failed_timestamp":
                event.timestamp,
                "decision_task_completed_event_id":
                event.decision_task_completed_event_id,
            }
            if event.workflow_id not in self._child_workflows:
                self._child_workflows[event.workflow_id] = workflow
                self._tasks.append(workflow)
            else:
                self._child_workflows[event.workflow_id].update(workflow)
        elif event.state == "started":
            workflow = get_workflow()
            workflow.update({
                "state":
                event.state,
                "run_id":
                event.workflow_execution["runId"],
                "workflow_id":
                event.workflow_execution["workflowId"],
                "started_id":
                event.id,
                "started_timestamp":
                event.timestamp,
            })
        elif event.state == "completed":
            workflow = get_workflow()
            workflow.update({
                "state": event.state,
                "result": getattr(event, "result", None),
                "completed_id": event.id,
                "completed_timestamp": event.timestamp,
            })
        elif event.state == "failed":
            workflow = get_workflow()
            workflow.update({
                "state": event.state,
                "reason": getattr(event, "reason", None),
                "details": getattr(event, "details", None),
                "failed_id": event.id,
                "failed_timestamp": event.timestamp,
            })
            if "retry" not in workflow:
                workflow["retry"] = 0
            else:
                workflow["retry"] += 1
        elif event.state == "timed_out":
            workflow = get_workflow()
            workflow.update({
                "state":
                event.state,
                "timeout_type":
                event.timeout_type,
                "timeout_value":
                getattr(
                    events[workflow["initiated_event_id"] - 1],
                    "{}_timeout".format(event.timeout_type.lower()),
                    None,
                ),
                "timed_out_id":
                event.id,
                "timed_out_timestamp":
                event.timestamp,
            })
            if "retry" not in workflow:
                workflow["retry"] = 0
            else:
                workflow["retry"] += 1
        elif event.state == "canceled":
            workflow = get_workflow()
            workflow.update({
                "state": event.state,
                "details": getattr(event, "details", None),
                "canceled_id": event.id,
                "canceled_timestamp": event.timestamp,
            })
        elif event.state == "terminated":
            workflow = get_workflow()
            workflow.update({
                "state": event.state,
                "terminated_id": event.id,
                "terminated_timestamp": event.timestamp,
            })
Exemplo n.º 19
0
    def parse_external_workflow_event(self, events, event):
        """
        Parse an external workflow event.
        :param events:
        :param event:
        """
        def get_workflow(workflows):
            initiated_event = events[event.initiated_event_id - 1]
            return workflows[initiated_event.workflow_id]

        if event.state == "signal_execution_initiated":
            workflow = {
                "type": "external_workflow",
                "id": event.workflow_id,
                "run_id": getattr(event, "run_id", None),
                "signal_name": event.signal_name,
                "state": event.state,
                "initiated_event_id": event.id,
                "input": event.input,
                "control": event.control,
                "initiated_event_timestamp": event.timestamp,
            }
            self._external_workflows_signaling[event.id] = workflow
        elif event.state == "signal_execution_failed":
            workflow = self._external_workflows_signaling[
                event.initiated_event_id]
            workflow.update({
                "state": event.state,
                "cause": event.cause,
                "signal_failed_timestamp": event.timestamp,
            })
            if event.control:
                workflow["control"] = event.control
        elif event.state == "execution_signaled":
            workflow = self._external_workflows_signaling[
                event.initiated_event_id]
            workflow.update({
                "state":
                event.state,
                "run_id":
                event.workflow_execution["runId"],
                "workflow_id":
                event.workflow_execution["workflowId"],
                "signaled_event_id":
                event.id,
                "signaled_timestamp":
                event.timestamp,
            })
            self._signaled_workflows[workflow["signal_name"]].append(workflow)
        elif event.state == "request_cancel_execution_initiated":
            workflow = {
                "type": "external_workflow",
                "id": event.workflow_id,
                "run_id": getattr(event, "run_id", None),
                "state": event.state,
                "control": event.control,
                "initiated_event_id": event.id,
                "initiated_event_timestamp": event.timestamp,
            }
            if event.workflow_id not in self._external_workflows_canceling:
                self._external_workflows_canceling[
                    event.workflow_id] = workflow
            else:
                logger.warning(
                    "request_cancel_initiated again for workflow {} (initiated @{}, we're @{})"
                    .format(
                        event.workflow_id,
                        self._external_workflows_canceling[event.workflow_id]
                        ["initiated_event_id"],
                        event.id,
                    ))
                self._external_workflows_canceling[event.workflow_id].update(
                    workflow)
        elif event.state == "request_cancel_execution_failed":
            workflow = get_workflow(self._external_workflows_canceling)
            workflow.update({
                "state": event.state,
                "cause": event.cause,
            })
            if event.control:
                workflow["control"] = event.control
            workflow["request_cancel_failed_timestamp"] = event.timestamp
        elif event.state == "execution_cancel_requested":
            workflow = get_workflow(self._external_workflows_canceling)
            workflow.update({
                "run_id":
                event.workflow_execution["runId"],
                "workflow_id":
                event.workflow_execution["workflowId"],
                "cancel_requested_event_id":
                event.id,
                "cancel_requested_timestamp":
                event.timestamp,
            })
Exemplo n.º 20
0
    def parse_external_workflow_event(self, events, event):
        """
        Parse an external workflow event.
        :param events:
        :param event:
        """

        def get_workflow(workflows):
            initiated_event = events[event.initiated_event_id - 1]
            return workflows[initiated_event.workflow_id]

        if event.state == 'signal_execution_initiated':
            workflow = {
                'type': 'external_workflow',
                'id': event.workflow_id,
                'run_id': getattr(event, 'run_id', None),
                'signal_name': event.signal_name,
                'state': event.state,
                'initiated_event_id': event.id,
                'input': event.input,
                'control': event.control,
                'initiated_event_timestamp': event.timestamp,
            }
            self._external_workflows_signaling[event.id] = workflow
        elif event.state == 'signal_execution_failed':
            workflow = self._external_workflows_signaling[event.initiated_event_id]
            workflow.update({
                'state': event.state,
                'cause': event.cause,
                'signal_failed_timestamp': event.timestamp,
            })
            if event.control:
                workflow['control'] = event.control
        elif event.state == 'execution_signaled':
            workflow = self._external_workflows_signaling[event.initiated_event_id]
            workflow.update({
                'state': event.state,
                'run_id': event.workflow_execution['runId'],
                'workflow_id': event.workflow_execution['workflowId'],
                'signaled_event_id': event.id,
                'signaled_timestamp': event.timestamp,
            })
            self._signaled_workflows[workflow['signal_name']].append(workflow)
        elif event.state == 'request_cancel_execution_initiated':
            workflow = {
                'type': 'external_workflow',
                'id': event.workflow_id,
                'run_id': getattr(event, 'run_id', None),
                'state': event.state,
                'control': event.control,
                'initiated_event_id': event.id,
                'initiated_event_timestamp': event.timestamp,
            }
            if event.workflow_id not in self._external_workflows_canceling:
                self._external_workflows_canceling[event.workflow_id] = workflow
            else:
                logger.warning("request_cancel_initiated again for workflow {} (initiated @{}, we're @{})".format(
                    event.workflow_id,
                    self._external_workflows_canceling[event.workflow_id]['initiated_event_id'],
                    event.id
                ))
                self._external_workflows_canceling[event.workflow_id].update(workflow)
        elif event.state == 'request_cancel_execution_failed':
            workflow = get_workflow(self._external_workflows_canceling)
            workflow.update({
                'state': event.state,
                'cause': event.cause,
            })
            if event.control:
                workflow['control'] = event.control
            workflow['request_cancel_failed_timestamp'] = event.timestamp
        elif event.state == 'execution_cancel_requested':
            workflow = get_workflow(self._external_workflows_canceling)
            workflow.update({
                'run_id': event.workflow_execution['runId'],
                'workflow_id': event.workflow_execution['workflowId'],
                'cancel_requested_event_id': event.id,
                'cancel_requested_timestamp': event.timestamp,
            })
Exemplo n.º 21
0
    def parse_child_workflow_event(self, events, event):
        """Aggregate all the attributes of a workflow in a single entry.

        See http://docs.aws.amazon.com/amazonswf/latest/apireference/API_HistoryEvent.html

        - StartChildWorkflowExecutionInitiated: A request was made to start a
          child workflow execution.
        - StartChildWorkflowExecutionFailed: Failed to process
          StartChildWorkflowExecution decision. This happens when the decision
          is not configured properly, for example the workflow type specified
          is not registered.
        - ChildWorkflowExecutionStarted: A child workflow execution was
          successfully started.
        - ChildWorkflowExecutionCompleted: A child workflow execution, started
          by this workflow execution, completed successfully and was closed.
        - ChildWorkflowExecutionFailed: A child workflow execution, started by
          this workflow execution, failed to complete successfully and was
          closed.
        - ChildWorkflowExecutionTimedOut: A child workflow execution, started
          by this workflow execution, timed out and was closed.
        - ChildWorkflowExecutionCanceled: A child workflow execution, started
          by this workflow execution, was canceled and closed.
        - ChildWorkflowExecutionTerminated: A child workflow execution, started
          by this workflow execution, was terminated.

        :param events:
        :type events: list[swf.models.event.Event]
        :param event:
        :type event: swf.models.event.Event
        """

        def get_workflow():
            initiated_event = events[event.initiated_event_id - 1]
            return self._child_workflows[initiated_event.workflow_id]

        if event.state == 'start_initiated':
            workflow = {
                'type': 'child_workflow',
                'id': event.workflow_id,
                'name': event.workflow_type['name'],
                'version': event.workflow_type['version'],
                'state': event.state,
                'initiated_event_id': event.id,
                'raw_input': event.raw.get('input'),  # FIXME obsolete; any user out there?
                'input': event.input,
                'child_policy': event.child_policy,
                'control': event.control,
                'tag_list': getattr(event, 'tag_list', None),
                'task_list': event.task_list['name'],
                'initiated_event_timestamp': event.timestamp,
                'decision_task_completed_event_id': event.decision_task_completed_event_id,
            }
            if event.workflow_id not in self._child_workflows:
                self._child_workflows[event.workflow_id] = workflow
                self._tasks.append(workflow)
            else:
                # May have gotten a start_failed before (or retrying?)
                if self._child_workflows[event.workflow_id]['state'] == 'start_initiated':
                    # Should not happen anymore
                    logger.warning("start_initiated again for workflow {} (initiated @{}, we're @{})".format(
                        event.workflow_id,
                        self._child_workflows[event.workflow_id]['initiated_event_id'],
                        event.id
                    ))
                self._child_workflows[event.workflow_id].update(workflow)
        elif event.state == 'start_failed':
            workflow = {
                'type': 'child_workflow',
                'id': event.workflow_id,
                'state': event.state,
                'cause': event.cause,
                'name': event.workflow_type['name'],
                'version': event.workflow_type['version'],
                'control': event.control,
                'start_failed_id': event.id,
                'start_failed_timestamp': event.timestamp,
                'decision_task_completed_event_id': event.decision_task_completed_event_id,
            }
            if event.workflow_id not in self._child_workflows:
                self._child_workflows[event.workflow_id] = workflow
                self._tasks.append(workflow)
            else:
                self._child_workflows[event.workflow_id].update(workflow)
        elif event.state == 'started':
            workflow = get_workflow()
            workflow.update({
                'state': event.state,
                'run_id': event.workflow_execution['runId'],
                'workflow_id': event.workflow_execution['workflowId'],
                'started_id': event.id,
                'started_timestamp': event.timestamp,
            })
        elif event.state == 'completed':
            workflow = get_workflow()
            workflow.update({
                'state': event.state,
                'result': getattr(event, 'result', None),
                'completed_id': event.id,
                'completed_timestamp': event.timestamp,
            })
        elif event.state == 'failed':
            workflow = get_workflow()
            workflow.update({
                'state': event.state,
                'reason': getattr(event, 'reason', None),
                'details': getattr(event, 'details', None),
                'failed_id': event.id,
                'failed_timestamp': event.timestamp,
            })
            if 'retry' not in workflow:
                workflow['retry'] = 0
            else:
                workflow['retry'] += 1
        elif event.state == 'timed_out':
            workflow = get_workflow()
            workflow.update({
                'state': event.state,
                'timeout_type': event.timeout_type,
                'timeout_value': getattr(
                    events[workflow['initiated_event_id'] - 1],
                    '{}_timeout'.format(event.timeout_type.lower()),
                    None
                ),
                'timed_out_id': event.id,
                'timed_out_timestamp': event.timestamp,
            })
            if 'retry' not in workflow:
                workflow['retry'] = 0
            else:
                workflow['retry'] += 1
        elif event.state == 'canceled':
            workflow = get_workflow()
            workflow.update({
                'state': event.state,
                'details': getattr(event, 'details', None),
                'canceled_id': event.id,
                'canceled_timestamp': event.timestamp,
            })
        elif event.state == 'terminated':
            workflow = get_workflow()
            workflow.update({
                'state': event.state,
                'terminated_id': event.id,
                'terminated_timestamp': event.timestamp,
            })
Exemplo n.º 22
0
def increment(x):
    # Here's how you can access the raw context of the activity task if you need
    # it. It gives you access to the response of the PollForActivityTask call to
    # the SWF API. See docs for more info: http://docs.aws.amazon.com/amazonswf/latest/apireference/API_PollForActivityTask.html#API_PollForActivityTask_ResponseSyntax  # NOQA
    logger.warning("activity context: {}".format(increment.context))
    return x + 1
Exemplo n.º 23
0
def spawn(poller, token, task, heartbeat=60):
    """
    Spawn a process and wait for it to end, sending heartbeats to SWF.

    On activity timeouts and termination, we reap the worker process and its
    children.

    :param poller:
    :type poller: ActivityPoller
    :param token:
    :type token: str
    :param task:
    :type task: swf.models.ActivityTask
    :param heartbeat: heartbeat delay (seconds)
    :type heartbeat: int
    """
    logger.info('spawning new activity worker pid={} heartbeat={}'.format(os.getpid(), heartbeat))
    worker = multiprocessing.Process(
        target=process_task,
        args=(poller, token, task),
    )
    worker.start()

    def worker_alive():
        return psutil.pid_exists(worker.pid)

    while worker_alive():
        worker.join(timeout=heartbeat)
        if not worker_alive():
            # Most certainly unneeded: we'll see
            if worker.exitcode is None:
                # race condition, try and re-join
                worker.join(timeout=0)
                if worker.exitcode is None:
                    logger.warning("process {} is dead but multiprocessing doesn't know it (simpleflow bug)".format(
                        worker.pid
                    ))
            if worker.exitcode != 0:
                poller.fail_with_retry(
                    token,
                    task,
                    reason='process {} died: exit code {}'.format(
                        worker.pid,
                        worker.exitcode)
                )
            return
        try:
            logger.debug(
                'heartbeating for pid={} (token={})'.format(worker.pid, token)
            )
            response = poller.heartbeat(token)
        except swf.exceptions.DoesNotExistError as error:
            # Either the task or the workflow execution no longer exists,
            # let's kill the worker process.
            logger.warning('heartbeat failed: {}'.format(error))
            logger.warning('killing (KILL) worker with pid={}'.format(worker.pid))
            reap_process_tree(worker.pid)
            return
        except swf.exceptions.RateLimitExceededError as error:
            # ignore rate limit errors: high chances the next heartbeat will be
            # ok anyway, so it would be stupid to break the task for that
            logger.warning(
                'got a "ThrottlingException / Rate exceeded" when heartbeating for task {}: {}'.format(
                    task.activity_type.name,
                    error))
            continue
        except Exception as error:
            # Let's crash if it cannot notify the heartbeat failed.  The
            # subprocess will become orphan and the heartbeat timeout may
            # eventually trigger on Amazon SWF side.
            logger.error('cannot send heartbeat for task {}: {}'.format(
                task.activity_type.name,
                error))
            raise

        # Task cancelled.
        if response and response.get('cancelRequested'):
            reap_process_tree(worker.pid)
            return