Exemplo n.º 1
0
 def fail(self, token, task, reason=None, details=None):
     """
     Fail the activity, log and ignore exceptions.
     :param token:
     :type token:
     :param task:
     :type task:
     :param reason:
     :type reason:
     :param details:
     :type details:
     :return:
     :rtype:
     """
     try:
         return swf.actors.ActivityWorker.fail(
             self,
             token,
             reason=reason,
             details=details,
         )
     except Exception as err:
         logger.error('cannot fail task {}: {}'.format(
             task.activity_type.name,
             err,
         ))
Exemplo n.º 2
0
    def run(self, token, task):
        ppid = os.getppid()

        while True:
            time.sleep(self._interval)

            if os.getppid() != ppid:
                os._exit(1)

            try:
                logger.info("heartbeat {} for task {}".format(
                    time.time(), task.activity_type.name))
            except Exception:
                # Do not crash for debug
                pass

            try:
                response = self.send_heartbeat(token)
            except swf.exceptions.DoesNotExistError:
                # Either the task or the workflow execution no longer exists.
                logger.warning(
                    "task {} no longer exists. Stopping heartbeat".format(
                        task.activity_type.name))
                return
            except Exception as error:
                # Let's crash if it cannot notify the heartbeat failed.
                logger.error("cannot send heartbeat for task {}: {}".format(
                    task.activity_type.name, error))
                raise

            if response and response.get("cancelRequested"):
                return
Exemplo n.º 3
0
        def decorated(*args, **kwargs):
            try:
                return func(*args, **kwargs)
            except exceptions as err:
                if log is True:
                    logger.error("call to {} raised: {}".format(
                        func.__name__, err))

                if handle_with is None:
                    raise

                return handle_with(err, *args, **kwargs)
Exemplo n.º 4
0
def process_decision(poller, decision_response):
    # type: (DeciderPoller, Response) -> None
    workflow_id = decision_response.execution.workflow_id
    workflow_str = "workflow {} ({})".format(workflow_id, poller.workflow_name)
    logger.debug("process_decision() pid={}".format(os.getpid()))
    logger.info("taking decision for {}".format(workflow_str))
    format.JUMBO_FIELDS_MEMORY_CACHE.clear()
    decisions = poller.decide(decision_response)
    try:
        logger.info("completing decision for {}".format(workflow_str))
        poller.complete_with_retry(decision_response.token, decisions)
    except Exception as err:
        logger.error("cannot complete decision for {}: {}".format(workflow_str, err))
Exemplo n.º 5
0
        def decorated(*args, **kwargs):
            try:
                return func(*args, **kwargs)
            except exceptions as err:
                if log is True:
                    logger.error('call to {} raised: {}'.format(
                        func.__name__,
                        err))

                if handle_with is None:
                    raise

                return handle_with(err, *args, **kwargs)
Exemplo n.º 6
0
def process_decision(poller, decision_response):
    # type: (DeciderPoller, Response) -> None
    workflow_id = decision_response.execution.workflow_id
    workflow_str = "workflow {} ({})".format(workflow_id, poller.workflow_name)
    logger.debug("process_decision() pid={}".format(os.getpid()))
    logger.info("taking decision for {}".format(workflow_str))
    format.JUMBO_FIELDS_MEMORY_CACHE.clear()
    decisions = poller.decide(decision_response)
    try:
        logger.info("completing decision for {}".format(workflow_str))
        poller.complete_with_retry(decision_response.token, decisions)
    except Exception as err:
        logger.error("cannot complete decision for {}: {}".format(workflow_str, err))
Exemplo n.º 7
0
def activity_rerun(domain, workflow_id, run_id, input, scheduled_id,
                   activity_id):
    # handle params
    if not activity_id and not scheduled_id:
        logger.error("Please supply --scheduled-id or --activity-id.")
        sys.exit(1)

    input_override = None
    if input:
        input_override = format.decode(input)

    # find workflow execution
    try:
        wfe = helpers.get_workflow_execution(domain, workflow_id, run_id)
    except (swf.exceptions.DoesNotExistError, IndexError):
        logger.error("Couldn't find execution, exiting.")
        sys.exit(1)
    logger.info("Found execution: workflowId={} runId={}".format(
        wfe.workflow_id, wfe.run_id))

    # now rerun the specified activity
    history = History(wfe.history())
    history.parse()
    task, args, kwargs, meta, params = helpers.find_activity(
        history,
        scheduled_id=scheduled_id,
        activity_id=activity_id,
        input=input_override,
    )
    kwargs["context"].update({
        "workflow_id": wfe.workflow_id,
        "run_id": wfe.run_id,
    })
    logger.debug("Found activity. Last execution:")
    for line in json_dumps(params, pretty=True).split("\n"):
        logger.debug(line)
    if input_override:
        logger.info("NB: input will be overriden with the passed one!")
    logger.info("Will re-run: {}(*{}, **{}) [+meta={}]".format(
        task, args, kwargs, meta))

    # download binaries if needed
    download_binaries(meta.get("binaries", {}))

    # execute the activity task with the correct arguments
    instance = ActivityTask(task, *args, **kwargs)
    result = instance.execute()
    if hasattr(instance, "post_execute"):
        instance.post_execute()
    logger.info("Result (JSON): {}".format(json_dumps(result, compact=False)))
Exemplo n.º 8
0
def reap_process_tree(pid, wait_timeout=settings.ACTIVITY_SIGTERM_WAIT_SEC):
    """
    TERMinates (and KILLs) if necessary a process and its descendants.

    See also: https://psutil.readthedocs.io/en/latest/#kill-process-tree.

    :param pid: Process ID
    :type pid: int
    :param wait_timeout: Wait timeout
    :type wait_timeout: float
    """

    def on_terminate(p):
        logger.info("process: terminated pid={} retcode={}".format(p.pid, p.returncode))

    if pid == os.getpid():
        raise RuntimeError("process: cannot terminate self!")
    parent = psutil.Process(pid)
    procs = parent.children(recursive=True)
    procs.append(parent)
    # Terminate
    for p in procs:
        try:
            p.terminate()
        except psutil.NoSuchProcess:
            pass
    _, alive = psutil.wait_procs(procs, timeout=wait_timeout, callback=on_terminate)
    # Kill
    for p in alive:
        logger.warning(
            "process: pid={} status={} did not respond to SIGTERM. Trying SIGKILL".format(
                p.pid, p.status()
            )
        )
        try:
            p.kill()
        except psutil.NoSuchProcess:
            pass
    # Check
    _, alive = psutil.wait_procs(alive)
    for p in alive:
        logger.error(
            "process: pid={} status={} still alive. Giving up!".format(
                p.pid, p.status()
            )
        )
Exemplo n.º 9
0
def activity_rerun(domain,
                   workflow_id,
                   run_id,
                   input,
                   scheduled_id,
                   activity_id):
    # handle params
    if not activity_id and not scheduled_id:
        logger.error("Please supply --scheduled-id or --activity-id.")
        sys.exit(1)

    input_override = None
    if input:
        input_override = format.decode(input)

    # find workflow execution
    try:
        wfe = helpers.get_workflow_execution(domain, workflow_id, run_id)
    except (swf.exceptions.DoesNotExistError, IndexError):
        logger.error("Couldn't find execution, exiting.")
        sys.exit(1)
    logger.info("Found execution: workflowId={} runId={}".format(wfe.workflow_id, wfe.run_id))

    # now rerun the specified activity
    history = History(wfe.history())
    history.parse()
    task, args, kwargs, meta, params = helpers.find_activity(
        history, scheduled_id=scheduled_id, activity_id=activity_id, input=input_override,
    )
    logger.debug("Found activity. Last execution:")
    for line in json_dumps(params, pretty=True).split("\n"):
        logger.debug(line)
    if input_override:
        logger.info("NB: input will be overriden with the passed one!")
    logger.info("Will re-run: {}(*{}, **{}) [+meta={}]".format(task, args, kwargs, meta))

    # download binaries if needed
    download_binaries(meta.get("binaries", {}))

    # execute the activity task with the correct arguments
    instance = ActivityTask(task, *args, **kwargs)
    result = instance.execute()
    if hasattr(instance, 'post_execute'):
        instance.post_execute()
    logger.info("Result (JSON): {}".format(json_dumps(result, compact=False)))
Exemplo n.º 10
0
def reap_process_tree(pid, wait_timeout=settings.ACTIVITY_SIGTERM_WAIT_SEC):
    """
    TERMinates (and KILLs) if necessary a process and its descendants.

    See also: https://psutil.readthedocs.io/en/latest/#kill-process-tree.

    :param pid: Process ID
    :type pid: int
    :param wait_timeout: Wait timeout
    :type wait_timeout: float
    """

    def on_terminate(p):
        logger.info('process: terminated pid={} retcode={}'.format(p.pid, p.returncode))

    if pid == os.getpid():
        raise RuntimeError('process: cannot terminate self!')
    parent = psutil.Process(pid)
    procs = parent.children(recursive=True)
    procs.append(parent)
    # Terminate
    for p in procs:
        try:
            p.terminate()
        except psutil.NoSuchProcess:
            pass
    _, alive = psutil.wait_procs(procs, timeout=wait_timeout, callback=on_terminate)
    # Kill
    for p in alive:
        logger.warning('process: pid={} status={} did not respond to SIGTERM. Trying SIGKILL'.format(p.pid, p.status()))
        try:
            p.kill()
        except psutil.NoSuchProcess:
            pass
    # Check
    _, alive = psutil.wait_procs(alive)
    for p in alive:
        logger.error('process: pid={} status={} still alive. Giving up!'.format(p.pid, p.status()))
Exemplo n.º 11
0
def spawn(poller, token, task, heartbeat=60):
    """
    Spawn a process and wait for it to end, sending heartbeats to SWF.

    On activity timeouts and termination, we reap the worker process and its
    children.

    :param poller:
    :type poller: ActivityPoller
    :param token:
    :type token: str
    :param task:
    :type task: swf.models.ActivityTask
    :param heartbeat: heartbeat delay (seconds)
    :type heartbeat: int
    """
    logger.info(
        "spawning new activity worker pid={} heartbeat={}".format(
            os.getpid(), heartbeat
        )
    )
    worker = multiprocessing.Process(target=process_task, args=(poller, token, task),)
    worker.start()

    def worker_alive():
        return psutil.pid_exists(worker.pid)

    while worker_alive():
        worker.join(timeout=heartbeat)
        if not worker_alive():
            # Most certainly unneeded: we'll see
            if worker.exitcode is None:
                # race condition, try and re-join
                worker.join(timeout=0)
                if worker.exitcode is None:
                    logger.warning(
                        "process {} is dead but multiprocessing doesn't know it (simpleflow bug)".format(
                            worker.pid
                        )
                    )
            if worker.exitcode != 0:
                poller.fail_with_retry(
                    token,
                    task,
                    reason="process {} died: exit code {}".format(
                        worker.pid, worker.exitcode
                    ),
                )
            return
        try:
            logger.debug("heartbeating for pid={} (token={})".format(worker.pid, token))
            response = poller.heartbeat(token)
        except swf.exceptions.DoesNotExistError as error:
            # Either the task or the workflow execution no longer exists,
            # let's kill the worker process.
            logger.warning("heartbeat failed: {}".format(error))
            logger.warning("killing (KILL) worker with pid={}".format(worker.pid))
            reap_process_tree(worker.pid)
            return
        except swf.exceptions.RateLimitExceededError as error:
            # ignore rate limit errors: high chances the next heartbeat will be
            # ok anyway, so it would be stupid to break the task for that
            logger.warning(
                'got a "ThrottlingException / Rate exceeded" when heartbeating for task {}: {}'.format(
                    task.activity_type.name, error
                )
            )
            continue
        except Exception as error:
            # Let's crash if it cannot notify the heartbeat failed.  The
            # subprocess will become orphan and the heartbeat timeout may
            # eventually trigger on Amazon SWF side.
            logger.error(
                "cannot send heartbeat for task {}: {}".format(
                    task.activity_type.name, error
                )
            )
            raise

        # Task cancelled.
        if response and response.get("cancelRequested"):
            reap_process_tree(worker.pid)
            return
Exemplo n.º 12
0
def _log_message_too_long(message):
    if len(message) > constants.MAX_LOG_FIELD:
        message = "{} <...truncated to {} chars>".format(
            message[:constants.MAX_LOG_FIELD], constants.MAX_LOG_FIELD)
    logger.error("Message too long, will raise: {}".format(message))
Exemplo n.º 13
0
def _log_message_too_long(message):
    if len(message) > constants.MAX_LOG_FIELD:
        message = "{} <...truncated to {} chars>".format(
            message[:constants.MAX_LOG_FIELD], constants.MAX_LOG_FIELD)
    logger.error("Message too long, will raise: {}".format(message))
Exemplo n.º 14
0
def spawn(poller, token, task, heartbeat=60):
    """
    Spawn a process and wait for it to end, sending heartbeats to SWF.

    On activity timeouts and termination, we reap the worker process and its
    children.

    :param poller:
    :type poller: ActivityPoller
    :param token:
    :type token: str
    :param task:
    :type task: swf.models.ActivityTask
    :param heartbeat: heartbeat delay (seconds)
    :type heartbeat: int
    """
    logger.info('spawning new activity worker pid={} heartbeat={}'.format(os.getpid(), heartbeat))
    worker = multiprocessing.Process(
        target=process_task,
        args=(poller, token, task),
    )
    worker.start()

    def worker_alive():
        return psutil.pid_exists(worker.pid)

    while worker_alive():
        worker.join(timeout=heartbeat)
        if not worker_alive():
            # Most certainly unneeded: we'll see
            if worker.exitcode is None:
                # race condition, try and re-join
                worker.join(timeout=0)
                if worker.exitcode is None:
                    logger.warning("process {} is dead but multiprocessing doesn't know it (simpleflow bug)".format(
                        worker.pid
                    ))
            if worker.exitcode != 0:
                poller.fail_with_retry(
                    token,
                    task,
                    reason='process {} died: exit code {}'.format(
                        worker.pid,
                        worker.exitcode)
                )
            return
        try:
            logger.debug(
                'heartbeating for pid={} (token={})'.format(worker.pid, token)
            )
            response = poller.heartbeat(token)
        except swf.exceptions.DoesNotExistError as error:
            # Either the task or the workflow execution no longer exists,
            # let's kill the worker process.
            logger.warning('heartbeat failed: {}'.format(error))
            logger.warning('killing (KILL) worker with pid={}'.format(worker.pid))
            reap_process_tree(worker.pid)
            return
        except swf.exceptions.RateLimitExceededError as error:
            # ignore rate limit errors: high chances the next heartbeat will be
            # ok anyway, so it would be stupid to break the task for that
            logger.warning(
                'got a "ThrottlingException / Rate exceeded" when heartbeating for task {}: {}'.format(
                    task.activity_type.name,
                    error))
            continue
        except Exception as error:
            # Let's crash if it cannot notify the heartbeat failed.  The
            # subprocess will become orphan and the heartbeat timeout may
            # eventually trigger on Amazon SWF side.
            logger.error('cannot send heartbeat for task {}: {}'.format(
                task.activity_type.name,
                error))
            raise

        # Task cancelled.
        if response and response.get('cancelRequested'):
            reap_process_tree(worker.pid)
            return