def fail(self, token, task, reason=None, details=None): """ Fail the activity, log and ignore exceptions. :param token: :type token: :param task: :type task: :param reason: :type reason: :param details: :type details: :return: :rtype: """ try: return swf.actors.ActivityWorker.fail( self, token, reason=reason, details=details, ) except Exception as err: logger.error('cannot fail task {}: {}'.format( task.activity_type.name, err, ))
def run(self, token, task): ppid = os.getppid() while True: time.sleep(self._interval) if os.getppid() != ppid: os._exit(1) try: logger.info("heartbeat {} for task {}".format( time.time(), task.activity_type.name)) except Exception: # Do not crash for debug pass try: response = self.send_heartbeat(token) except swf.exceptions.DoesNotExistError: # Either the task or the workflow execution no longer exists. logger.warning( "task {} no longer exists. Stopping heartbeat".format( task.activity_type.name)) return except Exception as error: # Let's crash if it cannot notify the heartbeat failed. logger.error("cannot send heartbeat for task {}: {}".format( task.activity_type.name, error)) raise if response and response.get("cancelRequested"): return
def decorated(*args, **kwargs): try: return func(*args, **kwargs) except exceptions as err: if log is True: logger.error("call to {} raised: {}".format( func.__name__, err)) if handle_with is None: raise return handle_with(err, *args, **kwargs)
def process_decision(poller, decision_response): # type: (DeciderPoller, Response) -> None workflow_id = decision_response.execution.workflow_id workflow_str = "workflow {} ({})".format(workflow_id, poller.workflow_name) logger.debug("process_decision() pid={}".format(os.getpid())) logger.info("taking decision for {}".format(workflow_str)) format.JUMBO_FIELDS_MEMORY_CACHE.clear() decisions = poller.decide(decision_response) try: logger.info("completing decision for {}".format(workflow_str)) poller.complete_with_retry(decision_response.token, decisions) except Exception as err: logger.error("cannot complete decision for {}: {}".format(workflow_str, err))
def decorated(*args, **kwargs): try: return func(*args, **kwargs) except exceptions as err: if log is True: logger.error('call to {} raised: {}'.format( func.__name__, err)) if handle_with is None: raise return handle_with(err, *args, **kwargs)
def activity_rerun(domain, workflow_id, run_id, input, scheduled_id, activity_id): # handle params if not activity_id and not scheduled_id: logger.error("Please supply --scheduled-id or --activity-id.") sys.exit(1) input_override = None if input: input_override = format.decode(input) # find workflow execution try: wfe = helpers.get_workflow_execution(domain, workflow_id, run_id) except (swf.exceptions.DoesNotExistError, IndexError): logger.error("Couldn't find execution, exiting.") sys.exit(1) logger.info("Found execution: workflowId={} runId={}".format( wfe.workflow_id, wfe.run_id)) # now rerun the specified activity history = History(wfe.history()) history.parse() task, args, kwargs, meta, params = helpers.find_activity( history, scheduled_id=scheduled_id, activity_id=activity_id, input=input_override, ) kwargs["context"].update({ "workflow_id": wfe.workflow_id, "run_id": wfe.run_id, }) logger.debug("Found activity. Last execution:") for line in json_dumps(params, pretty=True).split("\n"): logger.debug(line) if input_override: logger.info("NB: input will be overriden with the passed one!") logger.info("Will re-run: {}(*{}, **{}) [+meta={}]".format( task, args, kwargs, meta)) # download binaries if needed download_binaries(meta.get("binaries", {})) # execute the activity task with the correct arguments instance = ActivityTask(task, *args, **kwargs) result = instance.execute() if hasattr(instance, "post_execute"): instance.post_execute() logger.info("Result (JSON): {}".format(json_dumps(result, compact=False)))
def reap_process_tree(pid, wait_timeout=settings.ACTIVITY_SIGTERM_WAIT_SEC): """ TERMinates (and KILLs) if necessary a process and its descendants. See also: https://psutil.readthedocs.io/en/latest/#kill-process-tree. :param pid: Process ID :type pid: int :param wait_timeout: Wait timeout :type wait_timeout: float """ def on_terminate(p): logger.info("process: terminated pid={} retcode={}".format(p.pid, p.returncode)) if pid == os.getpid(): raise RuntimeError("process: cannot terminate self!") parent = psutil.Process(pid) procs = parent.children(recursive=True) procs.append(parent) # Terminate for p in procs: try: p.terminate() except psutil.NoSuchProcess: pass _, alive = psutil.wait_procs(procs, timeout=wait_timeout, callback=on_terminate) # Kill for p in alive: logger.warning( "process: pid={} status={} did not respond to SIGTERM. Trying SIGKILL".format( p.pid, p.status() ) ) try: p.kill() except psutil.NoSuchProcess: pass # Check _, alive = psutil.wait_procs(alive) for p in alive: logger.error( "process: pid={} status={} still alive. Giving up!".format( p.pid, p.status() ) )
def activity_rerun(domain, workflow_id, run_id, input, scheduled_id, activity_id): # handle params if not activity_id and not scheduled_id: logger.error("Please supply --scheduled-id or --activity-id.") sys.exit(1) input_override = None if input: input_override = format.decode(input) # find workflow execution try: wfe = helpers.get_workflow_execution(domain, workflow_id, run_id) except (swf.exceptions.DoesNotExistError, IndexError): logger.error("Couldn't find execution, exiting.") sys.exit(1) logger.info("Found execution: workflowId={} runId={}".format(wfe.workflow_id, wfe.run_id)) # now rerun the specified activity history = History(wfe.history()) history.parse() task, args, kwargs, meta, params = helpers.find_activity( history, scheduled_id=scheduled_id, activity_id=activity_id, input=input_override, ) logger.debug("Found activity. Last execution:") for line in json_dumps(params, pretty=True).split("\n"): logger.debug(line) if input_override: logger.info("NB: input will be overriden with the passed one!") logger.info("Will re-run: {}(*{}, **{}) [+meta={}]".format(task, args, kwargs, meta)) # download binaries if needed download_binaries(meta.get("binaries", {})) # execute the activity task with the correct arguments instance = ActivityTask(task, *args, **kwargs) result = instance.execute() if hasattr(instance, 'post_execute'): instance.post_execute() logger.info("Result (JSON): {}".format(json_dumps(result, compact=False)))
def reap_process_tree(pid, wait_timeout=settings.ACTIVITY_SIGTERM_WAIT_SEC): """ TERMinates (and KILLs) if necessary a process and its descendants. See also: https://psutil.readthedocs.io/en/latest/#kill-process-tree. :param pid: Process ID :type pid: int :param wait_timeout: Wait timeout :type wait_timeout: float """ def on_terminate(p): logger.info('process: terminated pid={} retcode={}'.format(p.pid, p.returncode)) if pid == os.getpid(): raise RuntimeError('process: cannot terminate self!') parent = psutil.Process(pid) procs = parent.children(recursive=True) procs.append(parent) # Terminate for p in procs: try: p.terminate() except psutil.NoSuchProcess: pass _, alive = psutil.wait_procs(procs, timeout=wait_timeout, callback=on_terminate) # Kill for p in alive: logger.warning('process: pid={} status={} did not respond to SIGTERM. Trying SIGKILL'.format(p.pid, p.status())) try: p.kill() except psutil.NoSuchProcess: pass # Check _, alive = psutil.wait_procs(alive) for p in alive: logger.error('process: pid={} status={} still alive. Giving up!'.format(p.pid, p.status()))
def spawn(poller, token, task, heartbeat=60): """ Spawn a process and wait for it to end, sending heartbeats to SWF. On activity timeouts and termination, we reap the worker process and its children. :param poller: :type poller: ActivityPoller :param token: :type token: str :param task: :type task: swf.models.ActivityTask :param heartbeat: heartbeat delay (seconds) :type heartbeat: int """ logger.info( "spawning new activity worker pid={} heartbeat={}".format( os.getpid(), heartbeat ) ) worker = multiprocessing.Process(target=process_task, args=(poller, token, task),) worker.start() def worker_alive(): return psutil.pid_exists(worker.pid) while worker_alive(): worker.join(timeout=heartbeat) if not worker_alive(): # Most certainly unneeded: we'll see if worker.exitcode is None: # race condition, try and re-join worker.join(timeout=0) if worker.exitcode is None: logger.warning( "process {} is dead but multiprocessing doesn't know it (simpleflow bug)".format( worker.pid ) ) if worker.exitcode != 0: poller.fail_with_retry( token, task, reason="process {} died: exit code {}".format( worker.pid, worker.exitcode ), ) return try: logger.debug("heartbeating for pid={} (token={})".format(worker.pid, token)) response = poller.heartbeat(token) except swf.exceptions.DoesNotExistError as error: # Either the task or the workflow execution no longer exists, # let's kill the worker process. logger.warning("heartbeat failed: {}".format(error)) logger.warning("killing (KILL) worker with pid={}".format(worker.pid)) reap_process_tree(worker.pid) return except swf.exceptions.RateLimitExceededError as error: # ignore rate limit errors: high chances the next heartbeat will be # ok anyway, so it would be stupid to break the task for that logger.warning( 'got a "ThrottlingException / Rate exceeded" when heartbeating for task {}: {}'.format( task.activity_type.name, error ) ) continue except Exception as error: # Let's crash if it cannot notify the heartbeat failed. The # subprocess will become orphan and the heartbeat timeout may # eventually trigger on Amazon SWF side. logger.error( "cannot send heartbeat for task {}: {}".format( task.activity_type.name, error ) ) raise # Task cancelled. if response and response.get("cancelRequested"): reap_process_tree(worker.pid) return
def _log_message_too_long(message): if len(message) > constants.MAX_LOG_FIELD: message = "{} <...truncated to {} chars>".format( message[:constants.MAX_LOG_FIELD], constants.MAX_LOG_FIELD) logger.error("Message too long, will raise: {}".format(message))
def spawn(poller, token, task, heartbeat=60): """ Spawn a process and wait for it to end, sending heartbeats to SWF. On activity timeouts and termination, we reap the worker process and its children. :param poller: :type poller: ActivityPoller :param token: :type token: str :param task: :type task: swf.models.ActivityTask :param heartbeat: heartbeat delay (seconds) :type heartbeat: int """ logger.info('spawning new activity worker pid={} heartbeat={}'.format(os.getpid(), heartbeat)) worker = multiprocessing.Process( target=process_task, args=(poller, token, task), ) worker.start() def worker_alive(): return psutil.pid_exists(worker.pid) while worker_alive(): worker.join(timeout=heartbeat) if not worker_alive(): # Most certainly unneeded: we'll see if worker.exitcode is None: # race condition, try and re-join worker.join(timeout=0) if worker.exitcode is None: logger.warning("process {} is dead but multiprocessing doesn't know it (simpleflow bug)".format( worker.pid )) if worker.exitcode != 0: poller.fail_with_retry( token, task, reason='process {} died: exit code {}'.format( worker.pid, worker.exitcode) ) return try: logger.debug( 'heartbeating for pid={} (token={})'.format(worker.pid, token) ) response = poller.heartbeat(token) except swf.exceptions.DoesNotExistError as error: # Either the task or the workflow execution no longer exists, # let's kill the worker process. logger.warning('heartbeat failed: {}'.format(error)) logger.warning('killing (KILL) worker with pid={}'.format(worker.pid)) reap_process_tree(worker.pid) return except swf.exceptions.RateLimitExceededError as error: # ignore rate limit errors: high chances the next heartbeat will be # ok anyway, so it would be stupid to break the task for that logger.warning( 'got a "ThrottlingException / Rate exceeded" when heartbeating for task {}: {}'.format( task.activity_type.name, error)) continue except Exception as error: # Let's crash if it cannot notify the heartbeat failed. The # subprocess will become orphan and the heartbeat timeout may # eventually trigger on Amazon SWF side. logger.error('cannot send heartbeat for task {}: {}'.format( task.activity_type.name, error)) raise # Task cancelled. if response and response.get('cancelRequested'): reap_process_tree(worker.pid) return