예제 #1
0
 def test_fail(self):
     retry_delay = 1
     with self.assertRaises(AssertionError):
         start = time.time()
         handle_broker_timeout(fail, retry_delay=retry_delay)
     delta = time.time() - start
     # Make sure we didn't do sleep or retry
     self.assertLess(delta, retry_delay)
예제 #2
0
def update_task_name(sender, task_id, *_args, **_kwargs):
    # Although the name was populated in populate_task_info before_task_publish, the name
    # can be inaccurate if it was a plugin. We can only over-write it with the accurate name
    # at task_prerun.
    callable_func = current_app.backend.client.hset
    args = (task_id, 'name', sender.name)
    handle_broker_timeout(callable_func=callable_func,
                          args=args,
                          timeout=5 * 60,
                          reraise_on_timeout=False)
예제 #3
0
 def test_success_not_reached_due_to_timeout(self):
     retry_delay = 0.1
     succeed_after_retries = 5
     timeout = 2 * retry_delay
     obj = SucceedAfter(succeed_after_retries)
     start = time.time()
     with self.assertRaises(TimeoutError):
         handle_broker_timeout(obj.foo,
                               retry_delay=retry_delay,
                               timeout=timeout)
     delta = time.time() - start
     self.assertGreater(delta, timeout)
     self.assertLess(delta, timeout + retry_delay)
예제 #4
0
 def test_fail_with_timeout(self):
     timeout = 0.5
     retry_delay = 0.1
     with self.assertRaises(TimeoutError):
         start = time.time()
         handle_broker_timeout(fail_with_timeout,
                               retry_delay=0.1,
                               timeout=timeout)
     delta = time.time() - start
     self.assertGreater(delta, timeout)
     self.assertLess(
         delta, timeout +
         (retry_delay *
          10))  # handle_broker_timeout now uses exponential retry delay
예제 #5
0
def is_result_ready(result: AsyncResult, timeout=15 * 60, retry_delay=1):
    """
    Protect against broker being temporary unreachable and throwing a TimeoutError
    """
    return handle_broker_timeout(result.ready,
                                 timeout=timeout,
                                 retry_delay=retry_delay)
예제 #6
0
def get_task_info_from_result(result, key: str = None):
    try:
        backend = result.app.backend
    except AttributeError:
        backend = current_app.backend

    if key is not None:
        info = handle_broker_timeout(backend.client.hget,
                                     args=(str(result), key))
    else:
        info = handle_broker_timeout(backend.client.get, args=(str(result), ))

    if info is None:
        info = ''
    else:
        info = info.decode()
    return info
예제 #7
0
def _check_for_failure_in_parents(result, timeout=15 * 60, retry_delay=1):
    failed_parent = revoked_parent = None
    parent = handle_broker_timeout(getattr,
                                   args=(result, 'parent'),
                                   timeout=timeout,
                                   retry_delay=retry_delay)
    while parent and parent != result:

        state = handle_broker_timeout(getattr,
                                      args=(parent, 'state'),
                                      timeout=timeout,
                                      retry_delay=retry_delay)
        if state == FAILURE:
            failed_parent = parent
            break

        if state == REVOKED or RevokedRequests.instance().is_revoked(parent):
            revoked_parent = parent
            break

        result = parent
        parent = handle_broker_timeout(getattr,
                                       args=(parent, 'parent'),
                                       timeout=timeout,
                                       retry_delay=retry_delay)
    else:
        return  # <-- loop finished with no errors in parents

    if revoked_parent:
        raise ChainRevokedException(
            task_id=str(revoked_parent),
            task_name=get_task_name_from_result(revoked_parent))

    #  If we get here, failed_parent holds a failed parent
    parent = handle_broker_timeout(getattr,
                                   args=(failed_parent, 'parent'),
                                   timeout=timeout,
                                   retry_delay=retry_delay)
    while parent and parent != failed_parent:  # Find first failed parent, now that celery propagates parent failures
        parent_failed = handle_broker_timeout(parent.failed,
                                              timeout=timeout,
                                              retry_delay=retry_delay)
        if not parent_failed:
            break

        failed_parent = parent
        parent = handle_broker_timeout(getattr,
                                       args=(parent, 'parent'),
                                       timeout=timeout,
                                       retry_delay=retry_delay)

    cause = handle_broker_timeout(getattr,
                                  args=(failed_parent, 'result'),
                                  timeout=timeout,
                                  retry_delay=retry_delay)
    cause = cause if isinstance(cause, Exception) else None
    raise ChainInterruptedException(
        task_id=str(failed_parent),
        task_name=get_task_name_from_result(failed_parent),
        cause=cause)
예제 #8
0
 def test_succeed_after_retries(self):
     retry_delay = 0.1
     succeed_after_retries = 3
     obj = SucceedAfter(succeed_after_retries)
     start = time.time()
     self.assertIsNone(
         handle_broker_timeout(obj.foo, retry_delay=retry_delay))
     delta = time.time() - start
     self.assertGreater(delta, succeed_after_retries * retry_delay)
     self.assertLess(delta, (succeed_after_retries + 1) * retry_delay)
예제 #9
0
def wait_on_async_results(results,
                          max_wait=None,
                          callbacks: Iterator[WaitLoopCallBack] = tuple(),
                          sleep_between_iterations=0.05,
                          check_task_worker_frequency=600,
                          fail_on_worker_failures=7,
                          log_msg=True,
                          **_kwargs):
    if not results:
        return

    if isinstance(results, AsyncResult):
        results = [results]

    max_sleep = sleep_between_iterations * 20 * 15  # Somewhat arbitrary
    failures = []
    start_time = time.monotonic()
    last_callback_time = {callback.func: start_time for callback in callbacks}
    for result in results:
        logging_name = get_result_logging_name(result)
        if log_msg:
            logger.debug('-> Waiting for %s to complete' % logging_name)

        try:
            task_worker_failures = 0
            last_dead_task_worker_check = time.monotonic()
            while not is_result_ready(result):
                if RevokedRequests.instance().is_revoked(result):
                    break
                _check_for_failure_in_parents(result)

                current_time = time.monotonic()
                if max_wait and (current_time - start_time) > max_wait:
                    logging_name = get_result_logging_name(result)
                    raise WaitOnChainTimeoutError(
                        'Result ID %s was not ready in %d seconds' %
                        (logging_name, max_wait))

                # callbacks
                for callback in callbacks:
                    if (current_time - last_callback_time[callback.func]
                        ) > callback.frequency:
                        callback.func(**callback.kwargs)
                        last_callback_time[callback.func] = current_time

                # Check for dead workers
                if check_task_worker_frequency and fail_on_worker_failures and \
                        (current_time - last_dead_task_worker_check) > check_task_worker_frequency:
                    alive = _is_worker_alive(result=result)
                    last_dead_task_worker_check = current_time
                    if not alive:
                        task_worker_failures += 1
                        logger.warning(
                            f'Task {get_task_name_from_result(result)} appears to be a zombie.'
                            f' Failures: {task_worker_failures}')
                        if task_worker_failures >= fail_on_worker_failures:
                            task_id = str(result)
                            task_name = get_task_name_from_result(result)
                            raise ChainInterruptedByZombieTaskException(
                                task_id=task_id, task_name=task_name)
                    else:
                        task_worker_failures = 0

                time.sleep(sleep_between_iterations)
                sleep_between_iterations = sleep_between_iterations * 1.01 \
                    if sleep_between_iterations*1.01 < max_sleep else max_sleep  # Exponential backoff

            # If failure happened in a chain, raise from the failing task within the chain
            _check_for_failure_in_parents(result)

            result_state = handle_broker_timeout(getattr,
                                                 args=(result, 'state'))
            if result_state == REVOKED:
                #  wait for revoked tasks to actually finish running
                wait_for_running_tasks_from_results([result])
                raise ChainRevokedException(
                    task_id=str(result),
                    task_name=get_task_name_from_result(result))
            if result_state == PENDING:
                # Pending tasks can be in revoke list. State will still be PENDING.
                raise ChainRevokedPreRunException(
                    task_id=str(result),
                    task_name=get_task_name_from_result(result))
            if result_state == FAILURE:
                cause = result.result if isinstance(result.result,
                                                    Exception) else None
                raise ChainInterruptedException(
                    task_id=str(result),
                    task_name=get_task_name_from_result(result),
                    cause=cause)

        except (ChainRevokedException, ChainInterruptedException) as e:
            failures.append(e)

    if len(failures) == 1:
        raise failures[0]
    elif failures:
        failed_task_ids = [
            e.task_id for e in failures if hasattr(e, 'task_id')
        ]
        multi_exception = MultipleFailuresException(failed_task_ids)
        multi_exception.failures = failures
        raise multi_exception
예제 #10
0
def _is_worker_alive(result: AsyncResult, retries=1):
    task_name = get_result_logging_name(result)
    tries = 0

    # NOTE: Retries for possible false negative in the case where task changes host in the small timing window
    # between getting task state / info and checking for aliveness. Retries for broker issues are handled downstream
    while tries <= retries:
        state = handle_broker_timeout(lambda r: r.state, args=(result, ))
        if not state:
            logger.debug(
                f'Cannot get state for {task_name}; assuming task is alive')
            return True

        if state == STARTED or state == RECEIVED:
            # Query the worker to see if it knows about this task
            info = handle_broker_timeout(lambda r: r.info, args=(result, ))
            try:
                # NOTE: if the task completes after the check for state right above but before the call
                # to handle_broker_timeout(), the type of 'info' is whatever the task returned, not the internal
                # Celery dictionary we want. It can be an exception, or even a dictionary with a random 'hostname'.
                # In the latter case _is_worker_alive() will return False, but since we retry _is_worker_alive() that
                # should be fine -- this timing issue cannot happen twice for the same task.
                hostname = info.get('hostname')
            except AttributeError:
                hostname = None

            if not hostname:
                logger.debug(
                    f'Cannot get run info for {task_name}; assuming task is alive.'
                    f' Info: {info}, Hostname: {hostname}')
                return True

            task_id = result.id
            task_info = get_task(method_args=(task_id, ),
                                 destination=(hostname, ),
                                 timeout=60)
            if task_info and any(task_info.values()):
                return True

            # Try get_active and get_reserved, since we suspect query_task (the api used by get_task above)
            # may be broken sometimes.
            active_tasks = get_active(destination=(hostname, ), timeout=60)
            task_list = active_tasks.get(hostname) if active_tasks else None
            if task_list:
                for task in task_list:
                    this_task_id = task.get('id')
                    if this_task_id == task_id:
                        return True

            reserved_tasks = get_reserved(destination=(hostname, ), timeout=60)
            task_list = reserved_tasks.get(
                hostname) if reserved_tasks else None
            if task_list:
                for task in task_list:
                    this_task_id = task.get('id')
                    if this_task_id == task_id:
                        return True

            logger.debug(
                f'Task inspection for {task_name} on {hostname} with id '
                f'of {task_id} returned:\n{pformat(task_info)}\n'
                f'Active tasks:\n{pformat(active_tasks)}\n'
                f'Reserved tasks:\n{pformat(reserved_tasks)}')

        elif state == PENDING or state == RETRY:
            # Check if task queue is alive
            task_queue = get_task_queue_from_result(result)
            if not task_queue:
                logger.debug(
                    f'Cannot get task queue for {task_name}; assuming task is alive.'
                )
                return True

            queue_seen = was_queue_ready(queue_name=task_queue)
            if not queue_seen:
                logger.debug(
                    f'Queue "{task_queue}" for {task_name} not seen yet; assuming task is alive.'
                )
                return True

            queues = get_active_queues(timeout=60)
            active_queues = {
                queue['name']
                for node in queues.values() for queue in node
            } if queues else set()
            if task_queue in active_queues:
                return True

            logger.debug(
                f'Active queues inspection for {task_name} on queue {task_queue} returned:\n'
                f'{pformat(queues)}\n'
                f'Active queues: {pformat(active_queues)}')

        elif state == SUCCESS:
            return True  # Timing; possible if task state changed after we waited on it but before we got here

        else:
            logger.debug(
                f'Unknown state ({state} for task {task_name}; assuming task is alive.'
            )
            return True

        tries += 1
        logger.info(
            f'Task {task_name} is not responding to queries. Tries: {tries}')

    return False
예제 #11
0
 def test_passing_callable_with_args_and_kwargs(self):
     self.assertEqual(
         handle_broker_timeout(bar, args=(1, ), kwargs={'b': 2}), 3)
예제 #12
0
 def test_passing_callable_with_kwargs(self):
     self.assertEqual(handle_broker_timeout(bar, kwargs={'a': 1}), 2)
예제 #13
0
 def test_passing_callable_with_args(self):
     self.assertEqual(handle_broker_timeout(bar, args=(1, )), 2)
예제 #14
0
 def test_passing_callable(self):
     self.assertEqual(handle_broker_timeout(foo), 1)