Пример #1
0
def _handle_success(ns, q, lock):
    qb.set_state(app_name=ns.app_name, job_id=ns.job_id, completed=True)
    q.consume()
    lock.release()
    log.info("successfully completed job",
             extra=dict(app_name=ns.app_name, job_id=ns.job_id,
                        completed=True))
Пример #2
0
def _handle_success(ns, q, lock):
    qb.set_state(
        app_name=ns.app_name, job_id=ns.job_id, completed=True)
    q.consume()
    lock.release()
    log.info(
        "successfully completed job",
        extra=dict(app_name=ns.app_name, job_id=ns.job_id, completed=True))
Пример #3
0
def _send_to_back_of_queue(q, app_name, job_id):
    # this exists so un-runnable tasks don't hog the front of the queue
    # and soak up resources
    try:
        qb.readd_subtask(app_name, job_id, _force=True)
        q.consume()
        log.info("Job sent to back of queue",
                 extra=dict(app_name=app_name, job_id=job_id))
    except exceptions.JobAlreadyQueued:
        log.info("Job already queued. Cannot send to back of queue.",
                 extra=dict(app_name=app_name, job_id=job_id))
Пример #4
0
def _send_to_back_of_queue(q, app_name, job_id):
    # this exists so un-runnable tasks don't hog the front of the queue
    # and soak up resources
    try:
        qb.readd_subtask(app_name, job_id, _force=True)
        q.consume()
        log.info(
            "Job sent to back of queue",
            extra=dict(app_name=app_name, job_id=job_id))
    except exceptions.JobAlreadyQueued:
        log.info(
            "Job already queued. Cannot send to back of queue.",
            extra=dict(app_name=app_name, job_id=job_id))
Пример #5
0
def validate_job_id(app_name, job_id, q, timeout):
    """Return True if valid job_id.
    If invalid, do whatever cleanup for this job is necessary and return False.
      --> necessary cleanup may include removing this job_id from queue
    """
    if job_id is None:
        log.info('No jobs found in %d seconds...' % timeout,
                 extra=dict(app_name=app_name))
        return False
    try:
        dt.parse_job_id(app_name, job_id)
    except exceptions.InvalidJobId as err:
        log.error(
            ("Stolos found an invalid job_id.  Removing it from queue"
             " and marking that job_id as failed.  Error details: %s") % err,
            extra=dict(app_name=app_name, job_id=job_id))
        q.consume()
        qb._set_state_unsafe(app_name, job_id, failed=True)
        return False
    return True
Пример #6
0
def validate_job_id(app_name, job_id, q, timeout):
    """Return True if valid job_id.
    If invalid, do whatever cleanup for this job is necessary and return False.
      --> necessary cleanup may include removing this job_id from queue
    """
    if job_id is None:
        log.info('No jobs found in %d seconds...' % timeout, extra=dict(
            app_name=app_name))
        return False
    try:
        dt.parse_job_id(app_name, job_id)
    except exceptions.InvalidJobId as err:
        log.error((
            "Stolos found an invalid job_id.  Removing it from queue"
            " and marking that job_id as failed.  Error details: %s") % err,
            extra=dict(app_name=app_name, job_id=job_id))
        q.consume()
        qb._set_state_unsafe(
            app_name, job_id, failed=True)
        return False
    return True
Пример #7
0
def main(ns):
    """
    Fetch a job_id from the `app_name` queue and figure out what to with it.

    If the job is runnable, execute it and then queue its children into their
    respective queues.  If it's not runnable, queue its parents into respective
    parent queues and remove the job from its own queue.
    If the job fails, either requeue it or mark it as permanently failed
    """
    assert ns.app_name in dt.get_task_names()
    if ns.bypass_scheduler:
        log.info(
            "Running a task without scheduling anything"
            " or fetching from a queue", extra=dict(
                app_name=ns.app_name, job_id=ns.job_id))
        assert ns.job_id
        ns.job_type_func(ns=ns)
        return

    log.info("Beginning Stolos", extra=dict(**ns.__dict__))
    q = qb.get_qbclient().LockingQueue(ns.app_name)
    if ns.job_id:
        lock = _handle_manually_given_job_id(ns)
        q.consume = object  # do nothing
    else:
        ns.job_id = q.get(timeout=ns.timeout)
        if not validate_job_id(app_name=ns.app_name, job_id=ns.job_id,
                               q=q, timeout=ns.timeout):
            return
        try:
            lock = get_lock_if_job_is_runnable(
                app_name=ns.app_name, job_id=ns.job_id)
        except exceptions.NoNodeError:
            q.consume()
            log.exception(
                "Job failed. The job is queued, so why does its state not"
                " exist?  The Queue backend may be in an inconsistent state."
                " Consuming this job",
                extra=dict(app_name=ns.app_name, job_id=ns.job_id))
            return

    log.debug(
        "Stolos got a job_id.", extra=dict(
            app_name=ns.app_name, job_id=ns.job_id, acquired_lock=bool(lock)))
    if lock is False:
        # infinite loop: some jobs will always requeue if lock is unobtainable
        log.info("Could not obtain a lock.  Will requeue and try again later",
                 extra=dict(app_name=ns.app_name, job_id=ns.job_id))
        _send_to_back_of_queue(
            q=q, app_name=ns.app_name, job_id=ns.job_id)
        return

    if not parents_completed(ns.app_name, ns.job_id, q=q, lock=lock):
        return

    log.info(
        "Job starting!", extra=dict(app_name=ns.app_name, job_id=ns.job_id))
    try:
        ns.job_type_func(ns=ns)
    except exceptions.CodeError:  # assume error is previously logged
        _handle_failure(ns, q, lock)
        return
    except Exception as err:
        log.exception(
            ("Job failed!  Unhandled exception in an application!"
             " Fix ASAP because"
             " it is unclear how to handle this failure.  %s: %s")
            % (err.__class__.__name__, err), extra=dict(
                app_name=ns.app_name, job_id=ns.job_id, failed=True))
        return
    _handle_success(ns, q, lock)
Пример #8
0
def main(ns):
    """
    Fetch a job_id from the `app_name` queue and figure out what to with it.

    If the job is runnable, execute it and then queue its children into their
    respective queues.  If it's not runnable, queue its parents into respective
    parent queues and remove the job from its own queue.
    If the job fails, either requeue it or mark it as permanently failed
    """
    assert ns.app_name in dt.get_task_names()
    if ns.bypass_scheduler:
        log.info(
            "Running a task without scheduling anything"
            " or fetching from a queue",
            extra=dict(app_name=ns.app_name, job_id=ns.job_id))
        assert ns.job_id
        ns.job_type_func(ns=ns)
        return

    log.info("Beginning Stolos", extra=dict(**ns.__dict__))
    q = qb.get_qbclient().LockingQueue(ns.app_name)
    if ns.job_id:
        lock = _handle_manually_given_job_id(ns)
        q.consume = object  # do nothing
    else:
        ns.job_id = q.get(timeout=ns.timeout)
        if not validate_job_id(
                app_name=ns.app_name, job_id=ns.job_id, q=q,
                timeout=ns.timeout):
            return
        try:
            lock = get_lock_if_job_is_runnable(app_name=ns.app_name,
                                               job_id=ns.job_id)
        except exceptions.NoNodeError:
            q.consume()
            log.exception(
                "Job failed. The job is queued, so why does its state not"
                " exist?  The Queue backend may be in an inconsistent state."
                " Consuming this job",
                extra=dict(app_name=ns.app_name, job_id=ns.job_id))
            return

    log.debug("Stolos got a job_id.",
              extra=dict(app_name=ns.app_name,
                         job_id=ns.job_id,
                         acquired_lock=bool(lock)))
    if lock is False:
        # infinite loop: some jobs will always requeue if lock is unobtainable
        log.info("Could not obtain a lock.  Will requeue and try again later",
                 extra=dict(app_name=ns.app_name, job_id=ns.job_id))
        _send_to_back_of_queue(q=q, app_name=ns.app_name, job_id=ns.job_id)
        return

    if not parents_completed(ns.app_name, ns.job_id, q=q, lock=lock):
        return

    log.info("Job starting!",
             extra=dict(app_name=ns.app_name, job_id=ns.job_id))
    try:
        ns.job_type_func(ns=ns)
    except exceptions.CodeError:  # assume error is previously logged
        _handle_failure(ns, q, lock)
        return
    except Exception as err:
        log.exception(("Job failed!  Unhandled exception in an application!"
                       " Fix ASAP because"
                       " it is unclear how to handle this failure.  %s: %s") %
                      (err.__class__.__name__, err),
                      extra=dict(app_name=ns.app_name,
                                 job_id=ns.job_id,
                                 failed=True))
        return
    _handle_success(ns, q, lock)