def _handle_success(ns, q, lock): qb.set_state(app_name=ns.app_name, job_id=ns.job_id, completed=True) q.consume() lock.release() log.info("successfully completed job", extra=dict(app_name=ns.app_name, job_id=ns.job_id, completed=True))
def _handle_success(ns, q, lock): qb.set_state( app_name=ns.app_name, job_id=ns.job_id, completed=True) q.consume() lock.release() log.info( "successfully completed job", extra=dict(app_name=ns.app_name, job_id=ns.job_id, completed=True))
def _send_to_back_of_queue(q, app_name, job_id): # this exists so un-runnable tasks don't hog the front of the queue # and soak up resources try: qb.readd_subtask(app_name, job_id, _force=True) q.consume() log.info("Job sent to back of queue", extra=dict(app_name=app_name, job_id=job_id)) except exceptions.JobAlreadyQueued: log.info("Job already queued. Cannot send to back of queue.", extra=dict(app_name=app_name, job_id=job_id))
def _send_to_back_of_queue(q, app_name, job_id): # this exists so un-runnable tasks don't hog the front of the queue # and soak up resources try: qb.readd_subtask(app_name, job_id, _force=True) q.consume() log.info( "Job sent to back of queue", extra=dict(app_name=app_name, job_id=job_id)) except exceptions.JobAlreadyQueued: log.info( "Job already queued. Cannot send to back of queue.", extra=dict(app_name=app_name, job_id=job_id))
def validate_job_id(app_name, job_id, q, timeout): """Return True if valid job_id. If invalid, do whatever cleanup for this job is necessary and return False. --> necessary cleanup may include removing this job_id from queue """ if job_id is None: log.info('No jobs found in %d seconds...' % timeout, extra=dict(app_name=app_name)) return False try: dt.parse_job_id(app_name, job_id) except exceptions.InvalidJobId as err: log.error( ("Stolos found an invalid job_id. Removing it from queue" " and marking that job_id as failed. Error details: %s") % err, extra=dict(app_name=app_name, job_id=job_id)) q.consume() qb._set_state_unsafe(app_name, job_id, failed=True) return False return True
def validate_job_id(app_name, job_id, q, timeout): """Return True if valid job_id. If invalid, do whatever cleanup for this job is necessary and return False. --> necessary cleanup may include removing this job_id from queue """ if job_id is None: log.info('No jobs found in %d seconds...' % timeout, extra=dict( app_name=app_name)) return False try: dt.parse_job_id(app_name, job_id) except exceptions.InvalidJobId as err: log.error(( "Stolos found an invalid job_id. Removing it from queue" " and marking that job_id as failed. Error details: %s") % err, extra=dict(app_name=app_name, job_id=job_id)) q.consume() qb._set_state_unsafe( app_name, job_id, failed=True) return False return True
def main(ns): """ Fetch a job_id from the `app_name` queue and figure out what to with it. If the job is runnable, execute it and then queue its children into their respective queues. If it's not runnable, queue its parents into respective parent queues and remove the job from its own queue. If the job fails, either requeue it or mark it as permanently failed """ assert ns.app_name in dt.get_task_names() if ns.bypass_scheduler: log.info( "Running a task without scheduling anything" " or fetching from a queue", extra=dict( app_name=ns.app_name, job_id=ns.job_id)) assert ns.job_id ns.job_type_func(ns=ns) return log.info("Beginning Stolos", extra=dict(**ns.__dict__)) q = qb.get_qbclient().LockingQueue(ns.app_name) if ns.job_id: lock = _handle_manually_given_job_id(ns) q.consume = object # do nothing else: ns.job_id = q.get(timeout=ns.timeout) if not validate_job_id(app_name=ns.app_name, job_id=ns.job_id, q=q, timeout=ns.timeout): return try: lock = get_lock_if_job_is_runnable( app_name=ns.app_name, job_id=ns.job_id) except exceptions.NoNodeError: q.consume() log.exception( "Job failed. The job is queued, so why does its state not" " exist? The Queue backend may be in an inconsistent state." " Consuming this job", extra=dict(app_name=ns.app_name, job_id=ns.job_id)) return log.debug( "Stolos got a job_id.", extra=dict( app_name=ns.app_name, job_id=ns.job_id, acquired_lock=bool(lock))) if lock is False: # infinite loop: some jobs will always requeue if lock is unobtainable log.info("Could not obtain a lock. Will requeue and try again later", extra=dict(app_name=ns.app_name, job_id=ns.job_id)) _send_to_back_of_queue( q=q, app_name=ns.app_name, job_id=ns.job_id) return if not parents_completed(ns.app_name, ns.job_id, q=q, lock=lock): return log.info( "Job starting!", extra=dict(app_name=ns.app_name, job_id=ns.job_id)) try: ns.job_type_func(ns=ns) except exceptions.CodeError: # assume error is previously logged _handle_failure(ns, q, lock) return except Exception as err: log.exception( ("Job failed! Unhandled exception in an application!" " Fix ASAP because" " it is unclear how to handle this failure. %s: %s") % (err.__class__.__name__, err), extra=dict( app_name=ns.app_name, job_id=ns.job_id, failed=True)) return _handle_success(ns, q, lock)
def main(ns): """ Fetch a job_id from the `app_name` queue and figure out what to with it. If the job is runnable, execute it and then queue its children into their respective queues. If it's not runnable, queue its parents into respective parent queues and remove the job from its own queue. If the job fails, either requeue it or mark it as permanently failed """ assert ns.app_name in dt.get_task_names() if ns.bypass_scheduler: log.info( "Running a task without scheduling anything" " or fetching from a queue", extra=dict(app_name=ns.app_name, job_id=ns.job_id)) assert ns.job_id ns.job_type_func(ns=ns) return log.info("Beginning Stolos", extra=dict(**ns.__dict__)) q = qb.get_qbclient().LockingQueue(ns.app_name) if ns.job_id: lock = _handle_manually_given_job_id(ns) q.consume = object # do nothing else: ns.job_id = q.get(timeout=ns.timeout) if not validate_job_id( app_name=ns.app_name, job_id=ns.job_id, q=q, timeout=ns.timeout): return try: lock = get_lock_if_job_is_runnable(app_name=ns.app_name, job_id=ns.job_id) except exceptions.NoNodeError: q.consume() log.exception( "Job failed. The job is queued, so why does its state not" " exist? The Queue backend may be in an inconsistent state." " Consuming this job", extra=dict(app_name=ns.app_name, job_id=ns.job_id)) return log.debug("Stolos got a job_id.", extra=dict(app_name=ns.app_name, job_id=ns.job_id, acquired_lock=bool(lock))) if lock is False: # infinite loop: some jobs will always requeue if lock is unobtainable log.info("Could not obtain a lock. Will requeue and try again later", extra=dict(app_name=ns.app_name, job_id=ns.job_id)) _send_to_back_of_queue(q=q, app_name=ns.app_name, job_id=ns.job_id) return if not parents_completed(ns.app_name, ns.job_id, q=q, lock=lock): return log.info("Job starting!", extra=dict(app_name=ns.app_name, job_id=ns.job_id)) try: ns.job_type_func(ns=ns) except exceptions.CodeError: # assume error is previously logged _handle_failure(ns, q, lock) return except Exception as err: log.exception(("Job failed! Unhandled exception in an application!" " Fix ASAP because" " it is unclear how to handle this failure. %s: %s") % (err.__class__.__name__, err), extra=dict(app_name=ns.app_name, job_id=ns.job_id, failed=True)) return _handle_success(ns, q, lock)