def task_failed(event): state.event(event) uuid = event['uuid'] exc = event.get('exception', "") if isinstance(exc, str): match = TASK_FAILED_RE.search(exc) if match: short_error = match.group(1) es_url = "%s/job_status-current/job/%s" % ( app.conf['JOBS_ES_URL'], uuid) r = requests.get(es_url) if r.status_code != 200: logging.error("Failed to query for task UUID %s: %s" % (uuid, r.content)) return res = r.json() job_status = res['_source'] job_status['status'] = 'job-failed' job_status['error'] = exc job_status['short_error'] = short_error job_status['traceback'] = event.get('traceback', "") time_end = datetime.utcnow().isoformat() + 'Z' job_status.setdefault('job', {}).setdefault('job_info', {})['time_end'] = time_end log_job_status(job_status) log_task_event('task-failed', event, uuid=event['uuid'])
def worker_offline(event): set_redis_pool() global POOL rd = StrictRedis(connection_pool=POOL) state.event(event) if ORCH_HOST_RE.search(event['hostname']): return rd.delete([WORKER_STATUS_KEY_TMPL % event['hostname']]) time_end = datetime.utcnow().isoformat() + 'Z' query = { "query" : { "filtered" : { "query" : { "bool": { "must": [ { "term": { "celery_hostname": event['hostname'] } }, #{ "term": { "status": 'job-started' } } ] } } } } } job_status_jsons = [] #logging.error("query:\n%s" % json.dumps(query, indent=2)) es_url = "%s/job_status-current/_search?search_type=scan&scroll=60m&size=100" % app.conf['JOBS_ES_URL'] try: r = requests.post(es_url, data=json.dumps(query)) r.raise_for_status() scan_result = r.json() scroll_id = scan_result['_scroll_id'] while True: r = requests.post('%s/_search/scroll?scroll=60m' % app.conf['JOBS_ES_URL'], data=scroll_id) res = r.json() scroll_id = res['_scroll_id'] if len(res['hits']['hits']) == 0: break for hit in res['hits']['hits']: job_status_jsons.append(hit['_source']) #logging.error("job_status_jsons:\n%s" % job_status_jsons) uuids = [] for job_status_json in job_status_jsons: # continue if real-time job status is still job-started if rd.get(JOB_STATUS_KEY_TMPL % job_status_json['uuid']) != "job-started": continue job_status_json['status'] = 'job-offline' job_status_json['error'] = 'Received worker-offline event during job execution.' job_status_json['short_error'] = 'worker-offline' job_status_json.setdefault('job', {}).setdefault('job_info', {})['time_end'] = time_end log_job_status(job_status_json) uuids.append(job_status_json['uuid']) log_worker_event('worker-offline', event, uuid=uuids) except Exception, e: logging.error("Got exception trying to update task events for " + \ "offline worker %s: %s\n%s" % (event['hostname'], str(e), traceback.format_exc()))
def offline_jobs(event): """Set job status to job-offline.""" time_end = datetime.utcnow().isoformat() + "Z" query = { "query": { "bool": { "must": [ {"term": {"celery_hostname": event["hostname"]}}, {"term": {"status": "job-started"}}, ] } } } logger.info("offline jobs query: %s" % json.dumps(query)) uuids = [] try: job_status_jsons = mozart_es.query(index="job_status-current", body=query) logger.info("Got {} jobs for {}.".format(len(job_status_jsons), event["hostname"])) for job_status in job_status_jsons: job_status_json = job_status['_source'] uuid = job_status_json["uuid"] # offline the job only if it hasn't been picked up by another worker cur_job_status = get_val_via_socket(JOB_STATUS_KEY_TMPL % uuid) cur_job_worker = get_val_via_socket(TASK_WORKER_KEY_TMPL % uuid) logger.info("cur_job_status: {}".format(cur_job_status)) logger.info("cur_job_worker: {}".format(cur_job_worker)) if cur_job_status == "job-started" and cur_job_worker == event["hostname"]: job_status_json["status"] = "job-offline" job_status_json["error"] = "Received worker-offline event during job execution." job_status_json["short_error"] = "worker-offline" job_status_json.setdefault("job", {}).setdefault("job_info", {})["time_end"] = time_end log_job_status(job_status_json) logger.info("Offlined job with UUID %s" % uuid) uuids.append(uuid) else: logger.info("Not offlining job with UUID %s since real-time job status doesn't match" % uuid) except Exception as e: logger.warn("Got exception trying to update task events for offline worker %s: %s\n%s" % ( event["hostname"], str(e), traceback.format_exc()))
def fail_job(event, uuid, exc, short_error): """Set job status to job-failed.""" query = { "query": { "bool": { "must": [ {"term": {"uuid": uuid}} ] } } } search_url = "%s/job_status-current/_search" % app.conf["JOBS_ES_URL"] headers = {"Content-Type": "application/json"} r = requests.post(search_url, data=json.dumps(query), headers=headers) if r.status_code != 200: logger.error("Failed to query for task UUID %s: %s" % (uuid, r.content)) return result = r.json() total = result["hits"]["total"]["value"] if total == 0: logger.error("Failed to query for task UUID %s: %s" % (uuid, r.content)) return res = result["hits"]["hits"][0] job_status = res["_source"] job_status["status"] = "job-failed" job_status["error"] = exc job_status["short_error"] = short_error job_status["traceback"] = event.get("traceback", "") time_end = datetime.utcnow().isoformat() + "Z" job_status.setdefault("job", {}).setdefault("job_info", {})["time_end"] = time_end log_job_status(job_status)
def resubmit_jobs(): # random sleep to prevent from getting ElasticSearch errors: # 429 Client Error: Too Many Requests time.sleep(randint(1, 5)) # can call submit_job #iterate through job ids and query to get the job json with open('_context.json') as f: ctx = json.load(f) retry_count_max = ctx['retry_count_max'] for job_id in ctx['retry_job_id']: try: ## get job json for ES rand_sleep() response = query_ES(job_id) if response.status_code != 200: print("Failed to query ES. Got status code %d:\n%s" % (response.status_code, json.dumps)(response.json(), indent=2)) response.raise_for_status() resp_json = response.json() #check retry_remaining_count job_json = resp_json["hits"]["hits"][0]["_source"]["job"] if 'retry_count' in job_json: if job_json['retry_count'] < retry_count_max: job_json['retry_count'] = int(job_json['retry_count']) + 1 elif job_json['retry_count'] == retry_count_max: print "Job reached retry_count_max limit. Cannot retry again." continue else: job_json['retry_count'] = 1 job_json["job_info"]["dedup"] = False # clean up job execution info for i in ('duration', 'execute_node', 'facts', 'job_dir', 'job_url', 'metrics', 'pid', 'public_ip', 'status', 'stderr', 'stdout', 'time_end', 'time_queued', 'time_start'): if i in job_json.get('job_info', {}): del job_json['job_info'][i] # set queue time job_json['job_info']['time_queued'] = datetime.utcnow().isoformat( ) + 'Z' # use priority from context priority = ctx['job_priority'] # reset priority job_json['priority'] = priority # revoke original job rand_sleep() try: app.control.revoke(job_json['job_id'], terminate=True) print "revoked original job: %s" % job_json['job_id'] except Exception, e: print "Got error issuing revoke on job %s: %s" % ( job_json['job_id'], traceback.format_exc()) print "Continuing." # generate celery task id job_json['task_id'] = uuid() # delete old job status rand_sleep() try: r = requests.delete("%s/%s/job/_query?q=_id:%s" % (es_url, query_idx, job_json['job_id'])) r.raise_for_status() print "deleted original job status: %s" % job_json['job_id'] except Exception, e: print "Got error deleting job status %s: %s" % ( ctx_json['retry_job_id'], traceback.format_exc()) print "Continuing." # log queued status rand_sleep() job_status_json = { 'uuid': job_json['task_id'], 'job_id': job_json['job_id'], 'payload_id': job_json['job_info']['job_payload']['payload_task_id'], 'status': 'job-queued', 'job': job_json } log_job_status(job_status_json) # submit job queue = job_json['job_info']['job_queue'] res = run_job.apply_async((job_json, ), queue=queue, time_limit=None, soft_time_limit=None, priority=priority, task_id=job_json['task_id'])
def submit_job(j): """Submit HySDS job.""" # get task_id and orchestrator queue task_id = submit_job.request.id orch_queue = submit_job.request.delivery_info.get('exchange', 'unknown') # get container image name and url image_name = j.get('container_image_name', None) image_url = j.get('container_image_url', None) image_mapping = j.get('container_mappings', None) # get hard/soft time limits time_limit = j.get('time_limit', None) soft_time_limit = j.get('soft_time_limit', None) # job dedup enabled? dedup = j.get('enable_dedup', True) # get priority priority = j.get('priority', None) if priority is None: priority = submit_job.request.delivery_info.get('priority') if priority is None: priority = 0 # get tag tag = j.get('tag', None) # get username username = j.get('username', None) # default job json job = { 'job_id': task_id, 'name': task_id, 'job_info': j, } # set job type if 'job_type' in j: match = JOB_TYPE_RE.search(j['job_type']) job['type'] = match.group(1) if match else j['job_type'] # default context context = j.get('context', {}) # get orchestrator configuration orch_cfg_file = os.environ.get('HYSDS_ORCHESTRATOR_CFG', None) if orch_cfg_file is None: error = "Environment variable HYSDS_ORCHESTRATOR_CFG is not set." error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error) job_status_json = { 'uuid': job['job_id'], 'job_id': job['job_id'], 'payload_id': task_id, 'status': 'job-failed', 'job': job, 'context': context, 'error': error_info, 'short_error': get_short_error(error_info), 'traceback': error_info } log_job_status(job_status_json) raise (OrchestratorExecutionError(error, job_status_json)) #logger.info("HYSDS_ORCHESTRATOR_CFG:%s" % orch_cfg_file) if not os.path.exists(orch_cfg_file): error = "Orchestrator configuration %s doesn't exist." % orch_cfg_file error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error) job_status_json = { 'uuid': job['job_id'], 'job_id': job['job_id'], 'payload_id': task_id, 'status': 'job-failed', 'job': job, 'context': context, 'error': error_info, 'short_error': get_short_error(error_info), 'traceback': error_info } log_job_status(job_status_json) raise (OrchestratorExecutionError(error, job_status_json)) with open(orch_cfg_file) as f: orch_cfg = json.load(f) # get job creators directory job_creators_dir = os.environ.get('HYSDS_JOB_CREATORS_DIR', None) if job_creators_dir is None: error = "Environment variable HYSDS_JOB_CREATORS_DIR is not set." error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error) job_status_json = { 'uuid': job['job_id'], 'job_id': job['job_id'], 'payload_id': task_id, 'status': 'job-failed', 'job': job, 'context': context, 'error': error_info, 'short_error': get_short_error(error_info), 'traceback': error_info } log_job_status(job_status_json) raise (OrchestratorExecutionError(error, job_status_json)) #logger.info("HYSDS_JOB_CREATORS_DIR:%s" % job_creators_dir) # parse job configurations job_cfgs = {} for cfg in orch_cfg['configs']: job_cfgs[cfg['job_type']] = cfg['job_creators'] # check that we have info to create jobs if 'job_type' not in j: error = "Invalid job spec. No 'job_type' specified." error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error) job_status_json = { 'uuid': job['job_id'], 'job_id': job['job_id'], 'payload_id': task_id, 'status': 'job-failed', 'job': job, 'context': context, 'error': error_info, 'short_error': get_short_error(error_info), 'traceback': error_info } log_job_status(job_status_json) raise (OrchestratorExecutionError(error, job_status_json)) job_type = j['job_type'] job_queue = j.get('job_queue', None) if 'payload' not in j: error = "Invalid job spec. No 'payload' specified." error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error) job_status_json = { 'uuid': job['job_id'], 'job_id': job['job_id'], 'payload_id': task_id, 'status': 'job-failed', 'job': job, 'context': context, 'error': error_info, 'short_error': get_short_error(error_info), 'traceback': error_info } log_job_status(job_status_json) raise (OrchestratorExecutionError(error, job_status_json)) payload = j['payload'] #logger.info("got job_type: %s" % job_type) #logger.info("payload: %s" % payload) # set payload hash if j.get('payload_hash', None) is None: j['payload_hash'] = get_payload_hash(payload) payload_hash = j['payload_hash'] # do dedup if dedup is True: dj = query_dedup_job(payload_hash) if isinstance(dj, dict): dedup_msg = "orchestrator found duplicate job %s with status %s" % ( dj['_id'], dj['status']) job_status_json = { 'uuid': job['job_id'], 'job_id': job['job_id'], 'payload_id': task_id, 'payload_hash': payload_hash, 'dedup': dedup, 'dedup_job': dj['_id'], 'status': 'job-deduped', 'job': job, 'context': context, 'dedup_msg': dedup_msg } log_job_status(job_status_json) return [task_id] # if no explicit job or data type defined in orchestrator, add catch-all if job_type not in job_cfgs: # first check if data product type; if not then assume job type match = DATA_TYPE_RE.search(job_type) if match: return queue_dataset_evaluation(payload) else: match = JOB_TYPE_RE.search(job_type) jt = match.group(1) if match else job_type job_cfgs[job_type] = [{ "job_name": j.get('job_name', jt).replace(":", "__"), "function": "utils.get_job_json", "job_queues": [jt if job_queue is None else job_queue] }] # get job json and queue jobs results = [] for jc in job_cfgs[job_type]: func = get_function(jc['function'], add_to_sys_path=job_creators_dir) argspec = getargspec(func) try: if len(argspec.args) > 1 and 'job_type' in argspec.args: match = JOB_TYPE_RE.search(job_type) jt = match.group(1) if match else job_type job = func(payload, jt) else: job = func(payload) except Exception as e: error = "Job creator function %s failed to generate job JSON." % jc[ 'function'] error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error) job_status_json = { 'uuid': job['job_id'], 'job_id': job['job_id'], 'payload_id': task_id, 'payload_hash': payload_hash, 'dedup': dedup, 'status': 'job-failed', 'job': { 'job_id': task_id, 'name': task_id, 'job_info': j }, 'context': context, 'error': error_info, 'short_error': get_short_error(error_info), 'traceback': traceback.format_exc() } log_job_status(job_status_json) raise (OrchestratorExecutionError(error, job_status_json)) #logger.info("job: %s" % job) # set context job.setdefault('context', {}).update(context) # override hard/soft time limits time_limit = jc.get('time_limit', time_limit) soft_time_limit = jc.get('soft_time_limit', soft_time_limit) # queue jobs for queue in jc['job_queues']: # copy job job_json = copy.deepcopy(job) # set job id if 'name' in job: job_json['job_id'] = get_job_id(job['name']) else: job_json['job_id'] = get_job_id(jc['job_name']) job_json['name'] = job_json['job_id'] # set container image name and url if image_name is not None: job_json['container_image_name'] = image_name if image_url is not None: job_json['container_image_url'] = image_url if image_mapping is not None: job_json['container_mappings'] = image_mapping # set priority job_json['priority'] = priority # set tag if 'tag' not in job_json and tag is not None: job_json['tag'] = tag # set username if 'username' not in job_json and username is not None: job_json['username'] = username # set job_info time_queued = datetime.utcnow() job_json['job_info'] = { 'id': job_json['job_id'], 'job_queue': queue, 'time_queued': time_queued.isoformat() + 'Z', 'time_limit': time_limit, 'soft_time_limit': soft_time_limit, 'payload_hash': payload_hash, 'dedup': dedup, 'job_payload': { 'job_type': job_type, 'payload_task_id': task_id, } } # generate celery task id job_json['task_id'] = uuid() # log queued status job_status_json = { 'uuid': job_json['task_id'], 'job_id': job_json['job_id'], 'payload_id': task_id, 'payload_hash': payload_hash, 'dedup': dedup, 'status': 'job-queued', 'job': job_json } log_job_status(job_status_json) # submit job res = run_job.apply_async((job_json, ), queue=queue, time_limit=time_limit, soft_time_limit=soft_time_limit, priority=priority, task_id=job_json['task_id']) # append result results.append(job_json['task_id']) return results
def submit_job(j): """Submit HySDS job.""" # get task_id and orchestrator queue task_id = submit_job.request.id orch_queue = submit_job.request.delivery_info.get("exchange", "unknown") # get container image name and url image_name = j.get("container_image_name", None) image_url = j.get("container_image_url", None) image_mapping = j.get("container_mappings", None) # get container runtime options runtime_options = j.get("runtime_options", None) # get hard/soft time limits time_limit = j.get("time_limit", None) soft_time_limit = j.get("soft_time_limit", None) # job dedup enabled? dedup = j.get("enable_dedup", True) # get priority priority = j.get("priority", None) if priority is None: priority = submit_job.request.delivery_info.get("priority") if priority is None: priority = 0 # get tag tag = j.get("tag", None) # get username username = j.get("username", None) # default job json job = { "job_id": task_id, "name": task_id, "job_info": j, } # set job type if "job_type" in j: match = JOB_TYPE_RE.search(j["job_type"]) job["type"] = match.group(1) if match else j["job_type"] # default context context = j.get("context", {}) # get orchestrator configuration orch_cfg_file = os.environ.get("HYSDS_ORCHESTRATOR_CFG", None) if orch_cfg_file is None: error = "Environment variable HYSDS_ORCHESTRATOR_CFG is not set." error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error) job_status_json = { "uuid": job["job_id"], "job_id": job["job_id"], "payload_id": task_id, "status": "job-failed", "job": job, "context": context, "error": error_info, "short_error": get_short_error(error_info), "traceback": error_info, } log_job_status(job_status_json) raise OrchestratorExecutionError(error, job_status_json) # logger.info("HYSDS_ORCHESTRATOR_CFG:%s" % orch_cfg_file) if not os.path.exists(orch_cfg_file): error = "Orchestrator configuration %s doesn't exist." % orch_cfg_file error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error) job_status_json = { "uuid": job["job_id"], "job_id": job["job_id"], "payload_id": task_id, "status": "job-failed", "job": job, "context": context, "error": error_info, "short_error": get_short_error(error_info), "traceback": error_info, } log_job_status(job_status_json) raise OrchestratorExecutionError(error, job_status_json) with open(orch_cfg_file) as f: orch_cfg = json.load(f) # get job creators directory job_creators_dir = os.environ.get("HYSDS_JOB_CREATORS_DIR", None) if job_creators_dir is None: error = "Environment variable HYSDS_JOB_CREATORS_DIR is not set." error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error) job_status_json = { "uuid": job["job_id"], "job_id": job["job_id"], "payload_id": task_id, "status": "job-failed", "job": job, "context": context, "error": error_info, "short_error": get_short_error(error_info), "traceback": error_info, } log_job_status(job_status_json) raise OrchestratorExecutionError(error, job_status_json) # logger.info("HYSDS_JOB_CREATORS_DIR:%s" % job_creators_dir) # parse job configurations job_cfgs = {} for cfg in orch_cfg["configs"]: job_cfgs[cfg["job_type"]] = cfg["job_creators"] # check that we have info to create jobs if "job_type" not in j: error = "Invalid job spec. No 'job_type' specified." error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error) job_status_json = { "uuid": job["job_id"], "job_id": job["job_id"], "payload_id": task_id, "status": "job-failed", "job": job, "context": context, "error": error_info, "short_error": get_short_error(error_info), "traceback": error_info, } log_job_status(job_status_json) raise OrchestratorExecutionError(error, job_status_json) job_type = j["job_type"] job_queue = j.get("job_queue", None) if "payload" not in j: error = "Invalid job spec. No 'payload' specified." error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error) job_status_json = { "uuid": job["job_id"], "job_id": job["job_id"], "payload_id": task_id, "status": "job-failed", "job": job, "context": context, "error": error_info, "short_error": get_short_error(error_info), "traceback": error_info, } log_job_status(job_status_json) raise OrchestratorExecutionError(error, job_status_json) payload = j["payload"] # logger.info("got job_type: %s" % job_type) # logger.info("payload: %s" % payload) # set payload hash if j.get("payload_hash", None) is None: j["payload_hash"] = get_payload_hash(payload) payload_hash = j["payload_hash"] # do dedup if dedup is True: try: dj = query_dedup_job(payload_hash) except NoDedupJobFoundException as e: logger.info(str(e)) dj = None if isinstance(dj, dict): dedup_msg = "orchestrator found duplicate job %s with status %s" % ( dj["_id"], dj["status"], ) job_status_json = { "uuid": job["job_id"], "job_id": job["job_id"], "payload_id": task_id, "payload_hash": payload_hash, "dedup": dedup, "dedup_job": dj["_id"], "status": "job-deduped", "job": job, "context": context, "dedup_msg": dedup_msg, } log_job_status(job_status_json) return [task_id] # if no explicit job or data type defined in orchestrator, add catch-all if job_type not in job_cfgs: # first check if data product type; if not then assume job type match = DATA_TYPE_RE.search(job_type) if match: return queue_dataset_evaluation(payload) else: match = JOB_TYPE_RE.search(job_type) jt = match.group(1) if match else job_type job_cfgs[job_type] = [ { "job_name": j.get("job_name", jt).replace(":", "__"), "function": "utils.get_job_json", "job_queues": [jt if job_queue is None else job_queue], } ] # get job json and queue jobs results = [] for jc in job_cfgs[job_type]: func = get_function(jc["function"], add_to_sys_path=job_creators_dir) argspec = getargspec(func) try: if len(argspec.args) > 1 and "job_type" in argspec.args: match = JOB_TYPE_RE.search(job_type) jt = match.group(1) if match else job_type job = func(payload, jt) else: job = func(payload) except Exception as e: error = ( "Job creator function %s failed to generate job JSON." % jc["function"] ) error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error) job_status_json = { "uuid": job["job_id"], "job_id": job["job_id"], "payload_id": task_id, "payload_hash": payload_hash, "dedup": dedup, "status": "job-failed", "job": {"job_id": task_id, "name": task_id, "job_info": j}, "context": context, "error": error_info, "short_error": get_short_error(error_info), "traceback": traceback.format_exc(), } log_job_status(job_status_json) raise OrchestratorExecutionError(error, job_status_json) # logger.info("job: %s" % job) # set context job.setdefault("context", {}).update(context) # override hard/soft time limits and ensure gap soft_time_limit, time_limit = ensure_hard_time_limit_gap( jc.get("soft_time_limit", soft_time_limit), jc.get("time_limit", time_limit) ) # queue jobs for queue in jc["job_queues"]: # copy job job_json = copy.deepcopy(job) # set job id if "name" in job: job_json["job_id"] = get_job_id(job["name"]) else: job_json["job_id"] = get_job_id(jc["job_name"]) job_json["name"] = job_json["job_id"] # set container image name, url, mappings, and runtime options if image_name is not None: job_json["container_image_name"] = image_name if image_url is not None: job_json["container_image_url"] = image_url if image_mapping is not None: job_json["container_mappings"] = image_mapping if runtime_options is not None: job_json["runtime_options"] = runtime_options # set priority job_json["priority"] = priority # set tag if "tag" not in job_json and tag is not None: job_json["tag"] = tag # set username if "username" not in job_json and username is not None: job_json["username"] = username # set job_info time_queued = datetime.utcnow() job_json["job_info"] = { "id": job_json["job_id"], "job_queue": queue, "time_queued": time_queued.isoformat() + "Z", "time_limit": time_limit, "soft_time_limit": soft_time_limit, "payload_hash": payload_hash, "dedup": dedup, "job_payload": { "job_type": job_type, "payload_task_id": task_id, }, } # generate celery task id job_json["task_id"] = uuid() # log queued status job_status_json = { "uuid": job_json["task_id"], "job_id": job_json["job_id"], "payload_id": task_id, "payload_hash": payload_hash, "dedup": dedup, "status": "job-queued", "job": job_json, } log_job_status(job_status_json) # submit job res = run_job.apply_async( (job_json,), queue=queue, time_limit=time_limit, soft_time_limit=soft_time_limit, priority=priority, task_id=job_json["task_id"], ) # append result results.append(job_json["task_id"]) return results
def resubmit_jobs(): es_url = app.conf["JOBS_ES_URL"] # random sleep to prevent from getting ElasticSearch errors: # 429 Client Error: Too Many Requests time.sleep(randint(1,5)) # can call submit_job # iterate through job ids and query to get the job json with open('_context.json') as f: ctx = json.load(f) increment_by = None new_priority = None if "job_priority_increment" in ctx: increment_by = ctx["job_priority_increment"] else: new_priority = ctx["new_job_priority"] retry_count_max = ctx['retry_count_max'] retry_job_ids = ctx['retry_job_id'] if isinstance(ctx['retry_job_id'], list) else [ctx['retry_job_id']] for job_id in retry_job_ids: print("Retrying job: {}".format(job_id)) try: # get job json for ES rand_sleep() response = query_ES(job_id) if response.status_code != 200: print("Failed to query ES. Got status code %d:\n%s" % (response.status_code, json.dumps(response.json(), indent=2))) response.raise_for_status() resp_json = response.json() job_json = resp_json["hits"]["hits"][0]["_source"]["job"] # don't retry a retry if job_json['type'].startswith('job-lw-mozart-retry'): print "Cannot retry retry job %s. Skipping" % job_id continue # check retry_remaining_count if 'retry_count' in job_json: if job_json['retry_count'] < retry_count_max: job_json['retry_count'] = int(job_json['retry_count']) + 1 else: print "For job {}, retry_count now is {}, retry_count_max limit of {} reached. Cannot retry again."\ .format(job_id, job_json['retry_count'], retry_count_max) continue else: job_json['retry_count'] = 1 job_json["job_info"]["dedup"] = False # clean up job execution info for i in ('duration', 'execute_node', 'facts', 'job_dir', 'job_url', 'metrics', 'pid', 'public_ip', 'status', 'stderr', 'stdout', 'time_end', 'time_queued', 'time_start'): if i in job_json.get('job_info', {}): del job_json['job_info'][i] # set queue time job_json['job_info']['time_queued'] = datetime.utcnow().isoformat() + 'Z' # reset priority old_priority = job_json['priority'] job_json['priority'] = get_new_job_priority(old_priority=old_priority, increment_by=increment_by, new_priority=new_priority) # revoke original job rand_sleep() try: app.control.revoke(job_json['job_id'], terminate=True) print "revoked original job: %s" % job_json['job_id'] except Exception, e: print "Got error issuing revoke on job %s: %s" % (job_json['job_id'], traceback.format_exc()) print "Continuing." # generate celery task id job_json['task_id'] = uuid() # delete old job status rand_sleep() try: r = requests.delete("%s/%s/job/_query?q=_id:%s" % (es_url, query_idx, job_json['job_id'])) r.raise_for_status() print "deleted original job status: %s" % job_json['job_id'] except Exception, e: print "Got error deleting job status %s: %s" % (job_json['job_id'], traceback.format_exc()) print "Continuing." # log queued status rand_sleep() job_status_json = {'uuid': job_json['task_id'], 'job_id': job_json['job_id'], 'payload_id': job_json['job_info']['job_payload']['payload_task_id'], 'status': 'job-queued', 'job': job_json } log_job_status(job_status_json) # submit job queue = job_json['job_info']['job_queue'] res = run_job.apply_async((job_json,), queue=queue, time_limit=job_json['job_info']['time_limit'], soft_time_limit=job_json['job_info']['soft_time_limit'], priority=job_json['priority'], task_id=job_json['task_id'])
def resubmit_jobs(context): """ logic to resubmit the job :param context: contents from _context.json """ # iterate through job ids and query to get the job json increment_by = None new_priority = None if "job_priority_increment" in context: increment_by = context["job_priority_increment"] else: new_priority = context["new_job_priority"] retry_count_max = context['retry_count_max'] if isinstance(context['retry_job_id'], list): retry_job_ids = context['retry_job_id'] else: retry_job_ids = [context['retry_job_id']] for job_id in retry_job_ids: print(("Validating retry job: {}".format(job_id))) try: doc = query_es(job_id) if doc['hits']['total']['value'] == 0: print('job id %s not found in Elasticsearch. Continuing.' % job_id) continue doc = doc["hits"]["hits"][0] job_json = doc["_source"]["job"] task_id = doc["_source"]["uuid"] index = doc["_index"] _id = doc["_id"] # don't retry a retry if job_json['type'].startswith('job-lw-mozart-retry'): print("Cannot retry retry job %s. Skipping" % job_id) continue # check retry_remaining_count if 'retry_count' in job_json: if job_json['retry_count'] < retry_count_max: job_json['retry_count'] = int(job_json['retry_count']) + 1 else: print( "For job {}, retry_count now is {}, retry_count_max limit of {} reached. Cannot retry again." .format(job_id, job_json['retry_count'], retry_count_max)) continue else: job_json['retry_count'] = 1 job_json["job_info"]["dedup"] = False # clean up job execution info for i in ('duration', 'execute_node', 'facts', 'job_dir', 'job_url', 'metrics', 'pid', 'public_ip', 'status', 'stderr', 'stdout', 'time_end', 'time_queued', 'time_start'): if i in job_json.get('job_info', {}): del job_json['job_info'][i] # set queue time job_json['job_info']['time_queued'] = datetime.utcnow().isoformat( ) + 'Z' # reset priority old_priority = job_json['priority'] job_json['priority'] = get_new_job_priority( old_priority=old_priority, increment_by=increment_by, new_priority=new_priority) # get state task = app.AsyncResult(task_id) state = task.state # revoke job_id = job_json['job_id'] try: revoke(task_id, state) print("revoked original job: %s (%s)" % (job_id, task_id)) except: print("Got error issuing revoke on job %s (%s): %s" % (job_id, task_id, traceback.format_exc())) print("Continuing.") # generate celery task id new_task_id = uuid() job_json['task_id'] = new_task_id # delete old job status delete_by_id(index, _id) # log queued status job_status_json = { 'uuid': new_task_id, 'job_id': job_id, 'payload_id': job_json['job_info']['job_payload']['payload_task_id'], 'status': 'job-queued', 'job': job_json } log_job_status(job_status_json) # submit job queue = job_json['job_info']['job_queue'] run_job.apply_async( (job_json, ), queue=queue, time_limit=job_json['job_info']['time_limit'], soft_time_limit=job_json['job_info']['soft_time_limit'], priority=job_json['priority'], task_id=new_task_id) except Exception as ex: print("[ERROR] Exception occurred {0}:{1} {2}".format( type(ex), ex, traceback.format_exc()), file=sys.stderr)