示例#1
0
 def task_failed(event):
     state.event(event)
     uuid = event['uuid']
     exc = event.get('exception', "")
     if isinstance(exc, str):
         match = TASK_FAILED_RE.search(exc)
         if match:
             short_error = match.group(1)
             es_url = "%s/job_status-current/job/%s" % (
                 app.conf['JOBS_ES_URL'], uuid)
             r = requests.get(es_url)
             if r.status_code != 200:
                 logging.error("Failed to query for task UUID %s: %s" %
                               (uuid, r.content))
                 return
             res = r.json()
             job_status = res['_source']
             job_status['status'] = 'job-failed'
             job_status['error'] = exc
             job_status['short_error'] = short_error
             job_status['traceback'] = event.get('traceback', "")
             time_end = datetime.utcnow().isoformat() + 'Z'
             job_status.setdefault('job',
                                   {}).setdefault('job_info',
                                                  {})['time_end'] = time_end
             log_job_status(job_status)
     log_task_event('task-failed', event, uuid=event['uuid'])
示例#2
0
 def worker_offline(event):
     set_redis_pool()
     global POOL
     rd = StrictRedis(connection_pool=POOL)
     state.event(event)
     if ORCH_HOST_RE.search(event['hostname']): return
     rd.delete([WORKER_STATUS_KEY_TMPL % event['hostname']])
     time_end = datetime.utcnow().isoformat() + 'Z'
     query = {
         "query" : {
             "filtered" : {
                 "query" : {
                     "bool": {
                         "must": [
                             { "term": { "celery_hostname": event['hostname'] } },
                             #{ "term": { "status": 'job-started' } }
                         ]
                     }
                 }
             }
         }
     }
     job_status_jsons = []
     #logging.error("query:\n%s" % json.dumps(query, indent=2))
     es_url = "%s/job_status-current/_search?search_type=scan&scroll=60m&size=100" % app.conf['JOBS_ES_URL']
     try:
         r = requests.post(es_url, data=json.dumps(query))
         r.raise_for_status()
         scan_result = r.json()
         scroll_id = scan_result['_scroll_id']
         while True:
             r = requests.post('%s/_search/scroll?scroll=60m' % app.conf['JOBS_ES_URL'], data=scroll_id)
             res = r.json()
             scroll_id = res['_scroll_id']
             if len(res['hits']['hits']) == 0: break
             for hit in res['hits']['hits']:
                 job_status_jsons.append(hit['_source'])
         #logging.error("job_status_jsons:\n%s" % job_status_jsons)
         uuids = []
         for job_status_json in job_status_jsons:
             # continue if real-time job status is still job-started
             if rd.get(JOB_STATUS_KEY_TMPL % job_status_json['uuid']) != "job-started":
                 continue
             job_status_json['status'] = 'job-offline'
             job_status_json['error'] = 'Received worker-offline event during job execution.'
             job_status_json['short_error'] = 'worker-offline'
             job_status_json.setdefault('job', {}).setdefault('job_info', {})['time_end'] = time_end
             log_job_status(job_status_json)
             uuids.append(job_status_json['uuid'])
         log_worker_event('worker-offline', event, uuid=uuids)
     except Exception, e:
         logging.error("Got exception trying to update task events for " + \
                       "offline worker %s: %s\n%s" % (event['hostname'], str(e), 
                                                      traceback.format_exc()))
示例#3
0
def offline_jobs(event):
    """Set job status to job-offline."""

    time_end = datetime.utcnow().isoformat() + "Z"
    query = {
        "query": {
            "bool": {
                "must": [
                    {"term": {"celery_hostname": event["hostname"]}},
                    {"term": {"status": "job-started"}},
                ]
            }
        }
    }
    logger.info("offline jobs query: %s" % json.dumps(query))
    uuids = []

    try:
        job_status_jsons = mozart_es.query(index="job_status-current", body=query)
        logger.info("Got {} jobs for {}.".format(len(job_status_jsons), event["hostname"]))

        for job_status in job_status_jsons:
            job_status_json = job_status['_source']
            uuid = job_status_json["uuid"]

            # offline the job only if it hasn't been picked up by another worker
            cur_job_status = get_val_via_socket(JOB_STATUS_KEY_TMPL % uuid)
            cur_job_worker = get_val_via_socket(TASK_WORKER_KEY_TMPL % uuid)
            logger.info("cur_job_status: {}".format(cur_job_status))
            logger.info("cur_job_worker: {}".format(cur_job_worker))

            if cur_job_status == "job-started" and cur_job_worker == event["hostname"]:
                job_status_json["status"] = "job-offline"
                job_status_json["error"] = "Received worker-offline event during job execution."
                job_status_json["short_error"] = "worker-offline"
                job_status_json.setdefault("job", {}).setdefault("job_info", {})["time_end"] = time_end
                log_job_status(job_status_json)
                logger.info("Offlined job with UUID %s" % uuid)
                uuids.append(uuid)
            else:
                logger.info("Not offlining job with UUID %s since real-time job status doesn't match" % uuid)
    except Exception as e:
        logger.warn("Got exception trying to update task events for offline worker %s: %s\n%s" % (
            event["hostname"], str(e), traceback.format_exc()))
示例#4
0
def fail_job(event, uuid, exc, short_error):
    """Set job status to job-failed."""

    query = {
        "query": {
            "bool": {
                "must": [
                    {"term": {"uuid": uuid}}
                ]
            }
        }
    }
    search_url = "%s/job_status-current/_search" % app.conf["JOBS_ES_URL"]

    headers = {"Content-Type": "application/json"}
    r = requests.post(search_url, data=json.dumps(query), headers=headers)

    if r.status_code != 200:
        logger.error("Failed to query for task UUID %s: %s" % (uuid, r.content))
        return

    result = r.json()
    total = result["hits"]["total"]["value"]
    if total == 0:
        logger.error("Failed to query for task UUID %s: %s" % (uuid, r.content))
        return

    res = result["hits"]["hits"][0]
    job_status = res["_source"]
    job_status["status"] = "job-failed"
    job_status["error"] = exc
    job_status["short_error"] = short_error
    job_status["traceback"] = event.get("traceback", "")

    time_end = datetime.utcnow().isoformat() + "Z"
    job_status.setdefault("job", {}).setdefault("job_info", {})["time_end"] = time_end
    log_job_status(job_status)
示例#5
0
def resubmit_jobs():
    # random sleep to prevent from getting ElasticSearch errors:
    # 429 Client Error: Too Many Requests
    time.sleep(randint(1, 5))
    # can call submit_job

    #iterate through job ids and query to get the job json
    with open('_context.json') as f:
        ctx = json.load(f)

    retry_count_max = ctx['retry_count_max']
    for job_id in ctx['retry_job_id']:
        try:
            ## get job json for ES
            rand_sleep()
            response = query_ES(job_id)
            if response.status_code != 200:
                print("Failed to query ES. Got status code %d:\n%s" %
                      (response.status_code, json.dumps)(response.json(),
                                                         indent=2))
            response.raise_for_status()
            resp_json = response.json()

            #check retry_remaining_count
            job_json = resp_json["hits"]["hits"][0]["_source"]["job"]

            if 'retry_count' in job_json:
                if job_json['retry_count'] < retry_count_max:
                    job_json['retry_count'] = int(job_json['retry_count']) + 1
                elif job_json['retry_count'] == retry_count_max:
                    print "Job reached retry_count_max limit. Cannot retry again."
                    continue
            else:
                job_json['retry_count'] = 1

            job_json["job_info"]["dedup"] = False
            # clean up job execution info
            for i in ('duration', 'execute_node', 'facts', 'job_dir',
                      'job_url', 'metrics', 'pid', 'public_ip', 'status',
                      'stderr', 'stdout', 'time_end', 'time_queued',
                      'time_start'):
                if i in job_json.get('job_info', {}):
                    del job_json['job_info'][i]

            # set queue time
            job_json['job_info']['time_queued'] = datetime.utcnow().isoformat(
            ) + 'Z'

            # use priority from context
            priority = ctx['job_priority']

            # reset priority
            job_json['priority'] = priority

            # revoke original job
            rand_sleep()
            try:
                app.control.revoke(job_json['job_id'], terminate=True)
                print "revoked original job: %s" % job_json['job_id']
            except Exception, e:
                print "Got error issuing revoke on job %s: %s" % (
                    job_json['job_id'], traceback.format_exc())
                print "Continuing."

            # generate celery task id
            job_json['task_id'] = uuid()

            # delete old job status
            rand_sleep()
            try:
                r = requests.delete("%s/%s/job/_query?q=_id:%s" %
                                    (es_url, query_idx, job_json['job_id']))
                r.raise_for_status()
                print "deleted original job status: %s" % job_json['job_id']
            except Exception, e:
                print "Got error deleting job status %s: %s" % (
                    ctx_json['retry_job_id'], traceback.format_exc())
                print "Continuing."

            # log queued status
            rand_sleep()
            job_status_json = {
                'uuid':
                job_json['task_id'],
                'job_id':
                job_json['job_id'],
                'payload_id':
                job_json['job_info']['job_payload']['payload_task_id'],
                'status':
                'job-queued',
                'job':
                job_json
            }
            log_job_status(job_status_json)

            # submit job
            queue = job_json['job_info']['job_queue']
            res = run_job.apply_async((job_json, ),
                                      queue=queue,
                                      time_limit=None,
                                      soft_time_limit=None,
                                      priority=priority,
                                      task_id=job_json['task_id'])
示例#6
0
def submit_job(j):
    """Submit HySDS job."""

    # get task_id and orchestrator queue
    task_id = submit_job.request.id
    orch_queue = submit_job.request.delivery_info.get('exchange', 'unknown')

    # get container image name and url
    image_name = j.get('container_image_name', None)
    image_url = j.get('container_image_url', None)
    image_mapping = j.get('container_mappings', None)

    # get hard/soft time limits
    time_limit = j.get('time_limit', None)
    soft_time_limit = j.get('soft_time_limit', None)

    # job dedup enabled?
    dedup = j.get('enable_dedup', True)

    # get priority
    priority = j.get('priority', None)
    if priority is None:
        priority = submit_job.request.delivery_info.get('priority')
        if priority is None: priority = 0

    # get tag
    tag = j.get('tag', None)

    # get username
    username = j.get('username', None)

    # default job json
    job = {
        'job_id': task_id,
        'name': task_id,
        'job_info': j,
    }

    # set job type
    if 'job_type' in j:
        match = JOB_TYPE_RE.search(j['job_type'])
        job['type'] = match.group(1) if match else j['job_type']

    # default context
    context = j.get('context', {})

    # get orchestrator configuration
    orch_cfg_file = os.environ.get('HYSDS_ORCHESTRATOR_CFG', None)
    if orch_cfg_file is None:
        error = "Environment variable HYSDS_ORCHESTRATOR_CFG is not set."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            'uuid': job['job_id'],
            'job_id': job['job_id'],
            'payload_id': task_id,
            'status': 'job-failed',
            'job': job,
            'context': context,
            'error': error_info,
            'short_error': get_short_error(error_info),
            'traceback': error_info
        }
        log_job_status(job_status_json)
        raise (OrchestratorExecutionError(error, job_status_json))

    #logger.info("HYSDS_ORCHESTRATOR_CFG:%s" % orch_cfg_file)
    if not os.path.exists(orch_cfg_file):
        error = "Orchestrator configuration %s doesn't exist." % orch_cfg_file
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            'uuid': job['job_id'],
            'job_id': job['job_id'],
            'payload_id': task_id,
            'status': 'job-failed',
            'job': job,
            'context': context,
            'error': error_info,
            'short_error': get_short_error(error_info),
            'traceback': error_info
        }
        log_job_status(job_status_json)
        raise (OrchestratorExecutionError(error, job_status_json))

    with open(orch_cfg_file) as f:
        orch_cfg = json.load(f)

    # get job creators directory
    job_creators_dir = os.environ.get('HYSDS_JOB_CREATORS_DIR', None)
    if job_creators_dir is None:
        error = "Environment variable HYSDS_JOB_CREATORS_DIR is not set."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            'uuid': job['job_id'],
            'job_id': job['job_id'],
            'payload_id': task_id,
            'status': 'job-failed',
            'job': job,
            'context': context,
            'error': error_info,
            'short_error': get_short_error(error_info),
            'traceback': error_info
        }
        log_job_status(job_status_json)
        raise (OrchestratorExecutionError(error, job_status_json))
    #logger.info("HYSDS_JOB_CREATORS_DIR:%s" % job_creators_dir)

    # parse job configurations
    job_cfgs = {}
    for cfg in orch_cfg['configs']:
        job_cfgs[cfg['job_type']] = cfg['job_creators']

    # check that we have info to create jobs
    if 'job_type' not in j:
        error = "Invalid job spec. No 'job_type' specified."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            'uuid': job['job_id'],
            'job_id': job['job_id'],
            'payload_id': task_id,
            'status': 'job-failed',
            'job': job,
            'context': context,
            'error': error_info,
            'short_error': get_short_error(error_info),
            'traceback': error_info
        }
        log_job_status(job_status_json)
        raise (OrchestratorExecutionError(error, job_status_json))
    job_type = j['job_type']
    job_queue = j.get('job_queue', None)

    if 'payload' not in j:
        error = "Invalid job spec. No 'payload' specified."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            'uuid': job['job_id'],
            'job_id': job['job_id'],
            'payload_id': task_id,
            'status': 'job-failed',
            'job': job,
            'context': context,
            'error': error_info,
            'short_error': get_short_error(error_info),
            'traceback': error_info
        }
        log_job_status(job_status_json)
        raise (OrchestratorExecutionError(error, job_status_json))
    payload = j['payload']
    #logger.info("got job_type: %s" % job_type)
    #logger.info("payload: %s" % payload)

    # set payload hash
    if j.get('payload_hash', None) is None:
        j['payload_hash'] = get_payload_hash(payload)
    payload_hash = j['payload_hash']

    # do dedup
    if dedup is True:
        dj = query_dedup_job(payload_hash)
        if isinstance(dj, dict):
            dedup_msg = "orchestrator found duplicate job %s with status %s" % (
                dj['_id'], dj['status'])
            job_status_json = {
                'uuid': job['job_id'],
                'job_id': job['job_id'],
                'payload_id': task_id,
                'payload_hash': payload_hash,
                'dedup': dedup,
                'dedup_job': dj['_id'],
                'status': 'job-deduped',
                'job': job,
                'context': context,
                'dedup_msg': dedup_msg
            }
            log_job_status(job_status_json)
            return [task_id]

    # if no explicit job or data type defined in orchestrator, add catch-all
    if job_type not in job_cfgs:
        # first check if data product type; if not then assume job type
        match = DATA_TYPE_RE.search(job_type)
        if match:
            return queue_dataset_evaluation(payload)
        else:
            match = JOB_TYPE_RE.search(job_type)
            jt = match.group(1) if match else job_type
            job_cfgs[job_type] = [{
                "job_name":
                j.get('job_name', jt).replace(":", "__"),
                "function":
                "utils.get_job_json",
                "job_queues": [jt if job_queue is None else job_queue]
            }]

    # get job json and queue jobs
    results = []
    for jc in job_cfgs[job_type]:
        func = get_function(jc['function'], add_to_sys_path=job_creators_dir)
        argspec = getargspec(func)
        try:
            if len(argspec.args) > 1 and 'job_type' in argspec.args:
                match = JOB_TYPE_RE.search(job_type)
                jt = match.group(1) if match else job_type
                job = func(payload, jt)
            else:
                job = func(payload)
        except Exception as e:
            error = "Job creator function %s failed to generate job JSON." % jc[
                'function']
            error_info = ERROR_TMPL.substitute(orch_queue=orch_queue,
                                               error=error)
            job_status_json = {
                'uuid': job['job_id'],
                'job_id': job['job_id'],
                'payload_id': task_id,
                'payload_hash': payload_hash,
                'dedup': dedup,
                'status': 'job-failed',
                'job': {
                    'job_id': task_id,
                    'name': task_id,
                    'job_info': j
                },
                'context': context,
                'error': error_info,
                'short_error': get_short_error(error_info),
                'traceback': traceback.format_exc()
            }
            log_job_status(job_status_json)
            raise (OrchestratorExecutionError(error, job_status_json))
        #logger.info("job: %s" % job)

        # set context
        job.setdefault('context', {}).update(context)

        # override hard/soft time limits
        time_limit = jc.get('time_limit', time_limit)
        soft_time_limit = jc.get('soft_time_limit', soft_time_limit)

        # queue jobs
        for queue in jc['job_queues']:
            # copy job
            job_json = copy.deepcopy(job)

            # set job id
            if 'name' in job:
                job_json['job_id'] = get_job_id(job['name'])
            else:
                job_json['job_id'] = get_job_id(jc['job_name'])
                job_json['name'] = job_json['job_id']

            # set container image name and url
            if image_name is not None:
                job_json['container_image_name'] = image_name
            if image_url is not None:
                job_json['container_image_url'] = image_url
            if image_mapping is not None:
                job_json['container_mappings'] = image_mapping

            # set priority
            job_json['priority'] = priority

            # set tag
            if 'tag' not in job_json and tag is not None:
                job_json['tag'] = tag

            # set username
            if 'username' not in job_json and username is not None:
                job_json['username'] = username

            # set job_info
            time_queued = datetime.utcnow()
            job_json['job_info'] = {
                'id': job_json['job_id'],
                'job_queue': queue,
                'time_queued': time_queued.isoformat() + 'Z',
                'time_limit': time_limit,
                'soft_time_limit': soft_time_limit,
                'payload_hash': payload_hash,
                'dedup': dedup,
                'job_payload': {
                    'job_type': job_type,
                    'payload_task_id': task_id,
                }
            }

            # generate celery task id
            job_json['task_id'] = uuid()

            # log queued status
            job_status_json = {
                'uuid': job_json['task_id'],
                'job_id': job_json['job_id'],
                'payload_id': task_id,
                'payload_hash': payload_hash,
                'dedup': dedup,
                'status': 'job-queued',
                'job': job_json
            }
            log_job_status(job_status_json)

            # submit job
            res = run_job.apply_async((job_json, ),
                                      queue=queue,
                                      time_limit=time_limit,
                                      soft_time_limit=soft_time_limit,
                                      priority=priority,
                                      task_id=job_json['task_id'])

            # append result
            results.append(job_json['task_id'])

    return results
示例#7
0
def submit_job(j):
    """Submit HySDS job."""

    # get task_id and orchestrator queue
    task_id = submit_job.request.id
    orch_queue = submit_job.request.delivery_info.get("exchange", "unknown")

    # get container image name and url
    image_name = j.get("container_image_name", None)
    image_url = j.get("container_image_url", None)
    image_mapping = j.get("container_mappings", None)

    # get container runtime options
    runtime_options = j.get("runtime_options", None)

    # get hard/soft time limits
    time_limit = j.get("time_limit", None)
    soft_time_limit = j.get("soft_time_limit", None)

    # job dedup enabled?
    dedup = j.get("enable_dedup", True)

    # get priority
    priority = j.get("priority", None)
    if priority is None:
        priority = submit_job.request.delivery_info.get("priority")
        if priority is None:
            priority = 0

    # get tag
    tag = j.get("tag", None)

    # get username
    username = j.get("username", None)

    # default job json
    job = {
        "job_id": task_id,
        "name": task_id,
        "job_info": j,
    }

    # set job type
    if "job_type" in j:
        match = JOB_TYPE_RE.search(j["job_type"])
        job["type"] = match.group(1) if match else j["job_type"]

    # default context
    context = j.get("context", {})

    # get orchestrator configuration
    orch_cfg_file = os.environ.get("HYSDS_ORCHESTRATOR_CFG", None)
    if orch_cfg_file is None:
        error = "Environment variable HYSDS_ORCHESTRATOR_CFG is not set."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            "uuid": job["job_id"],
            "job_id": job["job_id"],
            "payload_id": task_id,
            "status": "job-failed",
            "job": job,
            "context": context,
            "error": error_info,
            "short_error": get_short_error(error_info),
            "traceback": error_info,
        }
        log_job_status(job_status_json)
        raise OrchestratorExecutionError(error, job_status_json)

    # logger.info("HYSDS_ORCHESTRATOR_CFG:%s" % orch_cfg_file)
    if not os.path.exists(orch_cfg_file):
        error = "Orchestrator configuration %s doesn't exist." % orch_cfg_file
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            "uuid": job["job_id"],
            "job_id": job["job_id"],
            "payload_id": task_id,
            "status": "job-failed",
            "job": job,
            "context": context,
            "error": error_info,
            "short_error": get_short_error(error_info),
            "traceback": error_info,
        }
        log_job_status(job_status_json)
        raise OrchestratorExecutionError(error, job_status_json)

    with open(orch_cfg_file) as f:
        orch_cfg = json.load(f)

    # get job creators directory
    job_creators_dir = os.environ.get("HYSDS_JOB_CREATORS_DIR", None)
    if job_creators_dir is None:
        error = "Environment variable HYSDS_JOB_CREATORS_DIR is not set."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            "uuid": job["job_id"],
            "job_id": job["job_id"],
            "payload_id": task_id,
            "status": "job-failed",
            "job": job,
            "context": context,
            "error": error_info,
            "short_error": get_short_error(error_info),
            "traceback": error_info,
        }
        log_job_status(job_status_json)
        raise OrchestratorExecutionError(error, job_status_json)
    # logger.info("HYSDS_JOB_CREATORS_DIR:%s" % job_creators_dir)

    # parse job configurations
    job_cfgs = {}
    for cfg in orch_cfg["configs"]:
        job_cfgs[cfg["job_type"]] = cfg["job_creators"]

    # check that we have info to create jobs
    if "job_type" not in j:
        error = "Invalid job spec. No 'job_type' specified."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            "uuid": job["job_id"],
            "job_id": job["job_id"],
            "payload_id": task_id,
            "status": "job-failed",
            "job": job,
            "context": context,
            "error": error_info,
            "short_error": get_short_error(error_info),
            "traceback": error_info,
        }
        log_job_status(job_status_json)
        raise OrchestratorExecutionError(error, job_status_json)
    job_type = j["job_type"]
    job_queue = j.get("job_queue", None)

    if "payload" not in j:
        error = "Invalid job spec. No 'payload' specified."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            "uuid": job["job_id"],
            "job_id": job["job_id"],
            "payload_id": task_id,
            "status": "job-failed",
            "job": job,
            "context": context,
            "error": error_info,
            "short_error": get_short_error(error_info),
            "traceback": error_info,
        }
        log_job_status(job_status_json)
        raise OrchestratorExecutionError(error, job_status_json)
    payload = j["payload"]
    # logger.info("got job_type: %s" % job_type)
    # logger.info("payload: %s" % payload)

    # set payload hash
    if j.get("payload_hash", None) is None:
        j["payload_hash"] = get_payload_hash(payload)
    payload_hash = j["payload_hash"]

    # do dedup
    if dedup is True:
        try:
            dj = query_dedup_job(payload_hash)
        except NoDedupJobFoundException as e:
            logger.info(str(e))
            dj = None
        if isinstance(dj, dict):
            dedup_msg = "orchestrator found duplicate job %s with status %s" % (
                dj["_id"],
                dj["status"],
            )
            job_status_json = {
                "uuid": job["job_id"],
                "job_id": job["job_id"],
                "payload_id": task_id,
                "payload_hash": payload_hash,
                "dedup": dedup,
                "dedup_job": dj["_id"],
                "status": "job-deduped",
                "job": job,
                "context": context,
                "dedup_msg": dedup_msg,
            }
            log_job_status(job_status_json)
            return [task_id]

    # if no explicit job or data type defined in orchestrator, add catch-all
    if job_type not in job_cfgs:
        # first check if data product type; if not then assume job type
        match = DATA_TYPE_RE.search(job_type)
        if match:
            return queue_dataset_evaluation(payload)
        else:
            match = JOB_TYPE_RE.search(job_type)
            jt = match.group(1) if match else job_type
            job_cfgs[job_type] = [
                {
                    "job_name": j.get("job_name", jt).replace(":", "__"),
                    "function": "utils.get_job_json",
                    "job_queues": [jt if job_queue is None else job_queue],
                }
            ]

    # get job json and queue jobs
    results = []
    for jc in job_cfgs[job_type]:
        func = get_function(jc["function"], add_to_sys_path=job_creators_dir)
        argspec = getargspec(func)
        try:
            if len(argspec.args) > 1 and "job_type" in argspec.args:
                match = JOB_TYPE_RE.search(job_type)
                jt = match.group(1) if match else job_type
                job = func(payload, jt)
            else:
                job = func(payload)
        except Exception as e:
            error = (
                "Job creator function %s failed to generate job JSON." % jc["function"]
            )
            error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
            job_status_json = {
                "uuid": job["job_id"],
                "job_id": job["job_id"],
                "payload_id": task_id,
                "payload_hash": payload_hash,
                "dedup": dedup,
                "status": "job-failed",
                "job": {"job_id": task_id, "name": task_id, "job_info": j},
                "context": context,
                "error": error_info,
                "short_error": get_short_error(error_info),
                "traceback": traceback.format_exc(),
            }
            log_job_status(job_status_json)
            raise OrchestratorExecutionError(error, job_status_json)
        # logger.info("job: %s" % job)

        # set context
        job.setdefault("context", {}).update(context)

        # override hard/soft time limits and ensure gap
        soft_time_limit, time_limit = ensure_hard_time_limit_gap(
            jc.get("soft_time_limit", soft_time_limit), jc.get("time_limit", time_limit)
        )

        # queue jobs
        for queue in jc["job_queues"]:
            # copy job
            job_json = copy.deepcopy(job)

            # set job id
            if "name" in job:
                job_json["job_id"] = get_job_id(job["name"])
            else:
                job_json["job_id"] = get_job_id(jc["job_name"])
                job_json["name"] = job_json["job_id"]

            # set container image name, url, mappings, and runtime options
            if image_name is not None:
                job_json["container_image_name"] = image_name
            if image_url is not None:
                job_json["container_image_url"] = image_url
            if image_mapping is not None:
                job_json["container_mappings"] = image_mapping
            if runtime_options is not None:
                job_json["runtime_options"] = runtime_options

            # set priority
            job_json["priority"] = priority

            # set tag
            if "tag" not in job_json and tag is not None:
                job_json["tag"] = tag

            # set username
            if "username" not in job_json and username is not None:
                job_json["username"] = username

            # set job_info
            time_queued = datetime.utcnow()
            job_json["job_info"] = {
                "id": job_json["job_id"],
                "job_queue": queue,
                "time_queued": time_queued.isoformat() + "Z",
                "time_limit": time_limit,
                "soft_time_limit": soft_time_limit,
                "payload_hash": payload_hash,
                "dedup": dedup,
                "job_payload": {
                    "job_type": job_type,
                    "payload_task_id": task_id,
                },
            }

            # generate celery task id
            job_json["task_id"] = uuid()

            # log queued status
            job_status_json = {
                "uuid": job_json["task_id"],
                "job_id": job_json["job_id"],
                "payload_id": task_id,
                "payload_hash": payload_hash,
                "dedup": dedup,
                "status": "job-queued",
                "job": job_json,
            }
            log_job_status(job_status_json)

            # submit job
            res = run_job.apply_async(
                (job_json,),
                queue=queue,
                time_limit=time_limit,
                soft_time_limit=soft_time_limit,
                priority=priority,
                task_id=job_json["task_id"],
            )

            # append result
            results.append(job_json["task_id"])

    return results
示例#8
0
def resubmit_jobs():
    es_url = app.conf["JOBS_ES_URL"]
    # random sleep to prevent from getting ElasticSearch errors:
    # 429 Client Error: Too Many Requests
    time.sleep(randint(1,5))
    # can call submit_job

    # iterate through job ids and query to get the job json
    with open('_context.json') as f:
        ctx = json.load(f)

    increment_by = None
    new_priority = None
    if "job_priority_increment" in ctx:
        increment_by = ctx["job_priority_increment"]
    else:
        new_priority = ctx["new_job_priority"]

    retry_count_max = ctx['retry_count_max']
    retry_job_ids = ctx['retry_job_id'] if isinstance(ctx['retry_job_id'], list) else [ctx['retry_job_id']]
    for job_id in retry_job_ids:
        print("Retrying job: {}".format(job_id))
        try:
            # get job json for ES
            rand_sleep()
            response = query_ES(job_id)
            if response.status_code != 200:
                print("Failed to query ES. Got status code %d:\n%s" % (response.status_code, json.dumps(response.json(),
                                                                                                        indent=2)))
            response.raise_for_status()
            resp_json = response.json()
            job_json = resp_json["hits"]["hits"][0]["_source"]["job"]

            # don't retry a retry
            if job_json['type'].startswith('job-lw-mozart-retry'):
                print "Cannot retry retry job %s. Skipping" % job_id
                continue

            # check retry_remaining_count
            if 'retry_count' in job_json:
                if job_json['retry_count'] < retry_count_max:
                    job_json['retry_count'] = int(job_json['retry_count']) + 1
                else:
                    print "For job {}, retry_count now is {}, retry_count_max limit of {} reached. Cannot retry again."\
                        .format(job_id, job_json['retry_count'], retry_count_max)
                    continue
            else:
                job_json['retry_count'] = 1
            job_json["job_info"]["dedup"] = False
            # clean up job execution info
            for i in ('duration', 'execute_node', 'facts', 'job_dir', 'job_url',
                      'metrics', 'pid', 'public_ip', 'status', 'stderr',
                      'stdout', 'time_end', 'time_queued', 'time_start'):
                if i in job_json.get('job_info', {}):
                    del job_json['job_info'][i]

            # set queue time
            job_json['job_info']['time_queued'] = datetime.utcnow().isoformat() + 'Z'

            # reset priority
            old_priority = job_json['priority']
            job_json['priority'] = get_new_job_priority(old_priority=old_priority, increment_by=increment_by,
                                                        new_priority=new_priority)

            # revoke original job
            rand_sleep()
            try:
                app.control.revoke(job_json['job_id'], terminate=True)
                print "revoked original job: %s" % job_json['job_id']
            except Exception, e:
                print "Got error issuing revoke on job %s: %s" % (job_json['job_id'], traceback.format_exc())
                print "Continuing."

            # generate celery task id
            job_json['task_id'] = uuid()

            # delete old job status
            rand_sleep()
            try:
                r = requests.delete("%s/%s/job/_query?q=_id:%s" % (es_url, query_idx, job_json['job_id']))
                r.raise_for_status()
                print "deleted original job status: %s" % job_json['job_id']
            except Exception, e:
                print "Got error deleting job status %s: %s" % (job_json['job_id'], traceback.format_exc())
                print "Continuing."

            # log queued status
            rand_sleep()
            job_status_json = {'uuid': job_json['task_id'],
                               'job_id': job_json['job_id'],
                               'payload_id': job_json['job_info']['job_payload']['payload_task_id'],
                               'status': 'job-queued',
                               'job': job_json }
            log_job_status(job_status_json)

            # submit job
            queue = job_json['job_info']['job_queue']
            res = run_job.apply_async((job_json,), queue=queue,
                                      time_limit=job_json['job_info']['time_limit'],
                                      soft_time_limit=job_json['job_info']['soft_time_limit'],
                                      priority=job_json['priority'],
                                      task_id=job_json['task_id'])
示例#9
0
def resubmit_jobs(context):
    """
    logic to resubmit the job
    :param context: contents from _context.json
    """

    # iterate through job ids and query to get the job json
    increment_by = None
    new_priority = None
    if "job_priority_increment" in context:
        increment_by = context["job_priority_increment"]
    else:
        new_priority = context["new_job_priority"]

    retry_count_max = context['retry_count_max']

    if isinstance(context['retry_job_id'], list):
        retry_job_ids = context['retry_job_id']
    else:
        retry_job_ids = [context['retry_job_id']]

    for job_id in retry_job_ids:
        print(("Validating retry job: {}".format(job_id)))
        try:
            doc = query_es(job_id)
            if doc['hits']['total']['value'] == 0:
                print('job id %s not found in Elasticsearch. Continuing.' %
                      job_id)
                continue
            doc = doc["hits"]["hits"][0]

            job_json = doc["_source"]["job"]
            task_id = doc["_source"]["uuid"]
            index = doc["_index"]
            _id = doc["_id"]

            # don't retry a retry
            if job_json['type'].startswith('job-lw-mozart-retry'):
                print("Cannot retry retry job %s. Skipping" % job_id)
                continue

            # check retry_remaining_count
            if 'retry_count' in job_json:
                if job_json['retry_count'] < retry_count_max:
                    job_json['retry_count'] = int(job_json['retry_count']) + 1
                else:
                    print(
                        "For job {}, retry_count now is {}, retry_count_max limit of {} reached. Cannot retry again."
                        .format(job_id, job_json['retry_count'],
                                retry_count_max))
                    continue
            else:
                job_json['retry_count'] = 1
            job_json["job_info"]["dedup"] = False

            # clean up job execution info
            for i in ('duration', 'execute_node', 'facts', 'job_dir',
                      'job_url', 'metrics', 'pid', 'public_ip', 'status',
                      'stderr', 'stdout', 'time_end', 'time_queued',
                      'time_start'):
                if i in job_json.get('job_info', {}):
                    del job_json['job_info'][i]

            # set queue time
            job_json['job_info']['time_queued'] = datetime.utcnow().isoformat(
            ) + 'Z'

            # reset priority
            old_priority = job_json['priority']
            job_json['priority'] = get_new_job_priority(
                old_priority=old_priority,
                increment_by=increment_by,
                new_priority=new_priority)

            # get state
            task = app.AsyncResult(task_id)
            state = task.state

            # revoke
            job_id = job_json['job_id']
            try:
                revoke(task_id, state)
                print("revoked original job: %s (%s)" % (job_id, task_id))
            except:
                print("Got error issuing revoke on job %s (%s): %s" %
                      (job_id, task_id, traceback.format_exc()))
                print("Continuing.")

            # generate celery task id
            new_task_id = uuid()
            job_json['task_id'] = new_task_id

            # delete old job status
            delete_by_id(index, _id)

            # log queued status
            job_status_json = {
                'uuid':
                new_task_id,
                'job_id':
                job_id,
                'payload_id':
                job_json['job_info']['job_payload']['payload_task_id'],
                'status':
                'job-queued',
                'job':
                job_json
            }
            log_job_status(job_status_json)

            # submit job
            queue = job_json['job_info']['job_queue']
            run_job.apply_async(
                (job_json, ),
                queue=queue,
                time_limit=job_json['job_info']['time_limit'],
                soft_time_limit=job_json['job_info']['soft_time_limit'],
                priority=job_json['priority'],
                task_id=new_task_id)
        except Exception as ex:
            print("[ERROR] Exception occurred {0}:{1} {2}".format(
                type(ex), ex, traceback.format_exc()),
                  file=sys.stderr)