def get_module(m): """Import module and return.""" try: return import_module(m) except ImportError: logger.error('Failed to import module "%s".' % m) raise
def download_file(url, path, cache=False): """Download file/dir for input.""" params = get_download_params(url) if cache: url_hash = hashlib.md5(url.encode()).hexdigest() ### app.conf.ROOT_WORK_DIR = os.environ['HYSDS_ROOT_WORK_DIR'] ### logger.info("****** in utils.py:download_file(), app.conf.ROOT_WORK_DIR: %s" % app.conf.ROOT_WORK_DIR) # get it from env variable set in the top level shell script (e.g., celery_worker.sh) root_cache_dir = os.environ['HYSDS_ROOT_CACHE_DIR'] logger.info("****** in utils.py:download_file(), root_cache_dir: %s" % root_cache_dir) ### hash_dir = os.path.join(app.conf.ROOT_WORK_DIR, hash_dir = os.path.join(root_cache_dir, 'cache', *url_hash[0:4]) cache_dir = os.path.join(hash_dir, url_hash) makedirs(cache_dir) signal_file = os.path.join(cache_dir, '.localized') if os.path.exists(signal_file): logger.info("cache hit for {} at {}".format(url, cache_dir)) else: logger.info("cache miss for {}".format(url)) try: osaka.main.get(url, cache_dir, params=params) except Exception as e: shutil.rmtree(cache_dir) tb = traceback.format_exc() raise RuntimeError( "Failed to download {} to cache {}: {}\n{}".format( url, cache_dir, str(e), tb)) with atomic_write(signal_file, overwrite=True) as f: f.write("%sZ\n" % datetime.utcnow().isoformat()) for i in os.listdir(cache_dir): if i == '.localized': continue cached_obj = os.path.join(cache_dir, i) if os.path.isdir(cached_obj): dst = os.path.join(path, i) if os.path.isdir(path) else path try: os.symlink(cached_obj, dst) except: logger.error("Failed to soft link {} to {}".format( cached_obj, dst)) raise else: try: os.symlink(cached_obj, path) except: logger.error("Failed to soft link {} to {}".format( cached_obj, path)) raise else: return osaka.main.get(url, path, params=params)
def evaluate_user_rules_job(job_id, alias=STATUS_ALIAS): """ Process all user rules in ES database and check if this job ID matches. If so, submit jobs. Otherwise do nothing. """ time.sleep(10) # sleep 10 seconds to allow ES documents to be indexed ensure_job_indexed(job_id, alias) # ensure job is indexed # get all enabled user rules query = { "query": { "term": { "enabled": True } } } rules = mozart_es.query(index=USER_RULES_JOB_INDEX, body=query) logger.info("Total %d enabled rules to check." % len(rules)) for rule in rules: time.sleep(1) # sleep between queries rule = rule['_source'] # extracting _source from the rule itself logger.info('rule: %s' % json.dumps(rule, indent=2)) try: updated_query = update_query(job_id, rule) # check for matching rules rule['query'] = updated_query rule['query_string'] = json.dumps(updated_query) except (RuntimeError, Exception) as e: logger.error("unable to update user_rule's query, skipping") logger.error(e) continue rule_name = rule['rule_name'] final_qs = rule['query_string'] logger.info("updated query: %s" % json.dumps(final_qs, indent=2)) # check for matching rules try: result = mozart_es.es.search(index=alias, body=final_qs) if result['hits']['total']['value'] == 0: logger.info("Rule '%s' didn't match for %s" % (rule_name, job_id)) continue except ElasticsearchException as e: logger.error("Failed to query ES") logger.error(e) continue doc_res = result['hits']['hits'][0] logger.info("Rule '%s' successfully matched for %s" % (rule_name, job_id)) # submit trigger task queue_job_trigger(doc_res, rule) logger.info("Trigger task submitted for %s: %s" % (job_id, rule['job_type'])) return True
def download_file(url, path, cache=False): """Download file/dir for input.""" params = get_download_params(url) if cache: url_hash = hashlib.md5(url.encode()).hexdigest() hash_dir = os.path.join(app.conf.ROOT_WORK_DIR, "cache", *url_hash[0:4]) cache_dir = os.path.join(hash_dir, url_hash) makedirs(cache_dir) signal_file = os.path.join(cache_dir, ".localized") if os.path.exists(signal_file): logger.info("cache hit for {} at {}".format(url, cache_dir)) else: logger.info("cache miss for {}".format(url)) try: osaka.main.get(url, cache_dir, params=params) except Exception as e: shutil.rmtree(cache_dir) tb = traceback.format_exc() raise RuntimeError( "Failed to download {} to cache {}: {}\n{}".format( url, cache_dir, str(e), tb)) with atomic_write(signal_file, overwrite=True) as f: f.write("%sZ\n" % datetime.utcnow().isoformat()) for i in os.listdir(cache_dir): if i == ".localized": continue cached_obj = os.path.join(cache_dir, i) if os.path.isdir(cached_obj): dst = os.path.join(path, i) if os.path.isdir(path) else path try: os.symlink(cached_obj, dst) except: logger.error("Failed to soft link {} to {}".format( cached_obj, dst)) raise else: try: os.symlink(cached_obj, path) except: logger.error("Failed to soft link {} to {}".format( cached_obj, path)) raise else: return osaka.main.get(url, path, params=params)
def get_func(f): """Import function and return.""" if "." in f: mod_name, func_name = f.rsplit(".", 1) mod = get_module(mod_name) try: return getattr(mod, func_name) except AttributeError: logger.error('Failed to get function "%s" from module "%s".' % (func_name, mod_name)) raise else: try: return eval(f) except NameError: logger.error('Failed to get function "%s".' % (f)) raise
def ensure_dataset_indexed(objectid, system_version, alias): """Ensure dataset is indexed.""" query = { "query": { "bool": { "must": [{ 'term': { '_id': objectid } }, { 'term': { 'system_version.keyword': system_version } }] } } } logger.info("ensure_dataset_indexed query: %s" % json.dumps(query)) try: count = grq_es.get_count(index=alias, body=query) if count == 0: error_message = "Failed to find indexed dataset: %s (%s)" % ( objectid, system_version) logger.error(error_message) raise RuntimeError(error_message) logger.info("Found indexed dataset: %s (%s)" % (objectid, system_version)) except ElasticsearchException as e: logger.error("Unable to execute query") logger.error(e)
def fail_job(event, uuid, exc, short_error): """Set job status to job-failed.""" query = { "query": { "bool": { "must": [ {"term": {"uuid": uuid}} ] } } } search_url = "%s/job_status-current/_search" % app.conf["JOBS_ES_URL"] headers = {"Content-Type": "application/json"} r = requests.post(search_url, data=json.dumps(query), headers=headers) if r.status_code != 200: logger.error("Failed to query for task UUID %s: %s" % (uuid, r.content)) return result = r.json() total = result["hits"]["total"]["value"] if total == 0: logger.error("Failed to query for task UUID %s: %s" % (uuid, r.content)) return res = result["hits"]["hits"][0] job_status = res["_source"] job_status["status"] = "job-failed" job_status["error"] = exc job_status["short_error"] = short_error job_status["traceback"] = event.get("traceback", "") time_end = datetime.utcnow().isoformat() + "Z" job_status.setdefault("job", {}).setdefault("job_info", {})["time_end"] = time_end log_job_status(job_status)
def triage(job, ctx): """Triage failed job's context and job json as well as _run.sh.""" # set time_start if not defined (job failed prior to setting it) if "time_start" not in job["job_info"]: job["job_info"]["time_start"] = "{}Z".format( datetime.utcnow().isoformat("T")) # default triage id default_triage_id_format = "triaged_job-{job_id}_task-{job[task_id]}" default_triage_id_regex = "triaged_job-(?P<job_id>.+)_task-(?P<task_id>[-\\w])" # if exit code of job command is zero, don't triage anything exit_code = job["job_info"]["status"] if exit_code == 0: logger.info("Job exited with exit code %s. No need to triage." % exit_code) return True # disable triage if ctx.get("_triage_disabled", False): logger.info( "Flag _triage_disabled set to True. Not performing triage.") return True # Check if custom triage id format was provided if "_triage_id_format" in ctx: triage_id_format = ctx["_triage_id_format"] else: triage_id_format = default_triage_id_format # get job info job_dir = job["job_info"]["job_dir"] job_id = job["job_info"]["id"] logger.info("job id: {}".format(job_id)) # Check if the job_id is a triaged dataset. If so, let's parse out the job_id logger.info("Checking to see if the job_id matches the regex: {}".format( default_triage_id_regex)) match = re.search(default_triage_id_regex, job_id) if match: logger.info( "job_id matches the triage dataset regex. Parsing out job_id") parsed_job_id = match.groupdict()["job_id"] logger.info("extracted job_id: {}".format(parsed_job_id)) else: logger.info( "job_id does not match the triage dataset regex: {}".format( default_triage_id_regex)) parsed_job_id = job_id # create triage dataset # Attempt to first use triage id format from user, but if there is any problem use the default id format instead try: triage_id = triage_id_format.format(job_id=parsed_job_id, job=job, job_context=ctx) except Exception as e: logger.warning( "Failed to apply custom triage id format because of {}: {}. Falling back to default triage id" .format(e.__class__.__name__, e)) triage_id = default_triage_id_format.format(job_id=parsed_job_id, job=job, job_context=ctx) triage_dir = os.path.join(job_dir, triage_id) makedirs(triage_dir) # create dataset json ds_file = os.path.join(triage_dir, "{}.dataset.json".format(triage_id)) ds = { "version": "v{}".format(hysds.__version__), "label": "triage for job {}".format(parsed_job_id), } if "cmd_start" in job["job_info"]: ds["starttime"] = job["job_info"]["cmd_start"] if "cmd_end" in job["job_info"]: ds["endtime"] = job["job_info"]["cmd_end"] with open(ds_file, "w") as f: json.dump(ds, f, sort_keys=True, indent=2) # create met json met_file = os.path.join(triage_dir, "{}.met.json".format(triage_id)) with open(met_file, "w") as f: json.dump(job["job_info"], f, sort_keys=True, indent=2) # triage job-related files for f in glob(os.path.join(job_dir, "_*")): if os.path.isdir(f): shutil.copytree(f, os.path.join(triage_dir, os.path.basename(f))) else: shutil.copy(f, triage_dir) # triage log files for f in glob(os.path.join(job_dir, "*.log")): if os.path.isdir(f): shutil.copytree(f, os.path.join(triage_dir, os.path.basename(f))) else: shutil.copy(f, triage_dir) # triage additional globs for g in ctx.get("_triage_additional_globs", []): for f in glob(os.path.join(job_dir, g)): f = os.path.normpath(f) dst = os.path.join(triage_dir, os.path.basename(f)) if os.path.exists(dst): dst = "{}.{}Z".format(dst, datetime.utcnow().isoformat("T")) try: if os.path.isdir(f): shutil.copytree(f, dst) else: shutil.copy(f, dst) except Exception as e: tb = traceback.format_exc() logger.error( "Skipping copying of {}. Got exception: {}\n{}".format( f, str(e), tb)) # publish prod_json = publish_dataset(triage_dir, ds_file, job, ctx) # write published triage to file pub_triage_file = os.path.join(job_dir, "_triaged.json") with open(pub_triage_file, "w") as f: json.dump(prod_json, f, indent=2, sort_keys=True) # signal run_job() to continue return True
from __future__ import unicode_literals from __future__ import print_function from __future__ import division from __future__ import absolute_import from aws_requests_auth.boto_utils import BotoAWSRequestsAuth from elasticsearch import RequestsHttpConnection from hysds.celery import app from hysds.log_utils import logger try: from hysds_commons.elasticsearch_utils import ElasticsearchUtility except (ImportError, ModuleNotFoundError): logger.error('Cannot import hysds_commons.elasticsearch_utils') MOZART_ES = None GRQ_ES = None def get_mozart_es(): global MOZART_ES if MOZART_ES is None: MOZART_ES = ElasticsearchUtility(app.conf.JOBS_ES_URL, logger) return MOZART_ES def get_grq_es(): global GRQ_ES if GRQ_ES is None:
def evaluate_user_rules_dataset(objectid, system_version, alias=DATASET_ALIAS, job_queue=JOBS_PROCESSED_QUEUE): """ Process all user rules in ES database and check if this objectid matches. If so, submit jobs. Otherwise do nothing. """ time.sleep( 10) # sleep for 10 seconds; let any documents finish indexing in ES ensure_dataset_indexed(objectid, system_version, alias) # ensure dataset is indexed # get all enabled user rules query = {"query": {"term": {"enabled": True}}} rules = mozart_es.query(index=USER_RULES_DATASET_INDEX, body=query) logger.info("Total %d enabled rules to check." % len(rules)) for document in rules: time.sleep(1) # sleep between queries rule = document['_source'] logger.info("rule: %s" % json.dumps(rule, indent=2)) try: updated_query = update_query(objectid, system_version, rule) rule['query'] = updated_query rule['query_string'] = json.dumps(updated_query) except (RuntimeError, Exception) as e: logger.error("unable to update user_rule's query, skipping") logger.error(e) continue rule_name = rule['rule_name'] job_type = rule['job_type'] # set clean descriptive job name final_qs = rule['query_string'] logger.info("updated query: %s" % json.dumps(final_qs, indent=2)) # check for matching rules try: result = grq_es.es.search(index=alias, body=final_qs) if result['hits']['total']['value'] == 0: logger.info("Rule '%s' didn't match for %s (%s)" % (rule_name, objectid, system_version)) continue doc_res = result['hits']['hits'][0] logger.info("Rule '%s' successfully matched for %s (%s)" % (rule_name, objectid, system_version)) except (ElasticsearchException, Exception) as e: logger.error("Failed to query ES") logger.error(e) continue if job_type.startswith('hysds-io-'): job_type = job_type.replace('hysds-io-', '', 1) job_name = "%s-%s" % (job_type, objectid) queue_dataset_trigger(doc_res, rule, job_name) # submit trigger task logger.info("Trigger task submitted for %s (%s): %s" % (objectid, system_version, job_type)) return True
def evaluate_user_rules_dataset( objectid, system_version, es_url=app.conf.GRQ_ES_URL, alias=app.conf.DATASET_ALIAS, user_rules_idx=app.conf.USER_RULES_DATASET_INDEX, job_queue=app.conf.JOBS_PROCESSED_QUEUE): """Process all user rules in ES database and check if this objectid matches. If so, submit jobs. Otherwise do nothing.""" # sleep for 10 seconds; let any documents finish indexing in ES time.sleep(10) # ensure dataset is indexed ensure_dataset_indexed(objectid, system_version, es_url, alias) # get all enabled user rules query = {"query": {"term": {"enabled": True}}} r = requests.post( '%s/%s/.percolator/_search?search_type=scan&scroll=10m&size=100' % (es_url, user_rules_idx), data=json.dumps(query)) r.raise_for_status() scan_result = r.json() count = scan_result['hits']['total'] scroll_id = scan_result['_scroll_id'] rules = [] while True: r = requests.post('%s/_search/scroll?scroll=10m' % es_url, data=scroll_id) res = r.json() scroll_id = res['_scroll_id'] if len(res['hits']['hits']) == 0: break for hit in res['hits']['hits']: rules.append(hit['_source']) logger.info("Got %d enabled rules to check." % len(rules)) # process rules for rule in rules: # sleep between queries time.sleep(1) # check for matching rules update_query(objectid, system_version, rule) final_qs = rule['query_string'] try: r = requests.post('%s/%s/_search' % (es_url, alias), data=final_qs) r.raise_for_status() except: logger.error("Failed to query ES. Got status code %d:\n%s" % (r.status_code, traceback.format_exc())) continue result = r.json() if result['hits']['total'] == 0: logger.info("Rule '%s' didn't match for %s (%s)" % (rule['rule_name'], objectid, system_version)) continue else: doc_res = result['hits']['hits'][0] logger.info("Rule '%s' successfully matched for %s (%s)" % (rule['rule_name'], objectid, system_version)) #logger.info("doc_res: %s" % json.dumps(doc_res, indent=2)) # set clean descriptive job name job_type = rule['job_type'] if job_type.startswith('hysds-io-'): job_type = job_type.replace('hysds-io-', '', 1) job_name = "%s-%s" % (job_type, objectid) # submit trigger task queue_dataset_trigger(doc_res, rule, es_url, job_name) logger.info("Trigger task submitted for %s (%s): %s" % (objectid, system_version, rule['job_type'])) return True
def evaluate_user_rules_job(job_id, es_url=app.conf.JOBS_ES_URL, alias=app.conf.STATUS_ALIAS, user_rules_idx=app.conf.USER_RULES_JOB_INDEX, job_queue=app.conf.JOBS_PROCESSED_QUEUE): """Process all user rules in ES database and check if this job ID matches. If so, submit jobs. Otherwise do nothing.""" # sleep 10 seconds to allow ES documents to be indexed time.sleep(10) # ensure job is indexed ensure_job_indexed(job_id, es_url, alias) # get all enabled user rules query = {"query": {"term": {"enabled": True}}} r = requests.post( '%s/%s/.percolator/_search?search_type=scan&scroll=10m&size=100' % (es_url, user_rules_idx), data=json.dumps(query)) r.raise_for_status() scan_result = r.json() count = scan_result['hits']['total'] scroll_id = scan_result['_scroll_id'] rules = [] while True: r = requests.post('%s/_search/scroll?scroll=10m' % es_url, data=scroll_id) res = r.json() scroll_id = res['_scroll_id'] if len(res['hits']['hits']) == 0: break for hit in res['hits']['hits']: rules.append(hit['_source']) logger.info("Got %d enabled rules to check." % len(rules)) # process rules for rule in rules: # sleep between queries time.sleep(1) # check for matching rules update_query(job_id, rule) final_qs = rule['query_string'] try: r = requests.post('%s/job_status-current/job/_search' % es_url, data=final_qs) r.raise_for_status() except: logger.error("Failed to query ES. Got status code %d:\n%s" % (r.status_code, traceback.format_exc())) continue result = r.json() if result['hits']['total'] == 0: logger.info("Rule '%s' didn't match for %s" % (rule['rule_name'], job_id)) continue else: doc_res = result['hits']['hits'][0] logger.info("Rule '%s' successfully matched for %s" % (rule['rule_name'], job_id)) #logger.info("doc_res: %s" % json.dumps(doc_res, indent=2)) # submit trigger task queue_job_trigger(doc_res, rule, es_url) logger.info("Trigger task submitted for %s: %s" % (job_id, rule['job_type'])) return True