def update_query(_id, system_version, rule): """ takes the rule's query_string and adds system version and dataset's id to "filter" in "bool" :param _id: ES's _id :param system_version: string/int, system_version field in ES document :param rule: dict :return: dict """ updated_query = json.loads(rule['query_string']) filts = [ updated_query, { 'term': { 'system_version.keyword': system_version } } ] # will add _id if query all False if rule.get('query_all', False) is False: filts.append({"term": {"_id": _id}}) final_query = {"query": {"bool": {"must": filts}}} logger.info("Final query: %s" % json.dumps(final_query, indent=2)) return final_query
def ensure_dataset_indexed(objectid, system_version, alias): """Ensure dataset is indexed.""" query = { "query": { "bool": { "must": [{ 'term': { '_id': objectid } }, { 'term': { 'system_version.keyword': system_version } }] } } } logger.info("ensure_dataset_indexed query: %s" % json.dumps(query)) try: count = grq_es.get_count(index=alias, body=query) if count == 0: error_message = "Failed to find indexed dataset: %s (%s)" % ( objectid, system_version) logger.error(error_message) raise RuntimeError(error_message) logger.info("Found indexed dataset: %s (%s)" % (objectid, system_version)) except ElasticsearchException as e: logger.error("Unable to execute query") logger.error(e)
def error_handler(uuid): """Error handler function.""" result = AsyncResult(uuid) exc = result.get(propagate=False) logger.info("Task %s raised exception: %s\n%s" % (uuid, exc, result.traceback))
def get_download_params(url): """Set osaka download params.""" params = {} # set profile for prof in app.conf.get("BUCKET_PROFILES", []): if "profile_name" in params: break if prof.get("bucket_patterns", None) is None: params["profile_name"] = prof["profile"] break else: if isinstance(prof["bucket_patterns"], list): bucket_patterns = prof["bucket_patterns"] else: bucket_patterns = [prof["bucket_patterns"]] for bucket_pattern in prof["bucket_patterns"]: regex = re.compile(bucket_pattern) match = regex.search(url) if match: logger.info("{} matched '{}' for profile {}.".format( url, bucket_pattern, prof["profile"])) params["profile_name"] = prof["profile"] break return params
def publish_dataset(path, url, params=None, force=False): ''' Publish a dataset to the given url @param path - path of dataset to publish @param url - url to publish to ''' # set osaka params if params is None: params = {} # force remove previous dataset if it exists? if force: try: unpublish_dataset(url, params=params) except: pass # upload datasets for root, dirs, files in os.walk(path): for file in files: abs_path = os.path.join(root, file) rel_path = os.path.relpath(abs_path, path) dest_url = os.path.join(url, rel_path) logger.info("Uploading %s to %s." % (abs_path, dest_url)) osaka.main.put(abs_path, dest_url, params=params, noclobber=True)
def update_query(job_id, rule): """ takes the rule's query_string and adds system version and job's id to "filter" in "bool" :param job_id: ES's _id :param rule: dict :return: dict """ updated_query = json.loads(rule['query_string']) filts = [updated_query] if rule.get('query_all', False) is False: filts.append({ "term": { "_id": job_id } }) final_query = { "query": { "bool": { "must": filts } } } logger.info("Final query: %s" % json.dumps(final_query, indent=2)) return final_query
def download_file(url, path, cache=False): """Download file/dir for input.""" params = get_download_params(url) if cache: url_hash = hashlib.md5(url).hexdigest() hash_dir = os.path.join(app.conf.ROOT_WORK_DIR, 'cache', *url_hash[0:4]) cache_dir = os.path.join(hash_dir, url_hash) makedirs(cache_dir) signal_file = os.path.join(cache_dir, '.localized') if os.path.exists(signal_file): logger.info("cache hit for {} at {}".format(url, cache_dir)) else: logger.info("cache miss for {}".format(url)) try: osaka.main.get(url, cache_dir, params=params) except Exception, e: shutil.rmtree(cache_dir) tb = traceback.format_exc() raise(RuntimeError("Failed to download %s to cache %s: %s\n%s" % \ (url, cache_dir, str(e), tb))) with atomic_write(signal_file, overwrite=True) as f: f.write("%sZ\n" % datetime.utcnow().isoformat()) for i in os.listdir(cache_dir): if i == '.localized': continue cached_obj = os.path.join(cache_dir, i) if os.path.isdir(cached_obj): if os.path.isdir(path): shutil.copytree(cached_obj, os.path.join(path, i)) else: shutil.copytree(cached_obj, path) else: shutil.copy2(cached_obj, path)
def query_dedup_job(dedup_key, filter_id=None, states=None): """ Return job IDs with matching dedup key defined in states 'job-queued', 'job-started', 'job-completed', by default. """ # get states if states is None: states = ['job-queued', 'job-started', 'job-completed'] # build query query = { "sort": [{"job.job_info.time_queued": {"order": "asc"}}], "size": 1, "fields": ["_id", "status"], "query": { "filtered": { "filter": { "bool": { "must": { "term": { "payload_hash": dedup_key } } } } } } } for state in states: query['query']['filtered']['filter']['bool'].setdefault('should', []).append({ "term": { "status": state } }) if filter_id is not None: query['query']['filtered']['filter']['bool']['must_not'] = { "term": { "uuid": filter_id } } es_url = "%s/job_status-current/_search" % app.conf['JOBS_ES_URL'] r = requests.post(es_url, data=json.dumps(query)) if r.status_code != 200: if r.status_code == 404: pass else: r.raise_for_status() hits = [] j = r.json() if j.get('hits', {}).get('total', 0) == 0: return None else: hit = j['hits']['hits'][0] logger.info("Found duplicate job: %s" % json.dumps(hit, indent=2, sort_keys=True)) return {'_id': hit['_id'], 'status': hit['fields']['status'][0], 'query_timestamp': datetime.utcnow().isoformat()}
def get_job_status(id): """Get job status.""" es_url = "%s/job_status-current/job/%s" % (app.conf['JOBS_ES_URL'], id) r = requests.get(es_url, params={'fields': 'status'}) logger.info("get_job_status status: %s" % r.status_code) result = r.json() logger.info("get_job_status result: %s" % json.dumps(result, indent=2)) return result['fields']['status'][0] if result['found'] else None
def verify_dataset(dataset): """Verify dataset JSON fields.""" if "version" not in dataset: raise RuntimeError("Failed to find required field: version") for field in ("label", "location", "starttime", "endtime", "creation_timestamp"): if field not in dataset: logger.info("Optional field not found: %s" % field)
def verify_dataset(dataset): """Verify dataset JSON fields.""" if 'version' not in dataset: raise RuntimeError("Failed to find required field: version") for field in ('label', 'location', 'starttime', 'endtime', 'creation_timestamp'): if field not in dataset: logger.info("Optional field not found: %s" % field)
def copy_mount(path, mnt_dir): """Copy path to a directory to be used for mounting into container. Return this path.""" if not os.path.exists(mnt_dir): os.makedirs(mnt_dir, 0777) mnt_path = os.path.join(mnt_dir, os.path.basename(path)) if os.path.isdir(path): shutil.copytree(path, mnt_path) else: shutil.copy(path, mnt_path) logger.info("Copied container mount {} to {}.".format(path, mnt_path)) return os.path.join(mnt_dir, os.path.basename(path))
def get_job_status(_id): """Get job status.""" es_url = "%s/job_status-current/_doc/%s" % (app.conf["JOBS_ES_URL"], _id) r = requests.get(es_url, params={"_source": "status"}) logger.info("get_job_status status: %s" % r.status_code) result = r.json() logger.info("get_job_status result: %s" % json.dumps(result, indent=2)) return result["_source"]["status"] if result["found"] else None
def ensure_job_indexed(job_id, alias): """Ensure job is indexed.""" query = { "query": { "term": { "_id": job_id } } } logger.info("ensure_job_indexed: %s" % json.dumps(query)) count = mozart_es.get_count(index=alias, body=query) if count == 0: raise RuntimeError("Failed to find indexed job: {}".format(job_id))
def ensure_job_indexed(job_id, es_url, alias): """Ensure job is indexed.""" query = { "query": { "bool": { "must": [{ 'term': { '_id': job_id } }] } }, "fields": [], } logger.info("ensure_job_indexed query: %s" % json.dumps(query, indent=2)) if es_url.endswith('/'): search_url = '%s%s/_search' % (es_url, alias) else: search_url = '%s/%s/_search' % (es_url, alias) logger.info("ensure_job_indexed url: %s" % search_url) r = requests.post(search_url, data=json.dumps(query)) logger.info("ensure_job_indexed status: %s" % r.status_code) r.raise_for_status() result = r.json() logger.info("ensure_job_indexed result: %s" % json.dumps(result, indent=2)) total = result['hits']['total'] if total == 0: raise RuntimeError("Failed to find indexed job: %s" % job_id)
def update_context_file(localize_url, file_name): logger.info("update_context_file :%s, %s" % (localize_url, file_name)) ctx_file = "_context.json" localized_url_array = [] url_dict = {} url_dict["local_path"] = file_name url_dict["url"] = localize_url localized_url_array.append(url_dict) with open(ctx_file) as f: ctx = json.load(f) ctx["localize_urls"] = localized_url_array with open(ctx_file, 'w') as f: json.dump(ctx, f, indent=2, sort_keys=True)
def publish_dataset(path, url, params=None, force=False, publ_ctx_file=None, publ_ctx_url=None): """ Publish a dataset to the given url @param path - path of dataset to publish @param url - url to publish to @param force - unpublish dataset first if exists @param publ_ctx_file - publish context file @param publ_ctx_url - url to publish context file to """ # set osaka params if params is None: params = {} # force remove previous dataset if it exists? if force: try: unpublish_dataset(url, params=params) except: pass # write publish context file if publ_ctx_file is not None and publ_ctx_url is not None: try: osaka.main.put(publ_ctx_file, publ_ctx_url, params=params, noclobber=True) except osaka.utils.NoClobberException as e: raise NoClobberPublishContextException( "Failed to clobber {} when noclobber is True.".format( publ_ctx_url)) # upload datasets for root, dirs, files in os.walk(path): for file in files: abs_path = os.path.join(root, file) rel_path = os.path.relpath(abs_path, path) dest_url = os.path.join(url, rel_path) logger.info("Uploading %s to %s." % (abs_path, dest_url)) osaka.main.put(abs_path, dest_url, params=params, noclobber=True)
def download_file(url, path, cache=False): """Download file/dir for input.""" params = get_download_params(url) if cache: url_hash = hashlib.md5(url.encode()).hexdigest() hash_dir = os.path.join(app.conf.ROOT_WORK_DIR, "cache", *url_hash[0:4]) cache_dir = os.path.join(hash_dir, url_hash) makedirs(cache_dir) signal_file = os.path.join(cache_dir, ".localized") if os.path.exists(signal_file): logger.info("cache hit for {} at {}".format(url, cache_dir)) else: logger.info("cache miss for {}".format(url)) try: osaka.main.get(url, cache_dir, params=params) except Exception as e: shutil.rmtree(cache_dir) tb = traceback.format_exc() raise RuntimeError( "Failed to download {} to cache {}: {}\n{}".format( url, cache_dir, str(e), tb)) with atomic_write(signal_file, overwrite=True) as f: f.write("%sZ\n" % datetime.utcnow().isoformat()) for i in os.listdir(cache_dir): if i == ".localized": continue cached_obj = os.path.join(cache_dir, i) if os.path.isdir(cached_obj): dst = os.path.join(path, i) if os.path.isdir(path) else path try: os.symlink(cached_obj, dst) except: logger.error("Failed to soft link {} to {}".format( cached_obj, dst)) raise else: try: os.symlink(cached_obj, path) except: logger.error("Failed to soft link {} to {}".format( cached_obj, path)) raise else: return osaka.main.get(url, path, params=params)
def find_dataset_json(work_dir): """Search for *.dataset.json files.""" dataset_re = re.compile(r'^(.*)\.dataset\.json$') for root, dirs, files in os.walk(work_dir, followlinks=True): files.sort() dirs.sort() for file in files: match = dataset_re.search(file) if match: dataset_file = os.path.join(root, file) prod_dir = os.path.join(os.path.dirname(root), match.group(1)) if prod_dir != root: logger.info("%s exists in directory %s. Should be in %s. Not uploading." % (dataset_file, root, prod_dir)) elif not os.path.exists(prod_dir): logger.info("Couldn't find product directory %s for dataset.json %s. Not uploading." % (prod_dir, dataset_file)) else: yield (dataset_file, prod_dir)
def evaluate_user_rules_job(job_id, es_url=app.conf.JOBS_ES_URL, alias=app.conf.STATUS_ALIAS, user_rules_idx=app.conf.USER_RULES_JOB_INDEX, job_queue=app.conf.JOBS_PROCESSED_QUEUE): """Process all user rules in ES database and check if this job ID matches. If so, submit jobs. Otherwise do nothing.""" # sleep 10 seconds to allow ES documents to be indexed time.sleep(10) # get all enabled user rules query = { "query": { "term": { "enabled": True } } } r = requests.post('%s/%s/.percolator/_search?search_type=scan&scroll=10m&size=100' % (es_url, user_rules_idx), data=json.dumps(query)) r.raise_for_status() scan_result = r.json() count = scan_result['hits']['total'] scroll_id = scan_result['_scroll_id'] rules = [] while True: r = requests.post('%s/_search/scroll?scroll=10m' % es_url, data=scroll_id) res = r.json() scroll_id = res['_scroll_id'] if len(res['hits']['hits']) == 0: break for hit in res['hits']['hits']: rules.append(hit['_source']) logger.info("Got %d enabled rules to check." % len(rules)) # process rules for rule in rules: # sleep between queries time.sleep(1) # check for matching rules update_query(job_id, rule) final_qs = rule['query_string'] r = requests.post('%s/job_status-current/job/_search' % es_url, data=final_qs) r.raise_for_status() result = r.json() if result['hits']['total'] == 0: logger.info("Rule '%s' didn't match for %s" % (rule['rule_name'], job_id)) continue else: doc_res = result['hits']['hits'][0] logger.info("Rule '%s' successfully matched for %s" % (rule['rule_name'], job_id)) #logger.info("doc_res: %s" % json.dumps(doc_res, indent=2)) # submit trigger task queue_job_trigger(doc_res, rule, es_url) logger.info("Trigger task submitted for %s: %s" % (job_id, rule['job_type'])) return True
def get_acquisition_data_from_slc(slc_id): uu = getConf() es_url = uu['rest_url'] es_index = "grq_*_*acquisition*" query = { "query": { "bool": { "must": [{ "term": { "metadata.identifier.raw": slc_id } }] } }, "partial_fields": { "partial": { "exclude": "city", } } } logger.info(query) if es_url.endswith('/'): search_url = '%s%s/_search' % (es_url, es_index) else: search_url = '%s/%s/_search' % (es_url, es_index) r = requests.post(search_url, data=json.dumps(query)) if r.status_code != 200: logger.info("Failed to query %s:\n%s" % (es_url, r.text)) logger.info("query: %s" % json.dumps(query, indent=2)) logger.info("returned: %s" % r.text) r.raise_for_status() result = r.json() logger.info(result['hits']['total']) return result['hits']['hits'][0]
def get_singularity_cmd(params, cmd_line_list): """ sample singularity command line: singularity_cmd = ["/nasa/singularity/3.2.0/bin/singularity", "exec", "--no-home", "--home", "/home/ops", "--bind", "/nobackupp14/lpan/work/cache/container-hello_world_master-2019-06-19-82a52bf2bb3b.simg:/container-hello_world_master-2019-06-19-82a52bf2bb3b.simg", "--pwd", "/container-hello_world_master-2019-06-19-82a52bf2bb3b.simg", "/nobackupp14/lpan/work/cache/container-hello_world_master-2019-06-19-82a52bf2bb3b.simg", "/home/ops/verdi/ops/hello_world/run_hello_world.sh"] """ """Pull docker image into local repo and add call to docker in the command line list.""" # build command singularity_cmd = get_base_singularity_cmd(params) # set command singularity_cmd.extend([str(i) for i in cmd_line_list]) logger.info("XXXXXX singularity_cmd: %s" % singularity_cmd) ### singularity_cmd = ["/nasa/singularity/3.2.0/bin/singularity", "exec", "--no-home", "--home", "/home/ops", "--bind", "/nobackupp14/lpan/work/cache/container-hello_world_master-2019-07-24-b269614f8b4e.simg:/container-hello_world_master-2019-07-24-b269614f8b4e.simg", "--pwd", "/container-hello_world_master-2019-07-24-b269614f8b4e.simg", "/nobackupp14/lpan/work/cache/container-hello_world_master-2019-07-24-b269614f8b4e.simg", "/home/ops/verdi/ops/hello_world/run_hello_world.sh"] ### logger.info("XXXXXX hardcoded singularity_cmd: %s" % singularity_cmd) ### singularity exec --no-home --home /home/ops --bind /nobackupp14/lpan/work/cache/container-hello_world_master-2019-07-24-b269614f8b4e.simg:/container-hello_world_master-2019-07-24-b269614f8b4e.simg --pwd /container-hello_world_master-2019-07-24-b269614f8b4e.simg /nobackupp14/lpan/work/cache/container-hello_world_master-2019-07-24-b269614f8b4e.simg /home/ops/verdi/ops/hello_world/run_hello_world.sh return singularity_cmd
def update_query(objectid, system_version, rule): """Update final query.""" # build query query = rule['query'] # filters filts = [{'term': {'system_version.raw': system_version}}] # query all? if rule.get('query_all', False) is False: filts.append({'ids': {'values': [objectid]}}) # build final query if 'filtered' in query: final_query = copy.deepcopy(query) if 'and' in query['filtered']['filter']: final_query['filtered']['filter']['and'].extend(filts) else: filts.append(final_query['filtered']['filter']) final_query['filtered']['filter'] = { 'and': filts, } else: final_query = { 'filtered': { 'query': query, 'filter': { 'and': filts, } } } final_query = {"query": final_query} logger.info("Final query: %s" % json.dumps(final_query, indent=2)) rule['query'] = final_query rule['query_string'] = json.dumps(final_query)
def localize_file(url, path, cache): """Localize urls for job inputs. Track metrics.""" # get job info job_dir = os.getcwd() #job['job_info']['job_dir'] # localize urls if path is None: path = '%s/' % job_dir else: if path.startswith('/'): pass else: path = os.path.join(job_dir, path) if os.path.isdir(path) or path.endswith('/'): path = os.path.join(path, os.path.basename(url)) dir_path = os.path.dirname(path) logger.info(dir_path) if not os.path.exists(dir_path): os.makedirs(dir_path) loc_t1 = datetime.utcnow() try: download_file(url, path, cache=cache) except Exception, e: tb = traceback.format_exc() raise (RuntimeError("Failed to download %s: %s\n%s" % (url, str(e), tb)))
def check_slc_status(slc_id, index_suffix=None): result = get_dataset(slc_id, index_suffix) total = result['hits']['total'] logger.info("check_slc_status : total : %s" % total) if total > 0: logger.info("check_slc_status : returning True") return True logger.info("check_slc_status : returning False") return False
def download_file(url, path, cache=False): """Download file/dir for input.""" params = get_download_params(url) if cache: url_hash = hashlib.md5(url.encode()).hexdigest() ### app.conf.ROOT_WORK_DIR = os.environ['HYSDS_ROOT_WORK_DIR'] ### logger.info("****** in utils.py:download_file(), app.conf.ROOT_WORK_DIR: %s" % app.conf.ROOT_WORK_DIR) # get it from env variable set in the top level shell script (e.g., celery_worker.sh) root_cache_dir = os.environ['HYSDS_ROOT_CACHE_DIR'] logger.info("****** in utils.py:download_file(), root_cache_dir: %s" % root_cache_dir) ### hash_dir = os.path.join(app.conf.ROOT_WORK_DIR, hash_dir = os.path.join(root_cache_dir, 'cache', *url_hash[0:4]) cache_dir = os.path.join(hash_dir, url_hash) makedirs(cache_dir) signal_file = os.path.join(cache_dir, '.localized') if os.path.exists(signal_file): logger.info("cache hit for {} at {}".format(url, cache_dir)) else: logger.info("cache miss for {}".format(url)) try: osaka.main.get(url, cache_dir, params=params) except Exception as e: shutil.rmtree(cache_dir) tb = traceback.format_exc() raise RuntimeError( "Failed to download {} to cache {}: {}\n{}".format( url, cache_dir, str(e), tb)) with atomic_write(signal_file, overwrite=True) as f: f.write("%sZ\n" % datetime.utcnow().isoformat()) for i in os.listdir(cache_dir): if i == '.localized': continue cached_obj = os.path.join(cache_dir, i) if os.path.isdir(cached_obj): dst = os.path.join(path, i) if os.path.isdir(path) else path try: os.symlink(cached_obj, dst) except: logger.error("Failed to soft link {} to {}".format( cached_obj, dst)) raise else: try: os.symlink(cached_obj, path) except: logger.error("Failed to soft link {} to {}".format( cached_obj, path)) raise else: return osaka.main.get(url, path, params=params)
def get_remote_dav(url): """Get remote dir/file.""" lpath = "./%s" % os.path.basename(url) if not url.endswith("/"): url += "/" parsed_url = urlparse(url) rpath = parsed_url.path r = requests.request("PROPFIND", url, verify=False) if r.status_code not in (200, 207): # handle multistatus (207) as well logger.info("Got status code %d trying to read %s" % (r.status_code, url)) logger.info("Content:\n%s" % r.text) r.raise_for_status() tree = parse(StringIO(r.content)) makedirs(lpath) for elem in tree.findall("{DAV:}response"): collection = elem.find( "{DAV:}propstat/{DAV:}prop/{DAV:}resourcetype/{DAV:}collection") if collection is not None: continue href = elem.find("{DAV:}href").text rel_path = os.path.relpath(href, rpath) file_url = os.path.join(url, rel_path) local_path = os.path.join(lpath, rel_path) local_dir = os.path.dirname(local_path) makedirs(local_dir) resp = requests.request("GET", file_url, verify=False, stream=True) if resp.status_code != 200: logger.info("Got status code %d trying to read %s" % (resp.status_code, file_url)) logger.info("Content:\n%s" % resp.text) resp.raise_for_status() with open(local_path, "wb") as f: for chunk in resp.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() return os.path.abspath(lpath)
def ensure_dataset_indexed(objectid, system_version, es_url, alias): """Ensure dataset is indexed.""" query = { "query": { "bool": { "must": [{ 'term': { '_id': objectid } }, { 'term': { 'system_version.raw': system_version } }] } }, "fields": [], } logger.info("ensure_dataset_indexed query: %s" % json.dumps(query, indent=2)) if es_url.endswith('/'): search_url = '%s%s/_search' % (es_url, alias) else: search_url = '%s/%s/_search' % (es_url, alias) logger.info("ensure_dataset_indexed url: %s" % search_url) r = requests.post(search_url, data=json.dumps(query)) logger.info("ensure_dataset_indexed status: %s" % r.status_code) r.raise_for_status() result = r.json() logger.info("ensure_dataset_indexed result: %s" % json.dumps(result, indent=2)) total = result['hits']['total'] if total == 0: raise RuntimeError("Failed to find indexed dataset: {} ({})".format( objectid, system_version))
def publish_datasets(job, ctx): """Perform dataset publishing if job exited with zero status code.""" # if exit code of job command is non-zero, don't publish anything exit_code = job["job_info"]["status"] if exit_code != 0: logger.info( "Job exited with exit code %s. Bypassing dataset publishing." % exit_code) return True # if job command never ran, don't publish anything pid = job["job_info"]["pid"] if pid == 0: logger.info("Job command never ran. Bypassing dataset publishing.") return True # get job info job_dir = job["job_info"]["job_dir"] # find and publish published_prods = [] for dataset_file, prod_dir in find_dataset_json(job_dir): # skip if marked as localized input signal_file = os.path.join(prod_dir, ".localized") if os.path.exists(signal_file): logger.info("Skipping publish of %s. Marked as localized input." % prod_dir) continue # publish prod_json = publish_dataset(prod_dir, dataset_file, job, ctx) # save json for published product published_prods.append(prod_json) # write published products to file pub_prods_file = os.path.join(job_dir, "_datasets.json") with open(pub_prods_file, "w") as f: json.dump(published_prods, f, indent=2, sort_keys=True) # signal run_job() to continue return True
def evaluate_user_rules_dataset( objectid, system_version, es_url=app.conf.GRQ_ES_URL, alias=app.conf.DATASET_ALIAS, user_rules_idx=app.conf.USER_RULES_DATASET_INDEX, job_queue=app.conf.JOBS_PROCESSED_QUEUE): """Process all user rules in ES database and check if this objectid matches. If so, submit jobs. Otherwise do nothing.""" # sleep for 10 seconds; let any documents finish indexing in ES time.sleep(10) # get all enabled user rules query = {"query": {"term": {"enabled": True}}} r = requests.post( '%s/%s/.percolator/_search?search_type=scan&scroll=10m&size=100' % (es_url, user_rules_idx), data=json.dumps(query)) r.raise_for_status() scan_result = r.json() count = scan_result['hits']['total'] scroll_id = scan_result['_scroll_id'] rules = [] while True: r = requests.post('%s/_search/scroll?scroll=10m' % es_url, data=scroll_id) res = r.json() scroll_id = res['_scroll_id'] if len(res['hits']['hits']) == 0: break for hit in res['hits']['hits']: rules.append(hit['_source']) logger.info("Got %d enabled rules to check." % len(rules)) # process rules for rule in rules: # sleep between queries time.sleep(1) # check for matching rules update_query(objectid, system_version, rule) final_qs = rule['query_string'] r = requests.post('%s/%s/_search' % (es_url, alias), data=final_qs) r.raise_for_status() result = r.json() if result['hits']['total'] == 0: logger.info("Rule '%s' didn't match for %s (%s)" % (rule['rule_name'], objectid, system_version)) continue else: doc_res = result['hits']['hits'][0] logger.info("Rule '%s' successfully matched for %s (%s)" % (rule['rule_name'], objectid, system_version)) #logger.info("doc_res: %s" % json.dumps(doc_res, indent=2)) # set clean descriptive job name job_type = rule['job_type'] if job_type.startswith('hysds-io-'): job_type = job_type.replace('hysds-io-', '', 1) job_name = "%s-%s" % (job_type, objectid) # submit trigger task queue_dataset_trigger(doc_res, rule, es_url, job_name) logger.info("Trigger task submitted for %s (%s): %s" % (objectid, system_version, rule['job_type'])) return True