Exemplo n.º 1
0
def update_query(_id, system_version, rule):
    """
    takes the rule's query_string and adds system version and dataset's id to "filter" in "bool"
    :param _id: ES's _id
    :param system_version: string/int, system_version field in ES document
    :param rule: dict
    :return: dict
    """
    updated_query = json.loads(rule['query_string'])
    filts = [
        updated_query, {
            'term': {
                'system_version.keyword': system_version
            }
        }
    ]

    # will add _id if query all False
    if rule.get('query_all', False) is False:
        filts.append({"term": {"_id": _id}})

    final_query = {"query": {"bool": {"must": filts}}}

    logger.info("Final query: %s" % json.dumps(final_query, indent=2))
    return final_query
Exemplo n.º 2
0
def ensure_dataset_indexed(objectid, system_version, alias):
    """Ensure dataset is indexed."""
    query = {
        "query": {
            "bool": {
                "must": [{
                    'term': {
                        '_id': objectid
                    }
                }, {
                    'term': {
                        'system_version.keyword': system_version
                    }
                }]
            }
        }
    }
    logger.info("ensure_dataset_indexed query: %s" % json.dumps(query))

    try:
        count = grq_es.get_count(index=alias, body=query)
        if count == 0:
            error_message = "Failed to find indexed dataset: %s (%s)" % (
                objectid, system_version)
            logger.error(error_message)
            raise RuntimeError(error_message)
        logger.info("Found indexed dataset: %s (%s)" %
                    (objectid, system_version))

    except ElasticsearchException as e:
        logger.error("Unable to execute query")
        logger.error(e)
Exemplo n.º 3
0
Arquivo: utils.py Projeto: hysds/hysds
def error_handler(uuid):
    """Error handler function."""

    result = AsyncResult(uuid)
    exc = result.get(propagate=False)
    logger.info("Task %s raised exception: %s\n%s" %
                (uuid, exc, result.traceback))
Exemplo n.º 4
0
Arquivo: utils.py Projeto: hysds/hysds
def get_download_params(url):
    """Set osaka download params."""

    params = {}

    # set profile
    for prof in app.conf.get("BUCKET_PROFILES", []):
        if "profile_name" in params:
            break
        if prof.get("bucket_patterns", None) is None:
            params["profile_name"] = prof["profile"]
            break
        else:
            if isinstance(prof["bucket_patterns"], list):
                bucket_patterns = prof["bucket_patterns"]
            else:
                bucket_patterns = [prof["bucket_patterns"]]
            for bucket_pattern in prof["bucket_patterns"]:
                regex = re.compile(bucket_pattern)
                match = regex.search(url)
                if match:
                    logger.info("{} matched '{}' for profile {}.".format(
                        url, bucket_pattern, prof["profile"]))
                    params["profile_name"] = prof["profile"]
                    break

    return params
Exemplo n.º 5
0
def publish_dataset(path, url, params=None, force=False):
    '''
    Publish a dataset to the given url
    @param path - path of dataset to publish
    @param url - url to publish to
    '''

    # set osaka params
    if params is None: params = {}

    # force remove previous dataset if it exists?
    if force:
        try:
            unpublish_dataset(url, params=params)
        except:
            pass

    # upload datasets
    for root, dirs, files in os.walk(path):
        for file in files:
            abs_path = os.path.join(root, file)
            rel_path = os.path.relpath(abs_path, path)
            dest_url = os.path.join(url, rel_path)
            logger.info("Uploading %s to %s." % (abs_path, dest_url))
            osaka.main.put(abs_path, dest_url, params=params, noclobber=True)
Exemplo n.º 6
0
def update_query(job_id, rule):
    """
    takes the rule's query_string and adds system version and job's id to "filter" in "bool"
    :param job_id: ES's _id
    :param rule: dict
    :return: dict
    """
    updated_query = json.loads(rule['query_string'])
    filts = [updated_query]

    if rule.get('query_all', False) is False:
        filts.append({
            "term": {
                "_id": job_id
            }
        })

    final_query = {
        "query": {
            "bool": {
                "must": filts
            }
        }
    }

    logger.info("Final query: %s" % json.dumps(final_query, indent=2))
    return final_query
Exemplo n.º 7
0
def download_file(url, path, cache=False):
    """Download file/dir for input."""

    params = get_download_params(url)
    if cache:
        url_hash = hashlib.md5(url).hexdigest()
        hash_dir = os.path.join(app.conf.ROOT_WORK_DIR, 'cache',
                                *url_hash[0:4])
        cache_dir = os.path.join(hash_dir, url_hash)
        makedirs(cache_dir)
        signal_file = os.path.join(cache_dir, '.localized')
        if os.path.exists(signal_file):
            logger.info("cache hit for {} at {}".format(url, cache_dir))
        else:
            logger.info("cache miss for {}".format(url))
            try:
                osaka.main.get(url, cache_dir, params=params)
            except Exception, e:
                shutil.rmtree(cache_dir)
                tb = traceback.format_exc()
                raise(RuntimeError("Failed to download %s to cache %s: %s\n%s" % \
                    (url, cache_dir, str(e), tb)))
            with atomic_write(signal_file, overwrite=True) as f:
                f.write("%sZ\n" % datetime.utcnow().isoformat())
        for i in os.listdir(cache_dir):
            if i == '.localized': continue
            cached_obj = os.path.join(cache_dir, i)
            if os.path.isdir(cached_obj):
                if os.path.isdir(path):
                    shutil.copytree(cached_obj, os.path.join(path, i))
                else:
                    shutil.copytree(cached_obj, path)
            else:
                shutil.copy2(cached_obj, path)
Exemplo n.º 8
0
def query_dedup_job(dedup_key, filter_id=None, states=None):
    """
    Return job IDs with matching dedup key defined in states
    'job-queued', 'job-started', 'job-completed', by default.
    """

    # get states
    if states is None:
        states = ['job-queued', 'job-started', 'job-completed']

    # build query
    query = {
        "sort": [{"job.job_info.time_queued": {"order": "asc"}}],
        "size": 1,
        "fields": ["_id", "status"],
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": {
                            "term": {
                                "payload_hash": dedup_key
                            }
                        }
                    }
                }
            }
        }
    }
    for state in states:
        query['query']['filtered']['filter']['bool'].setdefault('should', []).append({
            "term": {
                "status": state
            }
        })
    if filter_id is not None:
        query['query']['filtered']['filter']['bool']['must_not'] = {
            "term": {
                "uuid": filter_id
            }
        }
    es_url = "%s/job_status-current/_search" % app.conf['JOBS_ES_URL']
    r = requests.post(es_url, data=json.dumps(query))
    if r.status_code != 200:
        if r.status_code == 404:
            pass
        else:
            r.raise_for_status()
    hits = []
    j = r.json()
    if j.get('hits', {}).get('total', 0) == 0:
        return None
    else:
        hit = j['hits']['hits'][0]
        logger.info("Found duplicate job: %s" %
                    json.dumps(hit, indent=2, sort_keys=True))
        return {'_id': hit['_id'],
                'status': hit['fields']['status'][0],
                'query_timestamp': datetime.utcnow().isoformat()}
Exemplo n.º 9
0
def get_job_status(id):
    """Get job status."""

    es_url = "%s/job_status-current/job/%s" % (app.conf['JOBS_ES_URL'], id)
    r = requests.get(es_url, params={'fields': 'status'})
    logger.info("get_job_status status: %s" % r.status_code)
    result = r.json()
    logger.info("get_job_status result: %s" % json.dumps(result, indent=2))
    return result['fields']['status'][0] if result['found'] else None
Exemplo n.º 10
0
def verify_dataset(dataset):
    """Verify dataset JSON fields."""

    if "version" not in dataset:
        raise RuntimeError("Failed to find required field: version")
    for field in ("label", "location", "starttime", "endtime",
                  "creation_timestamp"):
        if field not in dataset:
            logger.info("Optional field not found: %s" % field)
Exemplo n.º 11
0
def verify_dataset(dataset):
    """Verify dataset JSON fields."""

    if 'version' not in dataset:
        raise RuntimeError("Failed to find required field: version")
    for field in ('label', 'location', 'starttime', 'endtime',
                  'creation_timestamp'):
        if field not in dataset:
            logger.info("Optional field not found: %s" % field)
Exemplo n.º 12
0
def copy_mount(path, mnt_dir):
    """Copy path to a directory to be used for mounting into container. Return this path."""

    if not os.path.exists(mnt_dir): os.makedirs(mnt_dir, 0777)
    mnt_path = os.path.join(mnt_dir, os.path.basename(path))
    if os.path.isdir(path): shutil.copytree(path, mnt_path)
    else: shutil.copy(path, mnt_path)
    logger.info("Copied container mount {} to {}.".format(path, mnt_path))
    return os.path.join(mnt_dir, os.path.basename(path))
Exemplo n.º 13
0
Arquivo: utils.py Projeto: hysds/hysds
def get_job_status(_id):
    """Get job status."""

    es_url = "%s/job_status-current/_doc/%s" % (app.conf["JOBS_ES_URL"], _id)
    r = requests.get(es_url, params={"_source": "status"})

    logger.info("get_job_status status: %s" % r.status_code)
    result = r.json()

    logger.info("get_job_status result: %s" % json.dumps(result, indent=2))
    return result["_source"]["status"] if result["found"] else None
Exemplo n.º 14
0
def ensure_job_indexed(job_id, alias):
    """Ensure job is indexed."""
    query = {
        "query": {
            "term": {
                "_id": job_id
            }
        }
    }
    logger.info("ensure_job_indexed: %s" % json.dumps(query))
    count = mozart_es.get_count(index=alias, body=query)
    if count == 0:
        raise RuntimeError("Failed to find indexed job: {}".format(job_id))
Exemplo n.º 15
0
def ensure_job_indexed(job_id, es_url, alias):
    """Ensure job is indexed."""

    query = {
        "query": {
            "bool": {
                "must": [{
                    'term': {
                        '_id': job_id
                    }
                }]
            }
        },
        "fields": [],
    }
    logger.info("ensure_job_indexed query: %s" % json.dumps(query, indent=2))
    if es_url.endswith('/'):
        search_url = '%s%s/_search' % (es_url, alias)
    else:
        search_url = '%s/%s/_search' % (es_url, alias)
    logger.info("ensure_job_indexed url: %s" % search_url)
    r = requests.post(search_url, data=json.dumps(query))
    logger.info("ensure_job_indexed status: %s" % r.status_code)
    r.raise_for_status()
    result = r.json()
    logger.info("ensure_job_indexed result: %s" % json.dumps(result, indent=2))
    total = result['hits']['total']
    if total == 0:
        raise RuntimeError("Failed to find indexed job: %s" % job_id)
Exemplo n.º 16
0
def update_context_file(localize_url, file_name):
    logger.info("update_context_file :%s,  %s" % (localize_url, file_name))
    ctx_file = "_context.json"
    localized_url_array = []
    url_dict = {}
    url_dict["local_path"] = file_name
    url_dict["url"] = localize_url

    localized_url_array.append(url_dict)
    with open(ctx_file) as f:
        ctx = json.load(f)
    ctx["localize_urls"] = localized_url_array

    with open(ctx_file, 'w') as f:
        json.dump(ctx, f, indent=2, sort_keys=True)
Exemplo n.º 17
0
def publish_dataset(path,
                    url,
                    params=None,
                    force=False,
                    publ_ctx_file=None,
                    publ_ctx_url=None):
    """
    Publish a dataset to the given url
    @param path - path of dataset to publish
    @param url - url to publish to
    @param force - unpublish dataset first if exists
    @param publ_ctx_file - publish context file
    @param publ_ctx_url - url to publish context file to
    """

    # set osaka params
    if params is None:
        params = {}

    # force remove previous dataset if it exists?
    if force:
        try:
            unpublish_dataset(url, params=params)
        except:
            pass

    # write publish context file
    if publ_ctx_file is not None and publ_ctx_url is not None:
        try:
            osaka.main.put(publ_ctx_file,
                           publ_ctx_url,
                           params=params,
                           noclobber=True)
        except osaka.utils.NoClobberException as e:
            raise NoClobberPublishContextException(
                "Failed to clobber {} when noclobber is True.".format(
                    publ_ctx_url))

    # upload datasets
    for root, dirs, files in os.walk(path):
        for file in files:
            abs_path = os.path.join(root, file)
            rel_path = os.path.relpath(abs_path, path)
            dest_url = os.path.join(url, rel_path)
            logger.info("Uploading %s to %s." % (abs_path, dest_url))
            osaka.main.put(abs_path, dest_url, params=params, noclobber=True)
Exemplo n.º 18
0
Arquivo: utils.py Projeto: hysds/hysds
def download_file(url, path, cache=False):
    """Download file/dir for input."""

    params = get_download_params(url)
    if cache:
        url_hash = hashlib.md5(url.encode()).hexdigest()
        hash_dir = os.path.join(app.conf.ROOT_WORK_DIR, "cache",
                                *url_hash[0:4])
        cache_dir = os.path.join(hash_dir, url_hash)
        makedirs(cache_dir)
        signal_file = os.path.join(cache_dir, ".localized")
        if os.path.exists(signal_file):
            logger.info("cache hit for {} at {}".format(url, cache_dir))
        else:
            logger.info("cache miss for {}".format(url))
            try:
                osaka.main.get(url, cache_dir, params=params)
            except Exception as e:
                shutil.rmtree(cache_dir)
                tb = traceback.format_exc()
                raise RuntimeError(
                    "Failed to download {} to cache {}: {}\n{}".format(
                        url, cache_dir, str(e), tb))
            with atomic_write(signal_file, overwrite=True) as f:
                f.write("%sZ\n" % datetime.utcnow().isoformat())
        for i in os.listdir(cache_dir):
            if i == ".localized":
                continue
            cached_obj = os.path.join(cache_dir, i)
            if os.path.isdir(cached_obj):
                dst = os.path.join(path, i) if os.path.isdir(path) else path
                try:
                    os.symlink(cached_obj, dst)
                except:
                    logger.error("Failed to soft link {} to {}".format(
                        cached_obj, dst))
                    raise
            else:
                try:
                    os.symlink(cached_obj, path)
                except:
                    logger.error("Failed to soft link {} to {}".format(
                        cached_obj, path))
                    raise
    else:
        return osaka.main.get(url, path, params=params)
Exemplo n.º 19
0
def find_dataset_json(work_dir):
    """Search for *.dataset.json files."""

    dataset_re = re.compile(r'^(.*)\.dataset\.json$')
    for root, dirs, files in os.walk(work_dir, followlinks=True):
        files.sort()
        dirs.sort()
        for file in files:
            match = dataset_re.search(file)
            if match:
                dataset_file = os.path.join(root, file)
                prod_dir = os.path.join(os.path.dirname(root), match.group(1))
                if prod_dir != root:
                    logger.info("%s exists in directory %s. Should be in %s. Not uploading."
                                % (dataset_file, root, prod_dir))
                elif not os.path.exists(prod_dir):
                    logger.info("Couldn't find product directory %s for dataset.json %s. Not uploading."
                                % (prod_dir, dataset_file))
                else:
                    yield (dataset_file, prod_dir)
Exemplo n.º 20
0
def evaluate_user_rules_job(job_id, es_url=app.conf.JOBS_ES_URL,
                            alias=app.conf.STATUS_ALIAS,
                            user_rules_idx=app.conf.USER_RULES_JOB_INDEX,
                            job_queue=app.conf.JOBS_PROCESSED_QUEUE):
    """Process all user rules in ES database and check if this job ID matches.
       If so, submit jobs. Otherwise do nothing."""

    # sleep 10 seconds to allow ES documents to be indexed
    time.sleep(10)

    # get all enabled user rules
    query = { "query": { "term": { "enabled": True } } }
    r = requests.post('%s/%s/.percolator/_search?search_type=scan&scroll=10m&size=100' %
                      (es_url, user_rules_idx), data=json.dumps(query))
    r.raise_for_status()
    scan_result = r.json()
    count = scan_result['hits']['total']
    scroll_id = scan_result['_scroll_id']
    rules = []
    while True:
        r = requests.post('%s/_search/scroll?scroll=10m' % es_url, data=scroll_id)
        res = r.json()
        scroll_id = res['_scroll_id']
        if len(res['hits']['hits']) == 0: break
        for hit in res['hits']['hits']:
            rules.append(hit['_source'])
    logger.info("Got %d enabled rules to check." % len(rules))

    # process rules
    for rule in rules:
        # sleep between queries
        time.sleep(1)

        # check for matching rules
        update_query(job_id, rule)
        final_qs = rule['query_string']
        r = requests.post('%s/job_status-current/job/_search' % es_url, data=final_qs)
        r.raise_for_status()
        result = r.json()
        if result['hits']['total'] == 0:
            logger.info("Rule '%s' didn't match for %s" % (rule['rule_name'], job_id))
            continue
        else: doc_res = result['hits']['hits'][0]
        logger.info("Rule '%s' successfully matched for %s" % (rule['rule_name'], job_id))
        #logger.info("doc_res: %s" % json.dumps(doc_res, indent=2))

        # submit trigger task
        queue_job_trigger(doc_res, rule, es_url)
        logger.info("Trigger task submitted for %s: %s" % (job_id, rule['job_type']))

    return True
Exemplo n.º 21
0
def get_acquisition_data_from_slc(slc_id):
    uu = getConf()
    es_url = uu['rest_url']
    es_index = "grq_*_*acquisition*"
    query = {
        "query": {
            "bool": {
                "must": [{
                    "term": {
                        "metadata.identifier.raw": slc_id
                    }
                }]
            }
        },
        "partial_fields": {
            "partial": {
                "exclude": "city",
            }
        }
    }

    logger.info(query)

    if es_url.endswith('/'):
        search_url = '%s%s/_search' % (es_url, es_index)
    else:
        search_url = '%s/%s/_search' % (es_url, es_index)
    r = requests.post(search_url, data=json.dumps(query))

    if r.status_code != 200:
        logger.info("Failed to query %s:\n%s" % (es_url, r.text))
        logger.info("query: %s" % json.dumps(query, indent=2))
        logger.info("returned: %s" % r.text)
        r.raise_for_status()

    result = r.json()
    logger.info(result['hits']['total'])
    return result['hits']['hits'][0]
Exemplo n.º 22
0
def get_singularity_cmd(params, cmd_line_list):
    """
    sample singularity command line:
    singularity_cmd = ["/nasa/singularity/3.2.0/bin/singularity", "exec", "--no-home", "--home", "/home/ops", "--bind", "/nobackupp14/lpan/work/cache/container-hello_world_master-2019-06-19-82a52bf2bb3b.simg:/container-hello_world_master-2019-06-19-82a52bf2bb3b.simg", "--pwd", "/container-hello_world_master-2019-06-19-82a52bf2bb3b.simg", "/nobackupp14/lpan/work/cache/container-hello_world_master-2019-06-19-82a52bf2bb3b.simg", "/home/ops/verdi/ops/hello_world/run_hello_world.sh"]
    """

    """Pull docker image into local repo and add call to docker in the
       command line list."""

    # build command
    singularity_cmd = get_base_singularity_cmd(params)

    # set command
    singularity_cmd.extend([str(i) for i in cmd_line_list])
    logger.info("XXXXXX singularity_cmd: %s" % singularity_cmd)

    ### singularity_cmd = ["/nasa/singularity/3.2.0/bin/singularity", "exec", "--no-home", "--home", "/home/ops", "--bind", "/nobackupp14/lpan/work/cache/container-hello_world_master-2019-07-24-b269614f8b4e.simg:/container-hello_world_master-2019-07-24-b269614f8b4e.simg", "--pwd", "/container-hello_world_master-2019-07-24-b269614f8b4e.simg", "/nobackupp14/lpan/work/cache/container-hello_world_master-2019-07-24-b269614f8b4e.simg", "/home/ops/verdi/ops/hello_world/run_hello_world.sh"]

    ### logger.info("XXXXXX hardcoded singularity_cmd: %s" % singularity_cmd)

    ### singularity exec --no-home --home /home/ops --bind /nobackupp14/lpan/work/cache/container-hello_world_master-2019-07-24-b269614f8b4e.simg:/container-hello_world_master-2019-07-24-b269614f8b4e.simg --pwd /container-hello_world_master-2019-07-24-b269614f8b4e.simg /nobackupp14/lpan/work/cache/container-hello_world_master-2019-07-24-b269614f8b4e.simg /home/ops/verdi/ops/hello_world/run_hello_world.sh

    return singularity_cmd
Exemplo n.º 23
0
def update_query(objectid, system_version, rule):
    """Update final query."""

    # build query
    query = rule['query']

    # filters
    filts = [{'term': {'system_version.raw': system_version}}]

    # query all?
    if rule.get('query_all', False) is False:
        filts.append({'ids': {'values': [objectid]}})

    # build final query
    if 'filtered' in query:
        final_query = copy.deepcopy(query)
        if 'and' in query['filtered']['filter']:
            final_query['filtered']['filter']['and'].extend(filts)
        else:
            filts.append(final_query['filtered']['filter'])
            final_query['filtered']['filter'] = {
                'and': filts,
            }
    else:
        final_query = {
            'filtered': {
                'query': query,
                'filter': {
                    'and': filts,
                }
            }
        }
    final_query = {"query": final_query}
    logger.info("Final query: %s" % json.dumps(final_query, indent=2))
    rule['query'] = final_query
    rule['query_string'] = json.dumps(final_query)
Exemplo n.º 24
0
def localize_file(url, path, cache):
    """Localize urls for job inputs. Track metrics."""

    # get job info
    job_dir = os.getcwd()  #job['job_info']['job_dir']

    # localize urls
    if path is None: path = '%s/' % job_dir
    else:
        if path.startswith('/'): pass
        else: path = os.path.join(job_dir, path)
    if os.path.isdir(path) or path.endswith('/'):
        path = os.path.join(path, os.path.basename(url))
    dir_path = os.path.dirname(path)
    logger.info(dir_path)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    loc_t1 = datetime.utcnow()
    try:
        download_file(url, path, cache=cache)
    except Exception, e:
        tb = traceback.format_exc()
        raise (RuntimeError("Failed to download %s: %s\n%s" %
                            (url, str(e), tb)))
Exemplo n.º 25
0
def check_slc_status(slc_id, index_suffix=None):

    result = get_dataset(slc_id, index_suffix)
    total = result['hits']['total']
    logger.info("check_slc_status : total : %s" % total)
    if total > 0:
        logger.info("check_slc_status : returning True")
        return True

    logger.info("check_slc_status : returning False")
    return False
Exemplo n.º 26
0
def download_file(url, path, cache=False):
    """Download file/dir for input."""

    params = get_download_params(url)
    if cache:
        url_hash = hashlib.md5(url.encode()).hexdigest()
        ### app.conf.ROOT_WORK_DIR = os.environ['HYSDS_ROOT_WORK_DIR']
        ### logger.info("****** in utils.py:download_file(), app.conf.ROOT_WORK_DIR: %s" % app.conf.ROOT_WORK_DIR)

        # get it from env variable set in the top level shell script (e.g., celery_worker.sh)
        root_cache_dir = os.environ['HYSDS_ROOT_CACHE_DIR']
        logger.info("****** in utils.py:download_file(), root_cache_dir: %s" %
                    root_cache_dir)
        ### hash_dir = os.path.join(app.conf.ROOT_WORK_DIR,
        hash_dir = os.path.join(root_cache_dir, 'cache', *url_hash[0:4])
        cache_dir = os.path.join(hash_dir, url_hash)
        makedirs(cache_dir)
        signal_file = os.path.join(cache_dir, '.localized')
        if os.path.exists(signal_file):
            logger.info("cache hit for {} at {}".format(url, cache_dir))
        else:
            logger.info("cache miss for {}".format(url))
            try:
                osaka.main.get(url, cache_dir, params=params)
            except Exception as e:
                shutil.rmtree(cache_dir)
                tb = traceback.format_exc()
                raise RuntimeError(
                    "Failed to download {} to cache {}: {}\n{}".format(
                        url, cache_dir, str(e), tb))
            with atomic_write(signal_file, overwrite=True) as f:
                f.write("%sZ\n" % datetime.utcnow().isoformat())
        for i in os.listdir(cache_dir):
            if i == '.localized':
                continue
            cached_obj = os.path.join(cache_dir, i)
            if os.path.isdir(cached_obj):
                dst = os.path.join(path, i) if os.path.isdir(path) else path
                try:
                    os.symlink(cached_obj, dst)
                except:
                    logger.error("Failed to soft link {} to {}".format(
                        cached_obj, dst))
                    raise
            else:
                try:
                    os.symlink(cached_obj, path)
                except:
                    logger.error("Failed to soft link {} to {}".format(
                        cached_obj, path))
                    raise
    else:
        return osaka.main.get(url, path, params=params)
Exemplo n.º 27
0
def get_remote_dav(url):
    """Get remote dir/file."""

    lpath = "./%s" % os.path.basename(url)
    if not url.endswith("/"):
        url += "/"
    parsed_url = urlparse(url)
    rpath = parsed_url.path
    r = requests.request("PROPFIND", url, verify=False)
    if r.status_code not in (200, 207):  # handle multistatus (207) as well
        logger.info("Got status code %d trying to read %s" %
                    (r.status_code, url))
        logger.info("Content:\n%s" % r.text)
        r.raise_for_status()
    tree = parse(StringIO(r.content))
    makedirs(lpath)
    for elem in tree.findall("{DAV:}response"):
        collection = elem.find(
            "{DAV:}propstat/{DAV:}prop/{DAV:}resourcetype/{DAV:}collection")
        if collection is not None:
            continue
        href = elem.find("{DAV:}href").text
        rel_path = os.path.relpath(href, rpath)
        file_url = os.path.join(url, rel_path)
        local_path = os.path.join(lpath, rel_path)
        local_dir = os.path.dirname(local_path)
        makedirs(local_dir)
        resp = requests.request("GET", file_url, verify=False, stream=True)
        if resp.status_code != 200:
            logger.info("Got status code %d trying to read %s" %
                        (resp.status_code, file_url))
            logger.info("Content:\n%s" % resp.text)
        resp.raise_for_status()
        with open(local_path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)
                    f.flush()
    return os.path.abspath(lpath)
Exemplo n.º 28
0
def ensure_dataset_indexed(objectid, system_version, es_url, alias):
    """Ensure dataset is indexed."""

    query = {
        "query": {
            "bool": {
                "must": [{
                    'term': {
                        '_id': objectid
                    }
                }, {
                    'term': {
                        'system_version.raw': system_version
                    }
                }]
            }
        },
        "fields": [],
    }
    logger.info("ensure_dataset_indexed query: %s" %
                json.dumps(query, indent=2))
    if es_url.endswith('/'):
        search_url = '%s%s/_search' % (es_url, alias)
    else:
        search_url = '%s/%s/_search' % (es_url, alias)
    logger.info("ensure_dataset_indexed url: %s" % search_url)
    r = requests.post(search_url, data=json.dumps(query))
    logger.info("ensure_dataset_indexed status: %s" % r.status_code)
    r.raise_for_status()
    result = r.json()
    logger.info("ensure_dataset_indexed result: %s" %
                json.dumps(result, indent=2))
    total = result['hits']['total']
    if total == 0:
        raise RuntimeError("Failed to find indexed dataset: {} ({})".format(
            objectid, system_version))
Exemplo n.º 29
0
Arquivo: utils.py Projeto: hysds/hysds
def publish_datasets(job, ctx):
    """Perform dataset publishing if job exited with zero status code."""

    # if exit code of job command is non-zero, don't publish anything
    exit_code = job["job_info"]["status"]
    if exit_code != 0:
        logger.info(
            "Job exited with exit code %s. Bypassing dataset publishing." %
            exit_code)
        return True

    # if job command never ran, don't publish anything
    pid = job["job_info"]["pid"]
    if pid == 0:
        logger.info("Job command never ran. Bypassing dataset publishing.")
        return True

    # get job info
    job_dir = job["job_info"]["job_dir"]

    # find and publish
    published_prods = []
    for dataset_file, prod_dir in find_dataset_json(job_dir):

        # skip if marked as localized input
        signal_file = os.path.join(prod_dir, ".localized")
        if os.path.exists(signal_file):
            logger.info("Skipping publish of %s. Marked as localized input." %
                        prod_dir)
            continue

        # publish
        prod_json = publish_dataset(prod_dir, dataset_file, job, ctx)

        # save json for published product
        published_prods.append(prod_json)

    # write published products to file
    pub_prods_file = os.path.join(job_dir, "_datasets.json")
    with open(pub_prods_file, "w") as f:
        json.dump(published_prods, f, indent=2, sort_keys=True)

    # signal run_job() to continue
    return True
Exemplo n.º 30
0
def evaluate_user_rules_dataset(
        objectid,
        system_version,
        es_url=app.conf.GRQ_ES_URL,
        alias=app.conf.DATASET_ALIAS,
        user_rules_idx=app.conf.USER_RULES_DATASET_INDEX,
        job_queue=app.conf.JOBS_PROCESSED_QUEUE):
    """Process all user rules in ES database and check if this objectid matches.
       If so, submit jobs. Otherwise do nothing."""

    # sleep for 10 seconds; let any documents finish indexing in ES
    time.sleep(10)

    # get all enabled user rules
    query = {"query": {"term": {"enabled": True}}}
    r = requests.post(
        '%s/%s/.percolator/_search?search_type=scan&scroll=10m&size=100' %
        (es_url, user_rules_idx),
        data=json.dumps(query))
    r.raise_for_status()
    scan_result = r.json()
    count = scan_result['hits']['total']
    scroll_id = scan_result['_scroll_id']
    rules = []
    while True:
        r = requests.post('%s/_search/scroll?scroll=10m' % es_url,
                          data=scroll_id)
        res = r.json()
        scroll_id = res['_scroll_id']
        if len(res['hits']['hits']) == 0: break
        for hit in res['hits']['hits']:
            rules.append(hit['_source'])
    logger.info("Got %d enabled rules to check." % len(rules))

    # process rules
    for rule in rules:
        # sleep between queries
        time.sleep(1)

        # check for matching rules
        update_query(objectid, system_version, rule)
        final_qs = rule['query_string']
        r = requests.post('%s/%s/_search' % (es_url, alias), data=final_qs)
        r.raise_for_status()
        result = r.json()
        if result['hits']['total'] == 0:
            logger.info("Rule '%s' didn't match for %s (%s)" %
                        (rule['rule_name'], objectid, system_version))
            continue
        else:
            doc_res = result['hits']['hits'][0]
        logger.info("Rule '%s' successfully matched for %s (%s)" %
                    (rule['rule_name'], objectid, system_version))
        #logger.info("doc_res: %s" % json.dumps(doc_res, indent=2))

        # set clean descriptive job name
        job_type = rule['job_type']
        if job_type.startswith('hysds-io-'):
            job_type = job_type.replace('hysds-io-', '', 1)
        job_name = "%s-%s" % (job_type, objectid)

        # submit trigger task
        queue_dataset_trigger(doc_res, rule, es_url, job_name)
        logger.info("Trigger task submitted for %s (%s): %s" %
                    (objectid, system_version, rule['job_type']))

    return True