def run_extractor(dsets_file, prod_path, ctx):
    """Run extractor configured in datasets JSON config."""

    logging.info("datasets: %s" % dsets_file)
    logging.info("prod_path: %s" % prod_path)
    # get settings
    settings_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 'settings.json')
    settings = json.load(open(settings_file))

    # recognize
    r = Recognizer(dsets_file, prod_path, os.path.basename(prod_path),
                   settings["EXTRACT_VERSION"])
    objectid = r.getId()

    # get extractor
    extractor = r.getMetadataExtractor()
    if extractor is not None:
        match = SCRIPT_RE.search(extractor)
        if match: extractor = match.group(1)
    logging.info("Configured metadata extractor: %s" % extractor)

    # metadata file
    metadata_file = os.path.join(prod_path, '%s.met.json' % \
                                 os.path.basename(prod_path))
    dataset_file = os.path.join(prod_path, '%s.dataset.json' % \
                                 os.path.basename(prod_path))

    # load metadata
    metadata = {}
    if os.path.exists(metadata_file):
        with open(metadata_file) as f:
            metadata = json.load(f)

    m = {}
    # run extractor
    if extractor is None:
        logging.info("No metadata extraction configured.")
    else:
        logging.info("Running metadata extractor %s on %s" % \
                    (extractor, prod_path))
        m = check_output([extractor, prod_path])
        if os.path.exists(metadata_file):
            with open(metadata_file) as f:
                metadata.update(json.load(f))

    # set data_product_name
    metadata['data_product_name'] = objectid

    # set download url from context
    localize_urls = ctx.get('localize_urls', [])
    if len(localize_urls) > 0:
        metadata['download_url'] = localize_urls[0]['url']

    # write it out to file
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2)
    logging.info("Wrote metadata to %s" % metadata_file)

    # Build datasets and add in "optional" fields, if not already created by extractor
    if not os.path.exists(dataset_file):
        datasets = {"version": settings["EXTRACT_VERSION"]}
        for key in ["location", "starttime", "endtime", "label"]:
            if key in m:
                datasets[key] = m[key]
        # write it out to file
        with open(dataset_file, 'w') as f:
            json.dump(datasets, f, indent=2)
        logging.info("Wrote dataset to %s" % dataset_file)
Пример #2
0
def run_extractor(dsets_file, prod_path, url, ctx, md5_hash):
    """Run extractor configured in datasets JSON config."""

    logging.info("datasets: %s" % dsets_file)
    logging.info("prod_path: %s" % prod_path)
    # get settings
    settings = {}
    try:
        settings_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'settings.json')
        settings = json.load(open(settings_file))
    except:

        settings['DATASETS_CFG'] = "/home/ops/verdi/etc/datasets.json"
        settings["INCOMING_VERSION"] = "v0.1"
        settings["EXTRACT_VERSION"]= "v0.1"
        settings["ACQ_TO_DSET_MAP"] = {"acquisition-S1-IW_SLC": "S1-IW_SLC"}

    # recognize
    r = Recognizer(dsets_file, prod_path,os.path.basename(prod_path),settings["EXTRACT_VERSION"])
    objectid = r.getId()

    # get extractor
    extractor = r.getMetadataExtractor()
    if extractor is not None:
        match = SCRIPT_RE.search(extractor)
        if match: extractor = match.group(1)
    logging.info("Configured metadata extractor: %s" % extractor)

    # metadata file
    metadata_file = os.path.join(prod_path, '%s.met.json' % \
                                 os.path.basename(prod_path))
    dataset_file = os.path.join(prod_path, '%s.dataset.json' % \
                                 os.path.basename(prod_path))

    with open(os.path.join(prod_path, '%s.zip.md5' % os.path.basename(prod_path)), 'w') as md5_file:
        md5_file.write(md5_hash)  # writing md5 hash into zip file if it passes

    # load metadata
    metadata = {}
    if os.path.exists(metadata_file):
        with open(metadata_file) as f:
            metadata = json.load(f)

    m = {}
    # run extractor
    if extractor is None:
        logging.info("No metadata extraction configured.")
    else:
        logging.info("Running metadata extractor %s on %s" % \
                    (extractor, prod_path))
        try:
            m = check_output([extractor, prod_path])
        except CalledProcessError as e:
            err_msg = e.message
            root_dir = os.getcwd()
            logging.info("root_dir with getcwd() : %s" %root_dir)
            if not root_dir.endswith("Z"):
                root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

            logging.info("root_dir final : %s" %root_dir)
            prov_log = os.path.join(root_dir, 'create_prov_es.log')
            split_log  = os.path.join(root_dir, 'split_swath_products.log')
            logging.info("%s\n%s" %(prov_log, split_log))
            if os.path.isfile(prov_log):
                prov_err = get_log_err(prov_log)
                if prov_err:
                    err_msg = prov_err
            elif os.path.isfile(split_log):
                split_err = get_log_err(split_log)
                if split_err:
                    err_msg=split_err
            else:
                logging.info("%s file NOT Found" %split_log)

            raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, err_msg))

        if os.path.exists(metadata_file):
            with open(metadata_file) as f:
                metadata.update(json.load(f))

    # set data_product_name
    metadata['data_product_name'] = objectid 

    # set download url from context
    metadata['download_url'] = url

    # add md5 hash in metadata
    metadata['md5_hash'] = md5_hash

    # write it out to file
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2)
    logging.info("Wrote metadata to %s" % metadata_file)

    # Build datasets and add in "optional" fields, if not already created by extractor
    if not os.path.exists(dataset_file):
        datasets = {"version":settings["EXTRACT_VERSION"]}
        for key in ["location","starttime","endtime","label"]:
            if key in m:
                datasets[key] = m[key]
        # write it out to file
        with open(dataset_file, 'w') as f:
            json.dump(datasets, f, indent=2)
        logging.info("Wrote dataset to %s" % dataset_file)
Пример #3
0
def ingest(objectid,
           dsets_file,
           grq_update_url,
           dataset_processed_queue,
           prod_path,
           job_path,
           dry_run=False,
           force=False):
    """Run dataset ingest."""
    logger.info("#" * 80)
    logger.info("datasets: %s" % dsets_file)
    logger.info("grq_update_url: %s" % grq_update_url)
    logger.info("dataset_processed_queue: %s" % dataset_processed_queue)
    logger.info("prod_path: %s" % prod_path)
    logger.info("job_path: %s" % job_path)
    logger.info("dry_run: %s" % dry_run)
    logger.info("force: %s" % force)

    # get default job path
    if job_path is None:
        job_path = os.getcwd()

    # detect job info
    job = {}
    job_json = os.path.join(job_path, '_job.json')
    if os.path.exists(job_json):
        with open(job_json) as f:
            try:
                job = json.load(f)
            except Exception as e:
                logger.warn("Failed to read job json:\n{}".format(str(e)))
    task_id = job.get('task_id', None)
    payload_id = job.get('job_info', {}).get('job_payload',
                                             {}).get('payload_task_id', None)
    payload_hash = job.get('job_info', {}).get('payload_hash', None)
    logger.info("task_id: %s" % task_id)
    logger.info("payload_id: %s" % payload_id)
    logger.info("payload_hash: %s" % payload_hash)

    # get dataset
    if os.path.isdir(prod_path):
        local_prod_path = prod_path
    else:
        local_prod_path = get_remote_dav(prod_path)
    if not os.path.isdir(local_prod_path):
        raise RuntimeError("Failed to find local dataset directory: %s" %
                           local_prod_path)

    # write publish context
    publ_ctx_name = "_publish.context.json"
    publ_ctx_dir = mkdtemp(prefix=".pub_context", dir=job_path)
    publ_ctx_file = os.path.join(publ_ctx_dir, publ_ctx_name)
    with open(publ_ctx_file, 'w') as f:
        json.dump(
            {
                'payload_id': payload_id,
                'payload_hash': payload_hash,
                'task_id': task_id
            },
            f,
            indent=2,
            sort_keys=True)
    publ_ctx_url = None

    # dataset name
    pname = os.path.basename(local_prod_path)

    # dataset file
    dataset_file = os.path.join(local_prod_path, '%s.dataset.json' % pname)

    # get dataset json
    with open(dataset_file) as f:
        dataset = json.load(f)
    logger.info("Loaded dataset JSON from file: %s" % dataset_file)

    # check minimum requirements for dataset JSON
    logger.info("Verifying dataset JSON...")
    verify_dataset(dataset)
    logger.info("Dataset JSON verfication succeeded.")

    # get version
    version = dataset['version']

    # recognize
    r = Recognizer(dsets_file, local_prod_path, objectid, version)

    # get ipath
    ipath = r.currentIpath

    # get extractor
    extractor = r.getMetadataExtractor()
    if extractor is not None:
        match = SCRIPT_RE.search(extractor)
        if match:
            extractor = match.group(1)
    logger.info("Configured metadata extractor: %s" % extractor)

    # metadata file
    metadata_file = os.path.join(local_prod_path, '%s.met.json' % pname)

    # metadata seed file
    seed_file = os.path.join(local_prod_path, 'met.json')

    # metadata file already here
    if os.path.exists(metadata_file):
        with open(metadata_file) as f:
            metadata = json.load(f)
        logger.info("Loaded metadata from existing file: %s" % metadata_file)
    else:
        if extractor is None:
            logger.info(
                "No metadata extraction configured. Setting empty metadata.")
            metadata = {}
        else:
            logger.info("Running metadata extractor %s on %s" %
                        (extractor, local_prod_path))
            m = check_output([extractor, local_prod_path])
            logger.info("Output: %s" % m.decode())

            # generate json to update metadata and urls
            metadata = json.loads(m)

            # set data_product_name
            metadata['data_product_name'] = objectid

            # merge with seed metadata
            if os.path.exists(seed_file):
                with open(seed_file) as f:
                    seed = json.load(f)
                metadata.update(seed)
                logger.info("Loaded seed metadata from file: %s" % seed_file)

            # write it out to file
            with open(metadata_file, 'w') as f:
                json.dump(metadata, f, indent=2)
            logger.info("Wrote metadata to %s" % metadata_file)

            # delete seed file
            if os.path.exists(seed_file):
                os.unlink(seed_file)
                logger.info("Deleted seed file %s." % seed_file)

    # add context
    context_file = os.path.join(local_prod_path, '%s.context.json' % pname)
    if os.path.exists(context_file):
        with open(context_file) as f:
            context = json.load(f)
        logger.info("Loaded context from existing file: %s" % context_file)
    else:
        context = {}
    metadata['context'] = context

    # set metadata and dataset groups in recognizer
    r.setDataset(dataset)
    r.setMetadata(metadata)

    # get level
    level = r.getLevel()

    # get type
    dtype = r.getType()

    # set product metrics
    prod_metrics = {'ipath': ipath, 'path': local_prod_path}

    # publish dataset
    if r.publishConfigured():
        logger.info("Dataset publish is configured.")

        # get publish path
        pub_path_url = r.getPublishPath()

        # get publish urls
        pub_urls = [i for i in r.getPublishUrls()]

        # get S3 profile name and api keys for dataset publishing
        s3_secret_key, s3_access_key = r.getS3Keys()
        s3_profile = r.getS3Profile()

        # set osaka params
        osaka_params = {}

        # S3 profile takes precedence over explicit api keys
        if s3_profile is not None:
            osaka_params['profile_name'] = s3_profile
        else:
            if s3_secret_key is not None and s3_access_key is not None:
                osaka_params['aws_access_key_id'] = s3_access_key
                osaka_params['aws_secret_access_key'] = s3_secret_key

        # get pub host and path
        logger.info("Configured pub host & path: %s" % (pub_path_url))

        # check scheme
        if not osaka.main.supported(pub_path_url):
            raise RuntimeError("Scheme %s is currently not supported." %
                               urlparse(pub_path_url).scheme)

        # upload dataset to repo; track disk usage and start/end times of transfer
        prod_dir_usage = get_disk_usage(local_prod_path)
        tx_t1 = datetime.utcnow()
        if dry_run:
            logger.info("Would've published %s to %s" %
                        (local_prod_path, pub_path_url))
        else:
            publ_ctx_url = os.path.join(pub_path_url, publ_ctx_name)
            orig_publ_ctx_file = publ_ctx_file + '.orig'
            try:
                publish_dataset(local_prod_path,
                                pub_path_url,
                                params=osaka_params,
                                force=force,
                                publ_ctx_file=publ_ctx_file,
                                publ_ctx_url=publ_ctx_url)
            except NoClobberPublishContextException as e:
                logger.warn(
                    "A publish context file was found at {}. Retrieving.".
                    format(publ_ctx_url))
                osaka.main.get(publ_ctx_url,
                               orig_publ_ctx_file,
                               params=osaka_params)
                with open(orig_publ_ctx_file) as f:
                    orig_publ_ctx = json.load(f)
                logger.warn("original publish context: {}".format(
                    json.dumps(orig_publ_ctx, indent=2, sort_keys=True)))
                orig_payload_id = orig_publ_ctx.get('payload_id', None)
                orig_payload_hash = orig_publ_ctx.get('payload_hash', None)
                orig_task_id = orig_publ_ctx.get('task_id', None)
                logger.warn("orig payload_id: {}".format(orig_payload_id))
                logger.warn("orig payload_hash: {}".format(orig_payload_hash))
                logger.warn("orig task_id: {}".format(orig_payload_id))

                if orig_payload_id is None:
                    raise

                # overwrite if this job is a retry of the previous job
                if payload_id is not None and payload_id == orig_payload_id:
                    msg = "This job is a retry of a previous job that resulted " + \
                          "in an orphaned dataset. Forcing publish."
                    logger.warn(msg)
                    log_custom_event(
                        'orphaned_dataset-retry_previous_failed', 'clobber', {
                            'orphan_info': {
                                'payload_id': payload_id,
                                'payload_hash': payload_hash,
                                'task_id': task_id,
                                'orig_payload_id': orig_payload_id,
                                'orig_payload_hash': orig_payload_hash,
                                'orig_task_id': orig_task_id,
                                'dataset_id': objectid,
                                'dataset_url': pub_path_url,
                                'msg': msg
                            }
                        })
                else:
                    job_status = get_job_status(orig_payload_id)
                    logger.warn("orig job status: {}".format(job_status))

                    # overwrite if previous job failed
                    if job_status == "job-failed":
                        msg = "Detected previous job failure that resulted in an " + \
                              "orphaned dataset. Forcing publish."
                        logger.warn(msg)
                        log_custom_event(
                            'orphaned_dataset-job_failed', 'clobber', {
                                'orphan_info': {
                                    'payload_id': payload_id,
                                    'payload_hash': payload_hash,
                                    'task_id': task_id,
                                    'orig_payload_id': orig_payload_id,
                                    'orig_payload_hash': orig_payload_hash,
                                    'orig_task_id': orig_task_id,
                                    'orig_status': job_status,
                                    'dataset_id': objectid,
                                    'dataset_url': pub_path_url,
                                    'msg': msg
                                }
                            })
                    else:
                        # overwrite if dataset doesn't exist in grq
                        if not dataset_exists(objectid):
                            msg = "Detected orphaned dataset without ES doc. Forcing publish."
                            logger.warn(msg)
                            log_custom_event(
                                'orphaned_dataset-no_es_doc', 'clobber', {
                                    'orphan_info': {
                                        'payload_id': payload_id,
                                        'payload_hash': payload_hash,
                                        'task_id': task_id,
                                        'dataset_id': objectid,
                                        'dataset_url': pub_path_url,
                                        'msg': msg
                                    }
                                })
                        else:
                            raise
                publish_dataset(local_prod_path,
                                pub_path_url,
                                params=osaka_params,
                                force=True,
                                publ_ctx_file=publ_ctx_file,
                                publ_ctx_url=publ_ctx_url)
            except osaka.utils.NoClobberException as e:
                if dataset_exists(objectid):
                    try:
                        osaka.main.rmall(publ_ctx_url, params=osaka_params)
                    except:
                        logger.warn(
                            "Failed to clean up publish context {} after attempting to clobber valid dataset."
                            .format(publ_ctx_url))
                    raise
                else:
                    msg = "Detected orphaned dataset without ES doc. Forcing publish."
                    logger.warn(msg)
                    log_custom_event(
                        'orphaned_dataset-no_es_doc', 'clobber', {
                            'orphan_info': {
                                'payload_id': payload_id,
                                'payload_hash': payload_hash,
                                'task_id': task_id,
                                'dataset_id': objectid,
                                'dataset_url': pub_path_url,
                                'msg': msg
                            }
                        })
                    publish_dataset(local_prod_path,
                                    pub_path_url,
                                    params=osaka_params,
                                    force=True,
                                    publ_ctx_file=publ_ctx_file,
                                    publ_ctx_url=publ_ctx_url)
        tx_t2 = datetime.utcnow()
        tx_dur = (tx_t2 - tx_t1).total_seconds()

        # save dataset metrics on size and transfer
        prod_metrics.update({
            'url': urlparse(pub_path_url).path,
            'disk_usage': prod_dir_usage,
            'time_start': tx_t1.isoformat() + 'Z',
            'time_end': tx_t2.isoformat() + 'Z',
            'duration': tx_dur,
            'transfer_rate': prod_dir_usage / tx_dur
        })
    else:
        logger.info("Dataset publish is not configured.")
        pub_urls = []

    # publish browse
    if r.browseConfigured():
        logger.info("Browse publish is configured.")

        # get browse path and urls
        browse_path = r.getBrowsePath()
        browse_urls = r.getBrowseUrls()

        # get S3 profile name and api keys for browse image publishing
        s3_secret_key_browse, s3_access_key_browse = r.getS3Keys("browse")
        s3_profile_browse = r.getS3Profile("browse")

        # set osaka params for browse
        osaka_params_browse = {}

        # S3 profile takes precedence over explicit api keys
        if s3_profile_browse is not None:
            osaka_params_browse['profile_name'] = s3_profile_browse
        else:
            if s3_secret_key_browse is not None and s3_access_key_browse is not None:
                osaka_params_browse['aws_access_key_id'] = s3_access_key_browse
                osaka_params_browse[
                    'aws_secret_access_key'] = s3_secret_key_browse

        # add metadata for all browse images and upload to browse location
        imgs_metadata = []
        imgs = glob('%s/*browse.png' % local_prod_path)
        for img in imgs:
            img_metadata = {'img': os.path.basename(img)}
            small_img = img.replace('browse.png', 'browse_small.png')
            if os.path.exists(small_img):
                small_img_basename = os.path.basename(small_img)
                if browse_path is not None:
                    this_browse_path = os.path.join(browse_path,
                                                    small_img_basename)
                    if dry_run:
                        logger.info("Would've uploaded %s to %s" %
                                    (small_img, browse_path))
                    else:
                        logger.info("Uploading %s to %s" %
                                    (small_img, browse_path))
                        osaka.main.put(small_img,
                                       this_browse_path,
                                       params=osaka_params_browse,
                                       noclobber=False)
            else:
                small_img_basename = None
            img_metadata['small_img'] = small_img_basename
            tooltip_match = BROWSE_RE.search(img_metadata['img'])
            if tooltip_match:
                img_metadata['tooltip'] = tooltip_match.group(1)
            else:
                img_metadata['tooltip'] = ""
            imgs_metadata.append(img_metadata)

        # sort browse images
        browse_sort_order = r.getBrowseSortOrder()
        if isinstance(browse_sort_order, list) and len(browse_sort_order) > 0:
            bso_regexes = [re.compile(i) for i in browse_sort_order]
            sorter = {}
            unrecognized = []
            for img in imgs_metadata:
                matched = None
                for i, bso_re in enumerate(bso_regexes):
                    if bso_re.search(img['img']):
                        matched = img
                        sorter[i] = matched
                        break
                if matched is None:
                    unrecognized.append(img)
            imgs_metadata = [sorter[i] for i in sorted(sorter)]
            imgs_metadata.extend(unrecognized)
    else:
        logger.info("Browse publish is not configured.")
        browse_urls = []
        imgs_metadata = []

    # set update json
    update_json = {
        'id': objectid,
        'objectid': objectid,
        'metadata': metadata,
        'dataset': ipath.split('/')[1],
        'ipath': ipath,
        'system_version': version,
        'dataset_level': level,
        'dataset_type': dtype,
        'urls': pub_urls,
        'browse_urls': browse_urls,
        'images': imgs_metadata,
        'prov': context.get('_prov', {}),
    }
    update_json.update(dataset)
    #logger.info("update_json: %s" % pformat(update_json))

    # custom index specified?
    index = r.getIndex()
    if index is not None:
        update_json['index'] = index

    # update GRQ
    if isinstance(update_json['metadata'],
                  dict) and len(update_json['metadata']) > 0:
        #logger.info("update_json: %s" % pformat(update_json))
        if dry_run:
            logger.info("Would've indexed doc at %s: %s" %
                        (grq_update_url,
                         json.dumps(update_json, indent=2, sort_keys=True)))
        else:
            res = index_dataset(grq_update_url, update_json)
            logger.info("res: %s" % res)
            update_json['grq_index_result'] = res

    # finish if dry run
    if dry_run:
        try:
            shutil.rmtree(publ_ctx_dir)
        except:
            pass
        return (prod_metrics, update_json)

    # create PROV-ES JSON file for publish processStep
    prod_prov_es_file = os.path.join(
        local_prod_path, '%s.prov_es.json' % os.path.basename(local_prod_path))
    pub_prov_es_bn = "publish.prov_es.json"
    if os.path.exists(prod_prov_es_file):
        pub_prov_es_file = os.path.join(local_prod_path, pub_prov_es_bn)
        prov_es_info = {}
        with open(prod_prov_es_file) as f:
            try:
                prov_es_info = json.load(f)
            except Exception as e:
                tb = traceback.format_exc()
                raise RuntimeError(
                    "Failed to load PROV-ES from {}: {}\n{}".format(
                        prod_prov_es_file, str(e), tb))
        log_publish_prov_es(prov_es_info, pub_prov_es_file, local_prod_path,
                            pub_urls, prod_metrics, objectid)
        # upload publish PROV-ES file
        osaka.main.put(pub_prov_es_file,
                       os.path.join(pub_path_url, pub_prov_es_bn),
                       params=osaka_params,
                       noclobber=False)

    # cleanup publish context
    if publ_ctx_url is not None:
        try:
            osaka.main.rmall(publ_ctx_url, params=osaka_params)
        except:
            logger.warn(
                "Failed to clean up publish context at {} on successful publish."
                .format(publ_ctx_url))
    try:
        shutil.rmtree(publ_ctx_dir)
    except:
        pass

    # queue data dataset
    queue_dataset(ipath, update_json, dataset_processed_queue)

    # return dataset metrics and dataset json
    return (prod_metrics, update_json)