Пример #1
0
def add_metadata(product_dir, metadata_file):
    """Add metadata to json file."""

    with open(metadata_file) as f:
        metadata = json.load(f)

    # get datasets config
    uu = UrlUtils()
    dsets_file = uu.datasets_cfg
    r = Recognizer(dsets_file, product_dir, product_dir, 'v0.1')

    # add
    metadata.setdefault('dataset_type', r.getType())
    metadata.setdefault('dataset_level', r.getLevel())

    # overwrite metadata json file
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2, sort_keys=True)
Пример #2
0
    # get dataset json
    with open(dataset_file) as f:
        dataset = json.load(f)
    logger.info("Loaded dataset JSON from file: %s" % dataset_file)

    # check minimum requirements for dataset JSON
    logger.info("Verifying dataset JSON...")
    verify_dataset(dataset)
    logger.info("Dataset JSON verfication succeeded.")

    # get version
    version = dataset['version']

    # recognize
    r = Recognizer(dsets_file, local_prod_path, objectid, version)

    # get ipath
    ipath = r.currentIpath

    # get extractor
    extractor = r.getMetadataExtractor()
    if extractor is not None:
        match = SCRIPT_RE.search(extractor)
        if match: extractor = match.group(1)
    logger.info("Configured metadata extractor: %s" % extractor)

    # metadata file
    metadata_file = os.path.join(local_prod_path, '%s.met.json' % pname)

    # metadata seed file
def run_extractor(dsets_file, prod_path, ctx):
    """Run extractor configured in datasets JSON config."""

    logging.info("datasets: %s" % dsets_file)
    logging.info("prod_path: %s" % prod_path)
    # get settings
    settings_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 'settings.json')
    settings = json.load(open(settings_file))

    # recognize
    r = Recognizer(dsets_file, prod_path, os.path.basename(prod_path),
                   settings["EXTRACT_VERSION"])
    objectid = r.getId()

    # get extractor
    extractor = r.getMetadataExtractor()
    if extractor is not None:
        match = SCRIPT_RE.search(extractor)
        if match: extractor = match.group(1)
    logging.info("Configured metadata extractor: %s" % extractor)

    # metadata file
    metadata_file = os.path.join(prod_path, '%s.met.json' % \
                                 os.path.basename(prod_path))
    dataset_file = os.path.join(prod_path, '%s.dataset.json' % \
                                 os.path.basename(prod_path))

    # load metadata
    metadata = {}
    if os.path.exists(metadata_file):
        with open(metadata_file) as f:
            metadata = json.load(f)

    m = {}
    # run extractor
    if extractor is None:
        logging.info("No metadata extraction configured.")
    else:
        logging.info("Running metadata extractor %s on %s" % \
                    (extractor, prod_path))
        m = check_output([extractor, prod_path])
        if os.path.exists(metadata_file):
            with open(metadata_file) as f:
                metadata.update(json.load(f))

    # set data_product_name
    metadata['data_product_name'] = objectid

    # set download url from context
    localize_urls = ctx.get('localize_urls', [])
    if len(localize_urls) > 0:
        metadata['download_url'] = localize_urls[0]['url']

    # write it out to file
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2)
    logging.info("Wrote metadata to %s" % metadata_file)

    # Build datasets and add in "optional" fields, if not already created by extractor
    if not os.path.exists(dataset_file):
        datasets = {"version": settings["EXTRACT_VERSION"]}
        for key in ["location", "starttime", "endtime", "label"]:
            if key in m:
                datasets[key] = m[key]
        # write it out to file
        with open(dataset_file, 'w') as f:
            json.dump(datasets, f, indent=2)
        logging.info("Wrote dataset to %s" % dataset_file)
Пример #4
0
def run_extractor(dsets_file, prod_path, url, ctx):
    """Run extractor configured in datasets JSON config."""

    logging.info("datasets: %s" % dsets_file)
    logging.info("prod_path: %s" % prod_path)
    # get settings
    settings = {}
    try:
        settings_file = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'settings.json')
        settings = json.load(open(settings_file))
    except:

        settings['DATASETS_CFG'] = "/home/ops/verdi/etc/datasets.json"
        settings["INCOMING_VERSION"] = "v0.1"
        settings["EXTRACT_VERSION"] = "v0.1"
        settings["ACQ_TO_DSET_MAP"] = {"acquisition-S1-IW_SLC": "S1-IW_SLC"}

    # recognize
    r = Recognizer(dsets_file, prod_path, os.path.basename(prod_path),
                   settings["EXTRACT_VERSION"])
    objectid = r.getId()

    # get extractor
    extractor = r.getMetadataExtractor()
    if extractor is not None:
        match = SCRIPT_RE.search(extractor)
        if match: extractor = match.group(1)
    logging.info("Configured metadata extractor: %s" % extractor)

    # metadata file
    metadata_file = os.path.join(prod_path, '%s.met.json' % \
                                 os.path.basename(prod_path))
    dataset_file = os.path.join(prod_path, '%s.dataset.json' % \
                                 os.path.basename(prod_path))

    # load metadata
    metadata = {}
    if os.path.exists(metadata_file):
        with open(metadata_file) as f:
            metadata = json.load(f)

    m = {}
    # run extractor
    if extractor is None:
        logging.info("No metadata extraction configured.")
    else:
        logging.info("Running metadata extractor %s on %s" % \
                    (extractor, prod_path))
        try:
            m = check_output([extractor, prod_path])
        except CalledProcessError as e:
            raise RuntimeError(
                "command '{}' return with error (code {}): {}".format(
                    e.cmd, e.returncode, e.output))

        if os.path.exists(metadata_file):
            with open(metadata_file) as f:
                metadata.update(json.load(f))

    # set data_product_name
    metadata['data_product_name'] = objectid

    # set download url from context
    metadata['download_url'] = url

    # write it out to file
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2)
    logging.info("Wrote metadata to %s" % metadata_file)

    # Build datasets and add in "optional" fields, if not already created by extractor
    if not os.path.exists(dataset_file):
        datasets = {"version": settings["EXTRACT_VERSION"]}
        for key in ["location", "starttime", "endtime", "label"]:
            if key in m:
                datasets[key] = m[key]
        # write it out to file
        with open(dataset_file, 'w') as f:
            json.dump(datasets, f, indent=2)
        logging.info("Wrote dataset to %s" % dataset_file)
Пример #5
0
def run_extractor(dsets_file, prod_path, url, ctx, md5_hash):
    """Run extractor configured in datasets JSON config."""

    logging.info("datasets: %s" % dsets_file)
    logging.info("prod_path: %s" % prod_path)
    # get settings
    settings = {}
    try:
        settings_file = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'settings.json')
        settings = json.load(open(settings_file))
    except:
        settings['DATASETS_CFG'] = "/home/ops/verdi/etc/datasets.json"
        settings["INCOMING_VERSION"] = "v0.1"
        settings["EXTRACT_VERSION"] = "v0.1"
        settings["ACQ_TO_DSET_MAP"] = {"acquisition-S1-IW_SLC": "S1-IW_SLC"}

    # recognize
    r = Recognizer(dsets_file, prod_path, os.path.basename(prod_path),
                   settings["EXTRACT_VERSION"])
    objectid = r.getId()

    # get extractor
    extractor = r.getMetadataExtractor()
    if extractor is not None:
        match = SCRIPT_RE.search(extractor)
        if match: extractor = match.group(1)
    logging.info("Configured metadata extractor: %s" % extractor)

    # metadata file
    metadata_file = os.path.join(prod_path, '%s.met.json' % \
                                 os.path.basename(prod_path))
    dataset_file = os.path.join(prod_path, '%s.dataset.json' % \
                                 os.path.basename(prod_path))

    with open(
            os.path.join(prod_path,
                         '%s.zip.md5' % os.path.basename(prod_path)),
            'w') as md5_file:
        md5_file.write(md5_hash)  # writing md5 hash into zip file if it passes

    # load metadata
    metadata = {}
    if os.path.exists(metadata_file):
        with open(metadata_file) as f:
            metadata = json.load(f)

    m = {}
    # run extractor
    if extractor is None:
        logging.info("No metadata extraction configured.")
    else:
        logging.info("Running metadata extractor %s on %s" % \
                    (extractor, prod_path))
        try:
            m = check_output([extractor, prod_path])
        except CalledProcessError as e:
            err_msg = e.message
            root_dir = os.getcwd()
            logging.info("root_dir with getcwd() : %s" % root_dir)
            if not root_dir.endswith("Z"):
                root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

            logging.info("root_dir final : %s" % root_dir)
            prov_log = os.path.join(root_dir, 'create_prov_es.log')
            split_log = os.path.join(root_dir, 'split_swath_products.log')
            logging.info("%s\n%s" % (prov_log, split_log))
            if os.path.isfile(prov_log):
                prov_err = get_log_err(prov_log)
                if prov_err:
                    err_msg = prov_err
            elif os.path.isfile(split_log):
                split_err = get_log_err(split_log)
                if split_err:
                    err_msg = split_err
            else:
                logging.info("%s file NOT Found" % split_log)
            raise RuntimeError(
                "command '{}' return with error (code {}): {}".format(
                    e.cmd, e.returncode, err_msg))

        if os.path.exists(metadata_file):
            with open(metadata_file) as f:
                metadata.update(json.load(f))

    # set data_product_name
    metadata['data_product_name'] = objectid

    # set download url from context
    metadata['download_url'] = url

    # add md5 hash in metadata
    metadata['md5_hash'] = md5_hash

    # write it out to file
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2)
    logging.info("Wrote metadata to %s" % metadata_file)

    # Build datasets and add in "optional" fields, if not already created by extractor
    if not os.path.exists(dataset_file):
        datasets = {"version": settings["EXTRACT_VERSION"]}
        for key in ["location", "starttime", "endtime", "label"]:
            if key in m:
                datasets[key] = m[key]
        # write it out to file
        with open(dataset_file, 'w') as f:
            json.dump(datasets, f, indent=2)
        logging.info("Wrote dataset to %s" % dataset_file)
Пример #6
0
def ingest(
    objectid,
    dsets_file,
    grq_update_url,
    dataset_processed_queue,
    prod_path,
    job_path,
    dry_run=False,
    force=False,
):
    """Run dataset ingest."""
    logger.info("#" * 80)
    logger.info("datasets: %s" % dsets_file)
    logger.info("grq_update_url: %s" % grq_update_url)
    logger.info("dataset_processed_queue: %s" % dataset_processed_queue)
    logger.info("prod_path: %s" % prod_path)
    logger.info("job_path: %s" % job_path)
    logger.info("dry_run: %s" % dry_run)
    logger.info("force: %s" % force)

    # get default job path
    if job_path is None:
        job_path = os.getcwd()

    # detect job info
    job = {}
    job_json = os.path.join(job_path, "_job.json")
    if os.path.exists(job_json):
        with open(job_json) as f:
            try:
                job = json.load(f)
            except Exception as e:
                logger.warn("Failed to read job json:\n{}".format(str(e)))
    task_id = job.get("task_id", None)
    payload_id = (job.get("job_info", {}).get("job_payload",
                                              {}).get("payload_task_id", None))
    payload_hash = job.get("job_info", {}).get("payload_hash", None)
    logger.info("task_id: %s" % task_id)
    logger.info("payload_id: %s" % payload_id)
    logger.info("payload_hash: %s" % payload_hash)

    # get dataset
    if os.path.isdir(prod_path):
        local_prod_path = prod_path
    else:
        local_prod_path = get_remote_dav(prod_path)
    if not os.path.isdir(local_prod_path):
        raise RuntimeError("Failed to find local dataset directory: %s" %
                           local_prod_path)

    # write publish context
    publ_ctx_name = "_publish.context.json"
    publ_ctx_dir = mkdtemp(prefix=".pub_context", dir=job_path)
    publ_ctx_file = os.path.join(publ_ctx_dir, publ_ctx_name)
    with open(publ_ctx_file, "w") as f:
        json.dump(
            {
                "payload_id": payload_id,
                "payload_hash": payload_hash,
                "task_id": task_id,
            },
            f,
            indent=2,
            sort_keys=True,
        )
    publ_ctx_url = None

    # dataset name
    pname = os.path.basename(local_prod_path)

    # dataset file
    dataset_file = os.path.join(local_prod_path, "%s.dataset.json" % pname)

    # get dataset json
    with open(dataset_file) as f:
        dataset = json.load(f)
    logger.info("Loaded dataset JSON from file: %s" % dataset_file)

    # check minimum requirements for dataset JSON
    logger.info("Verifying dataset JSON...")
    verify_dataset(dataset)
    logger.info("Dataset JSON verfication succeeded.")

    # get version
    version = dataset["version"]

    # recognize
    r = Recognizer(dsets_file, local_prod_path, objectid, version)

    # get extractor
    extractor = r.getMetadataExtractor()
    if extractor is not None:
        match = SCRIPT_RE.search(extractor)
        if match:
            extractor = match.group(1)
    logger.info("Configured metadata extractor: %s" % extractor)

    # metadata file
    metadata_file = os.path.join(local_prod_path, "%s.met.json" % pname)

    # metadata seed file
    seed_file = os.path.join(local_prod_path, "met.json")

    # metadata file already here
    if os.path.exists(metadata_file):
        with open(metadata_file) as f:
            metadata = json.load(f)
        logger.info("Loaded metadata from existing file: %s" % metadata_file)
    else:
        if extractor is None:
            logger.info(
                "No metadata extraction configured. Setting empty metadata.")
            metadata = {}
        else:
            logger.info("Running metadata extractor %s on %s" %
                        (extractor, local_prod_path))
            m = check_output([extractor, local_prod_path])
            logger.info("Output: %s" % m.decode())

            # generate json to update metadata and urls
            metadata = json.loads(m)

            # set data_product_name
            metadata["data_product_name"] = objectid

            # merge with seed metadata
            if os.path.exists(seed_file):
                with open(seed_file) as f:
                    seed = json.load(f)
                metadata.update(seed)
                logger.info("Loaded seed metadata from file: %s" % seed_file)

            # write it out to file
            with open(metadata_file, "w") as f:
                json.dump(metadata, f, indent=2)
            logger.info("Wrote metadata to %s" % metadata_file)

            # delete seed file
            if os.path.exists(seed_file):
                os.unlink(seed_file)
                logger.info("Deleted seed file %s." % seed_file)

    # read context
    context_file = os.path.join(local_prod_path, "%s.context.json" % pname)
    if os.path.exists(context_file):
        with open(context_file) as f:
            context = json.load(f)
        logger.info("Loaded context from existing file: %s" % context_file)
    else:
        context = {}

    # set metadata and dataset groups in recognizer
    r.setDataset(dataset)
    r.setMetadata(metadata)

    # get ipath
    ipath = r.getIpath()

    # get level
    level = r.getLevel()

    # get type
    dtype = r.getType()

    # set product metrics
    prod_metrics = {"ipath": ipath, "path": local_prod_path}

    # publish dataset
    if r.publishConfigured():
        logger.info("Dataset publish is configured.")

        # get publish path
        pub_path_url = r.getPublishPath()

        # get publish urls
        pub_urls = [i for i in r.getPublishUrls()]

        # get S3 profile name and api keys for dataset publishing
        s3_secret_key, s3_access_key = r.getS3Keys()
        s3_profile = r.getS3Profile()

        # set osaka params
        osaka_params = {}

        # S3 profile takes precedence over explicit api keys
        if s3_profile is not None:
            osaka_params["profile_name"] = s3_profile
        else:
            if s3_secret_key is not None and s3_access_key is not None:
                osaka_params["aws_access_key_id"] = s3_access_key
                osaka_params["aws_secret_access_key"] = s3_secret_key

        # get pub host and path
        logger.info("Configured pub host & path: %s" % (pub_path_url))

        # check scheme
        if not osaka.main.supported(pub_path_url):
            raise RuntimeError("Scheme %s is currently not supported." %
                               urlparse(pub_path_url).scheme)

        # upload dataset to repo; track disk usage and start/end times of transfer
        prod_dir_usage = get_disk_usage(local_prod_path)
        tx_t1 = datetime.utcnow()
        if dry_run:
            logger.info("Would've published %s to %s" %
                        (local_prod_path, pub_path_url))
        else:
            publ_ctx_url = os.path.join(pub_path_url, publ_ctx_name)
            orig_publ_ctx_file = publ_ctx_file + ".orig"
            try:
                publish_dataset(
                    local_prod_path,
                    pub_path_url,
                    params=osaka_params,
                    force=force,
                    publ_ctx_file=publ_ctx_file,
                    publ_ctx_url=publ_ctx_url,
                )
            except NoClobberPublishContextException as e:
                logger.warn(
                    "A publish context file was found at {}. Retrieving.".
                    format(publ_ctx_url))
                osaka.main.get(publ_ctx_url,
                               orig_publ_ctx_file,
                               params=osaka_params)
                with open(orig_publ_ctx_file) as f:
                    orig_publ_ctx = json.load(f)
                logger.warn("original publish context: {}".format(
                    json.dumps(orig_publ_ctx, indent=2, sort_keys=True)))
                orig_payload_id = orig_publ_ctx.get("payload_id", None)
                orig_payload_hash = orig_publ_ctx.get("payload_hash", None)
                orig_task_id = orig_publ_ctx.get("task_id", None)
                logger.warn("orig payload_id: {}".format(orig_payload_id))
                logger.warn("orig payload_hash: {}".format(orig_payload_hash))
                logger.warn("orig task_id: {}".format(orig_payload_id))

                if orig_payload_id is None:
                    raise

                # overwrite if this job is a retry of the previous job
                if payload_id is not None and payload_id == orig_payload_id:
                    msg = (
                        "This job is a retry of a previous job that resulted "
                        + "in an orphaned dataset. Forcing publish.")
                    logger.warn(msg)
                    log_custom_event(
                        "orphaned_dataset-retry_previous_failed",
                        "clobber",
                        {
                            "orphan_info": {
                                "payload_id": payload_id,
                                "payload_hash": payload_hash,
                                "task_id": task_id,
                                "orig_payload_id": orig_payload_id,
                                "orig_payload_hash": orig_payload_hash,
                                "orig_task_id": orig_task_id,
                                "dataset_id": objectid,
                                "dataset_url": pub_path_url,
                                "msg": msg,
                            }
                        },
                    )
                else:
                    job_status = get_job_status(orig_payload_id)
                    logger.warn("orig job status: {}".format(job_status))

                    # overwrite if previous job failed
                    if job_status == "job-failed":
                        msg = (
                            "Detected previous job failure that resulted in an "
                            + "orphaned dataset. Forcing publish.")
                        logger.warn(msg)
                        log_custom_event(
                            "orphaned_dataset-job_failed",
                            "clobber",
                            {
                                "orphan_info": {
                                    "payload_id": payload_id,
                                    "payload_hash": payload_hash,
                                    "task_id": task_id,
                                    "orig_payload_id": orig_payload_id,
                                    "orig_payload_hash": orig_payload_hash,
                                    "orig_task_id": orig_task_id,
                                    "orig_status": job_status,
                                    "dataset_id": objectid,
                                    "dataset_url": pub_path_url,
                                    "msg": msg,
                                }
                            },
                        )
                    else:
                        # overwrite if dataset doesn't exist in grq
                        if not dataset_exists(objectid):
                            msg = "Detected orphaned dataset without ES doc. Forcing publish."
                            logger.warn(msg)
                            log_custom_event(
                                "orphaned_dataset-no_es_doc",
                                "clobber",
                                {
                                    "orphan_info": {
                                        "payload_id": payload_id,
                                        "payload_hash": payload_hash,
                                        "task_id": task_id,
                                        "dataset_id": objectid,
                                        "dataset_url": pub_path_url,
                                        "msg": msg,
                                    }
                                },
                            )
                        else:
                            raise
                publish_dataset(
                    local_prod_path,
                    pub_path_url,
                    params=osaka_params,
                    force=True,
                    publ_ctx_file=publ_ctx_file,
                    publ_ctx_url=publ_ctx_url,
                )
            except osaka.utils.NoClobberException as e:
                if dataset_exists(objectid):
                    try:
                        osaka.main.rmall(publ_ctx_url, params=osaka_params)
                    except:
                        logger.warn(
                            "Failed to clean up publish context {} after attempting to clobber valid dataset."
                            .format(publ_ctx_url))
                    raise
                else:
                    msg = "Detected orphaned dataset without ES doc. Forcing publish."
                    logger.warn(msg)
                    log_custom_event(
                        "orphaned_dataset-no_es_doc",
                        "clobber",
                        {
                            "orphan_info": {
                                "payload_id": payload_id,
                                "payload_hash": payload_hash,
                                "task_id": task_id,
                                "dataset_id": objectid,
                                "dataset_url": pub_path_url,
                                "msg": msg,
                            }
                        },
                    )
                    publish_dataset(
                        local_prod_path,
                        pub_path_url,
                        params=osaka_params,
                        force=True,
                        publ_ctx_file=publ_ctx_file,
                        publ_ctx_url=publ_ctx_url,
                    )
        tx_t2 = datetime.utcnow()
        tx_dur = (tx_t2 - tx_t1).total_seconds()

        # save dataset metrics on size and transfer
        prod_metrics.update({
            "url": urlparse(pub_path_url).path,
            "disk_usage": prod_dir_usage,
            "time_start": tx_t1.isoformat() + "Z",
            "time_end": tx_t2.isoformat() + "Z",
            "duration": tx_dur,
            "transfer_rate": prod_dir_usage / tx_dur,
        })
    else:
        logger.info("Dataset publish is not configured.")
        pub_urls = []

    # publish browse
    if r.browseConfigured():
        logger.info("Browse publish is configured.")

        # get browse path and urls
        browse_path = r.getBrowsePath()
        browse_urls = r.getBrowseUrls()

        # get S3 profile name and api keys for browse image publishing
        s3_secret_key_browse, s3_access_key_browse = r.getS3Keys("browse")
        s3_profile_browse = r.getS3Profile("browse")

        # set osaka params for browse
        osaka_params_browse = {}

        # S3 profile takes precedence over explicit api keys
        if s3_profile_browse is not None:
            osaka_params_browse["profile_name"] = s3_profile_browse
        else:
            if s3_secret_key_browse is not None and s3_access_key_browse is not None:
                osaka_params_browse["aws_access_key_id"] = s3_access_key_browse
                osaka_params_browse[
                    "aws_secret_access_key"] = s3_secret_key_browse

        # add metadata for all browse images and upload to browse location
        imgs_metadata = []
        imgs = glob("%s/*browse.png" % local_prod_path)
        for img in imgs:
            img_metadata = {"img": os.path.basename(img)}
            small_img = img.replace("browse.png", "browse_small.png")
            if os.path.exists(small_img):
                small_img_basename = os.path.basename(small_img)
                if browse_path is not None:
                    this_browse_path = os.path.join(browse_path,
                                                    small_img_basename)
                    if dry_run:
                        logger.info("Would've uploaded %s to %s" %
                                    (small_img, browse_path))
                    else:
                        logger.info("Uploading %s to %s" %
                                    (small_img, browse_path))
                        osaka.main.put(
                            small_img,
                            this_browse_path,
                            params=osaka_params_browse,
                            noclobber=False,
                        )
            else:
                small_img_basename = None
            img_metadata["small_img"] = small_img_basename
            tooltip_match = BROWSE_RE.search(img_metadata["img"])
            if tooltip_match:
                img_metadata["tooltip"] = tooltip_match.group(1)
            else:
                img_metadata["tooltip"] = ""
            imgs_metadata.append(img_metadata)

        # sort browse images
        browse_sort_order = r.getBrowseSortOrder()
        if isinstance(browse_sort_order, list) and len(browse_sort_order) > 0:
            bso_regexes = [re.compile(i) for i in browse_sort_order]
            sorter = {}
            unrecognized = []
            for img in imgs_metadata:
                matched = None
                for i, bso_re in enumerate(bso_regexes):
                    if bso_re.search(img["img"]):
                        matched = img
                        sorter[i] = matched
                        break
                if matched is None:
                    unrecognized.append(img)
            imgs_metadata = [sorter[i] for i in sorted(sorter)]
            imgs_metadata.extend(unrecognized)
    else:
        logger.info("Browse publish is not configured.")
        browse_urls = []
        imgs_metadata = []

    # set update json
    update_json = {
        "id": objectid,
        "objectid": objectid,
        "metadata": metadata,
        "dataset": ipath.split("/")[1],
        "ipath": ipath,
        "system_version": version,
        "dataset_level": level,
        "dataset_type": dtype,
        "urls": pub_urls,
        "browse_urls": browse_urls,
        "images": imgs_metadata,
        "prov": context.get("_prov", {}),
    }
    update_json.update(dataset)
    # logger.info("update_json: %s" % pformat(update_json))

    # custom index specified?
    index = r.getIndex()
    if index is not None:
        update_json["index"] = index

    # update GRQ
    if dry_run:
        update_json["grq_index_result"] = {"index": index}
        logger.info("Would've indexed doc at %s: %s" %
                    (grq_update_url,
                     json.dumps(update_json, indent=2, sort_keys=True)))
    else:
        res = index_dataset(grq_update_url, update_json)
        logger.info("res: %s" % res)
        update_json["grq_index_result"] = res

    # finish if dry run
    if dry_run:
        try:
            shutil.rmtree(publ_ctx_dir)
        except:
            pass
        return (prod_metrics, update_json)

    # create PROV-ES JSON file for publish processStep
    prod_prov_es_file = os.path.join(
        local_prod_path, "%s.prov_es.json" % os.path.basename(local_prod_path))
    pub_prov_es_bn = "publish.prov_es.json"
    if os.path.exists(prod_prov_es_file):
        pub_prov_es_file = os.path.join(local_prod_path, pub_prov_es_bn)
        prov_es_info = {}
        with open(prod_prov_es_file) as f:
            try:
                prov_es_info = json.load(f)
            except Exception as e:
                tb = traceback.format_exc()
                raise RuntimeError(
                    "Failed to load PROV-ES from {}: {}\n{}".format(
                        prod_prov_es_file, str(e), tb))
        log_publish_prov_es(
            prov_es_info,
            pub_prov_es_file,
            local_prod_path,
            pub_urls,
            prod_metrics,
            objectid,
        )
        # upload publish PROV-ES file
        osaka.main.put(
            pub_prov_es_file,
            os.path.join(pub_path_url, pub_prov_es_bn),
            params=osaka_params,
            noclobber=False,
        )

    # cleanup publish context
    if publ_ctx_url is not None:
        try:
            osaka.main.rmall(publ_ctx_url, params=osaka_params)
        except:
            logger.warn(
                "Failed to clean up publish context at {} on successful publish."
                .format(publ_ctx_url))
    try:
        shutil.rmtree(publ_ctx_dir)
    except:
        pass

    # queue data dataset
    queue_dataset(ipath, update_json, dataset_processed_queue)

    # return dataset metrics and dataset json
    return (prod_metrics, update_json)
Пример #7
0
def ingest(objectid,
           dsets_file,
           grq_update_url,
           dataset_processed_queue,
           prod_path,
           job_path,
           dry_run=False,
           force=False):
    """Run dataset ingest."""
    logger.info("#" * 80)
    logger.info("datasets: %s" % dsets_file)
    logger.info("grq_update_url: %s" % grq_update_url)
    logger.info("dataset_processed_queue: %s" % dataset_processed_queue)
    logger.info("prod_path: %s" % prod_path)
    logger.info("job_path: %s" % job_path)
    logger.info("dry_run: %s" % dry_run)
    logger.info("force: %s" % force)

    # get dataset
    if os.path.isdir(prod_path):
        local_prod_path = prod_path
    else:
        local_prod_path = get_remote_dav(prod_path)
    if not os.path.isdir(local_prod_path):
        raise RuntimeError("Failed to find local dataset directory: %s" %
                           local_prod_path)

    # dataset name
    pname = os.path.basename(local_prod_path)

    # dataset file
    dataset_file = os.path.join(local_prod_path, '%s.dataset.json' % pname)

    # get dataset json
    with open(dataset_file) as f:
        dataset = json.load(f)
    logger.info("Loaded dataset JSON from file: %s" % dataset_file)

    # check minimum requirements for dataset JSON
    logger.info("Verifying dataset JSON...")
    verify_dataset(dataset)
    logger.info("Dataset JSON verfication succeeded.")

    # get version
    version = dataset['version']

    # recognize
    r = Recognizer(dsets_file, local_prod_path, objectid, version)

    # get ipath
    ipath = r.currentIpath

    # get extractor
    extractor = r.getMetadataExtractor()
    if extractor is not None:
        match = SCRIPT_RE.search(extractor)
        if match: extractor = match.group(1)
    logger.info("Configured metadata extractor: %s" % extractor)

    # metadata file
    metadata_file = os.path.join(local_prod_path, '%s.met.json' % pname)

    # metadata seed file
    seed_file = os.path.join(local_prod_path, 'met.json')

    # metadata file already here
    if os.path.exists(metadata_file):
        with open(metadata_file) as f:
            metadata = json.load(f)
        logger.info("Loaded metadata from existing file: %s" % metadata_file)
    else:
        if extractor is None:
            logger.info(
                "No metadata extraction configured. Setting empty metadata.")
            metadata = {}
        else:
            logger.info("Running metadata extractor %s on %s" %
                        (extractor, local_prod_path))
            m = check_output([extractor, local_prod_path])
            logger.info("Output: %s" % m)

            # generate json to update metadata and urls
            metadata = json.loads(m)

            # set data_product_name
            metadata['data_product_name'] = objectid

            # merge with seed metadata
            if os.path.exists(seed_file):
                with open(seed_file) as f:
                    seed = json.load(f)
                metadata.update(seed)
                logger.info("Loaded seed metadata from file: %s" % seed_file)

            # write it out to file
            with open(metadata_file, 'w') as f:
                json.dump(metadata, f, indent=2)
            logger.info("Wrote metadata to %s" % metadata_file)

            # delete seed file
            if os.path.exists(seed_file):
                os.unlink(seed_file)
                logger.info("Deleted seed file %s." % seed_file)

    # add context
    context_file = os.path.join(local_prod_path, '%s.context.json' % pname)
    if os.path.exists(context_file):
        with open(context_file) as f:
            context = json.load(f)
        logger.info("Loaded context from existing file: %s" % context_file)
    else:
        context = {}
    metadata['context'] = context

    # set metadata and dataset groups in recognizer
    r.setDataset(dataset)
    r.setMetadata(metadata)

    # get level
    level = r.getLevel()

    # get type
    dtype = r.getType()

    # get publish path
    pub_path_url = r.getPublishPath()

    # get publish urls
    pub_urls = [i for i in r.getPublishUrls()]

    # get S3 profile name and api keys for dataset publishing
    s3_secret_key, s3_access_key = r.getS3Keys()
    s3_profile = r.getS3Profile()

    # set osaka params
    osaka_params = {}

    # S3 profile takes precedence over explicit api keys
    if s3_profile is not None:
        osaka_params['profile_name'] = s3_profile
    else:
        if s3_secret_key is not None and s3_access_key is not None:
            osaka_params['aws_access_key_id'] = s3_access_key
            osaka_params['aws_secret_access_key'] = s3_secret_key

    # get browse path and urls
    browse_path = r.getBrowsePath()
    browse_urls = r.getBrowseUrls()

    # get S3 profile name and api keys for browse image publishing
    s3_secret_key_browse, s3_access_key_browse = r.getS3Keys("browse")
    s3_profile_browse = r.getS3Profile("browse")

    # set osaka params for browse
    osaka_params_browse = {}

    # S3 profile takes precedence over explicit api keys
    if s3_profile_browse is not None:
        osaka_params_browse['profile_name'] = s3_profile_browse
    else:
        if s3_secret_key_browse is not None and s3_access_key_browse is not None:
            osaka_params_browse['aws_access_key_id'] = s3_access_key_browse
            osaka_params_browse['aws_secret_access_key'] = s3_secret_key_browse

    # get pub host and path
    logger.info("Configured pub host & path: %s" % (pub_path_url))

    # check scheme
    if not osaka.main.supported(pub_path_url):
        raise RuntimeError("Scheme %s is currently not supported." %
                           urlparse(pub_path_url).scheme)

    # upload dataset to repo; track disk usage and start/end times of transfer
    prod_dir_usage = get_disk_usage(local_prod_path)
    tx_t1 = datetime.utcnow()
    if dry_run:
        logger.info("Would've published %s to %s" %
                    (local_prod_path, pub_path_url))
    else:
        publish_dataset(local_prod_path,
                        pub_path_url,
                        params=osaka_params,
                        force=force)
    tx_t2 = datetime.utcnow()

    # add metadata for all browse images and upload to browse location
    imgs_metadata = []
    imgs = glob('%s/*browse.png' % local_prod_path)
    for img in imgs:
        img_metadata = {'img': os.path.basename(img)}
        small_img = img.replace('browse.png', 'browse_small.png')
        if os.path.exists(small_img):
            small_img_basename = os.path.basename(small_img)
            if browse_path is not None:
                this_browse_path = os.path.join(browse_path,
                                                small_img_basename)
                if dry_run:
                    logger.info("Would've uploaded %s to %s" %
                                (small_img, browse_path))
                else:
                    logger.info("Uploading %s to %s" %
                                (small_img, browse_path))
                    osaka.main.put(small_img,
                                   this_browse_path,
                                   params=osaka_params_browse,
                                   noclobber=False)
        else:
            small_img_basename = None
        img_metadata['small_img'] = small_img_basename
        tooltip_match = BROWSE_RE.search(img_metadata['img'])
        if tooltip_match: img_metadata['tooltip'] = tooltip_match.group(1)
        else: img_metadata['tooltip'] = ""
        imgs_metadata.append(img_metadata)

    # sort browse images
    browse_sort_order = r.getBrowseSortOrder()
    if isinstance(browse_sort_order,
                  types.ListType) and len(browse_sort_order) > 0:
        bso_regexes = [re.compile(i) for i in browse_sort_order]
        sorter = {}
        unrecognized = []
        for img in imgs_metadata:
            matched = None
            for i, bso_re in enumerate(bso_regexes):
                if bso_re.search(img['img']):
                    matched = img
                    sorter[i] = matched
                    break
            if matched is None: unrecognized.append(img)
        imgs_metadata = [sorter[i] for i in sorted(sorter)]
        imgs_metadata.extend(unrecognized)

    # save dataset metrics on size and transfer
    tx_dur = (tx_t2 - tx_t1).total_seconds()
    prod_metrics = {
        'ipath': ipath,
        'url': urlparse(pub_path_url).path,
        'path': local_prod_path,
        'disk_usage': prod_dir_usage,
        'time_start': tx_t1.isoformat() + 'Z',
        'time_end': tx_t2.isoformat() + 'Z',
        'duration': tx_dur,
        'transfer_rate': prod_dir_usage / tx_dur
    }

    # set update json
    ipath = r.currentIpath
    update_json = {
        'id': objectid,
        'objectid': objectid,
        'metadata': metadata,
        'urls': pub_urls,
        'browse_urls': browse_urls,
        'images': imgs_metadata,
        'dataset': ipath.split('/')[1],
        'ipath': ipath,
        'system_version': version,
        'dataset_level': level,
        'dataset_type': dtype,
    }
    update_json.update(dataset)
    #logger.info("update_json: %s" % pformat(update_json))

    # custom index specified?
    index = r.getIndex()
    if index is not None: update_json['index'] = index

    # update GRQ
    if isinstance(update_json['metadata'],
                  types.DictType) and len(update_json['metadata']) > 0:
        #logger.info("update_json: %s" % pformat(update_json))
        if dry_run:
            logger.info("Would've indexed doc at %s: %s" %
                        (grq_update_url,
                         json.dumps(update_json, indent=2, sort_keys=True)))
        else:
            res = index_dataset(grq_update_url, update_json)
            logger.info("res: %s" % res)
            update_json['grq_index_result'] = res

    # finish if dry run
    if dry_run: return (prod_metrics, update_json)

    # create PROV-ES JSON file for publish processStep
    prod_prov_es_file = os.path.join(
        local_prod_path, '%s.prov_es.json' % os.path.basename(local_prod_path))
    pub_prov_es_bn = "publish.prov_es.json"
    if os.path.exists(prod_prov_es_file):
        pub_prov_es_file = os.path.join(local_prod_path, pub_prov_es_bn)
        prov_es_info = {}
        with open(prod_prov_es_file) as f:
            try:
                prov_es_info = json.load(f)
            except Exception, e:
                tb = traceback.format_exc()
                raise (RuntimeError("Failed to load PROV-ES from %s: %s\n%s" %
                                    (prod_prov_es_file, str(e), tb)))
        log_publish_prov_es(prov_es_info, pub_prov_es_file, local_prod_path,
                            pub_urls, prod_metrics, objectid)
        # upload publish PROV-ES file
        osaka.main.put(pub_prov_es_file,
                       os.path.join(pub_path_url, pub_prov_es_bn),
                       params=osaka_params,
                       noclobber=False)