def run_extractor(dsets_file, prod_path, ctx): """Run extractor configured in datasets JSON config.""" logging.info("datasets: %s" % dsets_file) logging.info("prod_path: %s" % prod_path) # get settings settings_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'settings.json') settings = json.load(open(settings_file)) # recognize r = Recognizer(dsets_file, prod_path, os.path.basename(prod_path), settings["EXTRACT_VERSION"]) objectid = r.getId() # get extractor extractor = r.getMetadataExtractor() if extractor is not None: match = SCRIPT_RE.search(extractor) if match: extractor = match.group(1) logging.info("Configured metadata extractor: %s" % extractor) # metadata file metadata_file = os.path.join(prod_path, '%s.met.json' % \ os.path.basename(prod_path)) dataset_file = os.path.join(prod_path, '%s.dataset.json' % \ os.path.basename(prod_path)) # load metadata metadata = {} if os.path.exists(metadata_file): with open(metadata_file) as f: metadata = json.load(f) m = {} # run extractor if extractor is None: logging.info("No metadata extraction configured.") else: logging.info("Running metadata extractor %s on %s" % \ (extractor, prod_path)) m = check_output([extractor, prod_path]) if os.path.exists(metadata_file): with open(metadata_file) as f: metadata.update(json.load(f)) # set data_product_name metadata['data_product_name'] = objectid # set download url from context localize_urls = ctx.get('localize_urls', []) if len(localize_urls) > 0: metadata['download_url'] = localize_urls[0]['url'] # write it out to file with open(metadata_file, 'w') as f: json.dump(metadata, f, indent=2) logging.info("Wrote metadata to %s" % metadata_file) # Build datasets and add in "optional" fields, if not already created by extractor if not os.path.exists(dataset_file): datasets = {"version": settings["EXTRACT_VERSION"]} for key in ["location", "starttime", "endtime", "label"]: if key in m: datasets[key] = m[key] # write it out to file with open(dataset_file, 'w') as f: json.dump(datasets, f, indent=2) logging.info("Wrote dataset to %s" % dataset_file)
def run_extractor(dsets_file, prod_path, url, ctx, md5_hash): """Run extractor configured in datasets JSON config.""" logging.info("datasets: %s" % dsets_file) logging.info("prod_path: %s" % prod_path) # get settings settings = {} try: settings_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'settings.json') settings = json.load(open(settings_file)) except: settings['DATASETS_CFG'] = "/home/ops/verdi/etc/datasets.json" settings["INCOMING_VERSION"] = "v0.1" settings["EXTRACT_VERSION"]= "v0.1" settings["ACQ_TO_DSET_MAP"] = {"acquisition-S1-IW_SLC": "S1-IW_SLC"} # recognize r = Recognizer(dsets_file, prod_path,os.path.basename(prod_path),settings["EXTRACT_VERSION"]) objectid = r.getId() # get extractor extractor = r.getMetadataExtractor() if extractor is not None: match = SCRIPT_RE.search(extractor) if match: extractor = match.group(1) logging.info("Configured metadata extractor: %s" % extractor) # metadata file metadata_file = os.path.join(prod_path, '%s.met.json' % \ os.path.basename(prod_path)) dataset_file = os.path.join(prod_path, '%s.dataset.json' % \ os.path.basename(prod_path)) with open(os.path.join(prod_path, '%s.zip.md5' % os.path.basename(prod_path)), 'w') as md5_file: md5_file.write(md5_hash) # writing md5 hash into zip file if it passes # load metadata metadata = {} if os.path.exists(metadata_file): with open(metadata_file) as f: metadata = json.load(f) m = {} # run extractor if extractor is None: logging.info("No metadata extraction configured.") else: logging.info("Running metadata extractor %s on %s" % \ (extractor, prod_path)) try: m = check_output([extractor, prod_path]) except CalledProcessError as e: err_msg = e.message root_dir = os.getcwd() logging.info("root_dir with getcwd() : %s" %root_dir) if not root_dir.endswith("Z"): root_dir = os.path.abspath(os.path.join(os.getcwd(), '..')) logging.info("root_dir final : %s" %root_dir) prov_log = os.path.join(root_dir, 'create_prov_es.log') split_log = os.path.join(root_dir, 'split_swath_products.log') logging.info("%s\n%s" %(prov_log, split_log)) if os.path.isfile(prov_log): prov_err = get_log_err(prov_log) if prov_err: err_msg = prov_err elif os.path.isfile(split_log): split_err = get_log_err(split_log) if split_err: err_msg=split_err else: logging.info("%s file NOT Found" %split_log) raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, err_msg)) if os.path.exists(metadata_file): with open(metadata_file) as f: metadata.update(json.load(f)) # set data_product_name metadata['data_product_name'] = objectid # set download url from context metadata['download_url'] = url # add md5 hash in metadata metadata['md5_hash'] = md5_hash # write it out to file with open(metadata_file, 'w') as f: json.dump(metadata, f, indent=2) logging.info("Wrote metadata to %s" % metadata_file) # Build datasets and add in "optional" fields, if not already created by extractor if not os.path.exists(dataset_file): datasets = {"version":settings["EXTRACT_VERSION"]} for key in ["location","starttime","endtime","label"]: if key in m: datasets[key] = m[key] # write it out to file with open(dataset_file, 'w') as f: json.dump(datasets, f, indent=2) logging.info("Wrote dataset to %s" % dataset_file)
def ingest(objectid, dsets_file, grq_update_url, dataset_processed_queue, prod_path, job_path, dry_run=False, force=False): """Run dataset ingest.""" logger.info("#" * 80) logger.info("datasets: %s" % dsets_file) logger.info("grq_update_url: %s" % grq_update_url) logger.info("dataset_processed_queue: %s" % dataset_processed_queue) logger.info("prod_path: %s" % prod_path) logger.info("job_path: %s" % job_path) logger.info("dry_run: %s" % dry_run) logger.info("force: %s" % force) # get default job path if job_path is None: job_path = os.getcwd() # detect job info job = {} job_json = os.path.join(job_path, '_job.json') if os.path.exists(job_json): with open(job_json) as f: try: job = json.load(f) except Exception as e: logger.warn("Failed to read job json:\n{}".format(str(e))) task_id = job.get('task_id', None) payload_id = job.get('job_info', {}).get('job_payload', {}).get('payload_task_id', None) payload_hash = job.get('job_info', {}).get('payload_hash', None) logger.info("task_id: %s" % task_id) logger.info("payload_id: %s" % payload_id) logger.info("payload_hash: %s" % payload_hash) # get dataset if os.path.isdir(prod_path): local_prod_path = prod_path else: local_prod_path = get_remote_dav(prod_path) if not os.path.isdir(local_prod_path): raise RuntimeError("Failed to find local dataset directory: %s" % local_prod_path) # write publish context publ_ctx_name = "_publish.context.json" publ_ctx_dir = mkdtemp(prefix=".pub_context", dir=job_path) publ_ctx_file = os.path.join(publ_ctx_dir, publ_ctx_name) with open(publ_ctx_file, 'w') as f: json.dump( { 'payload_id': payload_id, 'payload_hash': payload_hash, 'task_id': task_id }, f, indent=2, sort_keys=True) publ_ctx_url = None # dataset name pname = os.path.basename(local_prod_path) # dataset file dataset_file = os.path.join(local_prod_path, '%s.dataset.json' % pname) # get dataset json with open(dataset_file) as f: dataset = json.load(f) logger.info("Loaded dataset JSON from file: %s" % dataset_file) # check minimum requirements for dataset JSON logger.info("Verifying dataset JSON...") verify_dataset(dataset) logger.info("Dataset JSON verfication succeeded.") # get version version = dataset['version'] # recognize r = Recognizer(dsets_file, local_prod_path, objectid, version) # get ipath ipath = r.currentIpath # get extractor extractor = r.getMetadataExtractor() if extractor is not None: match = SCRIPT_RE.search(extractor) if match: extractor = match.group(1) logger.info("Configured metadata extractor: %s" % extractor) # metadata file metadata_file = os.path.join(local_prod_path, '%s.met.json' % pname) # metadata seed file seed_file = os.path.join(local_prod_path, 'met.json') # metadata file already here if os.path.exists(metadata_file): with open(metadata_file) as f: metadata = json.load(f) logger.info("Loaded metadata from existing file: %s" % metadata_file) else: if extractor is None: logger.info( "No metadata extraction configured. Setting empty metadata.") metadata = {} else: logger.info("Running metadata extractor %s on %s" % (extractor, local_prod_path)) m = check_output([extractor, local_prod_path]) logger.info("Output: %s" % m.decode()) # generate json to update metadata and urls metadata = json.loads(m) # set data_product_name metadata['data_product_name'] = objectid # merge with seed metadata if os.path.exists(seed_file): with open(seed_file) as f: seed = json.load(f) metadata.update(seed) logger.info("Loaded seed metadata from file: %s" % seed_file) # write it out to file with open(metadata_file, 'w') as f: json.dump(metadata, f, indent=2) logger.info("Wrote metadata to %s" % metadata_file) # delete seed file if os.path.exists(seed_file): os.unlink(seed_file) logger.info("Deleted seed file %s." % seed_file) # add context context_file = os.path.join(local_prod_path, '%s.context.json' % pname) if os.path.exists(context_file): with open(context_file) as f: context = json.load(f) logger.info("Loaded context from existing file: %s" % context_file) else: context = {} metadata['context'] = context # set metadata and dataset groups in recognizer r.setDataset(dataset) r.setMetadata(metadata) # get level level = r.getLevel() # get type dtype = r.getType() # set product metrics prod_metrics = {'ipath': ipath, 'path': local_prod_path} # publish dataset if r.publishConfigured(): logger.info("Dataset publish is configured.") # get publish path pub_path_url = r.getPublishPath() # get publish urls pub_urls = [i for i in r.getPublishUrls()] # get S3 profile name and api keys for dataset publishing s3_secret_key, s3_access_key = r.getS3Keys() s3_profile = r.getS3Profile() # set osaka params osaka_params = {} # S3 profile takes precedence over explicit api keys if s3_profile is not None: osaka_params['profile_name'] = s3_profile else: if s3_secret_key is not None and s3_access_key is not None: osaka_params['aws_access_key_id'] = s3_access_key osaka_params['aws_secret_access_key'] = s3_secret_key # get pub host and path logger.info("Configured pub host & path: %s" % (pub_path_url)) # check scheme if not osaka.main.supported(pub_path_url): raise RuntimeError("Scheme %s is currently not supported." % urlparse(pub_path_url).scheme) # upload dataset to repo; track disk usage and start/end times of transfer prod_dir_usage = get_disk_usage(local_prod_path) tx_t1 = datetime.utcnow() if dry_run: logger.info("Would've published %s to %s" % (local_prod_path, pub_path_url)) else: publ_ctx_url = os.path.join(pub_path_url, publ_ctx_name) orig_publ_ctx_file = publ_ctx_file + '.orig' try: publish_dataset(local_prod_path, pub_path_url, params=osaka_params, force=force, publ_ctx_file=publ_ctx_file, publ_ctx_url=publ_ctx_url) except NoClobberPublishContextException as e: logger.warn( "A publish context file was found at {}. Retrieving.". format(publ_ctx_url)) osaka.main.get(publ_ctx_url, orig_publ_ctx_file, params=osaka_params) with open(orig_publ_ctx_file) as f: orig_publ_ctx = json.load(f) logger.warn("original publish context: {}".format( json.dumps(orig_publ_ctx, indent=2, sort_keys=True))) orig_payload_id = orig_publ_ctx.get('payload_id', None) orig_payload_hash = orig_publ_ctx.get('payload_hash', None) orig_task_id = orig_publ_ctx.get('task_id', None) logger.warn("orig payload_id: {}".format(orig_payload_id)) logger.warn("orig payload_hash: {}".format(orig_payload_hash)) logger.warn("orig task_id: {}".format(orig_payload_id)) if orig_payload_id is None: raise # overwrite if this job is a retry of the previous job if payload_id is not None and payload_id == orig_payload_id: msg = "This job is a retry of a previous job that resulted " + \ "in an orphaned dataset. Forcing publish." logger.warn(msg) log_custom_event( 'orphaned_dataset-retry_previous_failed', 'clobber', { 'orphan_info': { 'payload_id': payload_id, 'payload_hash': payload_hash, 'task_id': task_id, 'orig_payload_id': orig_payload_id, 'orig_payload_hash': orig_payload_hash, 'orig_task_id': orig_task_id, 'dataset_id': objectid, 'dataset_url': pub_path_url, 'msg': msg } }) else: job_status = get_job_status(orig_payload_id) logger.warn("orig job status: {}".format(job_status)) # overwrite if previous job failed if job_status == "job-failed": msg = "Detected previous job failure that resulted in an " + \ "orphaned dataset. Forcing publish." logger.warn(msg) log_custom_event( 'orphaned_dataset-job_failed', 'clobber', { 'orphan_info': { 'payload_id': payload_id, 'payload_hash': payload_hash, 'task_id': task_id, 'orig_payload_id': orig_payload_id, 'orig_payload_hash': orig_payload_hash, 'orig_task_id': orig_task_id, 'orig_status': job_status, 'dataset_id': objectid, 'dataset_url': pub_path_url, 'msg': msg } }) else: # overwrite if dataset doesn't exist in grq if not dataset_exists(objectid): msg = "Detected orphaned dataset without ES doc. Forcing publish." logger.warn(msg) log_custom_event( 'orphaned_dataset-no_es_doc', 'clobber', { 'orphan_info': { 'payload_id': payload_id, 'payload_hash': payload_hash, 'task_id': task_id, 'dataset_id': objectid, 'dataset_url': pub_path_url, 'msg': msg } }) else: raise publish_dataset(local_prod_path, pub_path_url, params=osaka_params, force=True, publ_ctx_file=publ_ctx_file, publ_ctx_url=publ_ctx_url) except osaka.utils.NoClobberException as e: if dataset_exists(objectid): try: osaka.main.rmall(publ_ctx_url, params=osaka_params) except: logger.warn( "Failed to clean up publish context {} after attempting to clobber valid dataset." .format(publ_ctx_url)) raise else: msg = "Detected orphaned dataset without ES doc. Forcing publish." logger.warn(msg) log_custom_event( 'orphaned_dataset-no_es_doc', 'clobber', { 'orphan_info': { 'payload_id': payload_id, 'payload_hash': payload_hash, 'task_id': task_id, 'dataset_id': objectid, 'dataset_url': pub_path_url, 'msg': msg } }) publish_dataset(local_prod_path, pub_path_url, params=osaka_params, force=True, publ_ctx_file=publ_ctx_file, publ_ctx_url=publ_ctx_url) tx_t2 = datetime.utcnow() tx_dur = (tx_t2 - tx_t1).total_seconds() # save dataset metrics on size and transfer prod_metrics.update({ 'url': urlparse(pub_path_url).path, 'disk_usage': prod_dir_usage, 'time_start': tx_t1.isoformat() + 'Z', 'time_end': tx_t2.isoformat() + 'Z', 'duration': tx_dur, 'transfer_rate': prod_dir_usage / tx_dur }) else: logger.info("Dataset publish is not configured.") pub_urls = [] # publish browse if r.browseConfigured(): logger.info("Browse publish is configured.") # get browse path and urls browse_path = r.getBrowsePath() browse_urls = r.getBrowseUrls() # get S3 profile name and api keys for browse image publishing s3_secret_key_browse, s3_access_key_browse = r.getS3Keys("browse") s3_profile_browse = r.getS3Profile("browse") # set osaka params for browse osaka_params_browse = {} # S3 profile takes precedence over explicit api keys if s3_profile_browse is not None: osaka_params_browse['profile_name'] = s3_profile_browse else: if s3_secret_key_browse is not None and s3_access_key_browse is not None: osaka_params_browse['aws_access_key_id'] = s3_access_key_browse osaka_params_browse[ 'aws_secret_access_key'] = s3_secret_key_browse # add metadata for all browse images and upload to browse location imgs_metadata = [] imgs = glob('%s/*browse.png' % local_prod_path) for img in imgs: img_metadata = {'img': os.path.basename(img)} small_img = img.replace('browse.png', 'browse_small.png') if os.path.exists(small_img): small_img_basename = os.path.basename(small_img) if browse_path is not None: this_browse_path = os.path.join(browse_path, small_img_basename) if dry_run: logger.info("Would've uploaded %s to %s" % (small_img, browse_path)) else: logger.info("Uploading %s to %s" % (small_img, browse_path)) osaka.main.put(small_img, this_browse_path, params=osaka_params_browse, noclobber=False) else: small_img_basename = None img_metadata['small_img'] = small_img_basename tooltip_match = BROWSE_RE.search(img_metadata['img']) if tooltip_match: img_metadata['tooltip'] = tooltip_match.group(1) else: img_metadata['tooltip'] = "" imgs_metadata.append(img_metadata) # sort browse images browse_sort_order = r.getBrowseSortOrder() if isinstance(browse_sort_order, list) and len(browse_sort_order) > 0: bso_regexes = [re.compile(i) for i in browse_sort_order] sorter = {} unrecognized = [] for img in imgs_metadata: matched = None for i, bso_re in enumerate(bso_regexes): if bso_re.search(img['img']): matched = img sorter[i] = matched break if matched is None: unrecognized.append(img) imgs_metadata = [sorter[i] for i in sorted(sorter)] imgs_metadata.extend(unrecognized) else: logger.info("Browse publish is not configured.") browse_urls = [] imgs_metadata = [] # set update json update_json = { 'id': objectid, 'objectid': objectid, 'metadata': metadata, 'dataset': ipath.split('/')[1], 'ipath': ipath, 'system_version': version, 'dataset_level': level, 'dataset_type': dtype, 'urls': pub_urls, 'browse_urls': browse_urls, 'images': imgs_metadata, 'prov': context.get('_prov', {}), } update_json.update(dataset) #logger.info("update_json: %s" % pformat(update_json)) # custom index specified? index = r.getIndex() if index is not None: update_json['index'] = index # update GRQ if isinstance(update_json['metadata'], dict) and len(update_json['metadata']) > 0: #logger.info("update_json: %s" % pformat(update_json)) if dry_run: logger.info("Would've indexed doc at %s: %s" % (grq_update_url, json.dumps(update_json, indent=2, sort_keys=True))) else: res = index_dataset(grq_update_url, update_json) logger.info("res: %s" % res) update_json['grq_index_result'] = res # finish if dry run if dry_run: try: shutil.rmtree(publ_ctx_dir) except: pass return (prod_metrics, update_json) # create PROV-ES JSON file for publish processStep prod_prov_es_file = os.path.join( local_prod_path, '%s.prov_es.json' % os.path.basename(local_prod_path)) pub_prov_es_bn = "publish.prov_es.json" if os.path.exists(prod_prov_es_file): pub_prov_es_file = os.path.join(local_prod_path, pub_prov_es_bn) prov_es_info = {} with open(prod_prov_es_file) as f: try: prov_es_info = json.load(f) except Exception as e: tb = traceback.format_exc() raise RuntimeError( "Failed to load PROV-ES from {}: {}\n{}".format( prod_prov_es_file, str(e), tb)) log_publish_prov_es(prov_es_info, pub_prov_es_file, local_prod_path, pub_urls, prod_metrics, objectid) # upload publish PROV-ES file osaka.main.put(pub_prov_es_file, os.path.join(pub_path_url, pub_prov_es_bn), params=osaka_params, noclobber=False) # cleanup publish context if publ_ctx_url is not None: try: osaka.main.rmall(publ_ctx_url, params=osaka_params) except: logger.warn( "Failed to clean up publish context at {} on successful publish." .format(publ_ctx_url)) try: shutil.rmtree(publ_ctx_dir) except: pass # queue data dataset queue_dataset(ipath, update_json, dataset_processed_queue) # return dataset metrics and dataset json return (prod_metrics, update_json)