def extract_tdb(group, params=None): record = {} for data_file in group: material = {} calphad = {} # Attempt to read the file try: calphad_db = pycalphad.Database(data_file) composition = "" for element in calphad_db.elements: if element.isalnum(): element = element.lower() element = element[0].upper() + element[1:] composition += element phases = list(calphad_db.phases.keys()) if composition: material['composition'] = composition if phases: calphad['phases'] = phases except Exception: pass else: # Add to record if material: record = mdf_toolbox.dict_merge(record, {"material": material}) if calphad: record = mdf_toolbox.dict_merge(record, {"calphad": calphad}) return record
def extract_crystal_structure(group, params=None): """Extractor for the crystal_structure block. Will also populate material block. Arguments: group (list of str): The paths to grouped files. params (dict): N/A Returns: dict: The record extractd. """ record = {} for data_file in group: material = {} crystal_structure = {} # Attempt to read the file try: # Read with ASE ase_res = ase.io.read(data_file) # Check data read, validate crystal structure if not ase_res or not all(ase_res.get_pbc()): raise ValueError("No valid data") else: # Convert ASE Atoms to Pymatgen Structure pmg_s = ase_to_pmg.get_structure(ase_res) # ASE failed to read file except Exception: try: # Read with Pymatgen pmg_s = pymatgen.Structure.from_file(data_file) except Exception: # Can't read file continue # Extract material block material["composition"] = pmg_s.formula.replace(" ", "") # Extract crystal_structure block crystal_structure["space_group_number"] = pmg_s.get_space_group_info( )[1] crystal_structure["number_of_atoms"] = float( pmg_s.composition.num_atoms) crystal_structure["volume"] = float(pmg_s.volume) crystal_structure[ "stoichiometry"] = pmg_s.composition.anonymized_formula # Add to record record = mdf_toolbox.dict_merge(record, { "material": material, "crystal_structure": crystal_structure }) return record
def get_translations(self): return dict_merge( super().get_translations(), { "dft": { "Converged": ("converged", bool), "XC_Functional": ("exchange_correlation_functional", str), "Cutoff_Energy_eV": ("cutoff_energy", float) }, "crystal_structure": { "Space_group_number": ("space_group_number", int), "Number_of_atoms_in_unit_cell": ("number_of_atoms", float), "Unit_cell_volume_AA_3": ("volume", float) }, })
def update_action_status(table_name, action_id, updates, overwrite=False): """Update action entry in status database. Arguments: table_name (str): The name of the table to update. action_id (dict): The ID for the action. updates (dict): The updates to apply to the action status. overwrite (bool): When False, will merge the updates into the existing status, overwriting only existing values. When True, will delete the existing status entirely and replace it with the updates. Default False. Returns: dict: The updated action status. Raises exception on any failure. """ # Verify old status exists and save it old_status = read_action_status(table_name, action_id) # Merge updates into old_status if not overwriting if not overwrite: # dict_merge(base, addition) returns base keys unchanged, addition keys added full_updates = mdf_toolbox.dict_merge(updates, old_status) else: full_updates = updates # TODO: Validate updates update_errors = [] if update_errors: raise err.InvalidRequest(*update_errors) # Update in DB (.put_item() overwrites) table = get_dmo_table(table_name) try: table.put_item(Item=full_updates) except Exception as e: logger.error("Error updating status for '{}': {}".format( action_id, str(e))) raise err.ServiceError(str(e)) logger.debug("{}: Action status updated: {}".format(action_id, updates)) return full_updates
def parse(group, **params): record = {} for data_file in group: material = {} crystal_structure = {} # Attempt to read the file try: # Read with ASE ase_res = ase.io.read(data_file) # Check data read, validate crystal structure if not ase_res or not all(ase_res.get_pbc()): raise ValueError("No valid data") else: # Convert ASE Atoms to Pymatgen Structure pmg_s = ase_to_pmg.get_structure(ase_res) # ASE failed to read file except Exception: try: # Read with Pymatgen pmg_s = pymatgen.Structure.from_file(data_file) except Exception: # Can't read file continue # Parse material block material["composition"] = pmg_s.formula.replace(" ", "") # Parse crystal_structure block crystal_structure["space_group_number"] = pmg_s.get_space_group_info()[1] crystal_structure["number_of_atoms"] = float(pmg_s.composition.num_atoms) crystal_structure["volume"] = float(pmg_s.volume) crystal_structure["stoichiometry"] = pmg_s.composition.anonymized_formula # Add to record record = mdf_toolbox.dict_merge(record, { "material": material, "crystal_structure": crystal_structure }) return record
from mdf_toolbox import dict_merge from .acl_config import DEFAULT_ACLS from .base_config import BASE_CONFIG from .catalog_config import KNOWN_CATALOGS from .keys import KEYS from .schemas import INPUT_SCHEMA, OUTPUT_SCHEMA # Config setup CONFIG = { "INPUT_SCHEMA": INPUT_SCHEMA, "OUTPUT_SCHEMA": OUTPUT_SCHEMA, "DEFAULT_ACLS": DEFAULT_ACLS, "KNOWN_CATALOGS": KNOWN_CATALOGS } CONFIG = dict_merge(BASE_CONFIG, CONFIG) CONFIG = dict_merge(KEYS, CONFIG)
def _ex_search(self, limit=None, info=False, retries=3): """Execute a search and return the results, up to the ``SEARCH_LIMIT``. Uses the query currently in this SearchHelper. Arguments: limit (int): Maximum number of entries to return. **Default**: ``10`` for basic queries, and ``10000`` for advanced. info (bool): If ``False``, search will return a list of the results. If ``True``, search will return a tuple containing the results list and other information about the query. **Default:** ``False``. retries (int): The number of times to retry a Search query if it fails. **Default:** 3. Returns: If ``info`` is ``False``, *list*: The search results. If ``info`` is ``True``, *tuple*: The search results, and a dictionary of query information. """ # Make sure there is query information present if not self.initialized: raise ValueError('No query has been set.') # Create Search-ready query if limit is not None: self.__query["limit"] = limit query = _validate_query(self.__query) tries = 0 errors = [] while True: # Try searching until success or `retries` number of failures # Raise exception after `retries` failures try: search_res = self.__search_client.post_search( self.index, query) except globus_sdk.SearchAPIError as e: if tries >= retries: raise else: errors.append(repr(e)) except Exception as e: if tries >= retries: raise else: errors.append(repr(e)) else: break tries += 1 # Remove the wrapping on each entry from Globus search res = mdf_toolbox.gmeta_pop(search_res, info=info) # Add more information to output if requested if info: # Add everything from the query itself info_dict = mdf_toolbox.dict_merge(res[1], query) # But rename "q" to "query" for clarity info_dict["query"] = info_dict.pop("q") # Add other useful/interesting parameters info_dict["index_uuid"] = self.index info_dict["retries"] = tries info_dict["errors"] = errors # Remake tuple because tuples don't suport assignment res = (res[0], info_dict) return res
def group_tree(root, config): """Run group_files on files in tree appropriately.""" files = [] dirs = [] if root == "/dev/null": return [] for node in os.listdir(root): node_path = os.path.join(root, node) if node == "mdf.json": with open(node_path) as f: try: new_config = json.load(f) logger.debug("Config updating: \n{}".format(new_config)) except Exception as e: logger.warning("Error reading config file '{}': {}".format( node_path, str(e))) else: config = mdf_toolbox.dict_merge(new_config, config) elif os.path.isfile(node_path): files.append(node_path) elif os.path.isdir(node_path): dirs.append(node_path) else: logger.debug( "Ignoring non-file, non-dir node '{}'".format(node_path)) # Group the files # list "groups" is list of dict, each dict contains actual file list + extractor info/config groups = [] # Group by dir overrides other grouping if config.get("group_by_dir"): groups.append({"files": files, "extractors": [], "params": {}}) else: for format_rules in config.get("known_formats", {}).values(): format_name_list = format_rules["files"] format_groups = {} # Check each file for rule matching # Match to appropriate group (with same pre/post pattern) # eg a_[match]_b groups with a_[other match]_b but not c_[other match]_d for f in files: fname = os.path.basename(f).lower().strip() for format_name in format_name_list: if format_name in fname: pre_post_pattern = fname.replace(format_name, "") if not format_groups.get(pre_post_pattern): format_groups[pre_post_pattern] = [] format_groups[pre_post_pattern].append(f) break # Remove grouped files from the file list and add groups to the group list for g in format_groups.values(): for f in g: files.remove(f) group_info = { "files": g, "extractors": format_rules["extractors"], "params": format_rules["params"] } groups.append(group_info) # NOTE: Keep this grouping last! # Default grouping: Each file is a group groups.extend([{ "files": [f], "extractors": [], "params": {} } for f in files]) [groups.extend(group_tree(d, config)) for d in dirs] return groups
def run_extractors(input_queue, output_queue, queue_done, extract_params): """Extract data files. Returns: list of dict: The metadata extractd from the file. Will be empty if no selected extractor can extract data. """ source_id = extract_params.get("dataset", {}).get("mdf", {}).get("source_id", "unknown") try: # Extract each group from the queue # Exit loop when queue_done is True and no groups remain while True: # Fetch group from queue try: group_info = input_queue.get(timeout=5) # No group fetched except Empty: # Queue is permanently depleted, stop processing if queue_done.value: break # Queue is still active, try again else: continue # Process fetched group single_record = {} multi_records = [] for extractor_name in (group_info["extractors"] or ALL_EXTRACTORS.keys()): try: specific_params = mdf_toolbox.dict_merge( extract_params or {}, group_info["params"]) extractor_res = ALL_EXTRACTORS[extractor_name]( group=group_info["files"], params=specific_params) except Exception as e: logger.warn( ("{} Extractor {} failed with " "exception {}").format(source_id, extractor_name, repr(e))) else: # If a list of one record was returned, treat as single record # Eliminates [{}] from cluttering feedstock # Filters one-record results from extractors that always return lists if isinstance(extractor_res, list) and len(extractor_res) == 1: extractor_res = extractor_res[0] # Only process actual results if extractor_res: # If a single record was returned, merge with others if isinstance(extractor_res, dict): single_record = mdf_toolbox.dict_merge( single_record, extractor_res) # If multiple records were returned, add to list elif isinstance(extractor_res, list): # Only add records with data [ multi_records.append(rec) for rec in extractor_res if rec ] # Else, panic else: raise TypeError( ("Extractor '{p}' returned " "type '{t}'!").format(p=extractor_name, t=type(extractor_res))) logger.debug("{}: {} extractd {}".format( source_id, extractor_name, group_info["files"])) elif SUPER_DEBUG: logger.debug("{}: {} could not extract {}".format( source_id, extractor_name, group_info)) # Merge the single_record into all multi_records if both exist if single_record and multi_records: records = [ mdf_toolbox.dict_merge(r, single_record) for r in multi_records if r ] # Else, if single_record exists, make it a list elif single_record: records = [single_record] # Otherwise, use the list of records if it exists elif multi_records: records = multi_records # If nothing exists, make a blank list else: records = [] # Push records to output queue # Get the file info try: file_info = _extract_file_info(group=group_info["files"], params=extract_params) except Exception as e: logger.warning("{}: File info extractor failed: {}".format( source_id, repr(e))) for record in records: # TODO: Should files be handled differently? record = mdf_toolbox.dict_merge(record, file_info) output_queue.put(json.dumps(record)) except Exception as e: logger.error("{}: Extractor error: {}".format(source_id, str(e))) # Log all exceptions! except BaseException as e: logger.error("{}: Extractor BaseException: {}".format( source_id, str(e))) return
from .base import BASE_CONFIG from .catalogs import KNOWN_CATALOGS from .dev import DEV from .keys import KEYS from .prod import PROD from .schemas import INPUT_SCHEMA, OUTPUT_SCHEMA from .staging import STAGING # Config setup CONFIG = { "INPUT_SCHEMA": INPUT_SCHEMA, "OUTPUT_SCHEMA": OUTPUT_SCHEMA, "DEFAULT_ACLS": DEFAULT_ACLS, "KNOWN_CATALOGS": KNOWN_CATALOGS } CONFIG = dict_merge(BASE_CONFIG, CONFIG) CONFIG = dict_merge(KEYS, CONFIG) # Server-specific config will overwrite previous base values if any server = os.environ.get("FLASK_ENV") if server == "prod": CONFIG = dict_merge(PROD, CONFIG) elif server == "staging": CONFIG = dict_merge(STAGING, CONFIG) elif server == "dev": CONFIG = dict_merge(DEV, CONFIG) else: raise EnvironmentError( "FLASK_ENV not correctly set! FLASK_ENV must be 'prod', 'staging'," " or 'dev' to use any part of this Action Provider.")
import os from mdf_toolbox import dict_merge from mdf_connect_server.config import (DEFAULT, DEV, GLOBUS_HTTP_HOSTS, GROUPINGS, KEYS, PROD) CONFIG = {} CONFIG = dict_merge(DEFAULT, CONFIG) CONFIG = dict_merge(KEYS, CONFIG) server = os.environ.get("FLASK_ENV") if server == "production": CONFIG = dict_merge(PROD, CONFIG) elif server == "development": CONFIG = dict_merge(DEV, CONFIG) else: raise EnvironmentError( "FLASK_ENV not correctly set! FLASK_ENV must be 'production'" " or 'development', even for processing only.") CONFIG["GLOBUS_HTTP_HOSTS"] = GLOBUS_HTTP_HOSTS CONFIG["GROUPING_RULES"] = GROUPINGS # Add credentials CONFIG["GLOBUS_CREDS"] = { "client_id": CONFIG["API_CLIENT_ID"], "client_secret": CONFIG["API_CLIENT_SECRET"] } # Make required dirs os.makedirs(CONFIG["LOCAL_PATH"], exist_ok=True) os.makedirs(CONFIG["FEEDSTOCK_PATH"], exist_ok=True)
def test_dict_merge(): base = { "base_key": "base", "both_key": "base", "level2": { "base_key": "base", "both_key": "base", "level3": { "base_key": "base", "both_key": "base", "mismatch_key": "string" } } } add = { "both_key": "add", "add_key": "add", "level2": { "both_key": "add", "add_key": "add", "level3": { "both_key": "add", "add_key": "add", "mismatch_key": 10, "level4": { "add_key": "add" } } } } merged = { "base_key": "base", "both_key": "base", "add_key": "add", "level2": { "base_key": "base", "both_key": "base", "add_key": "add", "level3": { "base_key": "base", "both_key": "base", "add_key": "add", "mismatch_key": "string", "level4": { "add_key": "add" } } } } b_list = {"list_field": ["base"]} a_list = {"list_field": ["add"]} m_list = {"list_field": ["base", "add"]} a_list_bad = {"list_field": "foo"} # Proper use old_base = deepcopy(base) old_add = deepcopy(add) assert mdf_toolbox.dict_merge(base, add) == merged # Originals should be unchanged assert base == old_base assert add == old_add # Test list appending # No appending assert mdf_toolbox.dict_merge(b_list, a_list, append_lists=False) == b_list # With appending assert mdf_toolbox.dict_merge(b_list, a_list, append_lists=True) == m_list # With mismatched data types assert mdf_toolbox.dict_merge(b_list, a_list_bad, append_lists=False) == b_list assert mdf_toolbox.dict_merge(b_list, a_list_bad, append_lists=True) == b_list assert mdf_toolbox.dict_merge({}, {}) == {} # Check errors with pytest.raises(TypeError): mdf_toolbox.dict_merge(1, {}) with pytest.raises(TypeError): mdf_toolbox.dict_merge({}, "a") with pytest.raises(TypeError): mdf_toolbox.dict_merge([], [])
def submission_driver(metadata, sub_conf, source_id, access_token, user_id): """The driver function for MOC. Modifies the status database as steps are completed. Arguments: metadata (dict): The JSON passed to /submit. sub_conf (dict): Submission configuration information. source_id (str): The source name of this submission. access_token (str): The Globus Auth access token for the submitting user. user_id (str): The Globus ID of the submitting user. """ # Setup utils.update_status(source_id, "sub_start", "P", except_on_fail=True) utils.modify_status_entry(source_id, { "pid": os.getpid(), "hibernating": False }, except_on_fail=True) try: # Connect auth # CAAC required for user auth later mdf_conf_client = globus_sdk.ConfidentialAppAuthClient( CONFIG["API_CLIENT_ID"], CONFIG["API_CLIENT_SECRET"]) mdf_creds = mdf_toolbox.dict_merge(CONFIG["GLOBUS_CREDS"], {"services": ["transfer"]}) mdf_clients = mdf_toolbox.confidential_login(**mdf_creds) mdf_transfer_client = mdf_clients["transfer"] # User auth # When coming from curation, the access token (from the curator) is not used access_token = access_token.replace("Bearer ", "") dependent_grant = mdf_conf_client.oauth2_get_dependent_tokens( access_token) # Get specifically Transfer's access token for grant in dependent_grant.data: if grant["resource_server"] == "transfer.api.globus.org": user_transfer_token = grant["access_token"] user_transfer_authorizer = globus_sdk.AccessTokenAuthorizer( user_transfer_token) user_transfer_client = globus_sdk.TransferClient( authorizer=user_transfer_authorizer) except Exception as e: utils.update_status(source_id, "sub_start", "F", text=repr(e), except_on_fail=True) utils.complete_submission(source_id) return # Cancel the previous version(s) source_info = utils.split_source_id(source_id) scan_res = utils.scan_table(table_name="status", fields=["source_id", "active"], filters=[("source_id", "^", source_info["source_name"]), ("source_id", "<", source_id)]) if not scan_res["success"]: utils.update_status(source_id, "sub_start", "F", text=scan_res["error"], except_on_fail=True) utils.complete_submission(source_id) return old_source_ids = [ oldsub["source_id"] for oldsub in scan_res["results"] if oldsub["active"] ] if old_source_ids: utils.update_status( source_id, "sub_start", "M", text=("The following submissions will be cancelled: {}".format( old_source_ids)), except_on_fail=True) utils.update_status(source_id, "old_cancel", "P", except_on_fail=True) for old_source_id in old_source_ids: cancel_res = utils.cancel_submission(old_source_id, wait=True) if not cancel_res["stopped"]: utils.update_status( source_id, "sub_start", "F", text=cancel_res.get( "error", ("Unable to cancel previous " "submission '{}'").format(old_source_id)), except_on_fail=True) utils.complete_submission(source_id) return if cancel_res["success"]: logger.info("{}: Cancelled source_id {}".format( source_id, old_source_id)) else: logger.debug("{}: Stopped source_id {}".format( source_id, old_source_id)) utils.update_status(source_id, "old_cancel", "S", except_on_fail=True) else: utils.update_status(source_id, "sub_start", "S", except_on_fail=True) utils.update_status(source_id, "old_cancel", "N", except_on_fail=True) # NOTE: Cancellation point if utils.read_table("status", source_id).get("status", {}).get("cancelled"): logger.debug("{}: Cancel signal acknowledged".format(source_id)) utils.complete_submission(source_id) return local_path = os.path.join(CONFIG["LOCAL_PATH"], source_id) + "/" feedstock_file = os.path.join(CONFIG["FEEDSTOCK_PATH"], source_id + ".json") curation_state_file = os.path.join(CONFIG["CURATION_DATA"], source_id + ".json") service_data = os.path.join(CONFIG["SERVICE_DATA"], source_id) + "/" os.makedirs(service_data, exist_ok=True) num_files = 0 # Curation skip point if type(sub_conf["curation"]) is not str: # If we're extracting, download data locally, then set canon source to local # This allows non-Globus sources (because to download to Connect's EP) if not sub_conf["no_extract"]: utils.update_status(source_id, "data_download", "P", except_on_fail=True) try: # Download from user for dl_res in utils.download_data( user_transfer_client, sub_conf["data_sources"], CONFIG["LOCAL_EP"], local_path, admin_client=mdf_transfer_client, user_id=user_id): if not dl_res["success"]: msg = "During data download: " + dl_res["error"] utils.update_status(source_id, "data_download", "T", text=msg, except_on_fail=True) if not dl_res["success"]: raise ValueError(dl_res["error"]) num_files = dl_res["total_files"] except Exception as e: utils.update_status(source_id, "data_download", "F", text=repr(e), except_on_fail=True) utils.complete_submission(source_id) return utils.update_status( source_id, "data_download", "M", text=( "{} files will be grouped and extracted (from {} archives)" .format(num_files, dl_res["num_extracted"])), except_on_fail=True) canon_data_sources = [ "globus://{}{}".format(CONFIG["LOCAL_EP"], local_path) ] # If we're not extracting, set canon source to only source # Also create local dir with no data to "extract" for dataset entry else: utils.update_status(source_id, "data_download", "N", except_on_fail=True) os.makedirs(local_path) canon_data_sources = sub_conf["data_sources"] # Move data from canon source(s) to canon dest (if different) utils.update_status(source_id, "data_transfer", "P", except_on_fail=True) # If not extracting, set up user TC for backup use if sub_conf["no_extract"]: backup_user_id = user_id backup_user_client = user_transfer_client else: backup_user_id = None backup_user_client = None for data_source in canon_data_sources: if data_source != sub_conf["canon_destination"]: logger.debug("Data transfer: '{}' to '{}'".format( data_source, sub_conf["canon_destination"])) try: for backup_res in utils.backup_data( mdf_transfer_client, data_source, sub_conf["canon_destination"], acl=sub_conf["storage_acl"], data_client=backup_user_client, data_user=backup_user_id): if not backup_res["success"]: msg = ("During data download: {}".format( backup_res.get("error", "Unknown error"))) utils.update_status(source_id, "data_transfer", "T", text=msg, except_on_fail=True) if not backup_res["success"]: raise ValueError(backup_res.get("error")) elif not backup_res[ sub_conf["canon_destination"]]["success"]: raise ValueError( backup_res[sub_conf["canon_destination"]]["error"]) except Exception as e: err_text = ( "Transfer from '{}' to primary/canon destination '{}' failed: {}" .format(data_source, sub_conf["canon_destination"], str(e))) utils.update_status(source_id, "data_transfer", "F", text=err_text, except_on_fail=True) return utils.update_status(source_id, "data_transfer", "S", except_on_fail=True) # Add file info data sub_conf["index"]["file"] = { "globus_host": sub_conf["canon_destination"], "http_host": utils.lookup_http_host(sub_conf["canon_destination"]), "local_path": local_path, } extract_params = { "dataset": metadata, "extractors": sub_conf["index"], "service_data": service_data, "feedstock_file": feedstock_file, "group_config": mdf_toolbox.dict_merge(sub_conf["extraction_config"], CONFIG["GROUPING_RULES"]), "validation_info": { "project_blocks": sub_conf.get("project_blocks", []), "required_fields": sub_conf.get("required_fields", []), "allowed_nulls": CONFIG["SCHEMA_NULLS"], "base_acl": sub_conf["acl"] } } # NOTE: Cancellation point if utils.read_table("status", source_id).get("status", {}).get("cancelled"): logger.debug("{}: Cancel signal acknowledged".format(source_id)) utils.complete_submission(source_id) return # Extract data utils.update_status(source_id, "extracting", "P", except_on_fail=True) try: extract_res = start_extractors(local_path, extract_params) if not extract_res["success"]: utils.update_status(source_id, "extracting", "F", text=extract_res["error"], except_on_fail=True) return dataset = extract_res["dataset"] num_records = extract_res["num_records"] num_groups = extract_res["num_groups"] extensions = extract_res["extensions"] except Exception as e: utils.update_status(source_id, "extracting", "F", text=repr(e), except_on_fail=True) utils.complete_submission(source_id) return else: utils.modify_status_entry(source_id, {"extensions": extensions}) # If nothing in dataset, panic if not dataset: utils.update_status(source_id, "extracting", "F", text="Could not process dataset entry", except_on_fail=True) utils.complete_submission(source_id) return # If not extracting, show status as skipped # Also check if records were extracted inappropriately, flag error in log elif sub_conf.get("no_extract"): if num_records != 0: logger.error( "{}: Records extracted with no_extract flag ({} records)" .format(source_id, num_records)) utils.update_status(source_id, "extracting", "N", except_on_fail=True) else: utils.update_status( source_id, "extracting", "M", text=("{} metadata records extracted out of {} file groups" .format(num_records, num_groups)), except_on_fail=True) logger.debug("{}: {} entries extracted".format( source_id, num_records + 1)) # NOTE: Cancellation point if utils.read_table("status", source_id).get("status", {}).get("cancelled"): logger.debug("{}: Cancel signal acknowledged".format(source_id)) utils.complete_submission(source_id) return ################### # Curation step # ################### # Trigger curation if required if sub_conf.get("curation"): utils.update_status(source_id, "curation", "P", except_on_fail=True) # Create curation task in curation table with open(feedstock_file) as f: # Discard dataset entry f.readline() # Save first few records # Append the json-loaded form of records # The number of records should be at most the default number, # and less if less are present curation_records = [] [ curation_records.append(json.loads(f.readline())) for i in range( min(CONFIG["NUM_CURATION_RECORDS"], num_records)) ] curation_dataset = deepcopy(dataset) # Numbers can be extracted into Decimal by DynamoDB, which causes JSON errors curation_dataset["mdf"].pop("scroll_id", None) curation_dataset["mdf"].pop("version", None) curation_task = { "source_id": source_id, "allowed_curators": sub_conf.get("permission_groups", sub_conf["acl"]), "dataset": json.dumps(dataset), "sample_records": json.dumps(curation_records), "submission_info": sub_conf, "extraction_summary": ("{} records were extracted out of {} groups from {} files". format(num_records, num_groups, num_files)), "curation_start_date": str(datetime.today()) } # If no allowed curators or public allowed, set to public if (not curation_task["allowed_curators"] or "public" in curation_task["allowed_curators"]): curation_task["allowed_curators"] = ["public"] # Create task in database create_res = utils.create_curation_task(curation_task) if not create_res["success"]: utils.update_status(source_id, "curation", "F", text=create_res.get( "error", "Unable to create curation task"), except_on_fail=True) return # Save state os.makedirs(CONFIG["CURATION_DATA"], exist_ok=True) with open(curation_state_file, 'w') as save_file: state_data = { "source_id": source_id, "sub_conf": sub_conf, "dataset": dataset } json.dump(state_data, save_file) logger.debug("{}: Saved state for curation".format(source_id)) # Trigger hibernation utils.modify_status_entry(source_id, {"hibernating": True}, except_on_fail=True) return else: utils.update_status(source_id, "curation", "N", except_on_fail=True) # Returning from curation # Submission accepted elif sub_conf["curation"].startswith("Accept"): # Save curation message curation_message = sub_conf["curation"] # Load state with open(curation_state_file) as save_file: state_data = json.load(save_file) # Verify source_ids match if state_data["source_id"] != source_id: logger.error("State data incorrect: '{}' is not '{}'".format( state_data["source_id"], source_id)) utils.update_status(source_id, "curation", "F", text="Submission corrupted", except_on_fail=True) return # Load state variables back sub_conf = state_data["sub_conf"] dataset = state_data["dataset"] logger.debug("{}: Loaded state from curation".format(source_id)) # Delete state file try: os.remove(curation_state_file) except FileNotFoundError: utils.update_status( source_id, "curation", "F", text="Unable to cleanly load curation information", except_on_fail=True) return # Delete curation task delete_res = utils.delete_from_table("curation", source_id) if not delete_res["success"]: utils.update_status(source_id, "curation", "F", text=delete_res.get("error", "Curation cleanup failed"), except_on_fail=True) return utils.update_status(source_id, "curation", "M", text=curation_message, except_on_fail=True) # Submission rejected elif sub_conf["curation"].startswith("Reject"): # Delete state file try: os.remove(curation_state_file) except FileNotFoundError: logger.error( "{}: Unable to delete curation state file '{}'".format( source_id, curation_state_file)) # Delete curation task delete_res = utils.delete_from_table("curation", source_id) if not delete_res["success"]: logger.error( "{}: Unable to delete rejected curation from database: {}". format(source_id, delete_res.get("error"))) utils.update_status(source_id, "curation", "F", text=sub_conf["curation"], except_on_fail=True) return # Curation invalid else: utils.update_status(source_id, "curation", "F", text="Unknown curation state: '{}'".format( sub_conf["curation"]), except_on_fail=True) return ################### # Post-curation # ################### # Integrations service_res = {} # NOTE: Cancellation point if utils.read_table("status", source_id).get("status", {}).get("cancelled"): logger.debug("{}: Cancel signal acknowledged".format(source_id)) utils.complete_submission(source_id) return # MDF Search (mandatory) utils.update_status(source_id, "ingest_search", "P", except_on_fail=True) search_config = sub_conf["services"].get("mdf_search", {}) try: search_args = { "feedstock_file": feedstock_file, "source_id": source_id, "index": search_config.get("index", CONFIG["INGEST_INDEX"]), "delete_existing": True, "batch_size": CONFIG["SEARCH_BATCH_SIZE"] } search_res = utils.search_ingest(**search_args) if not search_res["success"]: utils.update_status(source_id, "ingest_search", "F", text="; ".join(search_res["errors"]), except_on_fail=True) return except Exception as e: utils.update_status(source_id, "ingest_search", "F", text=repr(e), except_on_fail=True) utils.complete_submission(source_id) return else: # Handle errors if len(search_res["errors"]) > 0: utils.update_status( source_id, "ingest_search", "F", text=( "{} batches of records failed to ingest (up to {} records " "total)").format(len(search_res["errors"]), (len(search_res["errors"]) * CONFIG["SEARCH_BATCH_SIZE"])), except_on_fail=True) utils.complete_submission(source_id) return utils.update_status(source_id, "ingest_search", "S", except_on_fail=True) os.remove(feedstock_file) service_res["mdf_search"] = "This dataset was ingested to MDF Search." # Move files to data_destinations if sub_conf.get("data_destinations"): utils.update_status(source_id, "ingest_backup", "P", except_on_fail=True) try: for backup_res in utils.backup_data( mdf_transfer_client, storage_loc=sub_conf["canon_destination"], backup_locs=sub_conf["data_destinations"], acl=sub_conf["storage_acl"]): if not backup_res["success"]: msg = "During data backup: " + backup_res.get( "error", "Unknown error") utils.update_status(source_id, "ingest_backup", "T", text=msg, except_on_fail=True) if not backup_res["success"]: raise ValueError(backup_res.get("error")) except Exception as e: err_msg = "Destination backup failed: {}".format(str(e)) utils.update_status(source_id, "ingest_backup", "F", text=err_msg, except_on_fail=True) return # On any complete failure, fail submission if not all([val["success"] is True for val in backup_res.values()]): err_msg = "; ".join([ "'{}' failed: {}".format(k, v["error"]) for k, v in backup_res.items() if v["success"] is not True ]) utils.update_status(source_id, "ingest_backup", "F", text=err_msg, except_on_fail=True) return # On an error with a successful Transfer, notify user but continue elif not all([val["error"] == "" for val in backup_res.values()]): err_msg = "; ".join([ "on '{}': {}".format(k, v["error"]) for k, v in backup_res.items() if v["error"] ]) utils.update_status(source_id, "ingest_backup", "R", text=err_msg, except_on_fail=True) else: utils.update_status(source_id, "ingest_backup", "S", except_on_fail=True) else: utils.update_status(source_id, "ingest_backup", "N", except_on_fail=True) # MDF Publish if sub_conf["services"].get("mdf_publish"): publish_conf = sub_conf["services"]["mdf_publish"] # Data already moved to canon dest as a requirement of success so far # Mint DOI try: # Create DOI and add to dataset DC dataset["dc"]["identifier"] = { "identifier": utils.make_dc_doi(test=publish_conf["doi_test"]), "identifierType": "DOI" } # Add publication dates and publisher dataset["dc"]["publisher"] = "Materials Data Facility" dataset["dc"]["publicationYear"] = datetime.now().year if not dataset["dc"].get("dates"): dataset["dc"]["dates"] = [] dataset["dc"]["dates"].append({ "date": str(datetime.now().date()), "dateType": "Accepted" }) landing_page = CONFIG["DATASET_LANDING_PAGE"].format(source_id) mdf_publish_res = utils.datacite_mint_doi( dataset["dc"], test=publish_conf["doi_test"], url=landing_page) except Exception as e: logger.error("DOI minting exception: {}".format(repr(e))) utils.update_status(source_id, "ingest_publish", "F", text="DOI minting failed", except_on_fail=True) return else: if not mdf_publish_res["success"]: logger.error("DOI minting failed: {}".format( mdf_publish_res["error"])) utils.update_status(source_id, "ingest_publish", "F", text="Unable to mint DOI for publication", except_on_fail=True) return utils.update_status( source_id, "ingest_publish", "L", text=("Dataset published though MDF Publish with DOI '{}'".format( dataset["dc"]["identifier"]["identifier"])), link=landing_page, except_on_fail=True) service_res["mdf_publish"] = landing_page else: utils.update_status(source_id, "ingest_publish", "N", except_on_fail=True) # Citrine (skip if not extracted) if sub_conf["services"].get("citrine") and not sub_conf.get("no_extract"): utils.update_status(source_id, "ingest_citrine", "P", except_on_fail=True) # Get old Citrine dataset version, if exists scan_res = utils.scan_table(table_name="status", fields=["source_id", "citrine_id"], filters=[("source_name", "==", source_info["source_name"]), ("citrine_id", "!=", None)]) if not scan_res["success"]: logger.error("Status scan failed: {}".format(scan_res["error"])) old_cit_subs = scan_res.get("results", []) if len(old_cit_subs) == 0: old_citrine_id = None elif len(old_cit_subs) == 1: old_citrine_id = old_cit_subs[0]["citrine_id"] else: old_citrine_id = max([sub["citrine_id"] for sub in old_cit_subs]) try: # Check for PIFs to ingest cit_path = os.path.join(service_data, "citrine") if len(os.listdir(cit_path)) > 0: cit_res = utils.citrine_upload( cit_path, CONFIG["CITRINATION_API_KEY"], dataset, old_citrine_id, public=sub_conf["services"]["citrine"].get("public", True)) else: cit_res = { "success": False, "error": "No PIFs were generated from this dataset", "success_count": 0, "failure_count": 0 } except Exception as e: utils.update_status(source_id, "ingest_citrine", "R", text=str(e), except_on_fail=True) else: if not cit_res["success"]: if cit_res.get("error"): text = cit_res["error"] elif cit_res.get("failure_count"): text = "All {} PIFs failed to upload".format( cit_res["failure_count"]) elif cit_res.get("failure_count") == 0: text = "No PIFs were found" logger.warning("{}: PIFs not found!".format(source_id)) else: text = "An error prevented PIF uploading" utils.update_status(source_id, "ingest_citrine", "R", text=text, except_on_fail=True) else: text = "{}/{} PIFs uploaded successfully".format( cit_res["success_count"], cit_res["success_count"] + cit_res["failure_count"]) link = CONFIG["CITRINATION_LINK"].format( cit_ds_id=cit_res["cit_ds_id"]) utils.update_status(source_id, "ingest_citrine", "L", text=text, link=link, except_on_fail=True) stat_res_2 = utils.modify_status_entry( source_id, {"citrine_id": cit_res["cit_ds_id"]}) if not stat_res_2["success"]: raise ValueError(str(stat_res_2)) service_res["citrine"] = link else: utils.update_status(source_id, "ingest_citrine", "N", except_on_fail=True) # MRR if sub_conf["services"].get("mrr"): utils.update_status(source_id, "ingest_mrr", "P", except_on_fail=True) try: if (isinstance(sub_conf["services"]["mrr"], dict) and sub_conf["services"]["mrr"].get("test")): mrr_title = "TEST_" + dataset["dc"]["titles"][0]["title"] else: mrr_title = dataset["dc"]["titles"][0]["title"] mrr_contributors = "" for author in dataset["dc"]["creators"]: mrr_contributors += CONFIG["MRR_CONTRIBUTOR"].format( name=(author.get("givenName", "") + " " + author.get("familyName", "")), affiliation=author.get("affiliation", "")) mrr_description = "" for desc in dataset["dc"].get("descriptions", []): mrr_description += desc["description"] + " " # Must add at least one subject to MRR entry mrr_subjects = "<subject>MDF Dataset</subject>" for subj in dataset["dc"].get("subjects", []): mrr_subjects += "<subject>" + subj["subject"] + "</subject>" mrr_entry = { "title": dataset["dc"]["titles"][0]["title"], "template": CONFIG["MRR_SCHEMA"], "xml_content": CONFIG["MRR_TEMPLATE"].format( title=mrr_title, publisher=dataset["dc"]["publisher"], contributors=mrr_contributors, contact_name=dataset["dc"]["creators"][0]["creatorName"], description=mrr_description, subjects=mrr_subjects, landing_page=CONFIG["DATASET_LANDING_PAGE"].format( source_id)) } except Exception as e: utils.update_status(source_id, "ingest_mrr", "R", text="Unable to create MRR metadata:" + repr(e), except_on_fail=True) else: try: mrr_res_raw = requests.post(CONFIG["MRR_URL"], auth=(CONFIG["MRR_USERNAME"], CONFIG["MRR_PASSWORD"]), data=mrr_entry) try: mrr_res = mrr_res_raw.json() except json.JSONDecodeError: raise ValueError("Invalid MRR response: {}".format( mrr_res_raw.content)) if mrr_res_raw.status_code not in [201, 202]: raise ValueError( "MRR ingest failed with error code {}: '{}'".format( mrr_res_raw.status_code, mrr_res)) except Exception as e: utils.update_status(source_id, "ingest_mrr", "R", text="Unable to submit MRR entry: " + repr(e), except_on_fail=True) else: try: mrr_id = mrr_res.get("id") if not mrr_id: raise ValueError("MRR entry has no ID") except Exception: utils.update_status(source_id, "ingest_mrr", "R", text=mrr_res.get( "message", "Unknown MRR failure"), except_on_fail=True) else: text = "Dataset successfully registered with the MRR" mrr_link = CONFIG["MRR_LINK"].format(mrr_id) utils.update_status(source_id, "ingest_mrr", "L", text=text, link=mrr_link, except_on_fail=True) service_res["mrr"] = mrr_link else: utils.update_status(source_id, "ingest_mrr", "N", except_on_fail=True) # Dataset update, start cleanup utils.update_status(source_id, "ingest_cleanup", "P", except_on_fail=True) dataset["services"] = service_res ds_update = utils.update_search_entries(search_config.get( "index", CONFIG["INGEST_INDEX"]), entries=[dataset], overwrite=False) if not ds_update["success"]: utils.update_status(source_id, "ingest_cleanup", "F", text=ds_update.get("error", "Unable to update dataset"), except_on_fail=True) utils.complete_submission(source_id) return # Cleanup try: fin_res = utils.complete_submission(source_id, cleanup=CONFIG["FINAL_CLEANUP"]) except Exception as e: utils.update_status(source_id, "ingest_cleanup", "F", text=repr(e), except_on_fail=True) return if not fin_res["success"]: utils.update_status(source_id, "ingest_cleanup", "F", text=fin_res["error"], except_on_fail=True) return utils.update_status(source_id, "ingest_cleanup", "S", except_on_fail=True) logger.debug("{}: Ingest complete".format(source_id)) return {"success": True, "source_id": source_id}