def extract_tdb(group, params=None):
    record = {}

    for data_file in group:
        material = {}
        calphad = {}
        # Attempt to read the file
        try:
            calphad_db = pycalphad.Database(data_file)
            composition = ""
            for element in calphad_db.elements:
                if element.isalnum():
                    element = element.lower()
                    element = element[0].upper() + element[1:]
                    composition += element

            phases = list(calphad_db.phases.keys())

            if composition:
                material['composition'] = composition
            if phases:
                calphad['phases'] = phases

        except Exception:
            pass
        else:
            # Add to record
            if material:
                record = mdf_toolbox.dict_merge(record, {"material": material})
            if calphad:
                record = mdf_toolbox.dict_merge(record, {"calphad": calphad})

    return record
def extract_crystal_structure(group, params=None):
    """Extractor for the crystal_structure block.
    Will also populate material block.

    Arguments:
    group (list of str): The paths to grouped files.
    params (dict): N/A

    Returns:
    dict: The record extractd.
    """
    record = {}

    for data_file in group:
        material = {}
        crystal_structure = {}
        # Attempt to read the file
        try:
            # Read with ASE
            ase_res = ase.io.read(data_file)
            # Check data read, validate crystal structure
            if not ase_res or not all(ase_res.get_pbc()):
                raise ValueError("No valid data")
            else:
                # Convert ASE Atoms to Pymatgen Structure
                pmg_s = ase_to_pmg.get_structure(ase_res)
        # ASE failed to read file
        except Exception:
            try:
                # Read with Pymatgen
                pmg_s = pymatgen.Structure.from_file(data_file)
            except Exception:
                # Can't read file
                continue

        # Extract material block
        material["composition"] = pmg_s.formula.replace(" ", "")
        # Extract crystal_structure block
        crystal_structure["space_group_number"] = pmg_s.get_space_group_info(
        )[1]
        crystal_structure["number_of_atoms"] = float(
            pmg_s.composition.num_atoms)
        crystal_structure["volume"] = float(pmg_s.volume)
        crystal_structure[
            "stoichiometry"] = pmg_s.composition.anonymized_formula

        # Add to record
        record = mdf_toolbox.dict_merge(record, {
            "material": material,
            "crystal_structure": crystal_structure
        })
    return record
Пример #3
0
 def get_translations(self):
     return dict_merge(
         super().get_translations(), {
             "dft": {
                 "Converged": ("converged", bool),
                 "XC_Functional": ("exchange_correlation_functional", str),
                 "Cutoff_Energy_eV": ("cutoff_energy", float)
             },
             "crystal_structure": {
                 "Space_group_number": ("space_group_number", int),
                 "Number_of_atoms_in_unit_cell": ("number_of_atoms", float),
                 "Unit_cell_volume_AA_3": ("volume", float)
             },
         })
Пример #4
0
def update_action_status(table_name, action_id, updates, overwrite=False):
    """Update action entry in status database.

    Arguments:
        table_name (str): The name of the table to update.
        action_id (dict): The ID for the action.
        updates (dict): The updates to apply to the action status.
        overwrite (bool): When False, will merge the updates into the existing status,
                overwriting only existing values.
                When True, will delete the existing status entirely and replace it
                with the updates.
                Default False.

    Returns:
        dict: The updated action status.

    Raises exception on any failure.
    """
    # Verify old status exists and save it
    old_status = read_action_status(table_name, action_id)

    # Merge updates into old_status if not overwriting
    if not overwrite:
        # dict_merge(base, addition) returns base keys unchanged, addition keys added
        full_updates = mdf_toolbox.dict_merge(updates, old_status)
    else:
        full_updates = updates

    # TODO: Validate updates
    update_errors = []
    if update_errors:
        raise err.InvalidRequest(*update_errors)

    # Update in DB (.put_item() overwrites)
    table = get_dmo_table(table_name)
    try:
        table.put_item(Item=full_updates)
    except Exception as e:
        logger.error("Error updating status for '{}': {}".format(
            action_id, str(e)))
        raise err.ServiceError(str(e))

    logger.debug("{}: Action status updated: {}".format(action_id, updates))
    return full_updates
Пример #5
0
def parse(group, **params):
    record = {}

    for data_file in group:
        material = {}
        crystal_structure = {}
        # Attempt to read the file
        try:
            # Read with ASE
            ase_res = ase.io.read(data_file)
            # Check data read, validate crystal structure
            if not ase_res or not all(ase_res.get_pbc()):
                raise ValueError("No valid data")
            else:
                # Convert ASE Atoms to Pymatgen Structure
                pmg_s = ase_to_pmg.get_structure(ase_res)
        # ASE failed to read file
        except Exception:
            try:
                # Read with Pymatgen
                pmg_s = pymatgen.Structure.from_file(data_file)
            except Exception:
                # Can't read file
                continue

        # Parse material block
        material["composition"] = pmg_s.formula.replace(" ", "") 
        # Parse crystal_structure block
        crystal_structure["space_group_number"] = pmg_s.get_space_group_info()[1]
        crystal_structure["number_of_atoms"] = float(pmg_s.composition.num_atoms)
        crystal_structure["volume"] = float(pmg_s.volume)
        crystal_structure["stoichiometry"] = pmg_s.composition.anonymized_formula

        # Add to record
        record = mdf_toolbox.dict_merge(record, {
                                                "material": material,
                                                "crystal_structure": crystal_structure
                                            })
    return record
from mdf_toolbox import dict_merge

from .acl_config import DEFAULT_ACLS
from .base_config import BASE_CONFIG
from .catalog_config import KNOWN_CATALOGS
from .keys import KEYS
from .schemas import INPUT_SCHEMA, OUTPUT_SCHEMA

# Config setup
CONFIG = {
    "INPUT_SCHEMA": INPUT_SCHEMA,
    "OUTPUT_SCHEMA": OUTPUT_SCHEMA,
    "DEFAULT_ACLS": DEFAULT_ACLS,
    "KNOWN_CATALOGS": KNOWN_CATALOGS
}
CONFIG = dict_merge(BASE_CONFIG, CONFIG)
CONFIG = dict_merge(KEYS, CONFIG)
Пример #7
0
    def _ex_search(self, limit=None, info=False, retries=3):
        """Execute a search and return the results, up to the ``SEARCH_LIMIT``.

        Uses the query currently in this SearchHelper.

        Arguments:
            limit (int): Maximum number of entries to return. **Default**: ``10`` for basic
                queries, and ``10000`` for advanced.
            info (bool): If ``False``, search will return a list of the results.
                    If ``True``, search will return a tuple containing the results list
                    and other information about the query.
                    **Default:** ``False``.
            retries (int): The number of times to retry a Search query if it fails.
                           **Default:** 3.

        Returns:
            If ``info`` is ``False``, *list*: The search results.
            If ``info`` is ``True``, *tuple*: The search results,
            and a dictionary of query information.
        """
        # Make sure there is query information present
        if not self.initialized:
            raise ValueError('No query has been set.')

        # Create Search-ready query
        if limit is not None:
            self.__query["limit"] = limit
        query = _validate_query(self.__query)

        tries = 0
        errors = []
        while True:
            # Try searching until success or `retries` number of failures
            # Raise exception after `retries` failures
            try:
                search_res = self.__search_client.post_search(
                    self.index, query)
            except globus_sdk.SearchAPIError as e:
                if tries >= retries:
                    raise
                else:
                    errors.append(repr(e))
            except Exception as e:
                if tries >= retries:
                    raise
                else:
                    errors.append(repr(e))
            else:
                break
            tries += 1

        # Remove the wrapping on each entry from Globus search
        res = mdf_toolbox.gmeta_pop(search_res, info=info)

        # Add more information to output if requested
        if info:
            # Add everything from the query itself
            info_dict = mdf_toolbox.dict_merge(res[1], query)
            # But rename "q" to "query" for clarity
            info_dict["query"] = info_dict.pop("q")
            # Add other useful/interesting parameters
            info_dict["index_uuid"] = self.index
            info_dict["retries"] = tries
            info_dict["errors"] = errors
            # Remake tuple because tuples don't suport assignment
            res = (res[0], info_dict)
        return res
Пример #8
0
def group_tree(root, config):
    """Run group_files on files in tree appropriately."""
    files = []
    dirs = []
    if root == "/dev/null":
        return []
    for node in os.listdir(root):
        node_path = os.path.join(root, node)
        if node == "mdf.json":
            with open(node_path) as f:
                try:
                    new_config = json.load(f)
                    logger.debug("Config updating: \n{}".format(new_config))
                except Exception as e:
                    logger.warning("Error reading config file '{}': {}".format(
                        node_path, str(e)))
                else:
                    config = mdf_toolbox.dict_merge(new_config, config)
        elif os.path.isfile(node_path):
            files.append(node_path)
        elif os.path.isdir(node_path):
            dirs.append(node_path)
        else:
            logger.debug(
                "Ignoring non-file, non-dir node '{}'".format(node_path))

    # Group the files
    # list "groups" is list of dict, each dict contains actual file list + extractor info/config
    groups = []
    # Group by dir overrides other grouping
    if config.get("group_by_dir"):
        groups.append({"files": files, "extractors": [], "params": {}})
    else:
        for format_rules in config.get("known_formats", {}).values():
            format_name_list = format_rules["files"]
            format_groups = {}
            # Check each file for rule matching
            # Match to appropriate group (with same pre/post pattern)
            #   eg a_[match]_b groups with a_[other match]_b but not c_[other match]_d
            for f in files:
                fname = os.path.basename(f).lower().strip()
                for format_name in format_name_list:
                    if format_name in fname:
                        pre_post_pattern = fname.replace(format_name, "")
                        if not format_groups.get(pre_post_pattern):
                            format_groups[pre_post_pattern] = []
                        format_groups[pre_post_pattern].append(f)
                        break
            # Remove grouped files from the file list and add groups to the group list
            for g in format_groups.values():
                for f in g:
                    files.remove(f)
                group_info = {
                    "files": g,
                    "extractors": format_rules["extractors"],
                    "params": format_rules["params"]
                }
                groups.append(group_info)

        # NOTE: Keep this grouping last!
        # Default grouping: Each file is a group
        groups.extend([{
            "files": [f],
            "extractors": [],
            "params": {}
        } for f in files])

    [groups.extend(group_tree(d, config)) for d in dirs]

    return groups
def run_extractors(input_queue, output_queue, queue_done, extract_params):
    """Extract data files.

    Returns:
    list of dict: The metadata extractd from the file.
                  Will be empty if no selected extractor can extract data.
    """
    source_id = extract_params.get("dataset",
                                   {}).get("mdf",
                                           {}).get("source_id", "unknown")
    try:
        # Extract each group from the queue
        # Exit loop when queue_done is True and no groups remain
        while True:
            # Fetch group from queue
            try:
                group_info = input_queue.get(timeout=5)
            # No group fetched
            except Empty:
                # Queue is permanently depleted, stop processing
                if queue_done.value:
                    break
                # Queue is still active, try again
                else:
                    continue

            # Process fetched group
            single_record = {}
            multi_records = []
            for extractor_name in (group_info["extractors"]
                                   or ALL_EXTRACTORS.keys()):
                try:
                    specific_params = mdf_toolbox.dict_merge(
                        extract_params or {}, group_info["params"])
                    extractor_res = ALL_EXTRACTORS[extractor_name](
                        group=group_info["files"], params=specific_params)
                except Exception as e:
                    logger.warn(
                        ("{} Extractor {} failed with "
                         "exception {}").format(source_id, extractor_name,
                                                repr(e)))
                else:
                    # If a list of one record was returned, treat as single record
                    # Eliminates [{}] from cluttering feedstock
                    # Filters one-record results from extractors that always return lists
                    if isinstance(extractor_res,
                                  list) and len(extractor_res) == 1:
                        extractor_res = extractor_res[0]
                    # Only process actual results
                    if extractor_res:
                        # If a single record was returned, merge with others
                        if isinstance(extractor_res, dict):
                            single_record = mdf_toolbox.dict_merge(
                                single_record, extractor_res)
                        # If multiple records were returned, add to list
                        elif isinstance(extractor_res, list):
                            # Only add records with data
                            [
                                multi_records.append(rec)
                                for rec in extractor_res if rec
                            ]
                        # Else, panic
                        else:
                            raise TypeError(
                                ("Extractor '{p}' returned "
                                 "type '{t}'!").format(p=extractor_name,
                                                       t=type(extractor_res)))
                        logger.debug("{}: {} extractd {}".format(
                            source_id, extractor_name, group_info["files"]))
                    elif SUPER_DEBUG:
                        logger.debug("{}: {} could not extract {}".format(
                            source_id, extractor_name, group_info))
            # Merge the single_record into all multi_records if both exist
            if single_record and multi_records:
                records = [
                    mdf_toolbox.dict_merge(r, single_record)
                    for r in multi_records if r
                ]
            # Else, if single_record exists, make it a list
            elif single_record:
                records = [single_record]
            # Otherwise, use the list of records if it exists
            elif multi_records:
                records = multi_records
            # If nothing exists, make a blank list
            else:
                records = []

            # Push records to output queue
            # Get the file info
            try:
                file_info = _extract_file_info(group=group_info["files"],
                                               params=extract_params)
            except Exception as e:
                logger.warning("{}: File info extractor failed: {}".format(
                    source_id, repr(e)))
            for record in records:
                # TODO: Should files be handled differently?
                record = mdf_toolbox.dict_merge(record, file_info)
                output_queue.put(json.dumps(record))
    except Exception as e:
        logger.error("{}: Extractor error: {}".format(source_id, str(e)))
    # Log all exceptions!
    except BaseException as e:
        logger.error("{}: Extractor BaseException: {}".format(
            source_id, str(e)))
    return
Пример #10
0
from .base import BASE_CONFIG
from .catalogs import KNOWN_CATALOGS
from .dev import DEV
from .keys import KEYS
from .prod import PROD
from .schemas import INPUT_SCHEMA, OUTPUT_SCHEMA
from .staging import STAGING

# Config setup
CONFIG = {
    "INPUT_SCHEMA": INPUT_SCHEMA,
    "OUTPUT_SCHEMA": OUTPUT_SCHEMA,
    "DEFAULT_ACLS": DEFAULT_ACLS,
    "KNOWN_CATALOGS": KNOWN_CATALOGS
}
CONFIG = dict_merge(BASE_CONFIG, CONFIG)
CONFIG = dict_merge(KEYS, CONFIG)

# Server-specific config will overwrite previous base values if any
server = os.environ.get("FLASK_ENV")
if server == "prod":
    CONFIG = dict_merge(PROD, CONFIG)
elif server == "staging":
    CONFIG = dict_merge(STAGING, CONFIG)
elif server == "dev":
    CONFIG = dict_merge(DEV, CONFIG)
else:
    raise EnvironmentError(
        "FLASK_ENV not correctly set! FLASK_ENV must be 'prod', 'staging',"
        " or 'dev' to use any part of this Action Provider.")
Пример #11
0
import os

from mdf_toolbox import dict_merge

from mdf_connect_server.config import (DEFAULT, DEV, GLOBUS_HTTP_HOSTS,
                                       GROUPINGS, KEYS, PROD)

CONFIG = {}
CONFIG = dict_merge(DEFAULT, CONFIG)
CONFIG = dict_merge(KEYS, CONFIG)

server = os.environ.get("FLASK_ENV")
if server == "production":
    CONFIG = dict_merge(PROD, CONFIG)
elif server == "development":
    CONFIG = dict_merge(DEV, CONFIG)
else:
    raise EnvironmentError(
        "FLASK_ENV not correctly set! FLASK_ENV must be 'production'"
        " or 'development', even for processing only.")
CONFIG["GLOBUS_HTTP_HOSTS"] = GLOBUS_HTTP_HOSTS
CONFIG["GROUPING_RULES"] = GROUPINGS
# Add credentials
CONFIG["GLOBUS_CREDS"] = {
    "client_id": CONFIG["API_CLIENT_ID"],
    "client_secret": CONFIG["API_CLIENT_SECRET"]
}

# Make required dirs
os.makedirs(CONFIG["LOCAL_PATH"], exist_ok=True)
os.makedirs(CONFIG["FEEDSTOCK_PATH"], exist_ok=True)
Пример #12
0
def test_dict_merge():
    base = {
        "base_key": "base",
        "both_key": "base",
        "level2": {
            "base_key": "base",
            "both_key": "base",
            "level3": {
                "base_key": "base",
                "both_key": "base",
                "mismatch_key": "string"
            }
        }
    }
    add = {
        "both_key": "add",
        "add_key": "add",
        "level2": {
            "both_key": "add",
            "add_key": "add",
            "level3": {
                "both_key": "add",
                "add_key": "add",
                "mismatch_key": 10,
                "level4": {
                    "add_key": "add"
                }
            }
        }
    }
    merged = {
        "base_key": "base",
        "both_key": "base",
        "add_key": "add",
        "level2": {
            "base_key": "base",
            "both_key": "base",
            "add_key": "add",
            "level3": {
                "base_key": "base",
                "both_key": "base",
                "add_key": "add",
                "mismatch_key": "string",
                "level4": {
                    "add_key": "add"
                }
            }
        }
    }
    b_list = {"list_field": ["base"]}
    a_list = {"list_field": ["add"]}
    m_list = {"list_field": ["base", "add"]}
    a_list_bad = {"list_field": "foo"}
    # Proper use
    old_base = deepcopy(base)
    old_add = deepcopy(add)
    assert mdf_toolbox.dict_merge(base, add) == merged
    # Originals should be unchanged
    assert base == old_base
    assert add == old_add

    # Test list appending
    # No appending
    assert mdf_toolbox.dict_merge(b_list, a_list, append_lists=False) == b_list
    # With appending
    assert mdf_toolbox.dict_merge(b_list, a_list, append_lists=True) == m_list
    # With mismatched data types
    assert mdf_toolbox.dict_merge(b_list, a_list_bad,
                                  append_lists=False) == b_list
    assert mdf_toolbox.dict_merge(b_list, a_list_bad,
                                  append_lists=True) == b_list

    assert mdf_toolbox.dict_merge({}, {}) == {}

    # Check errors
    with pytest.raises(TypeError):
        mdf_toolbox.dict_merge(1, {})
    with pytest.raises(TypeError):
        mdf_toolbox.dict_merge({}, "a")
    with pytest.raises(TypeError):
        mdf_toolbox.dict_merge([], [])
def submission_driver(metadata, sub_conf, source_id, access_token, user_id):
    """The driver function for MOC.
    Modifies the status database as steps are completed.

    Arguments:
    metadata (dict): The JSON passed to /submit.
    sub_conf (dict): Submission configuration information.
    source_id (str): The source name of this submission.
    access_token (str): The Globus Auth access token for the submitting user.
    user_id (str): The Globus ID of the submitting user.
    """
    # Setup
    utils.update_status(source_id, "sub_start", "P", except_on_fail=True)
    utils.modify_status_entry(source_id, {
        "pid": os.getpid(),
        "hibernating": False
    },
                              except_on_fail=True)
    try:
        # Connect auth
        # CAAC required for user auth later
        mdf_conf_client = globus_sdk.ConfidentialAppAuthClient(
            CONFIG["API_CLIENT_ID"], CONFIG["API_CLIENT_SECRET"])
        mdf_creds = mdf_toolbox.dict_merge(CONFIG["GLOBUS_CREDS"],
                                           {"services": ["transfer"]})
        mdf_clients = mdf_toolbox.confidential_login(**mdf_creds)
        mdf_transfer_client = mdf_clients["transfer"]

        # User auth
        # When coming from curation, the access token (from the curator) is not used
        access_token = access_token.replace("Bearer ", "")
        dependent_grant = mdf_conf_client.oauth2_get_dependent_tokens(
            access_token)
        # Get specifically Transfer's access token
        for grant in dependent_grant.data:
            if grant["resource_server"] == "transfer.api.globus.org":
                user_transfer_token = grant["access_token"]
        user_transfer_authorizer = globus_sdk.AccessTokenAuthorizer(
            user_transfer_token)
        user_transfer_client = globus_sdk.TransferClient(
            authorizer=user_transfer_authorizer)
    except Exception as e:
        utils.update_status(source_id,
                            "sub_start",
                            "F",
                            text=repr(e),
                            except_on_fail=True)
        utils.complete_submission(source_id)
        return

    # Cancel the previous version(s)
    source_info = utils.split_source_id(source_id)
    scan_res = utils.scan_table(table_name="status",
                                fields=["source_id", "active"],
                                filters=[("source_id", "^",
                                          source_info["source_name"]),
                                         ("source_id", "<", source_id)])
    if not scan_res["success"]:
        utils.update_status(source_id,
                            "sub_start",
                            "F",
                            text=scan_res["error"],
                            except_on_fail=True)
        utils.complete_submission(source_id)
        return

    old_source_ids = [
        oldsub["source_id"] for oldsub in scan_res["results"]
        if oldsub["active"]
    ]
    if old_source_ids:
        utils.update_status(
            source_id,
            "sub_start",
            "M",
            text=("The following submissions will be cancelled: {}".format(
                old_source_ids)),
            except_on_fail=True)
        utils.update_status(source_id, "old_cancel", "P", except_on_fail=True)

        for old_source_id in old_source_ids:
            cancel_res = utils.cancel_submission(old_source_id, wait=True)
            if not cancel_res["stopped"]:
                utils.update_status(
                    source_id,
                    "sub_start",
                    "F",
                    text=cancel_res.get(
                        "error", ("Unable to cancel previous "
                                  "submission '{}'").format(old_source_id)),
                    except_on_fail=True)
                utils.complete_submission(source_id)
                return
            if cancel_res["success"]:
                logger.info("{}: Cancelled source_id {}".format(
                    source_id, old_source_id))
            else:
                logger.debug("{}: Stopped source_id {}".format(
                    source_id, old_source_id))
        utils.update_status(source_id, "old_cancel", "S", except_on_fail=True)
    else:
        utils.update_status(source_id, "sub_start", "S", except_on_fail=True)
        utils.update_status(source_id, "old_cancel", "N", except_on_fail=True)

    # NOTE: Cancellation point
    if utils.read_table("status", source_id).get("status",
                                                 {}).get("cancelled"):
        logger.debug("{}: Cancel signal acknowledged".format(source_id))
        utils.complete_submission(source_id)
        return

    local_path = os.path.join(CONFIG["LOCAL_PATH"], source_id) + "/"
    feedstock_file = os.path.join(CONFIG["FEEDSTOCK_PATH"],
                                  source_id + ".json")
    curation_state_file = os.path.join(CONFIG["CURATION_DATA"],
                                       source_id + ".json")
    service_data = os.path.join(CONFIG["SERVICE_DATA"], source_id) + "/"
    os.makedirs(service_data, exist_ok=True)
    num_files = 0
    # Curation skip point
    if type(sub_conf["curation"]) is not str:
        # If we're extracting, download data locally, then set canon source to local
        # This allows non-Globus sources (because to download to Connect's EP)
        if not sub_conf["no_extract"]:
            utils.update_status(source_id,
                                "data_download",
                                "P",
                                except_on_fail=True)
            try:
                # Download from user
                for dl_res in utils.download_data(
                        user_transfer_client,
                        sub_conf["data_sources"],
                        CONFIG["LOCAL_EP"],
                        local_path,
                        admin_client=mdf_transfer_client,
                        user_id=user_id):
                    if not dl_res["success"]:
                        msg = "During data download: " + dl_res["error"]
                        utils.update_status(source_id,
                                            "data_download",
                                            "T",
                                            text=msg,
                                            except_on_fail=True)
                if not dl_res["success"]:
                    raise ValueError(dl_res["error"])
                num_files = dl_res["total_files"]

            except Exception as e:
                utils.update_status(source_id,
                                    "data_download",
                                    "F",
                                    text=repr(e),
                                    except_on_fail=True)
                utils.complete_submission(source_id)
                return

            utils.update_status(
                source_id,
                "data_download",
                "M",
                text=(
                    "{} files will be grouped and extracted (from {} archives)"
                    .format(num_files, dl_res["num_extracted"])),
                except_on_fail=True)
            canon_data_sources = [
                "globus://{}{}".format(CONFIG["LOCAL_EP"], local_path)
            ]

        # If we're not extracting, set canon source to only source
        # Also create local dir with no data to "extract" for dataset entry
        else:
            utils.update_status(source_id,
                                "data_download",
                                "N",
                                except_on_fail=True)
            os.makedirs(local_path)
            canon_data_sources = sub_conf["data_sources"]

        # Move data from canon source(s) to canon dest (if different)
        utils.update_status(source_id,
                            "data_transfer",
                            "P",
                            except_on_fail=True)
        # If not extracting, set up user TC for backup use
        if sub_conf["no_extract"]:
            backup_user_id = user_id
            backup_user_client = user_transfer_client
        else:
            backup_user_id = None
            backup_user_client = None
        for data_source in canon_data_sources:
            if data_source != sub_conf["canon_destination"]:
                logger.debug("Data transfer: '{}' to '{}'".format(
                    data_source, sub_conf["canon_destination"]))
                try:
                    for backup_res in utils.backup_data(
                            mdf_transfer_client,
                            data_source,
                            sub_conf["canon_destination"],
                            acl=sub_conf["storage_acl"],
                            data_client=backup_user_client,
                            data_user=backup_user_id):
                        if not backup_res["success"]:
                            msg = ("During data download: {}".format(
                                backup_res.get("error", "Unknown error")))
                            utils.update_status(source_id,
                                                "data_transfer",
                                                "T",
                                                text=msg,
                                                except_on_fail=True)
                    if not backup_res["success"]:
                        raise ValueError(backup_res.get("error"))
                    elif not backup_res[
                            sub_conf["canon_destination"]]["success"]:
                        raise ValueError(
                            backup_res[sub_conf["canon_destination"]]["error"])
                except Exception as e:
                    err_text = (
                        "Transfer from '{}' to primary/canon destination '{}' failed: {}"
                        .format(data_source, sub_conf["canon_destination"],
                                str(e)))
                    utils.update_status(source_id,
                                        "data_transfer",
                                        "F",
                                        text=err_text,
                                        except_on_fail=True)
                    return
        utils.update_status(source_id,
                            "data_transfer",
                            "S",
                            except_on_fail=True)

        # Add file info data
        sub_conf["index"]["file"] = {
            "globus_host": sub_conf["canon_destination"],
            "http_host": utils.lookup_http_host(sub_conf["canon_destination"]),
            "local_path": local_path,
        }
        extract_params = {
            "dataset":
            metadata,
            "extractors":
            sub_conf["index"],
            "service_data":
            service_data,
            "feedstock_file":
            feedstock_file,
            "group_config":
            mdf_toolbox.dict_merge(sub_conf["extraction_config"],
                                   CONFIG["GROUPING_RULES"]),
            "validation_info": {
                "project_blocks": sub_conf.get("project_blocks", []),
                "required_fields": sub_conf.get("required_fields", []),
                "allowed_nulls": CONFIG["SCHEMA_NULLS"],
                "base_acl": sub_conf["acl"]
            }
        }

        # NOTE: Cancellation point
        if utils.read_table("status", source_id).get("status",
                                                     {}).get("cancelled"):
            logger.debug("{}: Cancel signal acknowledged".format(source_id))
            utils.complete_submission(source_id)
            return

        # Extract data
        utils.update_status(source_id, "extracting", "P", except_on_fail=True)
        try:
            extract_res = start_extractors(local_path, extract_params)
            if not extract_res["success"]:
                utils.update_status(source_id,
                                    "extracting",
                                    "F",
                                    text=extract_res["error"],
                                    except_on_fail=True)
                return
            dataset = extract_res["dataset"]
            num_records = extract_res["num_records"]
            num_groups = extract_res["num_groups"]
            extensions = extract_res["extensions"]
        except Exception as e:
            utils.update_status(source_id,
                                "extracting",
                                "F",
                                text=repr(e),
                                except_on_fail=True)
            utils.complete_submission(source_id)
            return
        else:
            utils.modify_status_entry(source_id, {"extensions": extensions})
            # If nothing in dataset, panic
            if not dataset:
                utils.update_status(source_id,
                                    "extracting",
                                    "F",
                                    text="Could not process dataset entry",
                                    except_on_fail=True)
                utils.complete_submission(source_id)
                return
            # If not extracting, show status as skipped
            # Also check if records were extracted inappropriately, flag error in log
            elif sub_conf.get("no_extract"):
                if num_records != 0:
                    logger.error(
                        "{}: Records extracted with no_extract flag ({} records)"
                        .format(source_id, num_records))
                utils.update_status(source_id,
                                    "extracting",
                                    "N",
                                    except_on_fail=True)
            else:
                utils.update_status(
                    source_id,
                    "extracting",
                    "M",
                    text=("{} metadata records extracted out of {} file groups"
                          .format(num_records, num_groups)),
                    except_on_fail=True)
            logger.debug("{}: {} entries extracted".format(
                source_id, num_records + 1))

        # NOTE: Cancellation point
        if utils.read_table("status", source_id).get("status",
                                                     {}).get("cancelled"):
            logger.debug("{}: Cancel signal acknowledged".format(source_id))
            utils.complete_submission(source_id)
            return

        ###################
        #  Curation step  #
        ###################
        # Trigger curation if required
        if sub_conf.get("curation"):
            utils.update_status(source_id,
                                "curation",
                                "P",
                                except_on_fail=True)
            # Create curation task in curation table
            with open(feedstock_file) as f:
                # Discard dataset entry
                f.readline()
                # Save first few records
                # Append the json-loaded form of records
                # The number of records should be at most the default number,
                # and less if less are present
                curation_records = []
                [
                    curation_records.append(json.loads(f.readline()))
                    for i in range(
                        min(CONFIG["NUM_CURATION_RECORDS"], num_records))
                ]
            curation_dataset = deepcopy(dataset)
            # Numbers can be extracted into Decimal by DynamoDB, which causes JSON errors
            curation_dataset["mdf"].pop("scroll_id", None)
            curation_dataset["mdf"].pop("version", None)
            curation_task = {
                "source_id":
                source_id,
                "allowed_curators":
                sub_conf.get("permission_groups", sub_conf["acl"]),
                "dataset":
                json.dumps(dataset),
                "sample_records":
                json.dumps(curation_records),
                "submission_info":
                sub_conf,
                "extraction_summary":
                ("{} records were extracted out of {} groups from {} files".
                 format(num_records, num_groups, num_files)),
                "curation_start_date":
                str(datetime.today())
            }
            # If no allowed curators or public allowed, set to public
            if (not curation_task["allowed_curators"]
                    or "public" in curation_task["allowed_curators"]):
                curation_task["allowed_curators"] = ["public"]

            # Create task in database
            create_res = utils.create_curation_task(curation_task)
            if not create_res["success"]:
                utils.update_status(source_id,
                                    "curation",
                                    "F",
                                    text=create_res.get(
                                        "error",
                                        "Unable to create curation task"),
                                    except_on_fail=True)
                return

            # Save state
            os.makedirs(CONFIG["CURATION_DATA"], exist_ok=True)
            with open(curation_state_file, 'w') as save_file:
                state_data = {
                    "source_id": source_id,
                    "sub_conf": sub_conf,
                    "dataset": dataset
                }
                json.dump(state_data, save_file)
                logger.debug("{}: Saved state for curation".format(source_id))

            # Trigger hibernation
            utils.modify_status_entry(source_id, {"hibernating": True},
                                      except_on_fail=True)
            return
        else:
            utils.update_status(source_id,
                                "curation",
                                "N",
                                except_on_fail=True)

    # Returning from curation
    # Submission accepted
    elif sub_conf["curation"].startswith("Accept"):
        # Save curation message
        curation_message = sub_conf["curation"]
        # Load state
        with open(curation_state_file) as save_file:
            state_data = json.load(save_file)
            # Verify source_ids match
            if state_data["source_id"] != source_id:
                logger.error("State data incorrect: '{}' is not '{}'".format(
                    state_data["source_id"], source_id))
                utils.update_status(source_id,
                                    "curation",
                                    "F",
                                    text="Submission corrupted",
                                    except_on_fail=True)
                return
            # Load state variables back
            sub_conf = state_data["sub_conf"]
            dataset = state_data["dataset"]
        logger.debug("{}: Loaded state from curation".format(source_id))
        # Delete state file
        try:
            os.remove(curation_state_file)
        except FileNotFoundError:
            utils.update_status(
                source_id,
                "curation",
                "F",
                text="Unable to cleanly load curation information",
                except_on_fail=True)
            return

        # Delete curation task
        delete_res = utils.delete_from_table("curation", source_id)
        if not delete_res["success"]:
            utils.update_status(source_id,
                                "curation",
                                "F",
                                text=delete_res.get("error",
                                                    "Curation cleanup failed"),
                                except_on_fail=True)
            return
        utils.update_status(source_id,
                            "curation",
                            "M",
                            text=curation_message,
                            except_on_fail=True)
    # Submission rejected
    elif sub_conf["curation"].startswith("Reject"):
        # Delete state file
        try:
            os.remove(curation_state_file)
        except FileNotFoundError:
            logger.error(
                "{}: Unable to delete curation state file '{}'".format(
                    source_id, curation_state_file))
        # Delete curation task
        delete_res = utils.delete_from_table("curation", source_id)
        if not delete_res["success"]:
            logger.error(
                "{}: Unable to delete rejected curation from database: {}".
                format(source_id, delete_res.get("error")))

        utils.update_status(source_id,
                            "curation",
                            "F",
                            text=sub_conf["curation"],
                            except_on_fail=True)
        return
    # Curation invalid
    else:
        utils.update_status(source_id,
                            "curation",
                            "F",
                            text="Unknown curation state: '{}'".format(
                                sub_conf["curation"]),
                            except_on_fail=True)
        return

    ###################
    #  Post-curation  #
    ###################

    # Integrations
    service_res = {}

    # NOTE: Cancellation point
    if utils.read_table("status", source_id).get("status",
                                                 {}).get("cancelled"):
        logger.debug("{}: Cancel signal acknowledged".format(source_id))
        utils.complete_submission(source_id)
        return

    # MDF Search (mandatory)
    utils.update_status(source_id, "ingest_search", "P", except_on_fail=True)
    search_config = sub_conf["services"].get("mdf_search", {})
    try:
        search_args = {
            "feedstock_file": feedstock_file,
            "source_id": source_id,
            "index": search_config.get("index", CONFIG["INGEST_INDEX"]),
            "delete_existing": True,
            "batch_size": CONFIG["SEARCH_BATCH_SIZE"]
        }
        search_res = utils.search_ingest(**search_args)
        if not search_res["success"]:
            utils.update_status(source_id,
                                "ingest_search",
                                "F",
                                text="; ".join(search_res["errors"]),
                                except_on_fail=True)
            return
    except Exception as e:
        utils.update_status(source_id,
                            "ingest_search",
                            "F",
                            text=repr(e),
                            except_on_fail=True)
        utils.complete_submission(source_id)
        return
    else:
        # Handle errors
        if len(search_res["errors"]) > 0:
            utils.update_status(
                source_id,
                "ingest_search",
                "F",
                text=(
                    "{} batches of records failed to ingest (up to {} records "
                    "total)").format(len(search_res["errors"]),
                                     (len(search_res["errors"]) *
                                      CONFIG["SEARCH_BATCH_SIZE"])),
                except_on_fail=True)
            utils.complete_submission(source_id)
            return

        utils.update_status(source_id,
                            "ingest_search",
                            "S",
                            except_on_fail=True)
        os.remove(feedstock_file)
        service_res["mdf_search"] = "This dataset was ingested to MDF Search."

    # Move files to data_destinations
    if sub_conf.get("data_destinations"):
        utils.update_status(source_id,
                            "ingest_backup",
                            "P",
                            except_on_fail=True)
        try:
            for backup_res in utils.backup_data(
                    mdf_transfer_client,
                    storage_loc=sub_conf["canon_destination"],
                    backup_locs=sub_conf["data_destinations"],
                    acl=sub_conf["storage_acl"]):
                if not backup_res["success"]:
                    msg = "During data backup: " + backup_res.get(
                        "error", "Unknown error")
                    utils.update_status(source_id,
                                        "ingest_backup",
                                        "T",
                                        text=msg,
                                        except_on_fail=True)
            if not backup_res["success"]:
                raise ValueError(backup_res.get("error"))
        except Exception as e:
            err_msg = "Destination backup failed: {}".format(str(e))
            utils.update_status(source_id,
                                "ingest_backup",
                                "F",
                                text=err_msg,
                                except_on_fail=True)
            return
        # On any complete failure, fail submission
        if not all([val["success"] is True for val in backup_res.values()]):
            err_msg = "; ".join([
                "'{}' failed: {}".format(k, v["error"])
                for k, v in backup_res.items() if v["success"] is not True
            ])
            utils.update_status(source_id,
                                "ingest_backup",
                                "F",
                                text=err_msg,
                                except_on_fail=True)
            return
        # On an error with a successful Transfer, notify user but continue
        elif not all([val["error"] == "" for val in backup_res.values()]):
            err_msg = "; ".join([
                "on '{}': {}".format(k, v["error"])
                for k, v in backup_res.items() if v["error"]
            ])
            utils.update_status(source_id,
                                "ingest_backup",
                                "R",
                                text=err_msg,
                                except_on_fail=True)
        else:
            utils.update_status(source_id,
                                "ingest_backup",
                                "S",
                                except_on_fail=True)
    else:
        utils.update_status(source_id,
                            "ingest_backup",
                            "N",
                            except_on_fail=True)

    # MDF Publish
    if sub_conf["services"].get("mdf_publish"):
        publish_conf = sub_conf["services"]["mdf_publish"]

        # Data already moved to canon dest as a requirement of success so far

        # Mint DOI
        try:
            # Create DOI and add to dataset DC
            dataset["dc"]["identifier"] = {
                "identifier": utils.make_dc_doi(test=publish_conf["doi_test"]),
                "identifierType": "DOI"
            }
            # Add publication dates and publisher
            dataset["dc"]["publisher"] = "Materials Data Facility"
            dataset["dc"]["publicationYear"] = datetime.now().year
            if not dataset["dc"].get("dates"):
                dataset["dc"]["dates"] = []
            dataset["dc"]["dates"].append({
                "date": str(datetime.now().date()),
                "dateType": "Accepted"
            })
            landing_page = CONFIG["DATASET_LANDING_PAGE"].format(source_id)
            mdf_publish_res = utils.datacite_mint_doi(
                dataset["dc"], test=publish_conf["doi_test"], url=landing_page)
        except Exception as e:
            logger.error("DOI minting exception: {}".format(repr(e)))
            utils.update_status(source_id,
                                "ingest_publish",
                                "F",
                                text="DOI minting failed",
                                except_on_fail=True)
            return
        else:
            if not mdf_publish_res["success"]:
                logger.error("DOI minting failed: {}".format(
                    mdf_publish_res["error"]))
                utils.update_status(source_id,
                                    "ingest_publish",
                                    "F",
                                    text="Unable to mint DOI for publication",
                                    except_on_fail=True)
                return

        utils.update_status(
            source_id,
            "ingest_publish",
            "L",
            text=("Dataset published though MDF Publish with DOI '{}'".format(
                dataset["dc"]["identifier"]["identifier"])),
            link=landing_page,
            except_on_fail=True)
        service_res["mdf_publish"] = landing_page

    else:
        utils.update_status(source_id,
                            "ingest_publish",
                            "N",
                            except_on_fail=True)

    # Citrine (skip if not extracted)
    if sub_conf["services"].get("citrine") and not sub_conf.get("no_extract"):
        utils.update_status(source_id,
                            "ingest_citrine",
                            "P",
                            except_on_fail=True)

        # Get old Citrine dataset version, if exists
        scan_res = utils.scan_table(table_name="status",
                                    fields=["source_id", "citrine_id"],
                                    filters=[("source_name", "==",
                                              source_info["source_name"]),
                                             ("citrine_id", "!=", None)])
        if not scan_res["success"]:
            logger.error("Status scan failed: {}".format(scan_res["error"]))
        old_cit_subs = scan_res.get("results", [])
        if len(old_cit_subs) == 0:
            old_citrine_id = None
        elif len(old_cit_subs) == 1:
            old_citrine_id = old_cit_subs[0]["citrine_id"]
        else:
            old_citrine_id = max([sub["citrine_id"] for sub in old_cit_subs])

        try:
            # Check for PIFs to ingest
            cit_path = os.path.join(service_data, "citrine")
            if len(os.listdir(cit_path)) > 0:
                cit_res = utils.citrine_upload(
                    cit_path,
                    CONFIG["CITRINATION_API_KEY"],
                    dataset,
                    old_citrine_id,
                    public=sub_conf["services"]["citrine"].get("public", True))
            else:
                cit_res = {
                    "success": False,
                    "error": "No PIFs were generated from this dataset",
                    "success_count": 0,
                    "failure_count": 0
                }
        except Exception as e:
            utils.update_status(source_id,
                                "ingest_citrine",
                                "R",
                                text=str(e),
                                except_on_fail=True)
        else:
            if not cit_res["success"]:
                if cit_res.get("error"):
                    text = cit_res["error"]
                elif cit_res.get("failure_count"):
                    text = "All {} PIFs failed to upload".format(
                        cit_res["failure_count"])
                elif cit_res.get("failure_count") == 0:
                    text = "No PIFs were found"
                    logger.warning("{}: PIFs not found!".format(source_id))
                else:
                    text = "An error prevented PIF uploading"
                utils.update_status(source_id,
                                    "ingest_citrine",
                                    "R",
                                    text=text,
                                    except_on_fail=True)
            else:
                text = "{}/{} PIFs uploaded successfully".format(
                    cit_res["success_count"],
                    cit_res["success_count"] + cit_res["failure_count"])
                link = CONFIG["CITRINATION_LINK"].format(
                    cit_ds_id=cit_res["cit_ds_id"])
                utils.update_status(source_id,
                                    "ingest_citrine",
                                    "L",
                                    text=text,
                                    link=link,
                                    except_on_fail=True)
                stat_res_2 = utils.modify_status_entry(
                    source_id, {"citrine_id": cit_res["cit_ds_id"]})
                if not stat_res_2["success"]:
                    raise ValueError(str(stat_res_2))
                service_res["citrine"] = link
    else:
        utils.update_status(source_id,
                            "ingest_citrine",
                            "N",
                            except_on_fail=True)

    # MRR
    if sub_conf["services"].get("mrr"):
        utils.update_status(source_id, "ingest_mrr", "P", except_on_fail=True)
        try:
            if (isinstance(sub_conf["services"]["mrr"], dict)
                    and sub_conf["services"]["mrr"].get("test")):
                mrr_title = "TEST_" + dataset["dc"]["titles"][0]["title"]
            else:
                mrr_title = dataset["dc"]["titles"][0]["title"]
            mrr_contributors = ""
            for author in dataset["dc"]["creators"]:
                mrr_contributors += CONFIG["MRR_CONTRIBUTOR"].format(
                    name=(author.get("givenName", "") + " " +
                          author.get("familyName", "")),
                    affiliation=author.get("affiliation", ""))
            mrr_description = ""
            for desc in dataset["dc"].get("descriptions", []):
                mrr_description += desc["description"] + " "
            # Must add at least one subject to MRR entry
            mrr_subjects = "<subject>MDF Dataset</subject>"
            for subj in dataset["dc"].get("subjects", []):
                mrr_subjects += "<subject>" + subj["subject"] + "</subject>"
            mrr_entry = {
                "title":
                dataset["dc"]["titles"][0]["title"],
                "template":
                CONFIG["MRR_SCHEMA"],
                "xml_content":
                CONFIG["MRR_TEMPLATE"].format(
                    title=mrr_title,
                    publisher=dataset["dc"]["publisher"],
                    contributors=mrr_contributors,
                    contact_name=dataset["dc"]["creators"][0]["creatorName"],
                    description=mrr_description,
                    subjects=mrr_subjects,
                    landing_page=CONFIG["DATASET_LANDING_PAGE"].format(
                        source_id))
            }
        except Exception as e:
            utils.update_status(source_id,
                                "ingest_mrr",
                                "R",
                                text="Unable to create MRR metadata:" +
                                repr(e),
                                except_on_fail=True)
        else:
            try:
                mrr_res_raw = requests.post(CONFIG["MRR_URL"],
                                            auth=(CONFIG["MRR_USERNAME"],
                                                  CONFIG["MRR_PASSWORD"]),
                                            data=mrr_entry)
                try:
                    mrr_res = mrr_res_raw.json()
                except json.JSONDecodeError:
                    raise ValueError("Invalid MRR response: {}".format(
                        mrr_res_raw.content))

                if mrr_res_raw.status_code not in [201, 202]:
                    raise ValueError(
                        "MRR ingest failed with error code {}: '{}'".format(
                            mrr_res_raw.status_code, mrr_res))
            except Exception as e:
                utils.update_status(source_id,
                                    "ingest_mrr",
                                    "R",
                                    text="Unable to submit MRR entry: " +
                                    repr(e),
                                    except_on_fail=True)
            else:
                try:
                    mrr_id = mrr_res.get("id")
                    if not mrr_id:
                        raise ValueError("MRR entry has no ID")
                except Exception:
                    utils.update_status(source_id,
                                        "ingest_mrr",
                                        "R",
                                        text=mrr_res.get(
                                            "message", "Unknown MRR failure"),
                                        except_on_fail=True)
                else:
                    text = "Dataset successfully registered with the MRR"
                    mrr_link = CONFIG["MRR_LINK"].format(mrr_id)
                    utils.update_status(source_id,
                                        "ingest_mrr",
                                        "L",
                                        text=text,
                                        link=mrr_link,
                                        except_on_fail=True)
                    service_res["mrr"] = mrr_link
    else:
        utils.update_status(source_id, "ingest_mrr", "N", except_on_fail=True)

    # Dataset update, start cleanup
    utils.update_status(source_id, "ingest_cleanup", "P", except_on_fail=True)

    dataset["services"] = service_res
    ds_update = utils.update_search_entries(search_config.get(
        "index", CONFIG["INGEST_INDEX"]),
                                            entries=[dataset],
                                            overwrite=False)
    if not ds_update["success"]:
        utils.update_status(source_id,
                            "ingest_cleanup",
                            "F",
                            text=ds_update.get("error",
                                               "Unable to update dataset"),
                            except_on_fail=True)
        utils.complete_submission(source_id)
        return

    # Cleanup
    try:
        fin_res = utils.complete_submission(source_id,
                                            cleanup=CONFIG["FINAL_CLEANUP"])
    except Exception as e:
        utils.update_status(source_id,
                            "ingest_cleanup",
                            "F",
                            text=repr(e),
                            except_on_fail=True)
        return
    if not fin_res["success"]:
        utils.update_status(source_id,
                            "ingest_cleanup",
                            "F",
                            text=fin_res["error"],
                            except_on_fail=True)
        return
    utils.update_status(source_id, "ingest_cleanup", "S", except_on_fail=True)

    logger.debug("{}: Ingest complete".format(source_id))
    return {"success": True, "source_id": source_id}