def citrine_upload(citrine_data,
                   api_key,
                   mdf_dataset,
                   previous_id=None,
                   public=True):
    import os
    from citrination_client import CitrinationClient

    cit_client = CitrinationClient(api_key).data
    source_id = mdf_dataset.get("mdf", {}).get("source_id", "NO_ID")
    try:
        cit_title = mdf_dataset["dc"]["titles"][0]["title"]
    except (KeyError, IndexError, TypeError):
        cit_title = "Untitled"
    try:
        cit_desc = " ".join([
            desc["description"] for desc in mdf_dataset["dc"]["descriptions"]
        ])
        if not cit_desc:
            raise KeyError
    except (KeyError, IndexError, TypeError):
        cit_desc = None

    # Create new version if dataset previously created
    if previous_id:
        try:
            rev_res = cit_client.create_dataset_version(previous_id)
            assert rev_res.number > 1
        except Exception:
            previous_id = "INVALID"
        else:
            cit_ds_id = previous_id
            cit_client.update_dataset(cit_ds_id,
                                      name=cit_title,
                                      description=cit_desc,
                                      public=False)
    # Create new dataset if not created
    if not previous_id or previous_id == "INVALID":
        try:
            cit_ds_id = cit_client.create_dataset(name=cit_title,
                                                  description=cit_desc,
                                                  public=False).id
            assert cit_ds_id > 0
        except Exception as e:
            print("{}: Citrine dataset creation failed: {}".format(
                source_id, repr(e)))
            if previous_id == "INVALID":
                return {
                    "success": False,
                    "error":
                    "Unable to create revision or new dataset in Citrine"
                }
            else:
                return {
                    "success":
                    False,
                    "error":
                    "Unable to create Citrine dataset, possibly due to duplicate entry"
                }

    success = 0
    failed = 0
    for path, _, files in os.walk(os.path.abspath(citrine_data)):
        for pif in files:
            up_res = cit_client.upload(cit_ds_id, os.path.join(path, pif))
            if up_res.successful():
                success += 1
            else:
                print("{}: Citrine upload failure: {}".format(
                    source_id, str(up_res)))
                failed += 1

    cit_client.update_dataset(cit_ds_id, public=public)

    return {
        "success": bool(success),
        "cit_ds_id": cit_ds_id,
        "success_count": success,
        "failure_count": failed
    }
Exemplo n.º 2
0
def begin_convert(mdf_dataset, status_id):
    """Pull, back up, and convert metadata."""
    # Setup
    creds = {
        "app_name": "MDF Open Connect",
        "client_id": app.config["API_CLIENT_ID"],
        "client_secret": app.config["API_CLIENT_SECRET"],
        "services": ["transfer", "publish"]
    }
    clients = toolbox.confidential_login(creds)
    mdf_transfer_client = clients["transfer"]
    globus_publish_client = clients["publish"]

    # Download data locally, back up on MDF resources
    dl_res = download_and_backup(mdf_transfer_client,
                                 mdf_dataset.pop("data", {}), status_id)
    if dl_res["success"]:
        local_path = dl_res["local_path"]
        backup_path = dl_res["backup_path"]
    else:
        raise IOError("No data downloaded")
    # TODO: Update status - data downloaded
    print("DEBUG: Data downloaded")

    print("DEBUG: Conversions started")
    # Pop indexing args
    parse_params = mdf_dataset.pop("index", {})
    add_services = mdf_dataset.pop("services", [])

    # TODO: Stream data into files instead of holding feedstock in memory
    feedstock = [mdf_dataset]

    # tags = [sub["subject"] for sub in mdf_dataset.get("dc", {}).get("subjects", [])]
    # key_info = get_key_matches(tags or None)

    # List of all files, for bag
    all_files = []

    # Citrination setup
    cit_manager = IngesterManager()
    cit_client = CitrinationClient(app.config["CITRINATION_API_KEY"])
    # Get title and description
    try:
        cit_title = mdf_dataset["dc"]["titles"][0]["title"]
    except (KeyError, IndexError):
        cit_title = "Untitled"
    try:
        cit_desc = " ".join([
            desc["description"] for desc in mdf_dataset["dc"]["descriptions"]
        ])
        if not cit_desc:
            raise KeyError
    except (KeyError, IndexError):
        cit_desc = None
    cit_ds = cit_client.create_data_set(name=cit_title,
                                        description=cit_desc,
                                        share=0).json()
    cit_ds_id = cit_ds["id"]
    print("DEBUG: Citrine dataset ID:", cit_ds_id)

    for path, dirs, files in os.walk(os.path.abspath(local_path)):
        # Separate files into groups, process group as unit
        for group in group_files(files):
            # Get all file metadata
            group_file_md = [
                get_file_metadata(file_path=os.path.join(path, filename),
                                  backup_path=os.path.join(
                                      backup_path,
                                      path.replace(os.path.abspath(local_path),
                                                   ""), filename))
                for filename in group
            ]
            all_files.extend(group_file_md)

            group_paths = [os.path.join(path, filename) for filename in group]

            # MDF parsing
            mdf_res = omniparser.omniparse(group_paths, parse_params)

            # Citrine parsing
            cit_pifs = cit_manager.run_extensions(
                group_paths,
                include=None,
                exclude=[],
                args={"quality_report": False})
            if not isinstance(cit_pifs, list):
                cit_pifs = [cit_pifs]
            cit_full = []
            if len(cit_pifs) > 0:
                cit_res = []
                # Add UIDs
                cit_pifs = cit_utils.set_uids(cit_pifs)
                for pif in cit_pifs:
                    # Get PIF URL
                    pif_land_page = {
                        "mdf": {
                            "landing_page": cit_utils.get_url(pif, cit_ds_id)
                        }
                    } if cit_ds_id else {}
                    # Make PIF into feedstock and save
                    cit_res.append(
                        toolbox.dict_merge(pif_to_feedstock(pif),
                                           pif_land_page))
                    # Add DataCite metadata
                    pif = add_dc(pif, mdf_dataset.get("dc", {}))

                    cit_full.append(pif)
            else:  # No PIFs parsed
                # TODO: Send failed datatype to Citrine for logging
                # Pad cit_res to the same length as mdf_res for "merging"
                cit_res = [{} for i in range(len(mdf_res))]

            # If MDF parser failed to parse group, pad mdf_res to match PIF count
            if len(mdf_res) == 0:
                mdf_res = [{} for i in range(len(cit_res))]

            # If only one mdf record was parsed, merge all PIFs into that record
            if len(mdf_res) == 1:
                merged_cit = {}
                [toolbox.dict_merge(merged_cit, cr) for cr in cit_res]
                mdf_records = [toolbox.dict_merge(mdf_res[0], merged_cit)]
            # If the same number of MDF records and Citrine PIFs were parsed, merge in order
            elif len(mdf_res) == len(cit_res):
                mdf_records = [
                    toolbox.dict_merge(r_mdf, r_cit)
                    for r_mdf, r_cit in zip(mdf_res, cit_res)
                ]
            # Otherwise, keep the MDF records only
            else:
                print("DEBUG: Record mismatch:\nMDF parsed", len(mdf_res),
                      "records", "\nCitrine parsed", len(cit_res), "records"
                      "\nPIFs discarded")
                # TODO: Update status/log - Citrine records discarded
                mdf_records = mdf_res

            # Filter null records, save rest
            if not mdf_records:
                print("DEBUG: No MDF records in group:", group)
            [
                feedstock.append(
                    toolbox.dict_merge(record, {"files": group_file_md}))
                for record in mdf_records if record
            ]

            # Upload PIFs to Citrine
            for full_pif in cit_full:
                with tempfile.NamedTemporaryFile(mode="w+") as pif_file:
                    pif_dump(full_pif, pif_file)
                    pif_file.seek(0)
                    up_res = json.loads(
                        cit_client.upload(cit_ds_id, pif_file.name))
                    if up_res["success"]:
                        print("DEBUG: Citrine upload success")
                    else:
                        print("DEBUG: Citrine upload failure, error",
                              up_res.get("status"))

    # TODO: Update status - indexing success
    print("DEBUG: Indexing success")

    # Pass feedstock to /ingest
    with tempfile.TemporaryFile(mode="w+") as stock:
        for entry in feedstock:
            json.dump(entry, stock)
            stock.write("\n")
        stock.seek(0)
        ingest_res = requests.post(app.config["INGEST_URL"],
                                   data={"status_id": status_id},
                                   files={'file': stock})
    if not ingest_res.json().get("success"):
        # TODO: Update status? Ingest failed
        # TODO: Fail everything, delete Citrine dataset, etc.
        raise ValueError("In convert - Ingest failed" + str(ingest_res.json()))

    # Additional service integrations

    # Finalize Citrine dataset
    # TODO: Turn on public dataset ingest (share=1)
    if "citrine" in add_services:
        try:
            cit_client.update_data_set(cit_ds_id, share=0)
        except Exception as e:
            # TODO: Update status, notify Citrine - Citrine ds failure
            print("DEBUG: Citrination dataset not updated")

    # Globus Publish
    # TODO: Test after Publish API is fixed
    if "globus_publish" in add_services:
        try:
            fin_res = globus_publish_data(globus_publish_client,
                                          mdf_transfer_client, mdf_dataset,
                                          local_path)
        except Exception as e:
            # TODO: Update status - Publish failed
            print("Publish ERROR:", repr(e))
        else:
            # TODO: Update status - Publish success
            print("DEBUG: Publish success:", fin_res)

    # Remove local data
    shutil.rmtree(local_path)
    # TODO: Update status - everything done
    return {"success": True, "status_id": status_id}