Exemplo n.º 1
0
def add_citation(collection, token, production=True):
    """Add in example citation text in the description field"""
    keys = dataset.keys(collection)
    for k in keys:
        record, err = dataset.read(collection, k)
        if err != "":
            print(err)
            exit()
        description = record["descriptions"]
        cite_exists = False
        for d in description:
            descr_text = d["description"]
            if descr_text.startswith("<br>Cite this record as:"):
                cite_exists = True
        if cite_exists == False:
            record_doi = record["identifier"]["identifier"]
            headers = {"Accept": "text/x-bibliography; style=apa"}
            citation_link = "https://doi.org/"
            citation = requests.get(citation_link + record_doi, headers=headers).text
            doi_url = "https://doi.org/" + record_doi
            if doi_url in citation:
                # Check that we have a citation and not a server error,
                # otherwise wait till next time
                n_txt = citation_text(citation, doi_url, record_doi)
                description.append({"descriptionType": "Other", "description": n_txt})
                response = caltechdata_edit(
                    token, k, {"descriptions": description}, {}, {}, production
                )
                print(response)
Exemplo n.º 2
0
def build_usage(caltechdata_collection, usage_collection):
    """Build collection of records that contain CaltechDATA usage
    information"""
    if not os.path.isdir(usage_collection):
        if not dataset.init(usage_collection):
            print("Dataset failed to init collection")
            exit()
        # Write date to start collecting statistics for new collection
        dataset.create(usage_collection, "end-date", {"end-date": 1485907200})
    # Build out structure for all CaltechDATA records
    ids = dataset.keys(caltechdata_collection)
    for k in ids:
        if dataset.has_key(usage_collection, k) == False:
            metadata, err = dataset.read(caltechdata_collection, k)
            # When record was submitted to CaltechDATA:
            rdate = None
            submitted = None
            issued = None
            if "dates" in metadata:
                doi = metadata["identifier"]["identifier"]
                for date in metadata["dates"]:
                    if date["dateType"] == "Submitted":
                        rdate = date["date"]
                    if date["dateType"] == "Updated":
                        submitted = date["date"]
                    if date["dateType"] == "Issued":
                        issued = date["date"]
                if rdate == None:
                    if submitted != None:
                        rdate = submitted
                    else:
                        rdate = issued
            else:
                # Dummy values for junk records
                rdate = "2020-04-01"
                doi = ""
            # Dataset is the only supported type in the spec and we are
            # following the dataset standards for usage
            # All dates are the date added to CaltechDATA, which is
            # the apropriate 'publication' date even if content was available
            # earlier
            record_data = {
                "dataset-id": [{"type": "doi", "value": doi}],
                "uri": "https://data.caltech.edu/records/" + k,
                "publisher": "CaltechDATA",
                "platform": "CaltechDATA",
                "publisher-id": [{"type": "grid", "value": "grid.20861.3d"}],
                "yop": rdate.split("-")[0],
                "data-type": "dataset",
                "dataset-dates": [{"type": "pub-date", "value": rdate}],
                "dataset-title": metadata["titles"][0]["title"],
                "performance": [],
                "grand-total-unique-investigations": 0,
                "grand-total-unique-requests": 0,
            }
            if not dataset.create(usage_collection, k, record_data):
                err = dataset.error_message()
                print(err)
                exit()
Exemplo n.º 3
0
def test_join(t, collection_name):
    key = "test_join1"
    obj1 = {"one": 1}
    obj2 = {"two": 2}
    if dataset.status(collection_name) == False:
        t.error("Failed, collection status is False,", collection_name)
        return
    ok = dataset.has_key(collection_name, key)
    err = ''
    if ok == True:
        ok = dataset.update(collection_nane, key, obj1)
    else:
        ok = dataset.create(collection_name, key, obj1)
    if ok == False:
        err = dataset.error_message()
        t.error(
            f'Failed, could not add record for test ({collection_name}, {key}, {obj1}), {err}'
        )
        return
    if dataset.join(collection_name, key, obj2, overwrite=False) == False:
        err = dataset.error_message()
        t.error(
            f'Failed, join for {collection_name}, {key}, {obj2}, overwrite = False -> {err}'
        )
    obj_result, err = dataset.read(collection_name, key)
    if err != '':
        t.error(f'Unexpected error for {key} in {collection_name}, {err}')
    if obj_result.get('one') != 1:
        t.error(f'Failed to join append key {key}, {obj_result}')
    if obj_result.get("two") != 2:
        t.error(f'Failed to join append key {key}, {obj_result}')
    obj2['one'] = 3
    obj2['two'] = 3
    obj2['three'] = 3
    if dataset.join(collection_name, key, obj2, overwrite=True) == False:
        err = dataset.error_message()
        t.error(
            f'Failed to join overwrite {collection_name}, {key}, {obj2}, overwrite = True -> {err}'
        )
    obj_result, err = dataset.read(collection_name, key)
    if err != '':
        t.error(f'Unexpected error for {key} in {collection_name}, {err}')
    for k in obj_result:
        if k != '_Key' and obj_result[k] != 3:
            t.error('Failed to update value in join overwrite', k, obj_result)
Exemplo n.º 4
0
def submit_report(month_collection,
                  keys,
                  token,
                  production,
                  prefix=None,
                  org="Caltech_Library"):
    for k in keys:
        datasets, err = dataset.read(month_collection, k, clean_object=True)
        if err != "":
            print(err)
        datasets = datasets["report-datasets"]
        dates = datasets[0]["performance"][0]["period"]
        if prefix != None:
            filtered = []
            for d in datasets:
                rec_prefix = d["dataset-id"][0]["value"].split("/")[0]
                if rec_prefix in prefix:
                    filtered.append(d)
            datasets = filtered
        # Build report structure
        today = date.today().isoformat()
        report = {
            "report-header": {
                "report-name": "dataset report",
                "report-id": "DSR",
                "release": "rd1",
                "report-filters": [],
                "report-attributes": [],
                "exceptions": [],
                "created-by": org,
                "created": today,
                "reporting-period": {
                    "begin-date": dates["begin-date"],
                    "end-date": dates["end-date"],
                },
            },
            "report-datasets": datasets,
        }
        if production:
            url = "https://api.datacite.org/reports/"
        else:
            url = "https://api.test.datacite.org/reports/"
        headers = {
            "Content-Type": "application/json",
            "Accept": "application/json",
            "Authorization": "Bearer %s" % token,
        }
        r = requests.post(url, headers=headers, json=report)
        if r.status_code != 201:
            print(r.text)
            print(report)
        else:
            print(r.json()["report"]["id"])
Exemplo n.º 5
0
def make_link_history(collection, resolver, url, note):
    """Make an entry in our link history collection"""
    now = datetime.today().isoformat()
    # Run checks on both resoler and final URL
    try:
        target = requests.get(url)
    except requests.exceptions.ConnectionError:
        target = requests.Response()
        target.status_code = 404
        target.url = ''
    if target.status_code != 200:
        print(
            f"Target URL {url} returns Error status code {target.status_code}")
    if links_differ(target.url, url):
        print(f"Target URL '{url}' redirects to '{target.url}'")
    try:
        get = requests.get(f"https://resolver.library.caltech.edu/{resolver}")
    except requests.exceptions.ConnectionError:
        get = requests.Response()
        get.status_code = 404
        get.url = ''
    if links_differ(get.url, url):
        print(f"Mismatch between expected url '{url}' and actual '{get.url}'")
    if get.status_code != 200:
        print(
            f"Resolver URL ({resolver}) '{get.url}' returns Error status code {get.status_code}"
        )
    entry = {
        "expected-url": url,
        "url": get.url,
        "modified": now,
        "code": get.status_code,
        "note": note,
    }
    # If existing, push into history
    if dataset.has_key(collection, resolver):
        existing, err = dataset.read(collection, resolver)
        if err != "":
            print(err)
            exit()
        if save_history(existing, url, get):
            past_history = existing.pop("history")
            past_history.append(existing)
            entry["history"] = past_history
            if not dataset.update(collection, resolver, entry):
                print(dataset.error_message())
                exit()
    else:
        entry["history"] = []
        if not dataset.create(collection, resolver, entry):
            print(dataset.error_message())
            exit()
Exemplo n.º 6
0
def update_datacite_metadata(collection, token, access):
    """Access contains username, password, and prefix for DataCite"""
    keys = dataset.keys(collection)
    for a in access:

        username = a["username"]
        password = a["password"]
        prefix = a["prefix"]

        # Initialize the MDS client.
        d = DataCiteMDSClient(
            username=username,
            password=password,
            prefix=prefix,
            url="https://mds.datacite.org",
        )

        for k in keys:
            print(k)
            metadata, err = dataset.read(collection, k)
            if err != "":
                print(err)
                exit()
            # Get rid of Key from dataset
            metadata.pop("_Key")

            if "identifier" in metadata:
                record_doi = metadata["identifier"]["identifier"]

                # Handle records with 4.3 metadata elements
                if "schemaVersion" in metadata:
                    metadata.pop("schemaVersion")
                if "types" in metadata:
                    metadata.pop("types")

                if record_doi.split("/")[0] == prefix:
                    result = schema40.validate(metadata)
                    # Debugging if this fails
                    if result == False:
                        print(metadata)
                        v = schema40.validator.validate(metadata)
                        errors = sorted(v.iter_errors(instance),
                                        key=lambda e: e.path)
                        for error in errors:
                            print(error.message)
                        exit()

                    xml = schema40.tostring(metadata)

                    response = d.metadata_post(xml)
                    print(response)
Exemplo n.º 7
0
def get_multiple_links(input_collection, output_collection):
    keys = dataset.keys(input_collection)
    for k in keys:
        record, err = dataset.read(input_collection, k)
        if err != "":
            print(err)
            exit()
        if "relatedIdentifiers" in record:
            idvs = []
            for idv in record["relatedIdentifiers"]:
                idvs.append(idv["relatedIdentifier"])
            for idv in record["relatedIdentifiers"]:
                count = idvs.count(idv["relatedIdentifier"])
                if count > 1:
                    print("DUPE")
                    print(k)
                    print(idv["relatedIdentifier"])
Exemplo n.º 8
0
def match_codemeta():
    collection = "github_records.ds"
    keys = dataset.keys(collection)
    for k in keys:
        existing, err = dataset.read(collection, k)
        if err != "":
            print(f"Unexpected error on read: {err}")
        if "completed" not in existing:
            print("Processing new record ", k)
            if dataset.attachments(collection, k) != "":
                dataset.detach(collection, k)

                # Update CaltechDATA
                token = os.environ["TINDTOK"]

                infile = open("codemeta.json", "r")
                try:
                    meta = json.load(infile)
                except:
                    print("Invalid json file - Skipping forever ", k)
                else:
                    standardized = codemeta_to_datacite(meta)

                    # Check that all records have a GitHub subject tag
                    add = True
                    for s in standardized["subjects"]:
                        if s["subject"] == "Github":
                            add = False
                        if s["subject"] == "GitHub":
                            add = False
                    if add == True:
                        standardized["subjects"].append({"subject": "GitHub"})
                    response = caltechdata_edit(token, k, standardized, {}, {}, True)
                    print(response)
                os.system("rm codemeta.json")

            existing["completed"] = "True"
            if not dataset.update(collection, k, existing):
                err = dataset.error_message()
                print(f"Unexpected error on read: {err}")
Exemplo n.º 9
0
def get_history(collection, caltechdata_collection, caltechdata_keys):
    """Harvest the history of records from CaltechDATA."""

    keys_to_update = []
    if os.path.exists("historyupdate"):
        with open("historyupdate", "r") as infile:
            update = date.fromisoformat(infile.read())
    else:
        # Arbitrary old date - everything will be updated
        update = date(2011, 1, 1)
    for k in progressbar(caltechdata_keys, redirect_stdout=True):
        existing, err = dataset.read(caltechdata_collection, k)
        if err != "":
            print(f"Unexpected error on read: {err}")
        record_update = datetime.fromisoformat(existing["updated"]).date()
        if record_update > update:
            keys_to_update.append(k)

    if not os.path.isdir(collection):
        if not dataset.init(collection):
            print("Dataset failed to init collection")
            exit()

    base_url = "https://data.caltech.edu/records/"

    for k in progressbar(keys_to_update):
        url = base_url + str(k) + "/revisions"
        response = requests.get(url)
        revisions = response.json()
        for num, metadata in enumerate(revisions):
            key = f"{k}-{num}"
            if dataset.has_key(collection, key) == False:
                dataset.create(collection, key, metadata)

    # Save date in file
    today = date.today().isoformat()
    with open("historyupdate", "w") as outfile:
        outfile.write(today)
def add_files(collection):
    #Run through all elements in collection
    keys = dataset.keys(collection)
    for k in keys:
        record, err = dataset.read(collection, k)
        if err != '':
            print(err)
            exit()
        url = record['url_links']
        print('Processing file from ', url)
        #Make a dummy file to represent results from kallisto
        files = ['example_file' + k]
        for f in files:
            with open(f, "w") as file:
                file.write(" 0 1 0 " + k)
        #Now attach file to collection
        err = dataset.attach(collection, k, files)
        if err != '':
            print(err)
            exit()
        #Cleanup local disk
        for f in files:
            os.remove(f)
Exemplo n.º 11
0
def migrate_attachment(c_name, key):
    obj, err = dataset.read(c_name, key)
    obj_path = dataset.path(c_name, key).replace(key + ".json", "")
    tarball = os.path.join(obj_path, key + ".tar")
    if os.path.exists(tarball):
        tar = tarfile.open(tarball)
        tar.extractall()
        tar.close()
        files = os.listdir()
        # Prune _Attachment from object and resave
        if "_Attachments" in obj:
            del obj["_Attachments"]
            err = dataset.update(c_name, key, obj)
            if err != "":
                print(f"Can't remove _Attachments metadata, {err}")
                sys.exit(1)
        for fname in files:
            print(".", end="")
            reattach(c_name, key, "v0.0.0", fname)
            os.remove(fname)
        # NOTE: if all re-attached then we need to remove tarball too
        os.remove(tarball)
        sys.stdout.flush()
Exemplo n.º 12
0
def get_crossref_refs(prefix, done=False, new=True):
    # New=True will download everything from scratch and delete any existing records

    collection = "crossref_refs.ds"

    if new == True:
        if os.path.exists(collection) == True:
            shutil.rmtree(collection)

    if os.path.isdir(collection) == False:
        if not dataset.init(collection):
            print("Dataset failed to init collection")
            exit()

    base_url = (
        "https://api.eventdata.crossref.org/v1/[email protected]&source=crossref&obj-id.prefix="
        + prefix)

    collected = dataset.has_key(collection, "captured")

    cursor = ""
    count = 0
    while cursor != None:
        if collected == True:
            date, err = dataset.read(collection, "captured")
            if err != "":
                print("error on read: " + err)
            date = date["captured"]
            print(date)
            url = base_url + "&from-collected-date=" + date
        else:
            url = base_url
        if cursor != "":
            url = url + "&cursor=" + cursor
        print(url)
        r = requests.get(url)
        records = r.json()
        if records["status"] == "failed":
            print(records)
            break
        for rec in records["message"]["events"]:
            # Save results in dataset
            print(count, rec["id"])
            count = count + 1  # Just for prettyness
            if not dataset.create(collection, rec["id"], rec):
                err = dataset.error_message()
                print("Error in saving record: " + err)

        if cursor == records["message"]["next-cursor"]:
            # Catches bug where we get the same curser back at end of results
            break
        if records["message"]["total-results"] > count:
            cursor = records["message"]["next-cursor"]
        else:
            cursor = None

    if collected == True:
        date, err = dataset.read(collection, "captured")
        if err != "":
            print("Error in reading date: " + err)
        date = date["captured"]

        # Check Deleted
        cursor = ""
        while cursor != None:
            del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref"
            full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor
            r = requests.get(full)
            records = r.json()
            for rec in records["message"]["events"]:
                # Delete results in dataset
                print("Deleted: ", rec["id"])
                if not dataset.delete(collection, rec["id"]):
                    err = dataset.error_message()
                    print(f"Unexpected error on read: {err}")
            cursor = records["message"]["next-cursor"]

        # Check Edited
        cursor = ""
        while cursor != None:
            del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref"
            full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor
            r = requests.get(full)
            records = r.json()
            for rec in records["message"]["events"]:
                # Update results in dataset
                print("Update: ", rec["id"])
                if not dataset.update(collection, rec["id"], rec):
                    err = dataset.error_message()
                    print(f"Unexpected error on write: {err}")
            cursor = records["message"]["next-cursor"]

    if done:
        date = datetime.date.today().isoformat()
        record = {"captured": date}
        if dataset.has_key(collection, "captured"):
            if not dataset.update(collection, "captured", record):
                err = dataset.error_message()
                print(f"Unexpected error on update: {err}")
        else:
            if not dataset.create(collection, "captured", record):
                err = dataset.error_message()
                print(f"Unexpected error on create: {err}")
Exemplo n.º 13
0
def send_simple_message(token, matched):
    matched_key = matched[0]
    matched_dois = matched[1]
    # Use raw api call to get email
    api_url = "https://data.caltech.edu/api/record/"
    r = requests.get(api_url + matched_key)
    r_data = r.json()
    if "message" in r_data:
        raise AssertionError("id " + idv + " expected http status 200, got " +
                             r_data.status + r_data.message)
    if not "metadata" in r_data:
        raise AssertionError(
            "expected as metadata property in response, got " + r_data)
    metadata = r_data["metadata"]
    email = ""
    name = ""
    if "contributors" in metadata:
        for c in metadata["contributors"]:
            if c["contributorType"] == "ContactPerson":
                if "contributorEmail" in c:
                    email = c["contributorEmail"]
                    name = c["contributorName"]
    if email == "":
        print("Missing email for record ", matched_key)
    else:
        # Use dataset version to get datacite metadata
        metadata, err = dataset.read("caltechdata.ds", matched_key)
        if err != "":
            print(f"Unexpected error on read: {err}")
            exit()
        title = metadata["titles"][0]["title"]
        doi = metadata["identifier"]["identifier"]
        headers = {"Accept": "text/bibliography;style=apa"}
        citation_block = ""
        for matched in matched_dois:
            citation = requests.get(matched, headers=headers)
            citation.encoding = "utf-8"
            citation = citation.text
            citation = su.unescape(citation)
            citation_block = citation_block + "<p>" + citation + "</p>"
        # Send email
        return requests.post(
            "https://api.mailgun.net/v3/notices.caltechlibrary.org/messages",
            auth=("api", token),
            files=[("inline", open("CaltechDATA_Logo_cropped.png", "rb"))],
            data={
                "from":
                "CaltechDATA Notices <*****@*****.**>",
                "to":
                name + " <" + email + ">, Tom Morrell <*****@*****.**>",
                "subject":
                "Your CaltechDATA Work has been cited!",
                "html":
                '<html> <center> <img src="cid:CaltechDATA_Logo_cropped.png"\
                      alt="CaltechDATA Logo" width="249" height="69"> </center> \
                      <p> Dear ' + name + ', </p>\
                      <p>Your CaltechDATA work "' + title + '" has been cited\
                      in:</p>' + citation_block + '<p>The\
                      citation(s) are now listed in your CaltechDATA record at \
                      <a href="https://doi.org/' + doi + '">' + doi +
                '</a>.</p>\
                      <p> Best, </p><p>CaltechDATA Alerting Service</p><hr>\
                      <p> Is this incorrect?  Let us know at\
                      <a href="mailto:[email protected]?Subject=Issue%20with%20citation%20link%20between%20'
                + doi + "%20and%20" + ",".join(matched_dois) +
                '">[email protected]</a></p>\
                      <P> This email was sent by the Caltech Library, \
                      1200 East California Blvd., MC 1-43, Pasadena, CA 91125, USA </p> </html>',
            },
        )
Exemplo n.º 14
0
import sys
from datetime import datetime
from py_dataset import dataset

#
# Loop through the keys, fetch the record and append a _Key: "deposit" to
# each object.
#

c_name = "people.ds"
keys = dataset.keys(c_name)
#print(f"DEBUG Keys: {keys}")
for key in keys:
    print(f"Fixing key {key}")
    data, err = dataset.read(c_name, key)
    if err != "":
        print(f"Error read {c_name} -> {key}, {err}")
        sys.exit(1)
    # Make fieldname lower case
    dt = datetime.now().strftime('%Y-%m-%d %H:%I:%S')
    obj = {
        "_Key": key,
        "_State": "deposit",
        "_Updated": f"{dt}",
        "_Created": f"{dt}"
    }
    for field in data:
        fkey = field.lower()
        if not ' ' in fkey:
            obj[fkey] = data[field]
Exemplo n.º 15
0
import_coll = "imported.ds"
os.system("rm -rf imported.ds")
dataset.init(import_coll)

os.environ['GOOGLE_CLIENT_SECRET_JSON'] = "/etc/client_secret.json"
err = dataset.import_gsheet(import_coll, sheet, 'Sheet1', 1, 'A:CZ')
if err != '':
    print(err)

keys = dataset.keys(import_coll)

coauthors = []

count = 0
for key in progressbar(keys, redirect_stdout=True):
    record, err = dataset.read(name, key)
    if err != "":
        print(err)
    count = 0
    if 'identifiers' in record:
        identifiers = record['identifiers']
    else:
        identifiers = []
    print(key)
    print(record)
    affiliations = record['affiliations']
    authors = record['authors'].split(';')
    link = record['link']
    year = record['year']
    for a in authors:
        #If none of the words in remove_words appears, we have an author
Exemplo n.º 16
0
archive_path = 'https://wayback.archive-it.org/9060/'

err = dataset.import_gsheet(collection,
                            sheet_id,
                            sheet_name,
                            1,
                            cell_range,
                            overwrite=True)
if err != '':
    print(f"Unexpected error on importing gsheet to {collection}, {err}")
    exit()

keys = dataset.keys(collection)

for key in keys:
    inputv, err = dataset.read(collection, key)
    if err != "":
        print(f"Unexpected error for {key} in {collection}, {err}")
        exit()
    #If we haven't assigned a doi for this resource before
    if 'doi' not in inputv:
        #Confirm that archiving is successful
        if 'archive_complete' in inputv:
            if inputv['archive_complete'] == 'Yes':
                metadata = {}
                metadata['titles'] = [{'title': inputv['title']}]
                authors = []
                alist = inputv['author'].split(';')
                if 'affiliation' in inputv:
                    aff_list = inputv['affiliation'].split(';')
                else:
Exemplo n.º 17
0
def update_datacite_media(username, password, collection, prefix):
    keys = dataset.keys(collection)

    if path.exists("mediaupdate"):
        with open("mediaupdate", "r") as infile:
            update = date.fromisoformat(infile.read())
    else:
        # Arbitrary old date - everything will be updated
        update = date(2011, 1, 1)
    for k in progressbar(keys, redirect_stdout=True):
        existing, err = dataset.read(collection, k)
        if err != "":
            print(f"Unexpected error on read: {err}")
        atlas = False
        subjects = existing["subjects"]
        for subject in subjects:
            if (subject["subject"].strip() ==
                    "Atlas of Bacterial and Archaeal Cell Structure"):
                atlas = True
        record_update = datetime.fromisoformat(existing["updated"]).date()
        # Subtraction to get window to grab records that were updated between runs
        if record_update > update - timedelta(days=2):
            if "electronic_location_and_access" in existing:
                doi = existing["identifier"]["identifier"]
                record_prefix = doi.split("/")[0]
                if record_prefix == prefix:
                    delete_datacite_media(username, password, doi)
                    for file_met in existing["electronic_location_and_access"]:
                        url = "https://mds.datacite.org/media/" + doi
                        headers = {
                            "Content-Type": "application/txt;charset=UTF-8"
                        }
                        extension = file_met["electronic_name"][0].split(
                            ".")[-1]
                        filename = file_met["electronic_name"][0].split(".")[0]
                        data = {}
                        if extension == "nc":
                            data = ("application/x-netcdf=" +
                                    file_met["uniform_resource_identifier"])
                        elif extension == "mp4":
                            if atlas:
                                data = (
                                    "video/mp4=" +
                                    "https://www.cellstructureatlas.org/videos/"
                                    + filename + ".mp4")
                            else:
                                data = (
                                    "video/mp4=" +
                                    file_met["uniform_resource_identifier"])
                        elif extension == "mj2":
                            data = ("video/mj2=" +
                                    file_met["uniform_resource_identifier"])
                        elif extension == "avi":
                            data = ("video/avi=" +
                                    file_met["uniform_resource_identifier"])
                        elif extension == "mov":
                            data = ("video/quicktime=" +
                                    file_met["uniform_resource_identifier"])
                        elif extension == "gz":
                            data = ("application/gzip=" +
                                    file_met["uniform_resource_identifier"])
                        elif extension == "zip":
                            data = ("application/zip=" +
                                    file_met["uniform_resource_identifier"])
                        elif extension == "h5ad":
                            data = ("application/octet-stream=" +
                                    file_met["uniform_resource_identifier"])
                        if data != {}:
                            print(doi)
                            print(data)
                            r = requests.post(
                                url,
                                data=data.encode("utf-8"),
                                auth=(username, password),
                                headers=headers,
                            )
                            print(r)
Exemplo n.º 18
0
def add_thesis_doi(data_collection, thesis_collection, token, production=True):
    """Add in theis DOI to CaltechDATA records"""

    # Search across CaltechTHESIS DOIs
    dot_paths = ["._Key", ".doi", ".official_url", ".related_url"]
    labels = ["eprint_id", "doi", "official_url", "related_url"]
    keys = dataset.keys(thesis_collection)
    all_metadata = get_records(dot_paths, "dois", thesis_collection, keys, labels)
    dois = []
    for metadata in progressbar(all_metadata, redirect_stdout=True):
        if "doi" in metadata:
            record_doi = metadata["doi"].strip()
            if "related_url" in metadata and "items" in metadata["related_url"]:
                items = metadata["related_url"]["items"]
                for item in items:
                    if "url" in item:
                        url = item["url"].strip()
                    if "type" in item:
                        itype = item["type"].strip().lower()
                    if itype == "doi":
                        if idutils.is_doi(url):
                            doi = "10." + url.split("10.")[1]
                            prefix = doi.split("/")[0]
                            if prefix == "10.22002":
                                dois.append([doi, record_doi])
                        else:
                            print("Ignoring non-DOI")
                            print(metadata["eprint_id"])
                            print(url.split("10."))
    for doi_link in dois:
        cd_doi = doi_link[0]
        thesis_doi = doi_link[1]
        print("Checking " + cd_doi)
        if "D1" in cd_doi:
            record_number = cd_doi.split("D1.")[1]
        if "d1" in cd_doi:
            record_number = cd_doi.split("d1.")[1]
        record, err = dataset.read(data_collection, record_number)
        if err != "":
            print(err)
            exit()

        done = False
        if "relatedIdentifiers" in record:
            for idv in record["relatedIdentifiers"]:
                identifier = idv["relatedIdentifier"]
                if identifier == thesis_doi:
                    done = True
            if done == False:
                identifiers = record["relatedIdentifiers"]
                identifiers.append(
                    {
                        "relatedIdentifier": thesis_doi,
                        "relatedIdentifierType": "DOI",
                        "relationType": "IsSupplementTo",
                    }
                )
                new_metadata = {"relatedIdentifiers": identifiers}
        else:
            new_metadata = {
                "relatedIdentifiers": [
                    {
                        "relatedIdentifier": thesis_doi,
                        "relatedIdentifierType": "DOI",
                        "relationType": "IsSupplementTo",
                    }
                ]
            }
        if done == False:
            print("Adding " + thesis_doi + " to " + cd_doi)
            response = caltechdata_edit(
                token, record_number, new_metadata, {}, {}, True
            )
            print(response)
Exemplo n.º 19
0
def add_usage(collection, token, usage_collection, production=True):
    """Add in usage text in the description field"""
    keys = dataset.keys(collection)
    biggest_views = 0
    biggest_views_record = ""
    biggest_downloads = 0
    biggest_downloads_record = ""
    total_views = 0
    total_downloads = 0
    for k in keys:
        record, err = dataset.read(collection, k)
        if err != "":
            print(err)
            exit()
        usage, err = dataset.read(usage_collection, k)
        views = usage["grand-total-unique-investigations"]
        downloads = usage["grand-total-unique-requests"]
        if views > biggest_views:
            biggest_views = views
            biggest_views_record = k
        if downloads > biggest_downloads:
            biggest_downloads = downloads
            biggest_downloads_record = k
        total_views += views
        total_downloads += downloads
        date = datetime.fromisoformat(usage["dataset-dates"][0]["value"])
        now = datetime.today()
        first = date.strftime("%B %d, %Y")
        last = now.strftime("%B %d, %Y")
        if views > 1:
            u_txt = (
                "<br>Unique Views: "
                + str(views)
                + "<br>Unique Downloads: "
                + str(downloads)
                + "<br> between "
                + first
                + " and "
                + last
                + '<br><a href="https://data.caltech.edu/stats"'
                + ">More info on how stats are collected</a><br>"
            )
            description = record["descriptions"]
            use_exists = False
            for d in description:
                descr_text = d["description"]
                # We always update an existing listing
                if descr_text.startswith("<br>Unique Views:"):
                    d["description"] = u_txt
                    use_exists = True
            # Otherwise we add a new one
            if use_exists == False:
                description.append({"descriptionType": "Other", "description": u_txt})
            response = caltechdata_edit(
                token, k, {"descriptions": description}, {}, {}, production
            )
            print(response)
    print(f"Most downloads {biggest_downloads} for record {biggest_downloads_record}")
    print(f"Most views {biggest_views} for record {biggest_views_record}")
    print(f"Total downloads {total_downloads}")
    print(f"Total views {total_views}")
Exemplo n.º 20
0
def get_usage(usage_collection, mapping, token):
    """Collect usage into a usage object for items in CaltechDATA"""

    # Find time periods
    datev, err = dataset.read(usage_collection, "end-date")
    new_start = datetime.fromtimestamp(datev["end-date"])
    now = datetime.now().timestamp()
    # minutes in range
    minutes_diff = math.ceil(
        (datetime.fromtimestamp(now) - new_start).total_seconds() / 60.0
    )

    # Get number of visitors since last harvest
    stats_url_base = "https://stats.tind.io/index.php?module=API&method=Live.getCounters&idSite=1161&format=JSON"

    token_s = "&token_auth=" + token

    stats_url = f"{stats_url_base}{token_s}&lastMinutes={minutes_diff}"
    response = requests.get(stats_url)
    if response.status_code != 200:
        print(response.text)
        print(stats_url)
    visitors = response.json()[0]["visits"]

    print(visitors)
    visit_url_base = "https://stats.tind.io/index.php?module=API&method=Live.getLastVisitsDetails&idSite=1161&format=json&filter_limit=1000"

    print("Getting usage")
    usage = []
    # We will page through visitors in chunks of 1000
    chunks = math.ceil(int(visitors) / 1000)
    if chunks > 1:
        url = visit_url_base + token_s + "&filter_limit=1000"
        process_visits(url, mapping)
        for c in progressbar(range(chunks)):
            url = f"{visit_url_base}{token_s}&filter_limit=1000&filter_offset={c*1000}"
            usage += process_visits(url, mapping)
    else:
        url = f"{visit_url_base}{token_s}&filter_limit={visitors}"
        usage = process_visits(url, mapping)

    print("Writing usage")
    for use in progressbar(usage):
        date = use["date"]
        if "downloads" in use and "views" in use:
            records = use["views"].union(use["downloads"])
        elif "views" in use:
            records = use["views"]
        else:
            records = use["downloads"]
        for rec in records:
            data, err = dataset.read(usage_collection, rec)
            if err == "":
                # We only track usage from live records
                instance = {"instance": [], "period": date}
                if "views" in use:
                    if rec in use["views"]:
                        instance["instance"].append(
                            {
                                "access-method": "regular",
                                "count": 1,
                                "metric-type": "unique-dataset-investigations",
                            }
                        )
                        # print(data,rec)
                        data["grand-total-unique-investigations"] += 1
                if "downloads" in use:
                    if rec in use["downloads"]:
                        instance["instance"].append(
                            {
                                "access-method": "regular",
                                "count": 1,
                                "metric-type": "unique-dataset-requests",
                            }
                        )
                        data["grand-total-unique-requests"] += 1
                data["performance"].append(instance)
                dataset.update(usage_collection, rec, data)

    dataset.update(usage_collection, "end-date", {"end-date": now})
Exemplo n.º 21
0
def fix_multiple_links(input_collection, token):
    keys = dataset.keys(input_collection)
    for k in keys:
        record, err = dataset.read(input_collection, k)
        if err != "":
            print(err)
            exit()
        if "relatedIdentifiers" in record:
            idvs = []
            new = []
            dupes = []
            replace = False
            record_doi = record["identifier"]["identifier"]
            for idv in record["relatedIdentifiers"]:
                idvs.append(idv["relatedIdentifier"])
            for idv in record["relatedIdentifiers"]:
                identifier = idv["relatedIdentifier"]
                if identifier == record_doi:
                    # Having a related identifier that is the same as the record
                    # doi doesn't make any sense
                    replace = True
                    dupes.append(identifier)
                else:
                    count = idvs.count(identifier)
                    if count > 1:
                        replace = True
                        if identifier not in dupes:
                            # We need to save the first duplicate
                            new.append(idv)
                            # Add to list of those already saved
                            dupes.append(identifier)
                        else:
                            # This will be deleted
                            dupes.append(identifier)
                    else:
                        # Save all unique ids
                        new.append(idv)
            if replace == True:
                print("Duplicate links found in record ", k)
                print("Will delete these links", dupes)
                response = input("Do you approve this change? Y or N")
                new_metadata = {"relatedIdentifiers": new}
                if response == "Y":
                    response = caltechdata_edit(token, k, new_metadata, {}, {}, True)
                    print(response)
        if "alternateIdentifiers" in record:
            idtypes = []
            alt_ids = []
            repeat = False
            for idv in record["alternateIdentifiers"]:
                if idv["alternateIdentifierType"] not in idtypes:
                    # If we haven't seen id type before, save it
                    alt_ids.append(idv)
                    idtypes.append(idv["alternateIdentifierType"])
                else:
                    repeat = True
                    print("Will Delete Repeated ID ", idv["alternateIdentifier"])
            if repeat == True:
                new_metadata = {"alternateIdentifiers": alt_ids}
                response = caltechdata_edit(token, k, new_metadata, {}, {}, True)
                print(response)
Exemplo n.º 22
0
def match_cd_refs():
    token = os.environ["TINDTOK"]

    matches = []
    collection = "caltechdata.ds"
    keys = dataset.keys(collection)
    if "mediaupdate" in keys:
        keys.remove("mediaupdate")

    # Get event data results
    event_data = "crossref_refs.ds"
    event_keys = dataset.keys(event_data)
    event_keys.remove("captured")
    f_name = "match_cd_refs"
    dot_paths = [".obj_id", ".id", ".subj_id"]
    labels = ["obj_id", "id", "subj_id"]
    print("Getting Event Data Records")
    if dataset.has_frame(event_data, f_name):
        if not dataset.frame_reframe(event_data, f_name, event_keys):
            err = dataset.error_message()
            print(f"Failed to reframe {f_name} in {event_data}, {err}")
            exit()
    elif not dataset.frame_create(event_data, f_name, event_keys, dot_paths, labels):
        err = dataset.error_message()
        print(f"Failed to create frame {f_name} in {event_data}, {err}")
        exit()
    grid = dataset.frame_grid(event_data, f_name)
    df = pd.DataFrame(np.array(grid), columns=["obj_id", "id", "subj_id"])
    grouped = df.groupby(["obj_id"])
    groups = grouped.groups
    # Look at all CaltechDATA records
    for k in keys:
        # Collect matched new links for the record
        record_matches = []
        print(k)
        metadata, err = dataset.read(collection, k)
        if err != "":
            print(f"Unexpected error on read: {err}")
        doi = "https://doi.org/" + metadata["identifier"]["identifier"]
        if doi in groups:
            hits = grouped.get_group(doi)
            for index, h in hits.iterrows():
                # Trigger for whether we already have this link
                new = True
                if "relatedIdentifiers" in metadata:
                    for m in metadata["relatedIdentifiers"]:
                        if m["relatedIdentifier"] in h["subj_id"]:
                            new = False
                if new == True:
                    match = h["subj_id"]
                    print(match)
                    print(h["obj_id"])
                    inputv = input("Do you approve this link?  Type Y or N: ")
                    if inputv == "Y":
                        record_matches.append(match)
        # If we have to update record
        if len(record_matches) > 0:
            ids = []
            if "relatedIdentifiers" in metadata:
                for m in metadata["relatedIdentifiers"]:
                    ids.append(m)
            matches.append([k, record_matches])
            # Now collect identifiers for record
            for match in record_matches:
                split = match.split("doi.org/")
                new_id = {
                    "relatedIdentifier": split[1],
                    "relatedIdentifierType": "DOI",
                    "relationType": "IsCitedBy",
                }
                ids.append(new_id)
            newmetadata = {"relatedIdentifiers": ids}
            response = caltechdata_edit(token, k, newmetadata, {}, {}, True)
            print(response)
    return matches
Exemplo n.º 23
0
from ames.harvesters import get_caltechfeed, get_records

if __name__ == "__main__":

    import_coll = "imported.ds"
    sheet = "1ZI3-XvQ_3rLcKrF-4FBa2tEInIdQfOnGJ9L_NmhmoGs"
    os.system("rm -rf imported.ds")
    dataset.init(import_coll)
    err = dataset.import_gsheet(import_coll, sheet, "CaltechPEOPLE", 4, "A:AA")
    if err != "":
        print(err)

    people_list = dataset.keys(import_coll)
    people = []
    for p in people_list:
        record, err = dataset.read(import_coll, p)
        people.append(record)

    # Profiles collection from feeds
    profile_ds = "profiles.ds"
    keys = dataset.keys(profile_ds)
    labels = ["orcid", "creator_id"]
    dot_paths = [".orcid", ".creator_id"]

    all_metadata = get_records(dot_paths, "profile", profile_ds, keys, labels)
    for profile in all_metadata:
        if "creator_id" in profile:
            idv = profile["creator_id"]
        else:
            print("ERROR", profile)
        for person in people:
Exemplo n.º 24
0
def test_basic(t, collection_name):
    '''test_basic(collection_name) runs tests on basic CRUD ops'''
    # Setup a test record
    key = "2488"
    value = {
        "title":
        "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World",
        "formats": ["epub", "kindle", "plain text"],
        "authors": [{
            "given": "Jules",
            "family": "Verne"
        }],
        "url": "https://www.gutenberg.org/ebooks/2488"
    }

    # We should have an empty collection, we will create our test record.
    if dataset.create(collection_name, key, value) == False:
        err = dataset.error_message()
        t.error(f'create({collection_name}, {key}, {value}) failed, {err}')
        return

    # Check to see that we have only one record
    key_count = dataset.count(collection_name)
    if key_count != 1:
        t.error(f"Failed, expected count to be 1, got {key_count}")

    # Do a minimal test to see if the record looks like it has content
    keyList = dataset.keys(collection_name)
    rec, err = dataset.read(collection_name, key)
    if err != "":
        t.error(f"Unexpected error for {key} in {collection_name}, {err}")
    for k, v in value.items():
        if not isinstance(v, list):
            if k in rec and rec[k] == v:
                t.print("OK, found", k, " -> ", v)
            else:
                t.error(f"epxected {rec[k]} got {v}")
        else:
            if k == "formats" or k == "authors":
                t.print("OK, expected lists for", k, " -> ", v)
            else:
                t.error(f"Failed, expected {k} with list v, got {v}")

    # Test updating record
    value["verified"] = True
    if dataset.update(collection_name, key, value) == False:
        err = dataset.error_message()
        t.error(f"update({collection_name}, {key}, {value}) failed, {err}")
    rec, err = dataset.read(collection_name, key)
    if err != "":
        t.error(f"Unexpected error for {key} in {collection_name}, {err}")
    for k, v in value.items():
        if not isinstance(v, list):
            if k in rec and rec[k] == v:
                t.print("OK, found", k, " -> ", v)
            else:
                t.error("expected {rec[k]} got {v} for key {k}")
        else:
            if k == "formats" or k == "authors":
                t.print("OK, expected lists for", k, " -> ", v)
            else:
                t.error("Failed, expected {k} with a list for v, got {v}")

    # Test path to record
    expected_s = "/".join(
        [collection_name, "pairtree", "24", "88", (key + ".json")])
    expected_l = len(expected_s)
    p = dataset.path(collection_name, key)
    if len(p) != expected_l:
        t.error("Failed, expected length", expected_l, "got", len(p))
    if p != expected_s:
        t.error("Failed, expected", expected_s, "got", p)

    # Test listing records
    l = dataset.list(collection_name, [key])
    if len(l) != 1:
        t.error(
            f"list({collection_name}, [{key}]) failed, list should return an array of one record, got",
            l)
        return

    # test deleting a record
    if dataset.delete(collection_name, key) == False:
        err = dataset.error_message()
        t.error("Failed, could not delete record", key, ", ", err)
Exemplo n.º 25
0
def test_keys(t, collection_name):
    '''test_keys(collection_name) test getting, filter and sorting keys'''
    # Test count after delete
    key_list = dataset.keys(collection_name)
    cnt = dataset.count(collection_name)
    if cnt != 0:
        t.error("Failed, expected zero records, got", cnt, key_list)

    #
    # Generate multiple records for collection for testing keys
    #
    test_records = {
        "gutenberg:21489": {
            "title": "The Secret of the Island",
            "formats": ["epub", "kindle", "plain text", "html"],
            "authors": [{
                "given": "Jules",
                "family": "Verne"
            }],
            "url": "http://www.gutenberg.org/ebooks/21489",
            "categories": "fiction, novel"
        },
        "gutenberg:2488": {
            "title":
            "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World",
            "formats": ["epub", "kindle", "plain text"],
            "authors": [{
                "given": "Jules",
                "family": "Verne"
            }],
            "url": "https://www.gutenberg.org/ebooks/2488",
            "categories": "fiction, novel"
        },
        "gutenberg:21839": {
            "title": "Sense and Sensibility",
            "formats": ["epub", "kindle", "plain text"],
            "authors": [{
                "given": "Jane",
                "family": "Austin"
            }],
            "url": "http://www.gutenberg.org/ebooks/21839",
            "categories": "fiction, novel"
        },
        "gutenberg:3186": {
            "title": "The Mysterious Stranger, and Other Stories",
            "formats": ["epub", "kindle", "plain text", "html"],
            "authors": [{
                "given": "Mark",
                "family": "Twain"
            }],
            "url": "http://www.gutenberg.org/ebooks/3186",
            "categories": "fiction, short story"
        },
        "hathi:uc1321060001561131": {
            "title":
            "A year of American travel - Narrative of personal experience",
            "formats": ["pdf"],
            "authors": [{
                "given": "Jessie Benton",
                "family": "Fremont"
            }],
            "url":
            "https://babel.hathitrust.org/cgi/pt?id=uc1.32106000561131;view=1up;seq=9",
            "categories": "non-fiction, memoir"
        }
    }
    test_count = len(test_records)

    for k in test_records:
        v = test_records[k]
        if dataset.create(collection_name, k, v) == False:
            err = dataset.error_message()
            t.error("Failed, could not add", k, "to", collection_name, ', ',
                    err)

    # Test keys, filtering keys and sorting keys
    all_keys = dataset.keys(collection_name)
    if len(all_keys) != test_count:
        t.error("Expected", test_count, "all_keys back, got", keys)

    #dataset.verbose_on()
    filter_expr = '(eq .categories "non-fiction, memoir")'
    filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr)
    if len(filtered_keys) != 1:
        t.error(
            f"key_filter({collection_name}, {keys}, {filter_expre}), Expected one key for",
            filter_expr, "got", filtered_keys)

    filter_expr = '(contains .categories "novel")'
    filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr)
    if len(filtered_keys) != 3:
        t.error(
            f"key_filter({collection_name}, {keys}, {filter_expr}), Expected three keys for",
            filter_expr, "got", filtered_keys)

    sort_expr = '+.title'
    filter_expr = '(contains .categories "novel")'
    sorted_keys = dataset.key_sort(collection_name, filtered_keys, sort_expr)
    if len(sorted_keys) != 3:
        t.error(
            f"key_sort({collection_name}, {filtered_keys}, {sort_expr}), Expected three keys for",
            filter_expr, "got", sorted_keys)
    expected_keys = ["gutenberg:21839", "gutenberg:21489", "gutenberg:2488"]
    for i, k in enumerate(expected_keys):
        if i < len(sorted_keys) and sorted_keys[i] != k:
            obj1, _ = dataset.read(collection_name, k)
            obj2, _ = dataset.read(collection_name, sorted_keys[i])
            t.error(
                f'key_sort({collection_name}, {filtered_keys}, {sort_expr}), ({q}) Expected {k} (title "{obj1["title"]}) got {keys[i]} (title "{obj2["title"]}")'
            )
Exemplo n.º 26
0
def get_wos_refs(new=True):
    # New=True will download everything from scratch and delete any existing records

    collection = "all_wos.ds"

    if new == True:
        if os.path.exists(collection) == True:
            shutil.rmtree(collection)

    if os.path.isdir(collection) == False:
        ok = dataset.init(collection)
        if ok == False:
            print("Dataset failed to init collection")
            exit()

    # Get access token from WOS sed as environment variable with source token.bash
    token = os.environ["WOSTOK"]

    headers = {"X-ApiKey": token, "Content-type": "application/json"}

    # Run query to get scope of records

    base_url = "https://api.clarivate.com/api/wos/?databaseId=WOK"

    collected = dataset.has_key(collection, "captured")

    if collected == True:
        date = dataset.read(collection, "captured")
        date = date[0]["captured"]
        date = datetime.fromisoformat(date)
        current = datetime.today()
        diff = current - date
        base_url = base_url + "&loadTimeSpan=" + str(diff.days) + "D"

    date = datetime.today().isoformat()
    record = {"captured": date}
    if dataset.has_key(collection, "captured"):
        err = dataset.update(collection, "captured", record)
        if err != "":
            print(f"Unexpected error on update: {err}")
    else:
        err = dataset.create(collection, "captured", record)
        if err != "":
            print(f"Unexpected error on create: {err}")

    query = "OG=(California Institute of Technology)"
    query = urllib.parse.quote_plus(query)
    url = base_url + "&usrQuery=" + query + "&count=100&firstRecord=1"

    response = requests.get(url, headers=headers)
    response = response.json()
    record_count = response["QueryResult"]["RecordsFound"]
    print(record_count, " Records from WOS")
    query_id = response["QueryResult"]["QueryID"]
    try:
        records = response["Data"]["Records"]["records"]["REC"]
    except:
        print(response)
    write_records(records, collection)
    # We have saved the first 100 records
    record_start = 101
    record_count = record_count - 100

    query_url = "https://api.clarivate.com/api/wos/query/"

    while record_count > 0:
        print(record_count)
        print(len(records), "records")
        if record_count > 100:
            url = (
                query_url
                + str(query_id)
                + "?count=100&firstRecord="
                + str(record_start)
            )
            response = requests.get(url, headers=headers)
            response = response.json()
            try:
                records = response["Records"]["records"]["REC"]
            except:
                print(response)
            write_records(records, collection)
            record_start = record_start + 100
            record_count = record_count - 100
        else:
            url = (
                query_url
                + str(query_id)
                + "?count="
                + str(record_count)
                + "&firstRecord="
                + str(record_start)
            )
            response = requests.get(url, headers=headers)
            response = response.json()
            records = response["Records"]["records"]["REC"]
            write_records(records, collection)
            record_count = 0

    print("Downloaded all records ")
    if err != "":
        print(f"{c_name}, {err}")

harvest = False

if harvest == True:
    username = os.environ["USER"]
    password = os.environ["PW"]
    returnc = ep_full(
        c_name, "https://caltechcampuspubs.library.caltech.edu/", username, password
    )
    print(returnc)

keys = dataset.keys(c_name)
for key in keys:
    existing, err = dataset.read(c_name, key)
    # print(existing)
    new = {
        "_access": {"metadata_restricted": False, "files_restricted": False},
        "_owners": [1],
        "_created_by": 1,
        "_default_preview": "previewer one",
        "access_right": "open",
        "resource_type": {"type": "publication", "subtype": "publication-other"},
    }
    new["recid"] = existing["eprint_id"]
    new["titles"] = [{"title": existing["title"], "type": "MainTitle"}]
    crea = []
    if "creators" in existing:
        for creator in existing["creators"]["items"]:
            cre = {
Exemplo n.º 28
0
def aggregate_usage(usage_collection, month_collection):
    keys = dataset.keys(usage_collection)
    keys.remove("end-date")
    for k in progressbar(keys):
        record, err = dataset.read(usage_collection, k)
        if err != "":
            print(err)
        use = {}
        views = {}
        for usage in record["performance"]:
            split = usage["period"].split("-")
            month = split[0] + "-" + split[1]
            for u in usage["instance"]:
                metric = u["metric-type"]
                if metric == "unique-dataset-requests":
                    if month in use:
                        use[month] += u["count"]
                    else:
                        use[month] = u["count"]
                if metric == "unique-dataset-investigations":
                    if month in views:
                        views[month] += u["count"]
                    else:
                        views[month] = u["count"]
        # Strip non-counter stuff
        record.pop("_Key")
        record.pop("grand-total-unique-requests")
        record.pop("grand-total-unique-investigations")
        # go across months
        for view in views:
            split = view.split("-")
            date_obj = datetime(int(split[0]), int(split[1]), 1)
            d_range = get_month_day_range(date_obj)
            performance = [
                {
                    "period": {
                        "begin-date": d_range[0].date().isoformat(),
                        "end-date": d_range[1].date().isoformat(),
                    },
                    "instance": [],
                }
            ]
            v = views[view]
            performance[0]["instance"].append(
                {
                    "count": v,
                    "metric-type": "unique-dataset-investigations",
                    "access-method": "regular",
                }
            )
            # Handle when we have both views and uses in a given month
            if view in use:
                u = use[view]
                performance[0]["instance"].append(
                    {
                        "count": u,
                        "metric-type": "unique-dataset-requests",
                        "access-method": "regular",
                    }
                )
            existing, err = dataset.read(month_collection, view)
            if err != "":
                print(err)
            record["performance"] = performance
            existing["report-datasets"].append(record)
            if not dataset.update(month_collection, view, existing):
                err = dataset.error_message()
                print(err)
        for use_date in use:
            # We only have use-only records left to handle
            if use_date not in views:
                u = use[use_date]
                split = use_date.split("-")
                date_obj = datetime(int(split[0]), int(split[1]), 1)
                d_range = get_month_day_range(date_obj)
                performance = [
                    {
                        "period": {
                            "begin-date": d_range[0].date().isoformat(),
                            "end-date": d_range[1].date().isoformat(),
                        },
                        "instance": [
                            {
                                "count": u,
                                "metric-type": "unique-dataset-requests",
                                "access-method": "regular",
                            }
                        ],
                    }
                ]
                existing, err = dataset.read(month_collection, view)
                if err != "":
                    print(err)
                record["performance"] = performance
                existing["report-datasets"].append(record)
                if not dataset.update(month_collection, view, existing):
                    err = dataset.error_message()
                    print(err)