示例#1
0
def file_mapping(source_collection):
    """Return a dictionary that maps /tindfiles/serve urls to records."""

    mapping = {}

    dot_paths = [".electronic_location_and_access", "._Key"]
    keys = dataset.keys(source_collection)
    metadata = get_records(dot_paths, "files", source_collection, keys)

    for record in metadata:
        # Handle history records where the key is the item and revision
        k = record["_Key"]
        if "-" in k:
            rec_id = k.split("-")[0]
        else:
            rec_id = k

        # Ignore embargoed records
        if "electronic_location_and_access" in record:
            for filev in record["electronic_location_and_access"]:
                url = filev["uniform_resource_identifier"]
                # name = filev['electronic_name'][0]
                if url not in mapping:
                    mapping[url] = rec_id

    return mapping
def get_subset(collection):
    #Demo pulling out a subset of records from collection
    #Using pandas data feame
    #Get all files with cell type Xenotransplanted microglia
    keys = dataset.keys(collection)
    dot_paths = [".cell_source", ".species", ".tissue", "._Key"]
    (grid, err) = dataset.grid(collection, keys, dot_paths)
    if err != "":
        print(err)
        exit()
    df = pd.DataFrame(np.array(grid),
                      columns=["source", "species", "tissue", "key"])
    grouped = df.groupby(["source"])
    print(grouped.groups.keys())
    records = grouped.get_group('Xenotransplanted microglia')
    for index, r in records.iterrows():
        print('getting files for ', r['key'])
        err = dataset.detach(collection, r['key'], [])
        if err != '':
            print(err)

    #Example doing the same thing with frames
    labels = ["source", "species", "tissue", "key"]
    f, err = dataset.frame(collection, 'frame_name', keys, dot_paths, labels)
    if err != "":
        print(err)
    records = dataset.frame_objects(collection, 'frame_name')
    for record in records:
        if record['source'] == 'Xenotransplanted microglia':
            print('getting files for ', record['key'])
            err = dataset.detach(collection, record['key'], [])
            if err != '':
                print(err)
示例#3
0
def add_citation(collection, token, production=True):
    """Add in example citation text in the description field"""
    keys = dataset.keys(collection)
    for k in keys:
        record, err = dataset.read(collection, k)
        if err != "":
            print(err)
            exit()
        description = record["descriptions"]
        cite_exists = False
        for d in description:
            descr_text = d["description"]
            if descr_text.startswith("<br>Cite this record as:"):
                cite_exists = True
        if cite_exists == False:
            record_doi = record["identifier"]["identifier"]
            headers = {"Accept": "text/x-bibliography; style=apa"}
            citation_link = "https://doi.org/"
            citation = requests.get(citation_link + record_doi, headers=headers).text
            doi_url = "https://doi.org/" + record_doi
            if doi_url in citation:
                # Check that we have a citation and not a server error,
                # otherwise wait till next time
                n_txt = citation_text(citation, doi_url, record_doi)
                description.append({"descriptionType": "Other", "description": n_txt})
                response = caltechdata_edit(
                    token, k, {"descriptions": description}, {}, {}, production
                )
                print(response)
示例#4
0
def build_usage(caltechdata_collection, usage_collection):
    """Build collection of records that contain CaltechDATA usage
    information"""
    if not os.path.isdir(usage_collection):
        if not dataset.init(usage_collection):
            print("Dataset failed to init collection")
            exit()
        # Write date to start collecting statistics for new collection
        dataset.create(usage_collection, "end-date", {"end-date": 1485907200})
    # Build out structure for all CaltechDATA records
    ids = dataset.keys(caltechdata_collection)
    for k in ids:
        if dataset.has_key(usage_collection, k) == False:
            metadata, err = dataset.read(caltechdata_collection, k)
            # When record was submitted to CaltechDATA:
            rdate = None
            submitted = None
            issued = None
            if "dates" in metadata:
                doi = metadata["identifier"]["identifier"]
                for date in metadata["dates"]:
                    if date["dateType"] == "Submitted":
                        rdate = date["date"]
                    if date["dateType"] == "Updated":
                        submitted = date["date"]
                    if date["dateType"] == "Issued":
                        issued = date["date"]
                if rdate == None:
                    if submitted != None:
                        rdate = submitted
                    else:
                        rdate = issued
            else:
                # Dummy values for junk records
                rdate = "2020-04-01"
                doi = ""
            # Dataset is the only supported type in the spec and we are
            # following the dataset standards for usage
            # All dates are the date added to CaltechDATA, which is
            # the apropriate 'publication' date even if content was available
            # earlier
            record_data = {
                "dataset-id": [{"type": "doi", "value": doi}],
                "uri": "https://data.caltech.edu/records/" + k,
                "publisher": "CaltechDATA",
                "platform": "CaltechDATA",
                "publisher-id": [{"type": "grid", "value": "grid.20861.3d"}],
                "yop": rdate.split("-")[0],
                "data-type": "dataset",
                "dataset-dates": [{"type": "pub-date", "value": rdate}],
                "dataset-title": metadata["titles"][0]["title"],
                "performance": [],
                "grand-total-unique-investigations": 0,
                "grand-total-unique-requests": 0,
            }
            if not dataset.create(usage_collection, k, record_data):
                err = dataset.error_message()
                print(err)
                exit()
示例#5
0
def test_issue12(t, c_name):
    src = '''[
{"id": "1", "c1": 1, "c2": 2, "c3": 3 },
{"id": "2", "c1": 2, "c2": 2, "c3": 3 },
{"id": "3", "c1": 3, "c2": 3, "c3": 3 },
{"id": "4", "c1": 1, "c2": 1, "c3": 1 },
{"id": "5", "c1": 6, "c2": 6, "c3": 6 }
]'''
    #dataset.verbose_on() # DEBUG
    #dataset.use_strict_dotpath(True) # DEBUG
    if dataset.status(c_name) == False:
        if not dataset.init(c_name):
            err = dataset.error_message()
            t.error(f'failed to create {c_name}')
            return
    objects = json.loads(src)
    for obj in objects:
        key = obj['id']
        if dataset.has_key(c_name, key):
            dataset.update(c_name, key, obj)
        else:
            dataset.create(c_name, key, obj)
    f_names = dataset.frames(c_name)
    for f_name in f_names:
        ok = dataset.delete_frame(c_name, f_name)
        if ok == False:
            err = dataset.error_message()
            t.error(f'Failed to delete {f_name} from {c_name} -> "{err}"')
            return
        if dataset.has_frame(c_name, f_name) == True:
            t.error(
                f'Failed to delete frame {c_name} from {c_name}, frame still exists'
            )
            return
    f_name = 'issue12'
    dot_paths = [".c1", "c3"]
    labels = [".col1", ".col3"]
    keys = dataset.keys(c_name)
    if not dataset.frame_create(c_name, f_name, keys, dot_paths, labels):
        err = dataset.error_message()
        t.error(f'failed to create {f_name} from {c_name}, {err}')
    if not dataset.has_frame(c_name, f_name):
        err = dataset.error_message()
        t.error(f'expected frame {f_name} to exists, {err}')
        return
    f_keys = dataset.frame_keys(c_name, f_name)
    if len(f_keys) == 0:
        err = dataset.error_message()
        t.error(f'expected keys in {f_name}, got zero, {err}')
        return
    f_objects = dataset.frame_objects(c_name, f_name)
    if len(f_objects) == 0:
        err = dataset.error_message()
        t.error(f'expected objects in {f_name}, got zero, {err}')
        return
    if not dataset.delete_frame(c_name, f_name):
        err = dataset.error_message()
        t.error(f'expected to delete {f_name} in {c_name}, {err}')
示例#6
0
def update_datacite_metadata(collection, token, access):
    """Access contains username, password, and prefix for DataCite"""
    keys = dataset.keys(collection)
    for a in access:

        username = a["username"]
        password = a["password"]
        prefix = a["prefix"]

        # Initialize the MDS client.
        d = DataCiteMDSClient(
            username=username,
            password=password,
            prefix=prefix,
            url="https://mds.datacite.org",
        )

        for k in keys:
            print(k)
            metadata, err = dataset.read(collection, k)
            if err != "":
                print(err)
                exit()
            # Get rid of Key from dataset
            metadata.pop("_Key")

            if "identifier" in metadata:
                record_doi = metadata["identifier"]["identifier"]

                # Handle records with 4.3 metadata elements
                if "schemaVersion" in metadata:
                    metadata.pop("schemaVersion")
                if "types" in metadata:
                    metadata.pop("types")

                if record_doi.split("/")[0] == prefix:
                    result = schema40.validate(metadata)
                    # Debugging if this fails
                    if result == False:
                        print(metadata)
                        v = schema40.validator.validate(metadata)
                        errors = sorted(v.iter_errors(instance),
                                        key=lambda e: e.path)
                        for error in errors:
                            print(error.message)
                        exit()

                    xml = schema40.tostring(metadata)

                    response = d.metadata_post(xml)
                    print(response)
示例#7
0
def get_multiple_links(input_collection, output_collection):
    keys = dataset.keys(input_collection)
    for k in keys:
        record, err = dataset.read(input_collection, k)
        if err != "":
            print(err)
            exit()
        if "relatedIdentifiers" in record:
            idvs = []
            for idv in record["relatedIdentifiers"]:
                idvs.append(idv["relatedIdentifier"])
            for idv in record["relatedIdentifiers"]:
                count = idvs.count(idv["relatedIdentifier"])
                if count > 1:
                    print("DUPE")
                    print(k)
                    print(idv["relatedIdentifier"])
示例#8
0
def match_codemeta():
    collection = "github_records.ds"
    keys = dataset.keys(collection)
    for k in keys:
        existing, err = dataset.read(collection, k)
        if err != "":
            print(f"Unexpected error on read: {err}")
        if "completed" not in existing:
            print("Processing new record ", k)
            if dataset.attachments(collection, k) != "":
                dataset.detach(collection, k)

                # Update CaltechDATA
                token = os.environ["TINDTOK"]

                infile = open("codemeta.json", "r")
                try:
                    meta = json.load(infile)
                except:
                    print("Invalid json file - Skipping forever ", k)
                else:
                    standardized = codemeta_to_datacite(meta)

                    # Check that all records have a GitHub subject tag
                    add = True
                    for s in standardized["subjects"]:
                        if s["subject"] == "Github":
                            add = False
                        if s["subject"] == "GitHub":
                            add = False
                    if add == True:
                        standardized["subjects"].append({"subject": "GitHub"})
                    response = caltechdata_edit(token, k, standardized, {}, {}, True)
                    print(response)
                os.system("rm codemeta.json")

            existing["completed"] = "True"
            if not dataset.update(collection, k, existing):
                err = dataset.error_message()
                print(f"Unexpected error on read: {err}")
def add_files(collection):
    #Run through all elements in collection
    keys = dataset.keys(collection)
    for k in keys:
        record, err = dataset.read(collection, k)
        if err != '':
            print(err)
            exit()
        url = record['url_links']
        print('Processing file from ', url)
        #Make a dummy file to represent results from kallisto
        files = ['example_file' + k]
        for f in files:
            with open(f, "w") as file:
                file.write(" 0 1 0 " + k)
        #Now attach file to collection
        err = dataset.attach(collection, k, files)
        if err != '':
            print(err)
            exit()
        #Cleanup local disk
        for f in files:
            os.remove(f)
示例#10
0
    )
    parser.add_argument("-recid", help="Eprints recid")
    parser.add_argument("-start_recid", help="Eprints recid to start at")
    parser.add_argument(
        "-test",
        help=
        "Uses feeds data and writes report of what would be changed, but makes no changes. Provide output file name",
    )
    parser.add_argument("-username", help="Eprints username")
    parser.add_argument("-password", help="Eprints password")

    args = parser.parse_args()

    if args.test:
        source = get_caltechfeed(args.repository)
        keys = dataset.keys(source)
        fout = open("../" + args.test, "w", newline="\n", encoding="utf-8-sig")
        file_out = csv.writer(fout)
    else:
        if args.repository in ["authors", "thesis", "caltechcampuspubs"]:
            source = "https://"
        else:
            source = "http://"
        if args.username:
            source = source + args.username + ":" + args.password + "@"
        source = source + args.repository + ".library.caltech.edu"
        keys = get_eprint_keys(source)
        file_out = None
    if args.start_recid:
        keys = [k for k in keys if int(k) >= int(args.start_recid)]
    if args.update_type == "resolver":
示例#11
0
def test_attachments(t, collection_name):
    t.print("Testing attach, attachments, detach and prune")
    # Generate two files to attach.
    with open('a1.txt', 'w') as text_file:
        text_file.write('This is file a1')
    with open('a2.txt', 'w') as text_file:
        text_file.write('This is file a2')
    filenames = ['a1.txt', 'a2.txt']

    if dataset.status(collection_name) == False:
        t.error("Failed,", collection_name, "missing")
        return
    keys = dataset.keys(collection_name)
    if len(keys) < 1:
        t.error("Failed,", collection_name, "should have keys")
        return

    key = keys[0]
    if dataset.attach(collection_name, key, filenames) == False:
        err = dataset.error_message()
        t.error("Failed, to attach files for", collection_name, key, filenames,
                ', ', err)
        return

    l = dataset.attachments(collection_name, key)
    if len(l) != 2:
        t.error("Failed, expected two attachments for", collection_name, key,
                "got", l)
        return

    #Check that attachments arn't impacted by update
    if dataset.update(collection_name, key, {"testing": "update"}) == False:
        err = dataset.error_message()
        t.error("Failed, to update record", collection_name, key, err)
        return
    l = dataset.attachments(collection_name, key)
    if len(l) != 2:
        t.error("Failed, expected two attachments after update for",
                collection_name, key, "got", l)
        return

    if os.path.exists(filenames[0]):
        os.remove(filenames[0])
    if os.path.exists(filenames[1]):
        os.remove(filenames[1])

    # First try detaching one file.
    if dataset.detach(collection_name, key, [filenames[1]]) == False:
        err = dataset.error_message()
        t.error("Failed, expected True for", collection_name, key,
                filenames[1], ', ', err)
    if os.path.exists(filenames[1]):
        os.remove(filenames[1])
    else:
        t.error("Failed to detch", filenames[1], "from", collection_name, key)

    # Test explicit filenames detch
    if dataset.detach(collection_name, key, filenames) == False:
        err = dataset.error_message()
        t.error("Failed, expected True for", collection_name, key, filenames,
                ', ', err)

    for fname in filenames:
        if os.path.exists(fname):
            os.remove(fname)
        else:
            t.error("Failed, expected", fname, "to be detached from",
                    collection_name, key)

    # Test detaching all files
    if dataset.detach(collection_name, key, []) == False:
        err = dataset.error_message()
        t.error("Failed, expected True for (detaching all)", collection_name,
                key, ', ', err)
    for fname in filenames:
        if os.path.exists(fname):
            os.remove(fname)
        else:
            t.error("Failed, expected", fname, "for detaching all from",
                    collection_name, key)

    if dataset.prune(collection_name, key, [filenames[0]]) == False:
        err = dataset.error_messag()
        t.error("Failed, expected True for prune", collection_name, key,
                [filenames[0]], ', ', err)
    l = dataset.attachments(collection_name, key)
    if len(l) != 1:
        t.error("Failed, expected one file after prune for", collection_name,
                key, [filenames[0]], "got", l)

    if dataset.prune(collection_name, key, []) == False:
        err = dataset.error_message()
        t.error("Failed, expected True for prune (all)", collection_name, key,
                ', ', err)
    l = dataset.attachments(collection_name, key)
    if len(l) != 0:
        t.error("Failed, expected zero files after prune for", collection_name,
                key, "got", l)
示例#12
0
args = parser.parse_args()

name = args.data_collection[0]
sheet = args.input_sheet[0]
output_sheet = args.output_sheet[0]

import_coll = "imported.ds"
os.system("rm -rf imported.ds")
dataset.init(import_coll)

os.environ['GOOGLE_CLIENT_SECRET_JSON'] = "/etc/client_secret.json"
err = dataset.import_gsheet(import_coll, sheet, 'Sheet1', 1, 'A:CZ')
if err != '':
    print(err)

keys = dataset.keys(import_coll)

coauthors = []

count = 0
for key in progressbar(keys, redirect_stdout=True):
    record, err = dataset.read(name, key)
    if err != "":
        print(err)
    count = 0
    if 'identifiers' in record:
        identifiers = record['identifiers']
    else:
        identifiers = []
    print(key)
    print(record)
示例#13
0
def test_sync_csv(t, c_name):
    # Setup test collection
    if os.path.exists(c_name):
        shutil.rmtree(c_name)
    if dataset.init(c_name) == False:
        err = dataset.error_message()
        t.error(f'init({c_name}) failed, {err}')
        return

    # Setup test CSV instance
    t_data = [{
        "_Key": "one",
        "value": 1
    }, {
        "_Key": "two",
        "value": 2
    }, {
        "_Key": "three",
        "value": 3
    }]
    csv_name = c_name.strip(".ds") + ".csv"
    if os.path.exists(csv_name):
        os.remove(csv_name)
    with open(csv_name, 'w') as csvfile:
        csv_writer = csv.DictWriter(csvfile, fieldnames=["_Key", "value"])
        csv_writer.writeheader()
        for obj in t_data:
            csv_writer.writerow(obj)

    # Import CSV into collection
    if dataset.import_csv(c_name, csv_name, True) == False:
        err = dataset.error_message()
        t.error(f'import_csv({c_name}, {csv_name}, True) failed, {err}')
        return
    for key in ["one", "two", "three"]:
        if dataset.has_key(c_name, key) == False:
            t.error(f"expected has_key({key}) == True, got False")
    if dataset.has_key(c_name, "five") == True:
        t.error(f"expected has_key('five') == False, got True")
    if dataset.create(c_name, "five", {"value": 5}) == False:
        err = dataset.error_message()
        t.error(f'create({c_name}, "five", {"value": 5}) failed, {err}')
        return

    # Setup frame
    frame_name = 'test_sync'
    keys = dataset.keys(c_name)
    if dataset.frame_create(c_name, frame_name, keys, ["._Key", ".value"],
                            ["_Key", "value"]) == False:
        err = dataset.error_message()
        t.error(f'frame_create({c_name}, {frame_name}, ...) failed, {err}')
        return

    #NOTE: Tests for sync_send_csv and sync_receive_csv
    if dataset.sync_send_csv(c_name, frame_name, csv_name) == False:
        err = dataset.error_message()
        t.error(
            f'sync_send_csv({c_name}, {frame_name}, {csv_name}) failed, {err}')
        return
    with open(csv_name) as fp:
        src = fp.read()
        if 'five' not in src:
            t.error(f"expected 'five' in src, got {src}")

    # Now remove "five" from collection
    if dataset.delete(c_name, "five") == False:
        err = dataset.error_message()
        t.error(f'delete({c_name}, "five") failed, {err}')
        return
    if dataset.has_key(c_name, "five") == True:
        t.error(f"expected has_key(five) == False, got True")
        return
    if dataset.sync_recieve_csv(c_name, frame_name, csv_name, False) == False:
        err = dataset.error_message()
        t.error(
            f'sync_receive_csv({c_name}, {frame_name}, {csv_name}) failed, {err}'
        )
        return
    if dataset.has_key(c_name, "five") == False:
        t.error(f"expected has_key(five) == True, got False")
        return
示例#14
0
def update_datacite_media(username, password, collection, prefix):
    keys = dataset.keys(collection)

    if path.exists("mediaupdate"):
        with open("mediaupdate", "r") as infile:
            update = date.fromisoformat(infile.read())
    else:
        # Arbitrary old date - everything will be updated
        update = date(2011, 1, 1)
    for k in progressbar(keys, redirect_stdout=True):
        existing, err = dataset.read(collection, k)
        if err != "":
            print(f"Unexpected error on read: {err}")
        atlas = False
        subjects = existing["subjects"]
        for subject in subjects:
            if (subject["subject"].strip() ==
                    "Atlas of Bacterial and Archaeal Cell Structure"):
                atlas = True
        record_update = datetime.fromisoformat(existing["updated"]).date()
        # Subtraction to get window to grab records that were updated between runs
        if record_update > update - timedelta(days=2):
            if "electronic_location_and_access" in existing:
                doi = existing["identifier"]["identifier"]
                record_prefix = doi.split("/")[0]
                if record_prefix == prefix:
                    delete_datacite_media(username, password, doi)
                    for file_met in existing["electronic_location_and_access"]:
                        url = "https://mds.datacite.org/media/" + doi
                        headers = {
                            "Content-Type": "application/txt;charset=UTF-8"
                        }
                        extension = file_met["electronic_name"][0].split(
                            ".")[-1]
                        filename = file_met["electronic_name"][0].split(".")[0]
                        data = {}
                        if extension == "nc":
                            data = ("application/x-netcdf=" +
                                    file_met["uniform_resource_identifier"])
                        elif extension == "mp4":
                            if atlas:
                                data = (
                                    "video/mp4=" +
                                    "https://www.cellstructureatlas.org/videos/"
                                    + filename + ".mp4")
                            else:
                                data = (
                                    "video/mp4=" +
                                    file_met["uniform_resource_identifier"])
                        elif extension == "mj2":
                            data = ("video/mj2=" +
                                    file_met["uniform_resource_identifier"])
                        elif extension == "avi":
                            data = ("video/avi=" +
                                    file_met["uniform_resource_identifier"])
                        elif extension == "mov":
                            data = ("video/quicktime=" +
                                    file_met["uniform_resource_identifier"])
                        elif extension == "gz":
                            data = ("application/gzip=" +
                                    file_met["uniform_resource_identifier"])
                        elif extension == "zip":
                            data = ("application/zip=" +
                                    file_met["uniform_resource_identifier"])
                        elif extension == "h5ad":
                            data = ("application/octet-stream=" +
                                    file_met["uniform_resource_identifier"])
                        if data != {}:
                            print(doi)
                            print(data)
                            r = requests.post(
                                url,
                                data=data.encode("utf-8"),
                                auth=(username, password),
                                headers=headers,
                            )
                            print(r)
示例#15
0
        description="caltechdata_backup queries the caltechDATA (Invenio 3) API\
    returns data and adds to dataset structure on disk")

    collection = "caltechdata.ds"
    if os.path.isdir(collection) == False:
        err = dataset.init(collection)
        if err != "":
            print(f"Failed on create {err}")
            exit()

    args = parser.parse_args()

    api_url = "https://data.caltech.edu/api/records/"

    # Get the existing records
    current = dataset.keys(collection)
    req = requests.get(api_url)
    data = req.json()

    temp = 'temp'
    if os.path.isdir(temp) == False:
        os.mkdir(temp)
    os.chdir(temp)
    collection = '../' + collection

    read_records(data["hits"]["hits"], current, collection)
    # if we have more pages of data
    while "next" in data["links"]:
        req = requests.get(data["links"]["next"])
        data = req.json()
示例#16
0
def add_usage(collection, token, usage_collection, production=True):
    """Add in usage text in the description field"""
    keys = dataset.keys(collection)
    biggest_views = 0
    biggest_views_record = ""
    biggest_downloads = 0
    biggest_downloads_record = ""
    total_views = 0
    total_downloads = 0
    for k in keys:
        record, err = dataset.read(collection, k)
        if err != "":
            print(err)
            exit()
        usage, err = dataset.read(usage_collection, k)
        views = usage["grand-total-unique-investigations"]
        downloads = usage["grand-total-unique-requests"]
        if views > biggest_views:
            biggest_views = views
            biggest_views_record = k
        if downloads > biggest_downloads:
            biggest_downloads = downloads
            biggest_downloads_record = k
        total_views += views
        total_downloads += downloads
        date = datetime.fromisoformat(usage["dataset-dates"][0]["value"])
        now = datetime.today()
        first = date.strftime("%B %d, %Y")
        last = now.strftime("%B %d, %Y")
        if views > 1:
            u_txt = (
                "<br>Unique Views: "
                + str(views)
                + "<br>Unique Downloads: "
                + str(downloads)
                + "<br> between "
                + first
                + " and "
                + last
                + '<br><a href="https://data.caltech.edu/stats"'
                + ">More info on how stats are collected</a><br>"
            )
            description = record["descriptions"]
            use_exists = False
            for d in description:
                descr_text = d["description"]
                # We always update an existing listing
                if descr_text.startswith("<br>Unique Views:"):
                    d["description"] = u_txt
                    use_exists = True
            # Otherwise we add a new one
            if use_exists == False:
                description.append({"descriptionType": "Other", "description": u_txt})
            response = caltechdata_edit(
                token, k, {"descriptions": description}, {}, {}, production
            )
            print(response)
    print(f"Most downloads {biggest_downloads} for record {biggest_downloads_record}")
    print(f"Most views {biggest_views} for record {biggest_views_record}")
    print(f"Total downloads {total_downloads}")
    print(f"Total views {total_views}")
示例#17
0
def test_keys(t, collection_name):
    '''test_keys(collection_name) test getting, filter and sorting keys'''
    # Test count after delete
    key_list = dataset.keys(collection_name)
    cnt = dataset.count(collection_name)
    if cnt != 0:
        t.error("Failed, expected zero records, got", cnt, key_list)

    #
    # Generate multiple records for collection for testing keys
    #
    test_records = {
        "gutenberg:21489": {
            "title": "The Secret of the Island",
            "formats": ["epub", "kindle", "plain text", "html"],
            "authors": [{
                "given": "Jules",
                "family": "Verne"
            }],
            "url": "http://www.gutenberg.org/ebooks/21489",
            "categories": "fiction, novel"
        },
        "gutenberg:2488": {
            "title":
            "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World",
            "formats": ["epub", "kindle", "plain text"],
            "authors": [{
                "given": "Jules",
                "family": "Verne"
            }],
            "url": "https://www.gutenberg.org/ebooks/2488",
            "categories": "fiction, novel"
        },
        "gutenberg:21839": {
            "title": "Sense and Sensibility",
            "formats": ["epub", "kindle", "plain text"],
            "authors": [{
                "given": "Jane",
                "family": "Austin"
            }],
            "url": "http://www.gutenberg.org/ebooks/21839",
            "categories": "fiction, novel"
        },
        "gutenberg:3186": {
            "title": "The Mysterious Stranger, and Other Stories",
            "formats": ["epub", "kindle", "plain text", "html"],
            "authors": [{
                "given": "Mark",
                "family": "Twain"
            }],
            "url": "http://www.gutenberg.org/ebooks/3186",
            "categories": "fiction, short story"
        },
        "hathi:uc1321060001561131": {
            "title":
            "A year of American travel - Narrative of personal experience",
            "formats": ["pdf"],
            "authors": [{
                "given": "Jessie Benton",
                "family": "Fremont"
            }],
            "url":
            "https://babel.hathitrust.org/cgi/pt?id=uc1.32106000561131;view=1up;seq=9",
            "categories": "non-fiction, memoir"
        }
    }
    test_count = len(test_records)

    for k in test_records:
        v = test_records[k]
        if dataset.create(collection_name, k, v) == False:
            err = dataset.error_message()
            t.error("Failed, could not add", k, "to", collection_name, ', ',
                    err)

    # Test keys, filtering keys and sorting keys
    all_keys = dataset.keys(collection_name)
    if len(all_keys) != test_count:
        t.error("Expected", test_count, "all_keys back, got", keys)

    #dataset.verbose_on()
    filter_expr = '(eq .categories "non-fiction, memoir")'
    filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr)
    if len(filtered_keys) != 1:
        t.error(
            f"key_filter({collection_name}, {keys}, {filter_expre}), Expected one key for",
            filter_expr, "got", filtered_keys)

    filter_expr = '(contains .categories "novel")'
    filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr)
    if len(filtered_keys) != 3:
        t.error(
            f"key_filter({collection_name}, {keys}, {filter_expr}), Expected three keys for",
            filter_expr, "got", filtered_keys)

    sort_expr = '+.title'
    filter_expr = '(contains .categories "novel")'
    sorted_keys = dataset.key_sort(collection_name, filtered_keys, sort_expr)
    if len(sorted_keys) != 3:
        t.error(
            f"key_sort({collection_name}, {filtered_keys}, {sort_expr}), Expected three keys for",
            filter_expr, "got", sorted_keys)
    expected_keys = ["gutenberg:21839", "gutenberg:21489", "gutenberg:2488"]
    for i, k in enumerate(expected_keys):
        if i < len(sorted_keys) and sorted_keys[i] != k:
            obj1, _ = dataset.read(collection_name, k)
            obj2, _ = dataset.read(collection_name, sorted_keys[i])
            t.error(
                f'key_sort({collection_name}, {filtered_keys}, {sort_expr}), ({q}) Expected {k} (title "{obj1["title"]}) got {keys[i]} (title "{obj2["title"]}")'
            )
示例#18
0
def fix_multiple_links(input_collection, token):
    keys = dataset.keys(input_collection)
    for k in keys:
        record, err = dataset.read(input_collection, k)
        if err != "":
            print(err)
            exit()
        if "relatedIdentifiers" in record:
            idvs = []
            new = []
            dupes = []
            replace = False
            record_doi = record["identifier"]["identifier"]
            for idv in record["relatedIdentifiers"]:
                idvs.append(idv["relatedIdentifier"])
            for idv in record["relatedIdentifiers"]:
                identifier = idv["relatedIdentifier"]
                if identifier == record_doi:
                    # Having a related identifier that is the same as the record
                    # doi doesn't make any sense
                    replace = True
                    dupes.append(identifier)
                else:
                    count = idvs.count(identifier)
                    if count > 1:
                        replace = True
                        if identifier not in dupes:
                            # We need to save the first duplicate
                            new.append(idv)
                            # Add to list of those already saved
                            dupes.append(identifier)
                        else:
                            # This will be deleted
                            dupes.append(identifier)
                    else:
                        # Save all unique ids
                        new.append(idv)
            if replace == True:
                print("Duplicate links found in record ", k)
                print("Will delete these links", dupes)
                response = input("Do you approve this change? Y or N")
                new_metadata = {"relatedIdentifiers": new}
                if response == "Y":
                    response = caltechdata_edit(token, k, new_metadata, {}, {}, True)
                    print(response)
        if "alternateIdentifiers" in record:
            idtypes = []
            alt_ids = []
            repeat = False
            for idv in record["alternateIdentifiers"]:
                if idv["alternateIdentifierType"] not in idtypes:
                    # If we haven't seen id type before, save it
                    alt_ids.append(idv)
                    idtypes.append(idv["alternateIdentifierType"])
                else:
                    repeat = True
                    print("Will Delete Repeated ID ", idv["alternateIdentifier"])
            if repeat == True:
                new_metadata = {"alternateIdentifiers": alt_ids}
                response = caltechdata_edit(token, k, new_metadata, {}, {}, True)
                print(response)
示例#19
0
def match_cd_refs():
    token = os.environ["TINDTOK"]

    matches = []
    collection = "caltechdata.ds"
    keys = dataset.keys(collection)
    if "mediaupdate" in keys:
        keys.remove("mediaupdate")

    # Get event data results
    event_data = "crossref_refs.ds"
    event_keys = dataset.keys(event_data)
    event_keys.remove("captured")
    f_name = "match_cd_refs"
    dot_paths = [".obj_id", ".id", ".subj_id"]
    labels = ["obj_id", "id", "subj_id"]
    print("Getting Event Data Records")
    if dataset.has_frame(event_data, f_name):
        if not dataset.frame_reframe(event_data, f_name, event_keys):
            err = dataset.error_message()
            print(f"Failed to reframe {f_name} in {event_data}, {err}")
            exit()
    elif not dataset.frame_create(event_data, f_name, event_keys, dot_paths, labels):
        err = dataset.error_message()
        print(f"Failed to create frame {f_name} in {event_data}, {err}")
        exit()
    grid = dataset.frame_grid(event_data, f_name)
    df = pd.DataFrame(np.array(grid), columns=["obj_id", "id", "subj_id"])
    grouped = df.groupby(["obj_id"])
    groups = grouped.groups
    # Look at all CaltechDATA records
    for k in keys:
        # Collect matched new links for the record
        record_matches = []
        print(k)
        metadata, err = dataset.read(collection, k)
        if err != "":
            print(f"Unexpected error on read: {err}")
        doi = "https://doi.org/" + metadata["identifier"]["identifier"]
        if doi in groups:
            hits = grouped.get_group(doi)
            for index, h in hits.iterrows():
                # Trigger for whether we already have this link
                new = True
                if "relatedIdentifiers" in metadata:
                    for m in metadata["relatedIdentifiers"]:
                        if m["relatedIdentifier"] in h["subj_id"]:
                            new = False
                if new == True:
                    match = h["subj_id"]
                    print(match)
                    print(h["obj_id"])
                    inputv = input("Do you approve this link?  Type Y or N: ")
                    if inputv == "Y":
                        record_matches.append(match)
        # If we have to update record
        if len(record_matches) > 0:
            ids = []
            if "relatedIdentifiers" in metadata:
                for m in metadata["relatedIdentifiers"]:
                    ids.append(m)
            matches.append([k, record_matches])
            # Now collect identifiers for record
            for match in record_matches:
                split = match.split("doi.org/")
                new_id = {
                    "relatedIdentifier": split[1],
                    "relatedIdentifierType": "DOI",
                    "relationType": "IsCitedBy",
                }
                ids.append(new_id)
            newmetadata = {"relatedIdentifiers": ids}
            response = caltechdata_edit(token, k, newmetadata, {}, {}, True)
            print(response)
    return matches
示例#20
0
import os
from py_dataset import dataset
from ames.harvesters import get_caltechfeed, get_records

if __name__ == "__main__":

    import_coll = "imported.ds"
    sheet = "1ZI3-XvQ_3rLcKrF-4FBa2tEInIdQfOnGJ9L_NmhmoGs"
    os.system("rm -rf imported.ds")
    dataset.init(import_coll)
    err = dataset.import_gsheet(import_coll, sheet, "CaltechPEOPLE", 4, "A:AA")
    if err != "":
        print(err)

    people_list = dataset.keys(import_coll)
    people = []
    for p in people_list:
        record, err = dataset.read(import_coll, p)
        people.append(record)

    # Profiles collection from feeds
    profile_ds = "profiles.ds"
    keys = dataset.keys(profile_ds)
    labels = ["orcid", "creator_id"]
    dot_paths = [".orcid", ".creator_id"]

    all_metadata = get_records(dot_paths, "profile", profile_ds, keys, labels)
    for profile in all_metadata:
        if "creator_id" in profile:
            idv = profile["creator_id"]
        else:
示例#21
0
def agent_report(file_name, repo, aspace):
    dot_paths = [
        "._Key",
        ".directory_info",
        ".ORCID",
        ".sort_name",
        ".ArchivesSpace_ID",
        ".family",
        ".given",
    ]
    labels = ["id", "directory_info", "orcid", "name", "as", "family", "given"]
    source = get_caltechfeed("people")
    keys = dataset.keys(source)
    keys.remove("captured")

    all_metadata = get_records(dot_paths, "p_list", source, keys, labels)

    all_metadata.sort(key=lambda all_metadata: all_metadata["id"])

    fname = file_name.split(".")[0]
    fcaltechpeople = fname + "_caltechpeople.csv"
    fmatched = fname + "_matched.csv"
    fnew_caltechpeople = fname + "_newcaltechpeople.csv"
    fnew_aspace = fname + "_newaspace.csv"

    caltechpeople = csv.writer(open(fcaltechpeople, "w"))
    matched = csv.writer(open(fmatched, "w"))
    new_caltechpeople = csv.writer(open(fnew_caltechpeople, "w"))
    new_aspace = csv.writer(open(fnew_aspace, "w"))

    to_match = {}
    gen_match = {}
    already_matched = {}

    aspace_url = "https://collections.archives.caltech.edu/agents/people/"
    feeds_url = "https://feeds.library.caltech.edu/people/"

    for metadata in all_metadata:
        if "as" in metadata:
            if metadata["as"] != "":
                already_matched[metadata["as"]] = metadata
            else:
                to_match[metadata["name"]] = metadata
                gen_match[metadata["family"]] = metadata
    print(f"{len(already_matched)} agents already in CaltechPEOPLE")

    print(f"Requesting agents")
    for agent in progressbar(aspace.agents):
        if agent.agent_type == "agent_person":
            primaty_name = agent.display_name.primary_name
            name = agent.display_name.sort_name
            published = agent.publish
            uid = int(agent.uri.split("/")[-1])
            if uid not in already_matched:
                if name in to_match:
                    person = to_match[name]
                    matched.writerow([
                        person["name"],
                        uid,
                        aspace_url + str(uid),
                        person["id"],
                        feeds_url + person["id"],
                        published,
                    ])
                    to_match.pop(name)
                else:
                    new_caltechpeople.writerow(
                        [name, uid, aspace_url + str(uid), published])
            else:
                metadata = already_matched[uid]
                caltechpeople.writerow([
                    metadata["name"],
                    metadata["as"],
                    aspace_url + str(metadata["as"]),
                    metadata["id"],
                    feeds_url + metadata["id"],
                    published,
                ])

    for name in to_match:
        new_aspace.writerow(
            [name, to_match[name]["id"], feeds_url + to_match[name]["id"]])
示例#22
0
def test_issue43(t, collection_name, csv_name):
    if os.path.exists(collection_name):
        shutil.rmtree(collection_name)
    if os.path.exists(csv_name):
        os.remove(csv_name)
    if dataset.init(collection_name) == False:
        err = dataset.error_message()
        t.error(f'Failed, need a {collection_name} to run test, {err}')
        return
    table = {
        "r1": {
            "c1": "one",
            "c2": "two",
            "c3": "three",
            "c4": "four"
        },
        "r2": {
            "c1": "one",
            "c3": "three",
            "c4": "four"
        },
        "r3": {
            "c1": "one",
            "c2": "two",
            "c4": "four"
        },
        "r4": {
            "c1": "one",
            "c2": "two",
            "c3": "three"
        },
        "r5": {
            "c1": "one",
            "c2": "two",
            "c3": "three",
            "c4": "four"
        }
    }
    for key in table:
        row = table[key]
        if dataset.create(collection_name, key, row) == False:
            err = dataset.error_message()
            t.error(f"Can't add test row {key} to {collection_name}, {err}")
            return

    dataset.use_strict_dotpath(False)
    # Setup frame
    frame_name = 'f1'
    keys = dataset.keys(collection_name)
    if dataset.frame_create(collection_name, frame_name, keys,
                            ["._Key", ".c1", ".c2", ".c3", ".c4"],
                            ["_Key", "c1", "c2", "c3", "c4"]) == False:
        err = dataset.error_message()
        t.error(err)
        return
    if dataset.export_csv(collection_name, frame_name, csv_name) == False:
        err = dataset.error_message()
        t.error(
            f'export_csv({collection_name}, {frame_name}, {csv_name} should have emitted warnings, not error, {err}'
        )
        return
    with open(csv_name, mode='r', encoding='utf-8') as f:
        rows = f.read()

    for row in rows.split('\n'):
        if len(row) > 0:
            cells = row.split(',')
            if len(cells) < 5:
                t.error(f'row error {csv_name} for {cells}')
示例#23
0
#!/usr/bin/env python3

import sys
from datetime import datetime
from py_dataset import dataset

#
# Loop through the keys, fetch the record and append a _Key: "deposit" to
# each object.
#

c_name = "people.ds"
keys = dataset.keys(c_name)
#print(f"DEBUG Keys: {keys}")
for key in keys:
    print(f"Fixing key {key}")
    data, err = dataset.read(c_name, key)
    if err != "":
        print(f"Error read {c_name} -> {key}, {err}")
        sys.exit(1)
    # Make fieldname lower case
    dt = datetime.now().strftime('%Y-%m-%d %H:%I:%S')
    obj = {
        "_Key": key,
        "_State": "deposit",
        "_Updated": f"{dt}",
        "_Created": f"{dt}"
    }
    for field in data:
        fkey = field.lower()
        if not ' ' in fkey:
示例#24
0
    os.mkdir("data")
os.chdir("data")

production = True
collection = "caltechdata.ds"

files = True

if files:
    get_caltechdata(collection, production)
    mapping = file_mapping(collection)

history = False

if history:
    keys = dataset.keys(collection)
    h_collection = "caltechdata_history.ds"
    get_history(h_collection, collection, keys)
    mapping = file_mapping(h_collection)

update = True

usage_collection = "caltechdata_usage.ds"
if update:
    token = os.environ["MATTOK"]
    build_usage(collection, usage_collection)
    get_usage(usage_collection, mapping, token)
    token = os.environ["TINDTOK"]
    add_usage(collection, token, usage_collection, production)

aggregate = True
示例#25
0
def add_thesis_doi(data_collection, thesis_collection, token, production=True):
    """Add in theis DOI to CaltechDATA records"""

    # Search across CaltechTHESIS DOIs
    dot_paths = ["._Key", ".doi", ".official_url", ".related_url"]
    labels = ["eprint_id", "doi", "official_url", "related_url"]
    keys = dataset.keys(thesis_collection)
    all_metadata = get_records(dot_paths, "dois", thesis_collection, keys, labels)
    dois = []
    for metadata in progressbar(all_metadata, redirect_stdout=True):
        if "doi" in metadata:
            record_doi = metadata["doi"].strip()
            if "related_url" in metadata and "items" in metadata["related_url"]:
                items = metadata["related_url"]["items"]
                for item in items:
                    if "url" in item:
                        url = item["url"].strip()
                    if "type" in item:
                        itype = item["type"].strip().lower()
                    if itype == "doi":
                        if idutils.is_doi(url):
                            doi = "10." + url.split("10.")[1]
                            prefix = doi.split("/")[0]
                            if prefix == "10.22002":
                                dois.append([doi, record_doi])
                        else:
                            print("Ignoring non-DOI")
                            print(metadata["eprint_id"])
                            print(url.split("10."))
    for doi_link in dois:
        cd_doi = doi_link[0]
        thesis_doi = doi_link[1]
        print("Checking " + cd_doi)
        if "D1" in cd_doi:
            record_number = cd_doi.split("D1.")[1]
        if "d1" in cd_doi:
            record_number = cd_doi.split("d1.")[1]
        record, err = dataset.read(data_collection, record_number)
        if err != "":
            print(err)
            exit()

        done = False
        if "relatedIdentifiers" in record:
            for idv in record["relatedIdentifiers"]:
                identifier = idv["relatedIdentifier"]
                if identifier == thesis_doi:
                    done = True
            if done == False:
                identifiers = record["relatedIdentifiers"]
                identifiers.append(
                    {
                        "relatedIdentifier": thesis_doi,
                        "relatedIdentifierType": "DOI",
                        "relationType": "IsSupplementTo",
                    }
                )
                new_metadata = {"relatedIdentifiers": identifiers}
        else:
            new_metadata = {
                "relatedIdentifiers": [
                    {
                        "relatedIdentifier": thesis_doi,
                        "relatedIdentifierType": "DOI",
                        "relationType": "IsSupplementTo",
                    }
                ]
            }
        if done == False:
            print("Adding " + thesis_doi + " to " + cd_doi)
            response = caltechdata_edit(
                token, record_number, new_metadata, {}, {}, True
            )
            print(response)
示例#26
0
def test_basic(t, collection_name):
    '''test_basic(collection_name) runs tests on basic CRUD ops'''
    # Setup a test record
    key = "2488"
    value = {
        "title":
        "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World",
        "formats": ["epub", "kindle", "plain text"],
        "authors": [{
            "given": "Jules",
            "family": "Verne"
        }],
        "url": "https://www.gutenberg.org/ebooks/2488"
    }

    # We should have an empty collection, we will create our test record.
    if dataset.create(collection_name, key, value) == False:
        err = dataset.error_message()
        t.error(f'create({collection_name}, {key}, {value}) failed, {err}')
        return

    # Check to see that we have only one record
    key_count = dataset.count(collection_name)
    if key_count != 1:
        t.error(f"Failed, expected count to be 1, got {key_count}")

    # Do a minimal test to see if the record looks like it has content
    keyList = dataset.keys(collection_name)
    rec, err = dataset.read(collection_name, key)
    if err != "":
        t.error(f"Unexpected error for {key} in {collection_name}, {err}")
    for k, v in value.items():
        if not isinstance(v, list):
            if k in rec and rec[k] == v:
                t.print("OK, found", k, " -> ", v)
            else:
                t.error(f"epxected {rec[k]} got {v}")
        else:
            if k == "formats" or k == "authors":
                t.print("OK, expected lists for", k, " -> ", v)
            else:
                t.error(f"Failed, expected {k} with list v, got {v}")

    # Test updating record
    value["verified"] = True
    if dataset.update(collection_name, key, value) == False:
        err = dataset.error_message()
        t.error(f"update({collection_name}, {key}, {value}) failed, {err}")
    rec, err = dataset.read(collection_name, key)
    if err != "":
        t.error(f"Unexpected error for {key} in {collection_name}, {err}")
    for k, v in value.items():
        if not isinstance(v, list):
            if k in rec and rec[k] == v:
                t.print("OK, found", k, " -> ", v)
            else:
                t.error("expected {rec[k]} got {v} for key {k}")
        else:
            if k == "formats" or k == "authors":
                t.print("OK, expected lists for", k, " -> ", v)
            else:
                t.error("Failed, expected {k} with a list for v, got {v}")

    # Test path to record
    expected_s = "/".join(
        [collection_name, "pairtree", "24", "88", (key + ".json")])
    expected_l = len(expected_s)
    p = dataset.path(collection_name, key)
    if len(p) != expected_l:
        t.error("Failed, expected length", expected_l, "got", len(p))
    if p != expected_s:
        t.error("Failed, expected", expected_s, "got", p)

    # Test listing records
    l = dataset.list(collection_name, [key])
    if len(l) != 1:
        t.error(
            f"list({collection_name}, [{key}]) failed, list should return an array of one record, got",
            l)
        return

    # test deleting a record
    if dataset.delete(collection_name, key) == False:
        err = dataset.error_message()
        t.error("Failed, could not delete record", key, ", ", err)
示例#27
0
def release_files(source, base_url, outfile=None):
    if source.split(".")[-1] == "ds":
        # This generates report
        dot_paths = [
            ".eprint_id",
            ".documents",
            ".date",
            ".eprint_status",
            ".creators.items[0].name.family",
            ".thesis_type",
            ".full_text_status",
        ]
        labels = [
            "eprint_id",
            "documents",
            "date",
            "status",
            "family",
            "type",
            "full_text",
        ]
        keys = dataset.keys(source)
        all_metadata = get_records(dot_paths, "official", source, keys, labels)
        all_metadata.sort(key=lambda all_metadata: all_metadata["family"])
        all_metadata.sort(key=lambda all_metadata: all_metadata["date"])
        for meta in all_metadata:
            year = meta["date"].split("-")[0]
            if is_in_range("2004-2005", year):
                if thesis_match(meta):
                    files = []
                    fnames = []
                    count = 0
                    for document in meta["documents"]:
                        count = count + 1
                        if document["security"] == "validuser":
                            files.append(count)
                            fnames.append(document["main"])
                    if len(files) > 0:
                        eprint_id = meta["eprint_id"]
                        print(eprint_id)
                        outfile.writerow(
                            [
                                year,
                                meta["family"],
                                eprint_id,
                                meta["status"],
                                meta["full_text"],
                                files,
                                fnames,
                            ]
                        )
                        mixed = False
                        for filen in files:
                            new = "public"
                            # Doc status
                            url = (
                                base_url
                                + "/rest/eprint/"
                                + str(eprint_id)
                                + "/full_text_status.txt"
                            )
                            response = requests.get(url)
                            eprint_status = response.text
                            if eprint_status == "restricted":
                                response = requests.put(url, data=new, headers=headers)
                                print(response)
                            elif eprint_status == "mixed":
                                print("mixed, skipping")
                                mixed = True
                            elif eprint_status != "public":
                                print(eprint_status)
                                print(url)
                                exit()
                            url = (
                                base_url
                                + "/rest/eprint/"
                                + str(eprint_id)
                                + "/documents/"
                                + str(filen)
                                + "/security.txt"
                            )
                            headers = {"content-type": "text/plain"}
                            response = requests.get(url)
                            live_status = response.text
                            if not mixed:
                                if live_status == "validuser":
                                    response = requests.put(
                                        url, data=new, headers=headers
                                    )
                                    print(response)
                                elif live_status != "public":
                                    print(live_status)
                                    print(url)
                                    exit()
示例#28
0
        os.remove(tarball)
        sys.stdout.flush()


if len(sys.argv) == 1:
    app = os.path.basename(sys.argv[0])
    print(f"USAGE: {app} DATASET_NAME", end="\n\n")
    print(
        "Converts attachments in a dataset from tarballs to v0.0.62 attachment scheme",
        end="\n\n")
    sys.exit(0)

if not os.path.exists("tmp-attachment-migration"):
    os.mkdir("tmp-attachment-migration")
os.chdir("tmp-attachment-migration")
print(f"Working directory for migration is {os.getcwd()}")
for c_name in sys.argv:
    keys = dataset.keys(os.path.join("..", c_name))
    if isinstance(keys[0], int):
        keys.sort(key=int)
    else:
        keys.sort()
    tot = len(keys)
    print(f"Ready to process {tot} objects")
    for i, key in enumerate(keys):
        if (i > 0) and (i % 500) == 0:
            print(f"\n{i} of {tot} processed")
        migrate_attachment(os.path.join("..", c_name), key)
    print()
    print(f"Procssing {c_name} complete")
示例#29
0
def aggregate_usage(usage_collection, month_collection):
    keys = dataset.keys(usage_collection)
    keys.remove("end-date")
    for k in progressbar(keys):
        record, err = dataset.read(usage_collection, k)
        if err != "":
            print(err)
        use = {}
        views = {}
        for usage in record["performance"]:
            split = usage["period"].split("-")
            month = split[0] + "-" + split[1]
            for u in usage["instance"]:
                metric = u["metric-type"]
                if metric == "unique-dataset-requests":
                    if month in use:
                        use[month] += u["count"]
                    else:
                        use[month] = u["count"]
                if metric == "unique-dataset-investigations":
                    if month in views:
                        views[month] += u["count"]
                    else:
                        views[month] = u["count"]
        # Strip non-counter stuff
        record.pop("_Key")
        record.pop("grand-total-unique-requests")
        record.pop("grand-total-unique-investigations")
        # go across months
        for view in views:
            split = view.split("-")
            date_obj = datetime(int(split[0]), int(split[1]), 1)
            d_range = get_month_day_range(date_obj)
            performance = [
                {
                    "period": {
                        "begin-date": d_range[0].date().isoformat(),
                        "end-date": d_range[1].date().isoformat(),
                    },
                    "instance": [],
                }
            ]
            v = views[view]
            performance[0]["instance"].append(
                {
                    "count": v,
                    "metric-type": "unique-dataset-investigations",
                    "access-method": "regular",
                }
            )
            # Handle when we have both views and uses in a given month
            if view in use:
                u = use[view]
                performance[0]["instance"].append(
                    {
                        "count": u,
                        "metric-type": "unique-dataset-requests",
                        "access-method": "regular",
                    }
                )
            existing, err = dataset.read(month_collection, view)
            if err != "":
                print(err)
            record["performance"] = performance
            existing["report-datasets"].append(record)
            if not dataset.update(month_collection, view, existing):
                err = dataset.error_message()
                print(err)
        for use_date in use:
            # We only have use-only records left to handle
            if use_date not in views:
                u = use[use_date]
                split = use_date.split("-")
                date_obj = datetime(int(split[0]), int(split[1]), 1)
                d_range = get_month_day_range(date_obj)
                performance = [
                    {
                        "period": {
                            "begin-date": d_range[0].date().isoformat(),
                            "end-date": d_range[1].date().isoformat(),
                        },
                        "instance": [
                            {
                                "count": u,
                                "metric-type": "unique-dataset-requests",
                                "access-method": "regular",
                            }
                        ],
                    }
                ]
                existing, err = dataset.read(month_collection, view)
                if err != "":
                    print(err)
                record["performance"] = performance
                existing["report-datasets"].append(record)
                if not dataset.update(month_collection, view, existing):
                    err = dataset.error_message()
                    print(err)