示例#1
0
def edit_ctd(idv, metadata={}, filenames=[], production=False):
    """ Edit an entry at Caltech Data.
    metadata can be emtpy dict ({}) and filenames a list of files associate with existing idv.
    filenames is list of strings with full path to file for upload.
    """

    # upload supporting data
    caltechdata_edit(ids=idv,
                     token=token,
                     metadata=metadata,
                     files=filenames,
                     production=production)
示例#2
0
def add_citation(collection, token, production=True):
    """Add in example citation text in the description field"""
    keys = dataset.keys(collection)
    for k in keys:
        record, err = dataset.read(collection, k)
        if err != "":
            print(err)
            exit()
        description = record["descriptions"]
        cite_exists = False
        for d in description:
            descr_text = d["description"]
            if descr_text.startswith("<br>Cite this record as:"):
                cite_exists = True
        if cite_exists == False:
            record_doi = record["identifier"]["identifier"]
            headers = {"Accept": "text/x-bibliography; style=apa"}
            citation_link = "https://doi.org/"
            citation = requests.get(citation_link + record_doi, headers=headers).text
            doi_url = "https://doi.org/" + record_doi
            if doi_url in citation:
                # Check that we have a citation and not a server error,
                # otherwise wait till next time
                n_txt = citation_text(citation, doi_url, record_doi)
                description.append({"descriptionType": "Other", "description": n_txt})
                response = caltechdata_edit(
                    token, k, {"descriptions": description}, {}, {}, production
                )
                print(response)
示例#3
0
def update_citation(record, rid, token, production=True):
    """Update example citation text in the description field"""
    description = record["descriptions"]
    for d in description:
        descr_text = d["description"]
        if descr_text.startswith("<br>Cite this record as:"):
            record_doi = record["identifier"]["identifier"]
            headers = {"Accept": "text/x-bibliography; style=apa"}
            citation_link = "https://doi.org/"
            citation = requests.get(citation_link + record_doi, headers=headers).text
            doi_url = "https://doi.org/" + record_doi.lower()
            if doi_url in citation.lower():
                # Check that we have a citation and not a server error,
                # otherwise wait till next time
                d["description"] = citation_text(citation, doi_url, record_doi)
    response = caltechdata_edit(
        token, rid, {"descriptions": description}, {}, {}, production
    )
    print(response)
示例#4
0
def match_codemeta():
    collection = "github_records.ds"
    keys = dataset.keys(collection)
    for k in keys:
        existing, err = dataset.read(collection, k)
        if err != "":
            print(f"Unexpected error on read: {err}")
        if "completed" not in existing:
            print("Processing new record ", k)
            if dataset.attachments(collection, k) != "":
                dataset.detach(collection, k)

                # Update CaltechDATA
                token = os.environ["TINDTOK"]

                infile = open("codemeta.json", "r")
                try:
                    meta = json.load(infile)
                except:
                    print("Invalid json file - Skipping forever ", k)
                else:
                    standardized = codemeta_to_datacite(meta)

                    # Check that all records have a GitHub subject tag
                    add = True
                    for s in standardized["subjects"]:
                        if s["subject"] == "Github":
                            add = False
                        if s["subject"] == "GitHub":
                            add = False
                    if add == True:
                        standardized["subjects"].append({"subject": "GitHub"})
                    response = caltechdata_edit(token, k, standardized, {}, {}, True)
                    print(response)
                os.system("rm codemeta.json")

            existing["completed"] = "True"
            if not dataset.update(collection, k, existing):
                err = dataset.error_message()
                print(f"Unexpected error on read: {err}")
示例#5
0
    prefix=prefix,
    #test_mode=True
)

doi_end = subprocess.check_output(['./gen-cool-doi'], universal_newlines=True)
identifier = str(prefix) + '/' + str(doi_end)

metadata['identifier'] = {'identifier': identifier, 'identifierType': 'DOI'}

assert schema40.validate(metadata)
#Debugging if this fails
#v = schema40.validator.validate(metadata)
#errors = sorted(v.iter_errors(instance), key=lambda e: e.path)
#for error in errors:
#    print(error.message)

xml = schema40.tostring(metadata)
d.metadata_post(xml)
d.doi_post(identifier, url)
print('DOI minted:' + identifier)

metadata = {}

metadata['relatedIdentifiers'] = [{
    "relatedIdentifier": url,
    'relatedIdentifierType': 'URL',
    'relationType': 'IsSourceOf'
}]

caltechdata_edit(token, idv, metadata)
                            build_relation(client, version)
                        ]
                    files = (outfile)
                    response = caltechdata_write(metadata, token, files, True)
                    print(response)
                    new_id = response.split('/')[4].split('.')[0]
                    outf = open(history_file, 'a')
                    outf.write(version + ',' + new_id)
                    #Cleanup
                    os.remove(outfile)
                else:
                    #Minor release - just edit the same CaltechDATA record
                    metaf = open(args.json_file[0], 'r')
                    metadata = json.load(metaf)
                    for d in metadata['dates']:
                        if d['dateType'] == 'Updated':
                            d['date'] = datetime.date.today().isoformat()
                    metadata['version'] = version
                    metadata['relatedIdentifiers'].append(
                        build_relation(client, version))
                    files = [outfile]
                    response = caltechdata_edit(token, archived_ids[mj],
                                                metadata, files, ['tgz'], True)
                    print(response)
                    #Not strictly necessary, but will prevent multiple edits
                    new_id = response.split('/')[4].split('.')[0]
                    outf = open(history_file, 'a')
                    outf.write(version + ',' + new_id)
                    #Cleanup
                    os.remove(outfile)
示例#7
0
def add_thesis_doi(data_collection, thesis_collection, token, production=True):
    """Add in theis DOI to CaltechDATA records"""

    # Search across CaltechTHESIS DOIs
    dot_paths = ["._Key", ".doi", ".official_url", ".related_url"]
    labels = ["eprint_id", "doi", "official_url", "related_url"]
    keys = dataset.keys(thesis_collection)
    all_metadata = get_records(dot_paths, "dois", thesis_collection, keys, labels)
    dois = []
    for metadata in progressbar(all_metadata, redirect_stdout=True):
        if "doi" in metadata:
            record_doi = metadata["doi"].strip()
            if "related_url" in metadata and "items" in metadata["related_url"]:
                items = metadata["related_url"]["items"]
                for item in items:
                    if "url" in item:
                        url = item["url"].strip()
                    if "type" in item:
                        itype = item["type"].strip().lower()
                    if itype == "doi":
                        if idutils.is_doi(url):
                            doi = "10." + url.split("10.")[1]
                            prefix = doi.split("/")[0]
                            if prefix == "10.22002":
                                dois.append([doi, record_doi])
                        else:
                            print("Ignoring non-DOI")
                            print(metadata["eprint_id"])
                            print(url.split("10."))
    for doi_link in dois:
        cd_doi = doi_link[0]
        thesis_doi = doi_link[1]
        print("Checking " + cd_doi)
        if "D1" in cd_doi:
            record_number = cd_doi.split("D1.")[1]
        if "d1" in cd_doi:
            record_number = cd_doi.split("d1.")[1]
        record, err = dataset.read(data_collection, record_number)
        if err != "":
            print(err)
            exit()

        done = False
        if "relatedIdentifiers" in record:
            for idv in record["relatedIdentifiers"]:
                identifier = idv["relatedIdentifier"]
                if identifier == thesis_doi:
                    done = True
            if done == False:
                identifiers = record["relatedIdentifiers"]
                identifiers.append(
                    {
                        "relatedIdentifier": thesis_doi,
                        "relatedIdentifierType": "DOI",
                        "relationType": "IsSupplementTo",
                    }
                )
                new_metadata = {"relatedIdentifiers": identifiers}
        else:
            new_metadata = {
                "relatedIdentifiers": [
                    {
                        "relatedIdentifier": thesis_doi,
                        "relatedIdentifierType": "DOI",
                        "relationType": "IsSupplementTo",
                    }
                ]
            }
        if done == False:
            print("Adding " + thesis_doi + " to " + cd_doi)
            response = caltechdata_edit(
                token, record_number, new_metadata, {}, {}, True
            )
            print(response)
示例#8
0
def add_usage(collection, token, usage_collection, production=True):
    """Add in usage text in the description field"""
    keys = dataset.keys(collection)
    biggest_views = 0
    biggest_views_record = ""
    biggest_downloads = 0
    biggest_downloads_record = ""
    total_views = 0
    total_downloads = 0
    for k in keys:
        record, err = dataset.read(collection, k)
        if err != "":
            print(err)
            exit()
        usage, err = dataset.read(usage_collection, k)
        views = usage["grand-total-unique-investigations"]
        downloads = usage["grand-total-unique-requests"]
        if views > biggest_views:
            biggest_views = views
            biggest_views_record = k
        if downloads > biggest_downloads:
            biggest_downloads = downloads
            biggest_downloads_record = k
        total_views += views
        total_downloads += downloads
        date = datetime.fromisoformat(usage["dataset-dates"][0]["value"])
        now = datetime.today()
        first = date.strftime("%B %d, %Y")
        last = now.strftime("%B %d, %Y")
        if views > 1:
            u_txt = (
                "<br>Unique Views: "
                + str(views)
                + "<br>Unique Downloads: "
                + str(downloads)
                + "<br> between "
                + first
                + " and "
                + last
                + '<br><a href="https://data.caltech.edu/stats"'
                + ">More info on how stats are collected</a><br>"
            )
            description = record["descriptions"]
            use_exists = False
            for d in description:
                descr_text = d["description"]
                # We always update an existing listing
                if descr_text.startswith("<br>Unique Views:"):
                    d["description"] = u_txt
                    use_exists = True
            # Otherwise we add a new one
            if use_exists == False:
                description.append({"descriptionType": "Other", "description": u_txt})
            response = caltechdata_edit(
                token, k, {"descriptions": description}, {}, {}, production
            )
            print(response)
    print(f"Most downloads {biggest_downloads} for record {biggest_downloads_record}")
    print(f"Most views {biggest_views} for record {biggest_views_record}")
    print(f"Total downloads {total_downloads}")
    print(f"Total views {total_views}")
示例#9
0
def fix_multiple_links(input_collection, token):
    keys = dataset.keys(input_collection)
    for k in keys:
        record, err = dataset.read(input_collection, k)
        if err != "":
            print(err)
            exit()
        if "relatedIdentifiers" in record:
            idvs = []
            new = []
            dupes = []
            replace = False
            record_doi = record["identifier"]["identifier"]
            for idv in record["relatedIdentifiers"]:
                idvs.append(idv["relatedIdentifier"])
            for idv in record["relatedIdentifiers"]:
                identifier = idv["relatedIdentifier"]
                if identifier == record_doi:
                    # Having a related identifier that is the same as the record
                    # doi doesn't make any sense
                    replace = True
                    dupes.append(identifier)
                else:
                    count = idvs.count(identifier)
                    if count > 1:
                        replace = True
                        if identifier not in dupes:
                            # We need to save the first duplicate
                            new.append(idv)
                            # Add to list of those already saved
                            dupes.append(identifier)
                        else:
                            # This will be deleted
                            dupes.append(identifier)
                    else:
                        # Save all unique ids
                        new.append(idv)
            if replace == True:
                print("Duplicate links found in record ", k)
                print("Will delete these links", dupes)
                response = input("Do you approve this change? Y or N")
                new_metadata = {"relatedIdentifiers": new}
                if response == "Y":
                    response = caltechdata_edit(token, k, new_metadata, {}, {}, True)
                    print(response)
        if "alternateIdentifiers" in record:
            idtypes = []
            alt_ids = []
            repeat = False
            for idv in record["alternateIdentifiers"]:
                if idv["alternateIdentifierType"] not in idtypes:
                    # If we haven't seen id type before, save it
                    alt_ids.append(idv)
                    idtypes.append(idv["alternateIdentifierType"])
                else:
                    repeat = True
                    print("Will Delete Repeated ID ", idv["alternateIdentifier"])
            if repeat == True:
                new_metadata = {"alternateIdentifiers": alt_ids}
                response = caltechdata_edit(token, k, new_metadata, {}, {}, True)
                print(response)
示例#10
0
def match_cd_refs():
    token = os.environ["TINDTOK"]

    matches = []
    collection = "caltechdata.ds"
    keys = dataset.keys(collection)
    if "mediaupdate" in keys:
        keys.remove("mediaupdate")

    # Get event data results
    event_data = "crossref_refs.ds"
    event_keys = dataset.keys(event_data)
    event_keys.remove("captured")
    f_name = "match_cd_refs"
    dot_paths = [".obj_id", ".id", ".subj_id"]
    labels = ["obj_id", "id", "subj_id"]
    print("Getting Event Data Records")
    if dataset.has_frame(event_data, f_name):
        if not dataset.frame_reframe(event_data, f_name, event_keys):
            err = dataset.error_message()
            print(f"Failed to reframe {f_name} in {event_data}, {err}")
            exit()
    elif not dataset.frame_create(event_data, f_name, event_keys, dot_paths, labels):
        err = dataset.error_message()
        print(f"Failed to create frame {f_name} in {event_data}, {err}")
        exit()
    grid = dataset.frame_grid(event_data, f_name)
    df = pd.DataFrame(np.array(grid), columns=["obj_id", "id", "subj_id"])
    grouped = df.groupby(["obj_id"])
    groups = grouped.groups
    # Look at all CaltechDATA records
    for k in keys:
        # Collect matched new links for the record
        record_matches = []
        print(k)
        metadata, err = dataset.read(collection, k)
        if err != "":
            print(f"Unexpected error on read: {err}")
        doi = "https://doi.org/" + metadata["identifier"]["identifier"]
        if doi in groups:
            hits = grouped.get_group(doi)
            for index, h in hits.iterrows():
                # Trigger for whether we already have this link
                new = True
                if "relatedIdentifiers" in metadata:
                    for m in metadata["relatedIdentifiers"]:
                        if m["relatedIdentifier"] in h["subj_id"]:
                            new = False
                if new == True:
                    match = h["subj_id"]
                    print(match)
                    print(h["obj_id"])
                    inputv = input("Do you approve this link?  Type Y or N: ")
                    if inputv == "Y":
                        record_matches.append(match)
        # If we have to update record
        if len(record_matches) > 0:
            ids = []
            if "relatedIdentifiers" in metadata:
                for m in metadata["relatedIdentifiers"]:
                    ids.append(m)
            matches.append([k, record_matches])
            # Now collect identifiers for record
            for match in record_matches:
                split = match.split("doi.org/")
                new_id = {
                    "relatedIdentifier": split[1],
                    "relatedIdentifierType": "DOI",
                    "relationType": "IsCitedBy",
                }
                ids.append(new_id)
            newmetadata = {"relatedIdentifiers": ids}
            response = caltechdata_edit(token, k, newmetadata, {}, {}, True)
            print(response)
    return matches