def edit_ctd(idv, metadata={}, filenames=[], production=False): """ Edit an entry at Caltech Data. metadata can be emtpy dict ({}) and filenames a list of files associate with existing idv. filenames is list of strings with full path to file for upload. """ # upload supporting data caltechdata_edit(ids=idv, token=token, metadata=metadata, files=filenames, production=production)
def add_citation(collection, token, production=True): """Add in example citation text in the description field""" keys = dataset.keys(collection) for k in keys: record, err = dataset.read(collection, k) if err != "": print(err) exit() description = record["descriptions"] cite_exists = False for d in description: descr_text = d["description"] if descr_text.startswith("<br>Cite this record as:"): cite_exists = True if cite_exists == False: record_doi = record["identifier"]["identifier"] headers = {"Accept": "text/x-bibliography; style=apa"} citation_link = "https://doi.org/" citation = requests.get(citation_link + record_doi, headers=headers).text doi_url = "https://doi.org/" + record_doi if doi_url in citation: # Check that we have a citation and not a server error, # otherwise wait till next time n_txt = citation_text(citation, doi_url, record_doi) description.append({"descriptionType": "Other", "description": n_txt}) response = caltechdata_edit( token, k, {"descriptions": description}, {}, {}, production ) print(response)
def update_citation(record, rid, token, production=True): """Update example citation text in the description field""" description = record["descriptions"] for d in description: descr_text = d["description"] if descr_text.startswith("<br>Cite this record as:"): record_doi = record["identifier"]["identifier"] headers = {"Accept": "text/x-bibliography; style=apa"} citation_link = "https://doi.org/" citation = requests.get(citation_link + record_doi, headers=headers).text doi_url = "https://doi.org/" + record_doi.lower() if doi_url in citation.lower(): # Check that we have a citation and not a server error, # otherwise wait till next time d["description"] = citation_text(citation, doi_url, record_doi) response = caltechdata_edit( token, rid, {"descriptions": description}, {}, {}, production ) print(response)
def match_codemeta(): collection = "github_records.ds" keys = dataset.keys(collection) for k in keys: existing, err = dataset.read(collection, k) if err != "": print(f"Unexpected error on read: {err}") if "completed" not in existing: print("Processing new record ", k) if dataset.attachments(collection, k) != "": dataset.detach(collection, k) # Update CaltechDATA token = os.environ["TINDTOK"] infile = open("codemeta.json", "r") try: meta = json.load(infile) except: print("Invalid json file - Skipping forever ", k) else: standardized = codemeta_to_datacite(meta) # Check that all records have a GitHub subject tag add = True for s in standardized["subjects"]: if s["subject"] == "Github": add = False if s["subject"] == "GitHub": add = False if add == True: standardized["subjects"].append({"subject": "GitHub"}) response = caltechdata_edit(token, k, standardized, {}, {}, True) print(response) os.system("rm codemeta.json") existing["completed"] = "True" if not dataset.update(collection, k, existing): err = dataset.error_message() print(f"Unexpected error on read: {err}")
prefix=prefix, #test_mode=True ) doi_end = subprocess.check_output(['./gen-cool-doi'], universal_newlines=True) identifier = str(prefix) + '/' + str(doi_end) metadata['identifier'] = {'identifier': identifier, 'identifierType': 'DOI'} assert schema40.validate(metadata) #Debugging if this fails #v = schema40.validator.validate(metadata) #errors = sorted(v.iter_errors(instance), key=lambda e: e.path) #for error in errors: # print(error.message) xml = schema40.tostring(metadata) d.metadata_post(xml) d.doi_post(identifier, url) print('DOI minted:' + identifier) metadata = {} metadata['relatedIdentifiers'] = [{ "relatedIdentifier": url, 'relatedIdentifierType': 'URL', 'relationType': 'IsSourceOf' }] caltechdata_edit(token, idv, metadata)
build_relation(client, version) ] files = (outfile) response = caltechdata_write(metadata, token, files, True) print(response) new_id = response.split('/')[4].split('.')[0] outf = open(history_file, 'a') outf.write(version + ',' + new_id) #Cleanup os.remove(outfile) else: #Minor release - just edit the same CaltechDATA record metaf = open(args.json_file[0], 'r') metadata = json.load(metaf) for d in metadata['dates']: if d['dateType'] == 'Updated': d['date'] = datetime.date.today().isoformat() metadata['version'] = version metadata['relatedIdentifiers'].append( build_relation(client, version)) files = [outfile] response = caltechdata_edit(token, archived_ids[mj], metadata, files, ['tgz'], True) print(response) #Not strictly necessary, but will prevent multiple edits new_id = response.split('/')[4].split('.')[0] outf = open(history_file, 'a') outf.write(version + ',' + new_id) #Cleanup os.remove(outfile)
def add_thesis_doi(data_collection, thesis_collection, token, production=True): """Add in theis DOI to CaltechDATA records""" # Search across CaltechTHESIS DOIs dot_paths = ["._Key", ".doi", ".official_url", ".related_url"] labels = ["eprint_id", "doi", "official_url", "related_url"] keys = dataset.keys(thesis_collection) all_metadata = get_records(dot_paths, "dois", thesis_collection, keys, labels) dois = [] for metadata in progressbar(all_metadata, redirect_stdout=True): if "doi" in metadata: record_doi = metadata["doi"].strip() if "related_url" in metadata and "items" in metadata["related_url"]: items = metadata["related_url"]["items"] for item in items: if "url" in item: url = item["url"].strip() if "type" in item: itype = item["type"].strip().lower() if itype == "doi": if idutils.is_doi(url): doi = "10." + url.split("10.")[1] prefix = doi.split("/")[0] if prefix == "10.22002": dois.append([doi, record_doi]) else: print("Ignoring non-DOI") print(metadata["eprint_id"]) print(url.split("10.")) for doi_link in dois: cd_doi = doi_link[0] thesis_doi = doi_link[1] print("Checking " + cd_doi) if "D1" in cd_doi: record_number = cd_doi.split("D1.")[1] if "d1" in cd_doi: record_number = cd_doi.split("d1.")[1] record, err = dataset.read(data_collection, record_number) if err != "": print(err) exit() done = False if "relatedIdentifiers" in record: for idv in record["relatedIdentifiers"]: identifier = idv["relatedIdentifier"] if identifier == thesis_doi: done = True if done == False: identifiers = record["relatedIdentifiers"] identifiers.append( { "relatedIdentifier": thesis_doi, "relatedIdentifierType": "DOI", "relationType": "IsSupplementTo", } ) new_metadata = {"relatedIdentifiers": identifiers} else: new_metadata = { "relatedIdentifiers": [ { "relatedIdentifier": thesis_doi, "relatedIdentifierType": "DOI", "relationType": "IsSupplementTo", } ] } if done == False: print("Adding " + thesis_doi + " to " + cd_doi) response = caltechdata_edit( token, record_number, new_metadata, {}, {}, True ) print(response)
def add_usage(collection, token, usage_collection, production=True): """Add in usage text in the description field""" keys = dataset.keys(collection) biggest_views = 0 biggest_views_record = "" biggest_downloads = 0 biggest_downloads_record = "" total_views = 0 total_downloads = 0 for k in keys: record, err = dataset.read(collection, k) if err != "": print(err) exit() usage, err = dataset.read(usage_collection, k) views = usage["grand-total-unique-investigations"] downloads = usage["grand-total-unique-requests"] if views > biggest_views: biggest_views = views biggest_views_record = k if downloads > biggest_downloads: biggest_downloads = downloads biggest_downloads_record = k total_views += views total_downloads += downloads date = datetime.fromisoformat(usage["dataset-dates"][0]["value"]) now = datetime.today() first = date.strftime("%B %d, %Y") last = now.strftime("%B %d, %Y") if views > 1: u_txt = ( "<br>Unique Views: " + str(views) + "<br>Unique Downloads: " + str(downloads) + "<br> between " + first + " and " + last + '<br><a href="https://data.caltech.edu/stats"' + ">More info on how stats are collected</a><br>" ) description = record["descriptions"] use_exists = False for d in description: descr_text = d["description"] # We always update an existing listing if descr_text.startswith("<br>Unique Views:"): d["description"] = u_txt use_exists = True # Otherwise we add a new one if use_exists == False: description.append({"descriptionType": "Other", "description": u_txt}) response = caltechdata_edit( token, k, {"descriptions": description}, {}, {}, production ) print(response) print(f"Most downloads {biggest_downloads} for record {biggest_downloads_record}") print(f"Most views {biggest_views} for record {biggest_views_record}") print(f"Total downloads {total_downloads}") print(f"Total views {total_views}")
def fix_multiple_links(input_collection, token): keys = dataset.keys(input_collection) for k in keys: record, err = dataset.read(input_collection, k) if err != "": print(err) exit() if "relatedIdentifiers" in record: idvs = [] new = [] dupes = [] replace = False record_doi = record["identifier"]["identifier"] for idv in record["relatedIdentifiers"]: idvs.append(idv["relatedIdentifier"]) for idv in record["relatedIdentifiers"]: identifier = idv["relatedIdentifier"] if identifier == record_doi: # Having a related identifier that is the same as the record # doi doesn't make any sense replace = True dupes.append(identifier) else: count = idvs.count(identifier) if count > 1: replace = True if identifier not in dupes: # We need to save the first duplicate new.append(idv) # Add to list of those already saved dupes.append(identifier) else: # This will be deleted dupes.append(identifier) else: # Save all unique ids new.append(idv) if replace == True: print("Duplicate links found in record ", k) print("Will delete these links", dupes) response = input("Do you approve this change? Y or N") new_metadata = {"relatedIdentifiers": new} if response == "Y": response = caltechdata_edit(token, k, new_metadata, {}, {}, True) print(response) if "alternateIdentifiers" in record: idtypes = [] alt_ids = [] repeat = False for idv in record["alternateIdentifiers"]: if idv["alternateIdentifierType"] not in idtypes: # If we haven't seen id type before, save it alt_ids.append(idv) idtypes.append(idv["alternateIdentifierType"]) else: repeat = True print("Will Delete Repeated ID ", idv["alternateIdentifier"]) if repeat == True: new_metadata = {"alternateIdentifiers": alt_ids} response = caltechdata_edit(token, k, new_metadata, {}, {}, True) print(response)
def match_cd_refs(): token = os.environ["TINDTOK"] matches = [] collection = "caltechdata.ds" keys = dataset.keys(collection) if "mediaupdate" in keys: keys.remove("mediaupdate") # Get event data results event_data = "crossref_refs.ds" event_keys = dataset.keys(event_data) event_keys.remove("captured") f_name = "match_cd_refs" dot_paths = [".obj_id", ".id", ".subj_id"] labels = ["obj_id", "id", "subj_id"] print("Getting Event Data Records") if dataset.has_frame(event_data, f_name): if not dataset.frame_reframe(event_data, f_name, event_keys): err = dataset.error_message() print(f"Failed to reframe {f_name} in {event_data}, {err}") exit() elif not dataset.frame_create(event_data, f_name, event_keys, dot_paths, labels): err = dataset.error_message() print(f"Failed to create frame {f_name} in {event_data}, {err}") exit() grid = dataset.frame_grid(event_data, f_name) df = pd.DataFrame(np.array(grid), columns=["obj_id", "id", "subj_id"]) grouped = df.groupby(["obj_id"]) groups = grouped.groups # Look at all CaltechDATA records for k in keys: # Collect matched new links for the record record_matches = [] print(k) metadata, err = dataset.read(collection, k) if err != "": print(f"Unexpected error on read: {err}") doi = "https://doi.org/" + metadata["identifier"]["identifier"] if doi in groups: hits = grouped.get_group(doi) for index, h in hits.iterrows(): # Trigger for whether we already have this link new = True if "relatedIdentifiers" in metadata: for m in metadata["relatedIdentifiers"]: if m["relatedIdentifier"] in h["subj_id"]: new = False if new == True: match = h["subj_id"] print(match) print(h["obj_id"]) inputv = input("Do you approve this link? Type Y or N: ") if inputv == "Y": record_matches.append(match) # If we have to update record if len(record_matches) > 0: ids = [] if "relatedIdentifiers" in metadata: for m in metadata["relatedIdentifiers"]: ids.append(m) matches.append([k, record_matches]) # Now collect identifiers for record for match in record_matches: split = match.split("doi.org/") new_id = { "relatedIdentifier": split[1], "relatedIdentifierType": "DOI", "relationType": "IsCitedBy", } ids.append(new_id) newmetadata = {"relatedIdentifiers": ids} response = caltechdata_edit(token, k, newmetadata, {}, {}, True) print(response) return matches