def make_link_history(collection, resolver, url, note): """Make an entry in our link history collection""" now = datetime.today().isoformat() # Run checks on both resoler and final URL try: target = requests.get(url) except requests.exceptions.ConnectionError: target = requests.Response() target.status_code = 404 target.url = '' if target.status_code != 200: print( f"Target URL {url} returns Error status code {target.status_code}") if links_differ(target.url, url): print(f"Target URL '{url}' redirects to '{target.url}'") try: get = requests.get(f"https://resolver.library.caltech.edu/{resolver}") except requests.exceptions.ConnectionError: get = requests.Response() get.status_code = 404 get.url = '' if links_differ(get.url, url): print(f"Mismatch between expected url '{url}' and actual '{get.url}'") if get.status_code != 200: print( f"Resolver URL ({resolver}) '{get.url}' returns Error status code {get.status_code}" ) entry = { "expected-url": url, "url": get.url, "modified": now, "code": get.status_code, "note": note, } # If existing, push into history if dataset.has_key(collection, resolver): existing, err = dataset.read(collection, resolver) if err != "": print(err) exit() if save_history(existing, url, get): past_history = existing.pop("history") past_history.append(existing) entry["history"] = past_history if not dataset.update(collection, resolver, entry): print(dataset.error_message()) exit() else: entry["history"] = [] if not dataset.create(collection, resolver, entry): print(dataset.error_message()) exit()
def test_join(t, collection_name): key = "test_join1" obj1 = {"one": 1} obj2 = {"two": 2} if dataset.status(collection_name) == False: t.error("Failed, collection status is False,", collection_name) return ok = dataset.has_key(collection_name, key) err = '' if ok == True: ok = dataset.update(collection_nane, key, obj1) else: ok = dataset.create(collection_name, key, obj1) if ok == False: err = dataset.error_message() t.error( f'Failed, could not add record for test ({collection_name}, {key}, {obj1}), {err}' ) return if dataset.join(collection_name, key, obj2, overwrite=False) == False: err = dataset.error_message() t.error( f'Failed, join for {collection_name}, {key}, {obj2}, overwrite = False -> {err}' ) obj_result, err = dataset.read(collection_name, key) if err != '': t.error(f'Unexpected error for {key} in {collection_name}, {err}') if obj_result.get('one') != 1: t.error(f'Failed to join append key {key}, {obj_result}') if obj_result.get("two") != 2: t.error(f'Failed to join append key {key}, {obj_result}') obj2['one'] = 3 obj2['two'] = 3 obj2['three'] = 3 if dataset.join(collection_name, key, obj2, overwrite=True) == False: err = dataset.error_message() t.error( f'Failed to join overwrite {collection_name}, {key}, {obj2}, overwrite = True -> {err}' ) obj_result, err = dataset.read(collection_name, key) if err != '': t.error(f'Unexpected error for {key} in {collection_name}, {err}') for k in obj_result: if k != '_Key' and obj_result[k] != 3: t.error('Failed to update value in join overwrite', k, obj_result)
def get_history(collection, caltechdata_collection, caltechdata_keys): """Harvest the history of records from CaltechDATA.""" keys_to_update = [] if os.path.exists("historyupdate"): with open("historyupdate", "r") as infile: update = date.fromisoformat(infile.read()) else: # Arbitrary old date - everything will be updated update = date(2011, 1, 1) for k in progressbar(caltechdata_keys, redirect_stdout=True): existing, err = dataset.read(caltechdata_collection, k) if err != "": print(f"Unexpected error on read: {err}") record_update = datetime.fromisoformat(existing["updated"]).date() if record_update > update: keys_to_update.append(k) if not os.path.isdir(collection): if not dataset.init(collection): print("Dataset failed to init collection") exit() base_url = "https://data.caltech.edu/records/" for k in progressbar(keys_to_update): url = base_url + str(k) + "/revisions" response = requests.get(url) revisions = response.json() for num, metadata in enumerate(revisions): key = f"{k}-{num}" if dataset.has_key(collection, key) == False: dataset.create(collection, key, metadata) # Save date in file today = date.today().isoformat() with open("historyupdate", "w") as outfile: outfile.write(today)
def get_crossref_refs(prefix, done=False, new=True): # New=True will download everything from scratch and delete any existing records collection = "crossref_refs.ds" if new == True: if os.path.exists(collection) == True: shutil.rmtree(collection) if os.path.isdir(collection) == False: if not dataset.init(collection): print("Dataset failed to init collection") exit() base_url = ( "https://api.eventdata.crossref.org/v1/[email protected]&source=crossref&obj-id.prefix=" + prefix) collected = dataset.has_key(collection, "captured") cursor = "" count = 0 while cursor != None: if collected == True: date, err = dataset.read(collection, "captured") if err != "": print("error on read: " + err) date = date["captured"] print(date) url = base_url + "&from-collected-date=" + date else: url = base_url if cursor != "": url = url + "&cursor=" + cursor print(url) r = requests.get(url) records = r.json() if records["status"] == "failed": print(records) break for rec in records["message"]["events"]: # Save results in dataset print(count, rec["id"]) count = count + 1 # Just for prettyness if not dataset.create(collection, rec["id"], rec): err = dataset.error_message() print("Error in saving record: " + err) if cursor == records["message"]["next-cursor"]: # Catches bug where we get the same curser back at end of results break if records["message"]["total-results"] > count: cursor = records["message"]["next-cursor"] else: cursor = None if collected == True: date, err = dataset.read(collection, "captured") if err != "": print("Error in reading date: " + err) date = date["captured"] # Check Deleted cursor = "" while cursor != None: del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref" full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor r = requests.get(full) records = r.json() for rec in records["message"]["events"]: # Delete results in dataset print("Deleted: ", rec["id"]) if not dataset.delete(collection, rec["id"]): err = dataset.error_message() print(f"Unexpected error on read: {err}") cursor = records["message"]["next-cursor"] # Check Edited cursor = "" while cursor != None: del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref" full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor r = requests.get(full) records = r.json() for rec in records["message"]["events"]: # Update results in dataset print("Update: ", rec["id"]) if not dataset.update(collection, rec["id"], rec): err = dataset.error_message() print(f"Unexpected error on write: {err}") cursor = records["message"]["next-cursor"] if done: date = datetime.date.today().isoformat() record = {"captured": date} if dataset.has_key(collection, "captured"): if not dataset.update(collection, "captured", record): err = dataset.error_message() print(f"Unexpected error on update: {err}") else: if not dataset.create(collection, "captured", record): err = dataset.error_message() print(f"Unexpected error on create: {err}")
if os.path.exists("boutique.ds") == False: dataset.init("boutique.ds") # crawl docs_dir and ingest files into data collection. for path, folders, files in os.walk(docs_dir): #log.print(f"Processing {path}") for filename in files: if filename.endswith(".md"): f_name = os.path.join(path, filename) log.print(f"Ingesting {f_name}") metadata = frontmatter(f_name) with open(f_name) as f: src = f.read() if "id" in metadata: key = str(metadata["id"]) if dataset.has_key(c_name, key): err = dataset.update(c_name, key, { "metadata": metadata, "content": f_name, "src": src }) else: err = dataset.create(c_name, key, { "metadata": metadata, "content": f_name, "src": src }) if err != "": log.fatal(err) else: log.print(f"Warning, no front matter for {f_name}")
def get_wos_refs(new=True): # New=True will download everything from scratch and delete any existing records collection = "all_wos.ds" if new == True: if os.path.exists(collection) == True: shutil.rmtree(collection) if os.path.isdir(collection) == False: ok = dataset.init(collection) if ok == False: print("Dataset failed to init collection") exit() # Get access token from WOS sed as environment variable with source token.bash token = os.environ["WOSTOK"] headers = {"X-ApiKey": token, "Content-type": "application/json"} # Run query to get scope of records base_url = "https://api.clarivate.com/api/wos/?databaseId=WOK" collected = dataset.has_key(collection, "captured") if collected == True: date = dataset.read(collection, "captured") date = date[0]["captured"] date = datetime.fromisoformat(date) current = datetime.today() diff = current - date base_url = base_url + "&loadTimeSpan=" + str(diff.days) + "D" date = datetime.today().isoformat() record = {"captured": date} if dataset.has_key(collection, "captured"): err = dataset.update(collection, "captured", record) if err != "": print(f"Unexpected error on update: {err}") else: err = dataset.create(collection, "captured", record) if err != "": print(f"Unexpected error on create: {err}") query = "OG=(California Institute of Technology)" query = urllib.parse.quote_plus(query) url = base_url + "&usrQuery=" + query + "&count=100&firstRecord=1" response = requests.get(url, headers=headers) response = response.json() record_count = response["QueryResult"]["RecordsFound"] print(record_count, " Records from WOS") query_id = response["QueryResult"]["QueryID"] try: records = response["Data"]["Records"]["records"]["REC"] except: print(response) write_records(records, collection) # We have saved the first 100 records record_start = 101 record_count = record_count - 100 query_url = "https://api.clarivate.com/api/wos/query/" while record_count > 0: print(record_count) print(len(records), "records") if record_count > 100: url = ( query_url + str(query_id) + "?count=100&firstRecord=" + str(record_start) ) response = requests.get(url, headers=headers) response = response.json() try: records = response["Records"]["records"]["REC"] except: print(response) write_records(records, collection) record_start = record_start + 100 record_count = record_count - 100 else: url = ( query_url + str(query_id) + "?count=" + str(record_count) + "&firstRecord=" + str(record_start) ) response = requests.get(url, headers=headers) response = response.json() records = response["Records"]["records"]["REC"] write_records(records, collection) record_count = 0 print("Downloaded all records ")
def test_sync_csv(t, c_name): # Setup test collection if os.path.exists(c_name): shutil.rmtree(c_name) if dataset.init(c_name) == False: err = dataset.error_message() t.error(f'init({c_name}) failed, {err}') return # Setup test CSV instance t_data = [{ "_Key": "one", "value": 1 }, { "_Key": "two", "value": 2 }, { "_Key": "three", "value": 3 }] csv_name = c_name.strip(".ds") + ".csv" if os.path.exists(csv_name): os.remove(csv_name) with open(csv_name, 'w') as csvfile: csv_writer = csv.DictWriter(csvfile, fieldnames=["_Key", "value"]) csv_writer.writeheader() for obj in t_data: csv_writer.writerow(obj) # Import CSV into collection if dataset.import_csv(c_name, csv_name, True) == False: err = dataset.error_message() t.error(f'import_csv({c_name}, {csv_name}, True) failed, {err}') return for key in ["one", "two", "three"]: if dataset.has_key(c_name, key) == False: t.error(f"expected has_key({key}) == True, got False") if dataset.has_key(c_name, "five") == True: t.error(f"expected has_key('five') == False, got True") if dataset.create(c_name, "five", {"value": 5}) == False: err = dataset.error_message() t.error(f'create({c_name}, "five", {"value": 5}) failed, {err}') return # Setup frame frame_name = 'test_sync' keys = dataset.keys(c_name) if dataset.frame_create(c_name, frame_name, keys, ["._Key", ".value"], ["_Key", "value"]) == False: err = dataset.error_message() t.error(f'frame_create({c_name}, {frame_name}, ...) failed, {err}') return #NOTE: Tests for sync_send_csv and sync_receive_csv if dataset.sync_send_csv(c_name, frame_name, csv_name) == False: err = dataset.error_message() t.error( f'sync_send_csv({c_name}, {frame_name}, {csv_name}) failed, {err}') return with open(csv_name) as fp: src = fp.read() if 'five' not in src: t.error(f"expected 'five' in src, got {src}") # Now remove "five" from collection if dataset.delete(c_name, "five") == False: err = dataset.error_message() t.error(f'delete({c_name}, "five") failed, {err}') return if dataset.has_key(c_name, "five") == True: t.error(f"expected has_key(five) == False, got True") return if dataset.sync_recieve_csv(c_name, frame_name, csv_name, False) == False: err = dataset.error_message() t.error( f'sync_receive_csv({c_name}, {frame_name}, {csv_name}) failed, {err}' ) return if dataset.has_key(c_name, "five") == False: t.error(f"expected has_key(five) == True, got False") return
def get_cd_github(new=True): collection = "github_records.ds" if new == True: os.system("rm -rf " + collection) if os.path.isdir(collection) == False: if not dataset.init(collection): print("Dataset failed to init collection") exit() url = "https://data.caltech.edu/api/records" response = requests.get(url + "/?size=1000&q=subjects:GitHub") hits = response.json() for h in hits["hits"]["hits"]: rid = str(h["id"]) record = h["metadata"] result = dataset.has_key(collection, rid) if result == False: dataset.create(collection, rid, record) print("Downloading files for ", rid) codemeta = False for erecord in record["electronic_location_and_access"]: f = download_file(erecord, rid) # We're just looking for the zip file if f.split(".")[-1] == "zip": zip_files = subprocess.check_output( ["unzip", "-l", f.rstrip()], universal_newlines=True).splitlines() i = 4 # Ignore header line = zip_files[i] while line[0] != "-": split = line.split("/") fname = split[1] if fname == "codemeta.json": sp = line.split(" ")[-1] os.system("unzip -j " + f.rstrip() + " " + sp + " -d .") codemeta = True i = i + 1 line = zip_files[i] # Will only identify codemeta files in root of repo # Trash downloaded files - extracted codemeta.json not impacted print("Trash " + f) os.system("rm " + f) if codemeta == True: print(collection, rid) response = dataset.attach(collection, rid, ["codemeta.json"]) print("Attachment ", response) os.system("rm codemeta.json") print("Trash codemeta.json")