def test_setup(t, collection_name): if os.path.exists(collection_name): shutil.rmtree(collection_name) if dataset.init(collection_name) == False: err = dataset.error_message() t.error("init({collection_name}) failed, {err}") return
def get_caltechdata(collection, production=True, datacite=False): """Harvest all records from CaltechDATA . Always creates collection from scratch""" # Delete existing collection if os.path.isdir(collection): shutil.rmtree(collection) if not dataset.init(collection): print("Dataset failed to init collection") exit() if production == True: url = "https://data.caltech.edu/api/records" else: url = "https://cd-sandbox.tind.io/api/records" response = requests.get(url + "/?size=9000") hits = response.json() print(hits) for h in progressbar(hits["hits"]["hits"]): rid = str(h["id"]) # Get enriched metadata records (including files) if datacite == False: metadata = decustomize_schema(h["metadata"], True, True, True) metadata["updated"] = h["updated"] else: # Get just DataCite metadata metadata = decustomize_schema(h["metadata"]) if not dataset.create(collection, rid, metadata): err = dataset.error_message() print(err)
def build_usage(caltechdata_collection, usage_collection): """Build collection of records that contain CaltechDATA usage information""" if not os.path.isdir(usage_collection): if not dataset.init(usage_collection): print("Dataset failed to init collection") exit() # Write date to start collecting statistics for new collection dataset.create(usage_collection, "end-date", {"end-date": 1485907200}) # Build out structure for all CaltechDATA records ids = dataset.keys(caltechdata_collection) for k in ids: if dataset.has_key(usage_collection, k) == False: metadata, err = dataset.read(caltechdata_collection, k) # When record was submitted to CaltechDATA: rdate = None submitted = None issued = None if "dates" in metadata: doi = metadata["identifier"]["identifier"] for date in metadata["dates"]: if date["dateType"] == "Submitted": rdate = date["date"] if date["dateType"] == "Updated": submitted = date["date"] if date["dateType"] == "Issued": issued = date["date"] if rdate == None: if submitted != None: rdate = submitted else: rdate = issued else: # Dummy values for junk records rdate = "2020-04-01" doi = "" # Dataset is the only supported type in the spec and we are # following the dataset standards for usage # All dates are the date added to CaltechDATA, which is # the apropriate 'publication' date even if content was available # earlier record_data = { "dataset-id": [{"type": "doi", "value": doi}], "uri": "https://data.caltech.edu/records/" + k, "publisher": "CaltechDATA", "platform": "CaltechDATA", "publisher-id": [{"type": "grid", "value": "grid.20861.3d"}], "yop": rdate.split("-")[0], "data-type": "dataset", "dataset-dates": [{"type": "pub-date", "value": rdate}], "dataset-title": metadata["titles"][0]["title"], "performance": [], "grand-total-unique-investigations": 0, "grand-total-unique-requests": 0, } if not dataset.create(usage_collection, k, record_data): err = dataset.error_message() print(err) exit()
def test_issue12(t, c_name): src = '''[ {"id": "1", "c1": 1, "c2": 2, "c3": 3 }, {"id": "2", "c1": 2, "c2": 2, "c3": 3 }, {"id": "3", "c1": 3, "c2": 3, "c3": 3 }, {"id": "4", "c1": 1, "c2": 1, "c3": 1 }, {"id": "5", "c1": 6, "c2": 6, "c3": 6 } ]''' #dataset.verbose_on() # DEBUG #dataset.use_strict_dotpath(True) # DEBUG if dataset.status(c_name) == False: if not dataset.init(c_name): err = dataset.error_message() t.error(f'failed to create {c_name}') return objects = json.loads(src) for obj in objects: key = obj['id'] if dataset.has_key(c_name, key): dataset.update(c_name, key, obj) else: dataset.create(c_name, key, obj) f_names = dataset.frames(c_name) for f_name in f_names: ok = dataset.delete_frame(c_name, f_name) if ok == False: err = dataset.error_message() t.error(f'Failed to delete {f_name} from {c_name} -> "{err}"') return if dataset.has_frame(c_name, f_name) == True: t.error( f'Failed to delete frame {c_name} from {c_name}, frame still exists' ) return f_name = 'issue12' dot_paths = [".c1", "c3"] labels = [".col1", ".col3"] keys = dataset.keys(c_name) if not dataset.frame_create(c_name, f_name, keys, dot_paths, labels): err = dataset.error_message() t.error(f'failed to create {f_name} from {c_name}, {err}') if not dataset.has_frame(c_name, f_name): err = dataset.error_message() t.error(f'expected frame {f_name} to exists, {err}') return f_keys = dataset.frame_keys(c_name, f_name) if len(f_keys) == 0: err = dataset.error_message() t.error(f'expected keys in {f_name}, got zero, {err}') return f_objects = dataset.frame_objects(c_name, f_name) if len(f_objects) == 0: err = dataset.error_message() t.error(f'expected objects in {f_name}, got zero, {err}') return if not dataset.delete_frame(c_name, f_name): err = dataset.error_message() t.error(f'expected to delete {f_name} in {c_name}, {err}')
def test_check_repair(t, collection_name): t.print("Testing status on", collection_name) # Make sure we have a left over collection to check and repair if os.path.exists(collection_name) == True: shutil.rmtree(collection_name) if dataset.status(collection_name) == True: dataset.close(collection_name) if dataset.init(collection_name) == False: err = dataset.error_message() t.error(f'init({collection_name}) failed, {err}') return if dataset.status(collection_name) == False: t.error( f"Failed, expected dataset.status() == True, got False for {collection_name}" ) return if dataset.has_key(collection_name, 'one') == False: if dataset.create(collection_name, 'one', {"one": 1}) == False: err = dataset.error_message() t.error( f'create({collection_name}, "one", {"one": 1}) failed, {err}') t.print(f"Testing check on {collection_name}") # Check our collection if not (dataset.check(collection_name) == True): err = dataset.error_message() t.error( "Failed, (before break) expected check True, got False for {collection_name} (err: {err})" ) return # Break and recheck our collection print(f"Removing {collection_name}/collection.json to cause a fail") if os.path.exists(collection_name + "/collection.json"): os.remove(collection_name + "/collection.json") print(f"Testing check on (broken) {collection_name}") if not (dataset.check(collection_name) == False): err = dataset.error_message() t.error( f"Failed, (after break) expected check False got True for {collection_name} (err: {err})" ) else: t.print(f"Should have see error output for broken {collection_name}") # Repair our collection t.print("Testing repair on", collection_name) if dataset.repair(collection_name) == False: err = dataset.error_message() t.error("Failed, expected repair to return True, got, ", err) if os.path.exists(os.path.join(collection_name, "collection.json")) == False: t.error( f"Failed, expected recreated {collection_name}/collection.json")
def test_frame(t, c_name): if os.path.exists(c_name): shutil.rmtree(c_name) if dataset.init(c_name) == False: err = dataset.error_message() t.error(err) return data = [{ "id": "A", "one": "one", "two": 22, "three": 3.0, "four": ["one", "two", "three"] }, { "id": "B", "two": 2000, "three": 3000.1 }, { "id": "C" }, { "id": "D", "one": "ONE", "two": 20, "three": 334.1, "four": [] }] keys = [] dot_paths = ["._Key", ".one", ".two", ".three", ".four"] labels = ["_Key", "one", "two", "three", "four"] for row in data: key = row['id'] keys.append(key) dataset.create(c_name, key, row) f_name = 'f1' if dataset.frame_create(c_name, f_name, keys, dot_paths, labels) == False: err = dataset.error_message() t.error(err) if dataset.frame_reframe(c_name, f_name) == False: err = dataset.error_message() t.error(err) l = dataset.frames(c_name) if len(l) != 1 or l[0] != 'f1': t.error(f"expected one frame name, f1, got {l}") if dataset.delete_frame(c_name, f_name) == False: err = dataset.error_message() t.error(f'delete_frame({c_name}, {f_name}), {err}')
def build_aggregate(collection): """Build a collection for usage by month. Always creates collection from scratch""" # Delete existing collection if os.path.isdir(collection): shutil.rmtree(collection) if not dataset.init(collection): print("Dataset failed to init collection") exit() # Find time periods start = datetime.fromisoformat("2017-01-01") today = datetime.today().date().isoformat() date_list = pd.date_range(start, today, freq="MS").strftime("%Y-%m").to_list() for month in date_list: if not dataset.create(collection, month, {"report-datasets": []}): err = dataset.error_message() print(err)
def get_history(collection, caltechdata_collection, caltechdata_keys): """Harvest the history of records from CaltechDATA.""" keys_to_update = [] if os.path.exists("historyupdate"): with open("historyupdate", "r") as infile: update = date.fromisoformat(infile.read()) else: # Arbitrary old date - everything will be updated update = date(2011, 1, 1) for k in progressbar(caltechdata_keys, redirect_stdout=True): existing, err = dataset.read(caltechdata_collection, k) if err != "": print(f"Unexpected error on read: {err}") record_update = datetime.fromisoformat(existing["updated"]).date() if record_update > update: keys_to_update.append(k) if not os.path.isdir(collection): if not dataset.init(collection): print("Dataset failed to init collection") exit() base_url = "https://data.caltech.edu/records/" for k in progressbar(keys_to_update): url = base_url + str(k) + "/revisions" response = requests.get(url) revisions = response.json() for num, metadata in enumerate(revisions): key = f"{k}-{num}" if dataset.has_key(collection, key) == False: dataset.create(collection, key, metadata) # Save date in file today = date.today().isoformat() with open("historyupdate", "w") as outfile: outfile.write(today)
def build_collection(collection): # We start from scratch with a new dataset collection if os.path.isdir(collection): shutil.rmtree(collection) ok = dataset.init(collection) if ok == False: print("Dataset failed to init collection") exit() # Sync metdata from google sheet gsheet_id = '1er6yYk-7jcySyX7bqADIC_TrfSDTxwAwDh_hXSxSdoU' gsheet_name = 'data4tom' #The column for key values, starting at 1, in this case SRR id_col = 14 #Range of cells to import. This is basically all, can modify to exclude #portions of gsheet cell_range = "A1:ZZ" err = dataset.import_gsheet(collection, gsheet_id, gsheet_name, id_col, cell_range) if err != '': print(err) exit()
parser.add_argument('data_collection', nargs=1, help=\ 'file name for the dataset collection with harvested data') parser.add_argument('input_sheet', nargs=1, help=\ 'Input Google Sheet ID with author citations') parser.add_argument('output_sheet', nargs=1, help='Output Google Sheet ID') parser.add_argument('-limited', action='store_true', help=\ 'Save only the first three authors') args = parser.parse_args() name = args.data_collection[0] sheet = args.input_sheet[0] output_sheet = args.output_sheet[0] import_coll = "imported.ds" os.system("rm -rf imported.ds") dataset.init(import_coll) os.environ['GOOGLE_CLIENT_SECRET_JSON'] = "/etc/client_secret.json" err = dataset.import_gsheet(import_coll, sheet, 'Sheet1', 1, 'A:CZ') if err != '': print(err) keys = dataset.keys(import_coll) coauthors = [] count = 0 for key in progressbar(keys, redirect_stdout=True): record, err = dataset.read(name, key) if err != "": print(err)
def test_sync_csv(t, c_name): # Setup test collection if os.path.exists(c_name): shutil.rmtree(c_name) if dataset.init(c_name) == False: err = dataset.error_message() t.error(f'init({c_name}) failed, {err}') return # Setup test CSV instance t_data = [{ "_Key": "one", "value": 1 }, { "_Key": "two", "value": 2 }, { "_Key": "three", "value": 3 }] csv_name = c_name.strip(".ds") + ".csv" if os.path.exists(csv_name): os.remove(csv_name) with open(csv_name, 'w') as csvfile: csv_writer = csv.DictWriter(csvfile, fieldnames=["_Key", "value"]) csv_writer.writeheader() for obj in t_data: csv_writer.writerow(obj) # Import CSV into collection if dataset.import_csv(c_name, csv_name, True) == False: err = dataset.error_message() t.error(f'import_csv({c_name}, {csv_name}, True) failed, {err}') return for key in ["one", "two", "three"]: if dataset.has_key(c_name, key) == False: t.error(f"expected has_key({key}) == True, got False") if dataset.has_key(c_name, "five") == True: t.error(f"expected has_key('five') == False, got True") if dataset.create(c_name, "five", {"value": 5}) == False: err = dataset.error_message() t.error(f'create({c_name}, "five", {"value": 5}) failed, {err}') return # Setup frame frame_name = 'test_sync' keys = dataset.keys(c_name) if dataset.frame_create(c_name, frame_name, keys, ["._Key", ".value"], ["_Key", "value"]) == False: err = dataset.error_message() t.error(f'frame_create({c_name}, {frame_name}, ...) failed, {err}') return #NOTE: Tests for sync_send_csv and sync_receive_csv if dataset.sync_send_csv(c_name, frame_name, csv_name) == False: err = dataset.error_message() t.error( f'sync_send_csv({c_name}, {frame_name}, {csv_name}) failed, {err}') return with open(csv_name) as fp: src = fp.read() if 'five' not in src: t.error(f"expected 'five' in src, got {src}") # Now remove "five" from collection if dataset.delete(c_name, "five") == False: err = dataset.error_message() t.error(f'delete({c_name}, "five") failed, {err}') return if dataset.has_key(c_name, "five") == True: t.error(f"expected has_key(five) == False, got True") return if dataset.sync_recieve_csv(c_name, frame_name, csv_name, False) == False: err = dataset.error_message() t.error( f'sync_receive_csv({c_name}, {frame_name}, {csv_name}) failed, {err}' ) return if dataset.has_key(c_name, "five") == False: t.error(f"expected has_key(five) == True, got False") return
action="store_true", help="Get resolver links from DataCite") args = parser.parse_args() # S3 Setup session = boto3.Session(profile_name="resolver") current_region = session.region_name bucket = "resolver.library.caltech.edu" s3 = session.resource("s3") collection = "link_history.ds" if os.path.isdir(collection) == False: make_s3_record(s3, bucket, "index.html", "https://libguides.caltech.edu/CODA") if not dataset.init(collection): print("Dataset failed to init collection") exit() # Get the links that already exist links = dataset.keys(collection) if args.update: # Everything will get updated links = [] # Get DOI links if args.dois: client_ids = [ "tind.caltech", "caltech.library", "caltech.ipacdoi",
cmd.append("-password") cmd.append(eprint_password) cmd.append("-export") cmd.append("all") p = run(cmd) exit_code = p.returncode if exit_code != 0: print(f"ERROR: {' '.join(cmd)}, exit code {exit_code}") sys.exit(1) c_name = "campuspubs.ds" ok = dataset.status(c_name) if ok == False: err = dataset.init(c_name) if err != "": print(f"{c_name}, {err}") harvest = False if harvest == True: username = os.environ["USER"] password = os.environ["PW"] returnc = ep_full( c_name, "https://caltechcampuspubs.library.caltech.edu/", username, password ) print(returnc) keys = dataset.keys(c_name) for key in keys:
cmd.append("-password") cmd.append(eprint_password) cmd.append("-export") cmd.append("all") p = run(cmd) exit_code = p.returncode if exit_code != 0: print(f"ERROR: {' '.join(cmd)}, exit code {exit_code}") sys.exit(1) c_name = "oh.ds" ok = dataset.status(c_name) if ok == False: err = dataset.init(c_name, layout="pairtree") if err != "": print(f"{c_name}, {err}") harvest = False if harvest == True: username = os.environ["USER"] password = os.environ["PW"] returnc = ep_full( c_name, "http://oralhistories.library.caltech.edu/", username, password ) print(returnc) keys = dataset.keys(c_name) for key in keys:
def test_issue43(t, collection_name, csv_name): if os.path.exists(collection_name): shutil.rmtree(collection_name) if os.path.exists(csv_name): os.remove(csv_name) if dataset.init(collection_name) == False: err = dataset.error_message() t.error(f'Failed, need a {collection_name} to run test, {err}') return table = { "r1": { "c1": "one", "c2": "two", "c3": "three", "c4": "four" }, "r2": { "c1": "one", "c3": "three", "c4": "four" }, "r3": { "c1": "one", "c2": "two", "c4": "four" }, "r4": { "c1": "one", "c2": "two", "c3": "three" }, "r5": { "c1": "one", "c2": "two", "c3": "three", "c4": "four" } } for key in table: row = table[key] if dataset.create(collection_name, key, row) == False: err = dataset.error_message() t.error(f"Can't add test row {key} to {collection_name}, {err}") return dataset.use_strict_dotpath(False) # Setup frame frame_name = 'f1' keys = dataset.keys(collection_name) if dataset.frame_create(collection_name, frame_name, keys, ["._Key", ".c1", ".c2", ".c3", ".c4"], ["_Key", "c1", "c2", "c3", "c4"]) == False: err = dataset.error_message() t.error(err) return if dataset.export_csv(collection_name, frame_name, csv_name) == False: err = dataset.error_message() t.error( f'export_csv({collection_name}, {frame_name}, {csv_name} should have emitted warnings, not error, {err}' ) return with open(csv_name, mode='r', encoding='utf-8') as f: rows = f.read() for row in rows.split('\n'): if len(row) > 0: cells = row.split(',') if len(cells) < 5: t.error(f'row error {csv_name} for {cells}')
def test_frame_objects(t, c_name): if dataset.status(c_name) == True: dataset.close(c_name) if os.path.exists(c_name): shutil.rmtree(c_name) if dataset.init(c_name) == False: err = dataset.error_message() t.error(f'init({c_name}), {err}') return data = [{ "id": "A", "nameIdentifiers": [{ "nameIdentifier": "0000-000X-XXXX-XXXX", "nameIdentifierScheme": "ORCID", "schemeURI": "http://orcid.org/" }, { "nameIdentifier": "H-XXXX-XXXX", "nameIdentifierScheme": "ResearcherID", "schemeURI": "http://www.researcherid.com/rid/" }], "two": 22, "three": 3.0, "four": ["one", "two", "three"] }, { "id": "B", "two": 2000, "three": 3000.1 }, { "id": "C" }, { "id": "D", "nameIdentifiers": [{ "nameIdentifier": "0000-000X-XXXX-XXXX", "nameIdentifierScheme": "ORCID", "schemeURI": "http://orcid.org/" }], "two": 20, "three": 334.1, "four": [] }] keys = [] dot_paths = [ "._Key", ".nameIdentifiers", ".nameIdentifiers[:].nameIdentifier", ".two", ".three", ".four" ] labels = [ "id", "nameIdentifiers", "nameIdentifier", "two", "three", "four" ] for row in data: key = row['id'] keys.append(key) err = dataset.create(c_name, key, row) f_name = 'f1' if dataset.frame_create(c_name, f_name, keys, dot_paths, labels) == False: err = dataset.error_message() t.error( f'frame_create({c_name}, {f_name}, {keys}, {dot_paths}, {labels}), {err}' ) return f_keys = dataset.frame_keys(c_name, f_name) if len(f_keys) != len(keys): t.error(f'expected {len(keys)}, got {len(f_keys)}') if dataset.frame_refresh(c_name, f_name) == False: err = dataset.error_message() t.error(f'frame_reframe({c_name}, {f_name}), {err}') l = dataset.frames(c_name) if len(l) != 1 or l[0] != 'f1': t.error(f"expected one frame name, f1, got {l}") object_result = dataset.frame_objects(c_name, f_name) if len(object_result) != 4: t.error( f'Did not get correct number of objects back, expected 4 got {len(object_result)}, {object_result}' ) count_nameId = 0 count_nameIdObj = 0 for obj in object_result: if 'id' not in obj: t.error('Did not get id in object') if 'nameIdentifiers' in obj: count_nameId += 1 for idv in obj['nameIdentifiers']: if 'nameIdentifier' not in idv: t.error('Missing part of object') if 'nameIdentifier' in obj: count_nameIdObj += 1 if "0000-000X-XXXX-XXXX" not in obj['nameIdentifier']: t.error('Missing object in complex dot path') if count_nameId != 2: t.error( f"Incorrect number of nameIdentifiers elements, expected 2, got {count_nameId}" ) if count_nameIdObj != 2: t.error( f"Incorrect number of nameIdentifier elements, expected 2, got {count_nameIdObj}" ) if dataset.delete_frame(c_name, f_name) == False: err = dataset.error_message() t.error(f'delete_frame({c_name}, {f_name}), {err}')
def get_crossref_refs(prefix, done=False, new=True): # New=True will download everything from scratch and delete any existing records collection = "crossref_refs.ds" if new == True: if os.path.exists(collection) == True: shutil.rmtree(collection) if os.path.isdir(collection) == False: if not dataset.init(collection): print("Dataset failed to init collection") exit() base_url = ( "https://api.eventdata.crossref.org/v1/[email protected]&source=crossref&obj-id.prefix=" + prefix) collected = dataset.has_key(collection, "captured") cursor = "" count = 0 while cursor != None: if collected == True: date, err = dataset.read(collection, "captured") if err != "": print("error on read: " + err) date = date["captured"] print(date) url = base_url + "&from-collected-date=" + date else: url = base_url if cursor != "": url = url + "&cursor=" + cursor print(url) r = requests.get(url) records = r.json() if records["status"] == "failed": print(records) break for rec in records["message"]["events"]: # Save results in dataset print(count, rec["id"]) count = count + 1 # Just for prettyness if not dataset.create(collection, rec["id"], rec): err = dataset.error_message() print("Error in saving record: " + err) if cursor == records["message"]["next-cursor"]: # Catches bug where we get the same curser back at end of results break if records["message"]["total-results"] > count: cursor = records["message"]["next-cursor"] else: cursor = None if collected == True: date, err = dataset.read(collection, "captured") if err != "": print("Error in reading date: " + err) date = date["captured"] # Check Deleted cursor = "" while cursor != None: del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref" full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor r = requests.get(full) records = r.json() for rec in records["message"]["events"]: # Delete results in dataset print("Deleted: ", rec["id"]) if not dataset.delete(collection, rec["id"]): err = dataset.error_message() print(f"Unexpected error on read: {err}") cursor = records["message"]["next-cursor"] # Check Edited cursor = "" while cursor != None: del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref" full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor r = requests.get(full) records = r.json() for rec in records["message"]["events"]: # Update results in dataset print("Update: ", rec["id"]) if not dataset.update(collection, rec["id"], rec): err = dataset.error_message() print(f"Unexpected error on write: {err}") cursor = records["message"]["next-cursor"] if done: date = datetime.date.today().isoformat() record = {"captured": date} if dataset.has_key(collection, "captured"): if not dataset.update(collection, "captured", record): err = dataset.error_message() print(f"Unexpected error on update: {err}") else: if not dataset.create(collection, "captured", record): err = dataset.error_message() print(f"Unexpected error on create: {err}")
import sys import json from py_dataset import dataset from py_sitetools import mkpage, frontmatter, version_no, Logger log = Logger(os.getpid()) # Minimal configuration docs_dir = "docs" site_dir = "htdocs" c_name = "boutique.ds" index_tmpl = "templates/index.tmpl" # Create our boutique.ds if required if os.path.exists("boutique.ds") == False: dataset.init("boutique.ds") # crawl docs_dir and ingest files into data collection. for path, folders, files in os.walk(docs_dir): #log.print(f"Processing {path}") for filename in files: if filename.endswith(".md"): f_name = os.path.join(path, filename) log.print(f"Ingesting {f_name}") metadata = frontmatter(f_name) with open(f_name) as f: src = f.read() if "id" in metadata: key = str(metadata["id"]) if dataset.has_key(c_name, key): err = dataset.update(c_name, key, {
import urllib #Get access token from WOS sed as environment variable with source token.bash token = os.environ['WOSTOK'] headers = {'X-ApiKey': token, 'Content-type': 'application/json'} #Get input name = input("Enter a WOS author search term (e.g. Mooley K):") caltech = input("Restrict to Caltech-affiliated papers? Y or N:") sheet = input("Enter the google sheet ID:") #Set up collection collection = name.split()[0] + '.ds' subprocess.run(['rm', '-rf', collection]) dataset.init(collection) base_url = 'https://api.clarivate.com/api/wos/?databaseId=WOK' url = base_url + '&count=100&firstRecord=1' if caltech == 'Y': query = 'AU=(' + name + ') AND OG=(California Institute of Technology)' else: query = 'AU=(' + name + ')' query = urllib.parse.quote_plus(query) url = url + '&usrQuery=' + query print(url) response = requests.get(url, headers=headers) response = response.json() record_count = response['QueryResult']['RecordsFound'] print(record_count, " Records from WOS")
def get_wos_refs(new=True): # New=True will download everything from scratch and delete any existing records collection = "all_wos.ds" if new == True: if os.path.exists(collection) == True: shutil.rmtree(collection) if os.path.isdir(collection) == False: ok = dataset.init(collection) if ok == False: print("Dataset failed to init collection") exit() # Get access token from WOS sed as environment variable with source token.bash token = os.environ["WOSTOK"] headers = {"X-ApiKey": token, "Content-type": "application/json"} # Run query to get scope of records base_url = "https://api.clarivate.com/api/wos/?databaseId=WOK" collected = dataset.has_key(collection, "captured") if collected == True: date = dataset.read(collection, "captured") date = date[0]["captured"] date = datetime.fromisoformat(date) current = datetime.today() diff = current - date base_url = base_url + "&loadTimeSpan=" + str(diff.days) + "D" date = datetime.today().isoformat() record = {"captured": date} if dataset.has_key(collection, "captured"): err = dataset.update(collection, "captured", record) if err != "": print(f"Unexpected error on update: {err}") else: err = dataset.create(collection, "captured", record) if err != "": print(f"Unexpected error on create: {err}") query = "OG=(California Institute of Technology)" query = urllib.parse.quote_plus(query) url = base_url + "&usrQuery=" + query + "&count=100&firstRecord=1" response = requests.get(url, headers=headers) response = response.json() record_count = response["QueryResult"]["RecordsFound"] print(record_count, " Records from WOS") query_id = response["QueryResult"]["QueryID"] try: records = response["Data"]["Records"]["records"]["REC"] except: print(response) write_records(records, collection) # We have saved the first 100 records record_start = 101 record_count = record_count - 100 query_url = "https://api.clarivate.com/api/wos/query/" while record_count > 0: print(record_count) print(len(records), "records") if record_count > 100: url = ( query_url + str(query_id) + "?count=100&firstRecord=" + str(record_start) ) response = requests.get(url, headers=headers) response = response.json() try: records = response["Records"]["records"]["REC"] except: print(response) write_records(records, collection) record_start = record_start + 100 record_count = record_count - 100 else: url = ( query_url + str(query_id) + "?count=" + str(record_count) + "&firstRecord=" + str(record_start) ) response = requests.get(url, headers=headers) response = response.json() records = response["Records"]["records"]["REC"] write_records(records, collection) record_count = 0 print("Downloaded all records ")
for f in files: if f != None: os.remove(f) ### Need to handle old files if __name__ == "__main__": parser = argparse.ArgumentParser( description="caltechdata_backup queries the caltechDATA (Invenio 3) API\ returns data and adds to dataset structure on disk") collection = "caltechdata.ds" if os.path.isdir(collection) == False: err = dataset.init(collection) if err != "": print(f"Failed on create {err}") exit() args = parser.parse_args() api_url = "https://data.caltech.edu/api/records/" # Get the existing records current = dataset.keys(collection) req = requests.get(api_url) data = req.json() temp = 'temp' if os.path.isdir(temp) == False:
def get_cd_github(new=True): collection = "github_records.ds" if new == True: os.system("rm -rf " + collection) if os.path.isdir(collection) == False: if not dataset.init(collection): print("Dataset failed to init collection") exit() url = "https://data.caltech.edu/api/records" response = requests.get(url + "/?size=1000&q=subjects:GitHub") hits = response.json() for h in hits["hits"]["hits"]: rid = str(h["id"]) record = h["metadata"] result = dataset.has_key(collection, rid) if result == False: dataset.create(collection, rid, record) print("Downloading files for ", rid) codemeta = False for erecord in record["electronic_location_and_access"]: f = download_file(erecord, rid) # We're just looking for the zip file if f.split(".")[-1] == "zip": zip_files = subprocess.check_output( ["unzip", "-l", f.rstrip()], universal_newlines=True).splitlines() i = 4 # Ignore header line = zip_files[i] while line[0] != "-": split = line.split("/") fname = split[1] if fname == "codemeta.json": sp = line.split(" ")[-1] os.system("unzip -j " + f.rstrip() + " " + sp + " -d .") codemeta = True i = i + 1 line = zip_files[i] # Will only identify codemeta files in root of repo # Trash downloaded files - extracted codemeta.json not impacted print("Trash " + f) os.system("rm " + f) if codemeta == True: print(collection, rid) response = dataset.attach(collection, rid, ["codemeta.json"]) print("Attachment ", response) os.system("rm codemeta.json") print("Trash codemeta.json")