def main(): asf.setServer('Prod') now1 = datetime.now() start_time = str(now1) end_time = '' #set later # today_str = str(date.today().strftime("%Y%m%d")) yest_str = str((date.today() - timedelta(days=1)).strftime("%Y-%m-%d")) sheet_id = '198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY' data_data = [{ 'range': 'resource-changes!A:Z', 'filter': 'resources' }, { 'range': 'accession-changes!A:Z', 'filter': 'accessions' }] for d in data_data: print('processing ' + d['filter']) the_sheet = dataSheet(sheet_id, d['range']) the_date = yest_str # the_date = '2019-08-27' the_repos = [2, 3, 4, 5] the_fields = [ 'id', 'title', 'identifier', 'create_time', 'system_mtime', 'last_modified_by', 'publish' ] the_modifieds = [] for r in the_repos: print('searching repo ' + str(r)) x = asf.getByDate(r, the_date, date_type='mtime', comparator='equal', filter=d['filter'], fields=the_fields) for a in x: row = [a[v] for v in the_fields] print(row) the_modifieds.append(row) # print(list(a.values())) # the_modifieds.append(list(a.values())) print('Repo ' + str(r) + ': ' + str(len(x))) print('Total ' + d['filter'] + ': ' + str(len(the_modifieds))) # the_sheet.clear() # the_sheet.appendData([the_fields]) the_sheet.appendData(the_modifieds) quit()
def main(): # Main code goes here. asf.setServer("Prod") output_folder = "output/resource_remove_links" the_lookup_csv = "id_lookup_prod.csv" bibid_file = "/Users/dwh2128/Documents/ACFA/TEST/ACFA-161-remove-links/acfa-161-remove-links.txt" # Read a list of bibids (csv) the_bibids = [] with open(bibid_file) as ids: for row in csv.reader(ids): the_bibids.append(row[0]) for b in the_bibids: try: repo, asid = asf.lookupByBibID(b, the_lookup_csv) print("Processing " + str(b) + "...") out_path_old = (output_folder + "/" + str(repo) + "_" + str(asid) + "_old.json") out_path_new = (output_folder + "/" + str(repo) + "_" + str(asid) + "_new.json") x = asf.getResource(repo, asid) # Save copy of existing object print("Saving data to " + out_path_old + "....") with open(out_path_old, "w+") as f: f.write(x) x_dict = json.loads(x) print(x_dict["ead_location"]) if "ead_location" in x_dict: del x_dict["ead_location"] else: print("No URL to delete!") y = json.dumps(x_dict) # print(y) post = asf.postResource(repo, asid, y) print(post) # Save copy of new object print("Saving data to " + out_path_new + "....") with open(out_path_new, "w+") as f: f.write(y) except: print("Error: Could not process " + str(b)) print(sys.exc_info()) # raise quit()
def main(): # Test functions here. from pprint import pprint server = 'Test' asf.setServer(server) # The resource to scan the_resource = (4, 6288) # A place to put output of saved json objects (optional) output_folder = 'output/replace_extrefs' # Retrieve all archival objects under a given resource x = asf.getResponse('/repositories/' + str(the_resource[0]) + '/resources/' + str(the_resource[1]) + '/ordered_records') y = json.loads(x)['uris'] # Select only the ones that are items or files, and add to a list the_refs = [r['ref'] for r in y if r['level'] in ['item', 'file']] cnt = 0 for a_ref in the_refs: ref_decomposed = a_ref.split('/') repo, asid = ref_decomposed[2], ref_decomposed[4] ref_json = asf.getArchivalObject(repo, asid) out_path = output_folder + '/' + str(repo) + '_' + str(asid) + '.json' data_old = ref_json # The regex substitution repl = re.subn(r'<extref\s+type=\\"simple\\"\s+href=', r'<extref xlink:type=\"simple\" xlink:href=', ref_json, flags=re.DOTALL) if repl[1] > 0: # [1] is the count of replacements from subn # there is a change # Save copy of existing object print('Saving data to ' + out_path + '....') with open(out_path, "w+") as f: f.write(data_old) data_new = repl[0] cnt += 1 print('Posting ' + str(repo) + '_' + str(asid) + ' to ' + server) z = asf.postArchivalObject(repo, asid, data_new) print(z) print(' ') print('Total replacements: ' + str(cnt))
def main(): # SERVER = "Test" # test SERVER = "Prod" asf.setServer(SERVER) LOOKUP = '/Users/dwh2128/Documents/git/dcps-utils/archivesspace/as_reports/id_lookup_prod.csv' sheet_id = '1Jbdhda0HbmHKJ7COOJ3CBzdMwpSeIbYHyXzr179ETpI' read_sheet = dataSheet(sheet_id, 'TEST!A:Z') # Test write_sheet = dataSheet(sheet_id, 'Output!A:Z') the_data = read_sheet.getData() the_data.pop(0) # print(the_refs) the_output = [] for r in the_data: bibid = r[0] repo = r[1] ref = r[2] extref_old = r[3] extref_new = r[5] the_res = json.loads(asf.getResourceByBibID(bibid, LOOKUP)) # pprint(the_res) asid = the_res['uri'].split('/')[4] print("repo: " + str(repo) + "; asid: " + str(asid)) the_notes = json.dumps(the_res['notes']) # print(the_notes) print(" ") the_new_notes = replace_notes( the_notes, [ # fix problem of leading space in href {'find': 'xlink:href=\\" http', 'replace': 'xlink:href=\\"http'}, # replace old url with new one {'find': extref_old, 'replace': extref_new}]) # print(the_new_notes) the_res['notes'] = json.loads(the_new_notes) x = asf.postResource(repo, asid, json.dumps(the_res)) out_row = [SERVER, repo, asid, ref, extref_old, extref_new, str(x)] print(out_row) the_output.append(out_row) # # write_sheet.clear() write_sheet.appendData(the_output) quit()
def main(): asf.setServer('Prod') # the_repos=[2,3,4,5] the_repos=[2] the_fields = ['id','title','identifier','create_time','system_mtime','last_modified_by','json'] the_sheet=dataSheet('198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY','unpublished!A:Z') the_unpublished = [] for r in the_repos: print('searching repo ' + str(r)) x = getUnpublished(r,filter='resources',fields=the_fields) # print(x) for a in x: row = [ a[v] for v in the_fields ] my_json = json.loads(row.pop(6)) try: call_no = my_json['user_defined']['string_1'] except: call_no = '' repo_id = int(str(row[0].split('/')[-3]).rstrip()) # get the repo from the uri string. asid = int(str(row[0].split('/')[-1]).rstrip()) # get the asid from the uri string. row.pop(0) row.insert(0,asid), row.insert(0,repo_id) if 'UA' in call_no: repo = 'nnc-ua' else: repo = get_repo(repo_id) row.insert(0,repo) the_unpublished.append(row) print(row) print('Repo ' + str(r) + ': ' + str(len(x))) print('Total unpublished: ' + str(len(the_unpublished))) # the_sheet.clear() # the_sheet.appendData([the_fields]) # the_sheet.appendData(the_unpublished) quit()
def main(): SERVER = "Prod" # test # SERVER = "Prod" asf.setServer(SERVER) sheet_id = '1Jbdhda0HbmHKJ7COOJ3CBzdMwpSeIbYHyXzr179ETpI' read_sheet = dataSheet(sheet_id, 'TEST!A:Z') # Test write_sheet = dataSheet(sheet_id, 'Output!A:Z') the_data = read_sheet.getData() the_data.pop(0) # print(the_refs) the_output = [] for r in the_data: repo = r[1] ref = r[2] extref_old = r[3] extref_new = r[5] the_ao = json.loads(asf.getArchivalObjectByRef(repo, ref)) asid = the_ao['uri'].split('/')[4] print("asid: " + str(asid)) the_notes = json.dumps(the_ao['notes']) # fix problem of leading space in href the_new_notes = the_notes.replace('xlink:href=\\" http', 'xlink:href=\\"http') # replace old url with new one the_new_notes = the_new_notes.replace(extref_old, extref_new) print(the_new_notes) the_ao['notes'] = json.loads(the_new_notes) pprint(the_ao) x = asf.postArchivalObject(repo, asid, json.dumps(the_ao)) out_row = [SERVER, repo, asid, ref, extref_old, extref_new, str(x)] print(out_row) the_output.append(out_row) # write_sheet.clear() write_sheet.appendData(the_output) quit()
def main(): # set to Prod | Dev | Test asf.setServer('Prod') bibid_file = "ead_bibids_20190520.txt" lookup_file = "id_lookup_prod_20190522.csv" outfile_loc = "ead_as_qc_reports/ead_as_qc_xml_PROD1" with open(bibid_file) as f: the_bibids = [line.rstrip('\n') for line in f] the_errors = [] the_processed = [] for a_bibid in the_bibids: print('Processing bibid: ' + a_bibid) if a_bibid: try: the_lookup = asf.lookupByBibID(a_bibid, lookup_file) the_repo = the_lookup[0] the_asid = the_lookup[1] the_processed.append(a_bibid) except: # Can't find in lookup the_repo = 0 the_asid = 0 the_errors.append(a_bibid) if (a_bibid and the_asid != 0): the_ead = asf.getEAD(the_repo, the_asid) the_filepath = outfile_loc + '/' + a_bibid + '_ead.xml' with open(the_filepath, "w") as myfile: myfile.write(the_ead) # Report results print('Processed ' + str(len(the_processed)) + ' records.') if len(the_errors) > 0: print('*** Warning: ' + str(len(the_errors)) + ' errors. Could not process id ' + ', '.join(the_errors) + ' ***')
def main(): server = 'Prod' asf.setServer(server) enum_num = 14 # extent_extent_type enumeration extent_data = asf.getEnumeration(enum_num) extent_usage_csv = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-111-extents-cleanup/extent-values-prod3.tsv' output_folder = 'output/enumerations' # Paths for reporting before/after data out_path_old = output_folder + '/' + str(enum_num) + 'PROD_old.json' out_path_new = output_folder + '/' + str(enum_num) + 'PROD_new.json' # Save copy of existing object print('Saving data to ' + out_path_old + '....') with open(out_path_old, "w+") as f: f.write(extent_data) # Load list from csv csv.register_dialect('my_dialect', delimiter='\t', quoting=csv.QUOTE_NONE) data = [] with open(extent_usage_csv) as the_csv_data: for row in csv.reader(the_csv_data, 'my_dialect'): data.append(row) # A list of ids of extent values to remove unused_extents = [x[0] for x in data if x[2] == 'Not used.'] for e in unused_extents: print('suppressing ' + str(e)) # mode='suppress' to suppress, mode='unsuppress' to unsuppress post = asf.suppressEnumerationValue(e, mode='suppress') print(post) extent_data_new = asf.getEnumeration(enum_num) # Save updated object print('Saving data to ' + out_path_new + '....') with open(out_path_new, "w+") as f: f.write(extent_data_new)
def main(): # Main code goes here. asf.setServer("Prod") lookup_csv = "id_lookup_prod.csv" id_file = "/Users/dwh2128/Documents/ACFA/TEST/ACFA-226-oclc/035s_20200915.txt" # Read a list of bibids and oclc strings the_data = [] with open(id_file) as ids: for row in csv.reader(ids, delimiter="|"): the_data.append([row[0], row[1], row[2]]) for a_row in the_data: bibid = a_row[0] print(bibid) str_2 = a_row[1] str_3 = a_row[2] try: repo, asid = asf.lookupByBibID(bibid, lookup_csv) x = asf.getResource(repo, asid) y = json.loads(x) user_defnd = y["user_defined"] if "user_defined" in y else {} user_defnd["string_2"] = str_2 user_defnd["string_3"] = str_3 print(user_defnd) y["user_defined"] = user_defnd z = json.dumps(y) post = asf.postResource(repo, asid, z) print(post) except Exception as e: print(e + ": Could not lookup " + str(bibid))
def main(): # SERVER = "Test" # test SERVER = "Prod" asf.setServer(SERVER) sheet_id = '1OABHEJF1jqA1vlbW5yTENry5W7YqKlag5nJDJ9ouCzg' # read_sheet = dataSheet(sheet_id, 'Test!A:Z') # Test read_sheet = dataSheet(sheet_id, 'Prod!A:Z') # Test write_sheet = dataSheet(sheet_id, 'output!A:Z') the_refs = read_sheet.getDataColumns()[0] # print(the_refs) the_output = [] for r in the_refs: the_ao = json.loads(asf.getArchivalObjectByRef(2, r)) asid = the_ao['uri'].split('/')[4] old_date = str(the_ao['dates'][0]['begin']) new_ao = fix_begin_date(2, the_ao) new_date = str(new_ao['dates'][0]['begin']) print("asid: " + str(asid)) x = asf.postArchivalObject(2, asid, json.dumps(new_ao)) out_row = [SERVER, r, asid, old_date, new_date, str(x)] # print(out_row) the_output.append(out_row) write_sheet.clear() write_sheet.appendData(the_output) quit() x = fix_begin_date(2, 'b2ec9ce511e4212ebb145fb909ca85bd') print(x) pprint( json.loads( asf.getArchivalObjectByRef(2, 'b2ec9ce511e4212ebb145fb909ca85bd'))) quit()
def main(): # set to True to use test sheet and test json folder location. debug = False asf.setServer("Prod") my_name = __file__ script_name = os.path.basename(my_name) # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) now1 = datetime.now() start_time = str(now1) end_time = "" # set later today_str = str((date.today()).strftime("%Y%m%d")) if debug: print("[Running script in debug mode...]") parent_folder = "/cul/cul0/ldpd/archivesspace/test/resources" # test folder sheet_id = "1wFyLN_Ea7ExCZSMuksB8MTrS9DjsUkwsmaPBujL7x0U" # test sheet the_repos = [4] # to test else: parent_folder = "/cul/cul0/ldpd/archivesspace/resources" sheet_id = "1T3EpIZmnh3Gk-VAIGtvavTQUIpS7AluyKQ8-sJsS8vg" the_repos = [2, 3, 4, 5, 6] output_folder = parent_folder + "/" + today_str the_sheets = { "resources": dataSheet(sheet_id, "Resources!A:Z"), "cm": dataSheet(sheet_id, "Collection Management!A:Z"), "log": dataSheet(sheet_id, "log!A:Z"), } # Set number of chars to truncate the scope and bioghist notes. trunc_len = 400 # List of fields to extract, expressed as dpaths. the_fields = [ ["bibid", "/id_0"], ["title", "/title"], ["published", "/publish"], ["create_time", "/create_time"], ["system_mtime", "/system_mtime"], ["created_by", "/created_by"], ["last_modified_by", "/last_modified_by"], ["ead_location", "/ead_location"], ["ext_number", "/extents/0/number"], ["ext_portion", "/extents/0/portion"], ["ext_type", "/extents/0/extent_type"], # ["integer_1", "/user_defined/integer_1"], # ["integer_2", "/user_defined/integer_2"], # ["integer_3", "/user_defined/integer_3"], ["local call no.", "/user_defined/string_1"], ["other ctrl no. 1", "/user_defined/string_2"], ["other ctrl no. 2", "/user_defined/string_3"], ["other ctrl no. 3", "/user_defined/string_4"], # ["enum_1", "/user_defined/enum_1"], # ["enum_2", "/user_defined/enum_2"], ["description status", "/user_defined/enum_3"], ["collecting area", "/user_defined/enum_4"], ["level", "level"] # (Scope and bioghist notes are added in separately below.) ] # Get the collection management records for use in report. the_cms = [] fields = [ "id", "parent_id", "title", "system_mtime", "processing_priority", "processing_status", ] print(" ") print("*** Retrieve Collection Management Data ***") print(" ") for r in the_repos: print("Getting collection management records for repo: " + str(r) + "...") cm = asf.getCollectionManagements(r, filter="resource", fields=fields) for c in cm: row = [c[f] for f in fields] the_cms.append(row) # a data set of collection managment records to post to sheet below. the_cms.insert(0, fields) print(" ") print("*** Retrieve Resource Data ***") print(" ") # Get the list of resources for each repo and add to the_ids the_ids = [] for r in the_repos: print("Getting ids for repo: " + str(r) + "...") asids = json.loads( asf.getResponse("/repositories/" + str(r) + "/resources?all_ids=true")) print(str(len(asids)) + " records found in repo " + str(r) + ".") for i in asids: the_ids.append([r, i]) # Construct the head row the_heads = [x[0] for x in the_fields] the_heads.insert(0, "asid") the_heads.insert(0, "repo") the_heads.append("scope note") the_heads.append("scopenote length") the_heads.append("bioghist note") the_heads.append("biognote length") the_output = [the_heads] # Fetch the resources from the ids print("Downloading resources...") if not os.path.exists(output_folder): print("Creating directory " + output_folder + "...") os.makedirs(output_folder) for repo, asid in the_ids: # print("Processsing " + str(repo) + ":" + str(asid) + "...") the_row = [repo, asid] res_json = asf.getResource(repo, asid) res_dict = json.loads(res_json) out_path = output_folder + "/" + str(repo) + "_" + str(asid) + ".json" # Write the JSON to file. with open(out_path, "w+") as f: f.write(res_json) # Use dpath to extract values from dict and compose into rows. for af in the_fields: try: d = str(dpath.util.get(res_dict, af[1])) except: d = "" the_row.append(d) # Process scope and bioghist notes the_notes = dpath.util.values(res_dict, "notes/*", afilter=None) the_scope_notes = [] the_biog_notes = [] for a_note in the_notes: try: if a_note["type"] == "scopecontent": the_scope_notes.append(a_note) except: pass try: if a_note["type"] == "bioghist": the_biog_notes.append(a_note) except: pass if the_scope_notes: # If there are scope notes, grab all the text and concatenate. Then get the total length in # chars. scope_note_texts = [ s["subnotes"][0]["content"] for s in the_scope_notes ] the_scope_text = " ".join(scope_note_texts) scope_note_len = len(the_scope_text) scope_note_short = truncate_str(the_scope_text, length=trunc_len) else: scope_note_short = "" scope_note_len = 0 if the_biog_notes: # If there are bioghist notes, grab all the text and concatenate. Then get the total length in # chars. biog_note_texts = [ s["subnotes"][0]["content"] for s in the_biog_notes ] the_biog_text = " ".join(biog_note_texts) biog_note_len = len(the_biog_text) biog_note_short = truncate_str(the_biog_text, length=trunc_len) else: biog_note_short = "" biog_note_len = 0 the_row.append(scope_note_short) the_row.append(str(scope_note_len)) the_row.append(biog_note_short) the_row.append(str(biog_note_len)) the_output.append(the_row) # Zip up the JSON files for storage. zip_out = make_archive(today_str, "zip", root_dir=parent_folder, base_dir=today_str) print(zip_out) # Zip is saved in working dir; move to correct location. print("Saving zip file " + str(today_str) + ".zip to " + parent_folder) # Test if file already exists. if os.path.exists(parent_folder + "/" + str(today_str) + ".zip"): print("File " + parent_folder + "/" + str(today_str) + ".zip exists already. Replacing with new zip file...") os.remove(parent_folder + "/" + str(today_str) + ".zip") move(zip_out, parent_folder) # Remove the json folder once zip is in place. rmtree(parent_folder + "/" + today_str) util.file_cleanup(parent_folder, 60) # Write output to Google sheet. print(" ") print("*** Writing Data to Report ***") print(" ") the_sheets["cm"].clear() the_sheets["cm"].appendData(the_cms) digester.post_digest( script_name, "Total collection management records: " + str(len(the_cms) - 1)) the_sheets["resources"].clear() the_sheets["resources"].appendData(the_output) digester.post_digest( script_name, "Total number of resource records: " + str(len(the_output) - 1)) ######################## ### FINISH UP ### ######################## # Generate log string. now2 = datetime.now() end_time = str(now2) my_duration = str(now2 - now1) the_log = ("Data imported by " + my_name + ". Start: " + start_time + ". Finished: " + end_time + " (duration: " + my_duration + ").") the_sheets["log"].appendData([[the_log]]) print(" ") print(the_log) print(" ") exit_msg = "Script done. Updated data is available at " + \ the_sheets["resources"].url print(exit_msg) digester.post_digest(script_name, exit_msg)
# Requires pytest. Checks basic connectivity and read functions from sample data sheet. # Run all tests with 'pytest --disable-pytest-warnings'. # If in virtual environment, use 'python -m pytest --disable-pytest-warnings'. import ASFunctions as asf import json # import logging asf.setServer('Prod') def test_get_resource_prod(): x = json.loads(asf.getResource(2, 5907)) assert x[ 'id_0'] == '4078601', "Prod: BIBID for resource 2:5907 should be 4078601" asf.setServer('Test') def test_get_resource_test(): x = json.loads(asf.getResource(2, 5907)) assert x[ 'id_0'] == '4078601', "Test: BIBID for resource 2:5907 should be 4078601" asf.setServer('Dev') def test_get_resource_dev(): x = json.loads(asf.getResource(2, 5907)) assert x[
def main(): # Set to Test | Dev | Prod asf.setServer('Prod') the_report_sheet = dataSheet( '1wNO0t2j5G9U0hUmb7E-jLd4T5skTs1aRxN7HrlyZwEI', 'notes!A:Z') id_file = 'replace_notes.csv' output_folder = 'output/notes' # Read a list of repo and object ids (csv) the_ids = [] ids = open(id_file) for row in csv.reader(ids): the_ids.append([row[0], row[1]]) ids.close() the_before_afters = [] the_heads = [ 'repo', 'asid', 'uid', 'title', 'note_cnt1', 'note_cnt2', 'status' ] the_before_afters.append(the_heads) for an_obj in the_ids: out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[ 1] + '_old.json' # read from API print('getting data for ' + str(an_obj[0]) + ', ' + str(an_obj[1])) try: x = asf.getArchivalObjectByRef(an_obj[0], an_obj[1]) # Save copy of existing object print('Saving data to ' + out_path + '....') f = open(out_path, "w+") f.write(x) f.close() x = json.loads(x) asid = str( x['uri'].split('/')[-1]) # get the asid from the uri string. title = x['title'] repo = str(an_obj[0]) y = x my_notes_init = y['notes'] my_notes_new = [] if len(my_notes_init) > 0: if 'subnotes' in my_notes_init[0]: for a_note in my_notes_init: if 'subnotes' in a_note: if 'extref' in a_note['subnotes'][0]['content']: pass else: my_notes_new.append(a_note) if len(my_notes_new) == len(my_notes_init): the_status = "[no change]" else: the_status = "[deleted note]" y['notes'] = my_notes_new note_cnt1 = len(my_notes_init) note_cnt2 = len(y['notes']) the_before_afters.append([ an_obj[0], asid, an_obj[1], title, note_cnt1, note_cnt2, the_status ]) # convert dict back to json for posting. z = json.dumps(y) # Post the fixed object back to API. # (Comment these out for testing.) resp = asf.postArchivalObject(repo, asid, z) print(resp) except: print('Could not retrieve record ' + str(an_obj[1])) # Report changes to Google Sheet print('Writing before/after info to sheet...') the_report_sheet.clear() the_report_sheet.appendData(the_before_afters) print("Done!") quit()
# Script to add authorities or make other changes to subjects. See ACFA-287. import ASFunctions as asf import json from pprint import pprint from sheetFeeder import dataSheet import os.path SERVER = 'Prod' asf.setServer(SERVER) my_name = __file__ # pprint(asf.getSubject(11453)) # quit() # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) sheet_id = '1b-dFdOaWD7AEqzhK0uuGXkonum6wX8Zcriq8-G4l33Q' # list_sheet = dataSheet(sheet_id, 'Test!A:Z') # test list_sheet = dataSheet(sheet_id, 'batch!A:Z') report_sheet = dataSheet(sheet_id, 'output!A:Z') def add_authority(server, asid, uri, source=None): # function to (1) query subject and determine if it already has # an authority uri, (2) if not, add in the provided URI,
def main(): # Set to Test | Dev | Prod asf.setServer('Prod') the_report_sheet = dataSheet( '1wNO0t2j5G9U0hUmb7E-jLd4T5skTs1aRxN7HrlyZwEI', 'daos-prod!A:Z') id_file = 'replace_daos.csv' output_folder = 'output/daos/prod' # Read a list of repo and object ids (csv) the_ids = [] ids = open(id_file) for row in csv.reader(ids): the_ids.append([row[0], row[1]]) ids.close() the_before_afters = [] the_heads = ['repo', 'asid', 'uid', 'title', 'before', 'after'] the_before_afters.append(the_heads) for an_obj in the_ids: out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[ 1] + '_old.json' # read from API try: x = asf.getDigitalObjectFromParent(an_obj[0], an_obj[1]) # Save copy of existing object print('Saving data to ' + out_path + '....') f = open(out_path, "w+") f.write(x) f.close() x = json.loads(x) the_old_field_data = x['file_versions'][0]['file_uri'] asid = str( x['uri'].split('/')[-1]) # get the asid from the uri string. title = x['title'] repo = str(an_obj[0]) y = x y['file_versions'][0]['file_uri'] = re.sub( r"^(.*)-staging(.*)'$", r'\1\2', x['file_versions'][0]['file_uri']) if y['file_versions'][0]['file_uri'] == the_old_field_data: the_new_field_data = "[no change]" else: the_new_field_data = y['file_versions'][0]['file_uri'] the_before_afters.append([ an_obj[0], asid, an_obj[1], title, the_old_field_data, the_new_field_data ]) # convert dict back to json for posting. z = json.dumps(y) # Post the fixed object back to API. # (Comment these out for testing.) if the_new_field_data != "[no change]": resp = asf.postDigitalObject(repo, asid, z) print(resp) else: print('No update: skipping record.') except: print('Could not retrieve record ' + str(an_obj[1])) # Report changes to Google Sheet print('Writing before/after info to sheet...') the_report_sheet.clear() the_report_sheet.appendData(the_before_afters) print("Done!") quit()
def main(): asf.setServer('Prod') the_report_sheet = dataSheet( '1wNO0t2j5G9U0hUmb7E-jLd4T5skTs1aRxN7HrlyZwEI', 'ampersands!A:Z') id_file = 'archival_objects.csv' output_folder = 'output/archival_objects' # Read a list of repo and object ids (csv) the_ids = [] ids = open(id_file) for row in csv.reader(ids): the_ids.append([row[0], row[1]]) ids.close() # Search/replace patterns the_search_pattern = '&amp;' the_replace_pattern = '&' the_before_afters = [] # the fields to perform regex replace on. the_fields = ['title', 'display_string'] the_heads = ['repo', 'asid', 'uid', 'before', 'after'] the_before_afters.append(the_heads) for an_obj in the_ids: out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[ 1] + '_old.json' # read from API x = asf.getArchivalObjectByRef(an_obj[0], an_obj[1]) # Save copy of existing object print('Saving data to ' + out_path + '....') f = open(out_path, "w+") f.write(x) f.close() x = json.loads(x) asid = str( x['uri'].split('/')[-1]) # get the asid from the uri string. repo = str(an_obj[0]) the_initial_values = [ str('{' + f + '_old:} ' + x[f]) for f in the_fields ] the_initial_values = "\n".join(the_initial_values) # print(the_initial_values) # TODO: function modifies x as well as y. Harmless but messy. y = regex_dict(x, the_fields, the_search_pattern, the_replace_pattern) the_new_values = [ str('{' + f + '_new:} ' + y[f] + ' ') for f in the_fields ] the_new_values = "\n".join(the_new_values) the_before_afters.append( [repo, asid, an_obj[1], the_initial_values, the_new_values]) # convert dict back to json for posting. z = json.dumps(y) # Post the fixed object back to API. # (Comment out these lines to test output without replacing.) post = asf.postArchivalObject(repo, asid, z) print(post) # Report changes to Google Sheet print('Writing before/after info to sheet...') the_report_sheet.clear() the_report_sheet.appendData(the_before_afters) print("Done!")
# Automated reporting of ArchivesSpace accessions info. import ASFunctions as asf import json from pprint import pprint from sheetFeeder import dataSheet from operator import itemgetter import datetime import re import os.path import dateutil.parser import digester # for generating composite digest of report info. # set Prod | Dev | Test target_server = 'Prod' # Prod | Dev | Test asf.setServer(target_server) DEBUG = False # mode = 'Prod' # Prod or Test MY_NAME = __file__ SCRIPT_NAME = os.path.basename(MY_NAME) # This makes sure the script can be run from any working directory and still find related files. MY_PATH = os.path.dirname(__file__) # File to use to lookup bibids LOOKUP_CSV = os.path.join(MY_PATH, "id_lookup_prod.csv") def main():
def main(): # Set value to switch to, publish (True) or unpublish (False) publish_value = False # Report changes to a spreadsheet? report_results = True asf.setServer('Prod') # A GSheet to post report to the_report_sheet = dataSheet( '1wNO0t2j5G9U0hUmb7E-jLd4T5skTs1aRxN7HrlyZwEI', 'aos_unpub3!A:Z') # A CSV of format <repo>,<refid> id_file = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-141-unpublish-archival-objects/unpublish_aos_series_IIIA_PROD_p7.csv' # A folder to put json objects for auditing purposes output_folder = 'output/unpubs3' # Read a list of repo and object ids (csv) the_ids = [] ids = open(id_file) for row in csv.reader(ids): the_ids.append([row[0], row[1]]) ids.close() the_before_afters = [] the_heads = ['repo', 'asid', 'uid', 'title', 'before', 'after'] the_before_afters.append(the_heads) for an_obj in the_ids: out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[ 1] + '_old.json' # read from API x = asf.getArchivalObjectByRef(an_obj[0], an_obj[1]) # Save copy of existing object print('Saving data to ' + out_path + '....') f = open(out_path, "w+") f.write(x) f.close() x = json.loads(x) asid = str( x['uri'].split('/')[-1]) # get the asid from the uri string. repo = str(an_obj[0]) title = x['title'] y = x old_value = x['publish'] y['publish'] = publish_value new_value = y['publish'] if new_value == old_value: new_value = '[no change]' the_before_afters.append( [repo, asid, an_obj[1], title, old_value, new_value]) # convert dict back to json for posting. z = json.dumps(y) if new_value != "[no change]": resp = asf.postArchivalObject(repo, asid, z) print(resp) else: print('No update: skipping record.') # Report changes to Google Sheet if report_results == True: print('Writing before/after info to sheet...') the_report_sheet.clear() the_report_sheet.appendData(the_before_afters) print("Done!")
def main(): asf.setServer('Prod') id_file = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-147-hrw-access-restrictions/acfa-147-aos_UNVETTED.csv' output_folder = 'output/archival_objects_accessrestrict' # Read a list of repo and object ids (csv) the_ids = [] ids = open(id_file) for row in csv.reader(ids): the_ids.append([row[0], row[1]]) ids.close() access_types = { 'unvetted': { 'vocab': 'TEMPORARILY UNAVAILABLE', 'text': '[Unvetted]' }, 'vetted': { 'vocab': 'AVAILABLE', 'text': '[Vetted, open]' } } # Set to 'vetted' or 'unvetted' the_type = 'unvetted' for an_obj in the_ids: out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[ 1] + '_old.json' # read from API x = asf.getArchivalObjectByRef(an_obj[0], an_obj[1]) # Save copy of existing object print('Saving data to ' + out_path + '....') with open(out_path, "w+") as f: f.write(x) y = json.loads(x) asid = str( y['uri'].split('/')[-1]) # get the asid from the uri string. repo = str(an_obj[0]) print('Processing ' + str(repo) + ' - ' + str(asid) + '...') the_notes = y['notes'] # Test if there is already an accessrestrict has_accrestrict = False for an_item in the_notes: if an_item['type'] == 'accessrestrict': has_accrestrict = True if has_accrestrict == False: print('Adding access restrict note ...') the_access_note = { 'jsonmodel_type': 'note_multipart', 'publish': True, 'rights_restriction': { 'local_access_restriction_type': [access_types[the_type]['vocab']] }, 'subnotes': [{ 'content': access_types[the_type]['text'], 'jsonmodel_type': 'note_text', 'publish': True }], 'type': 'accessrestrict' } y['notes'].append(the_access_note) # the_notes = y['notes'] z = json.dumps(y) # print(z) post = asf.postArchivalObject(repo, asid, z) print(post) else: print('Already has access restrict note. Skipping!') print("Done!")
def main(): # Set to Test | Dev | Prod asf.setServer('Prod') the_report_sheet = dataSheet( '1wNO0t2j5G9U0hUmb7E-jLd4T5skTs1aRxN7HrlyZwEI', 'daos-publish!A:Z') # Set value to switch to, publish (True) or unpublish (False) publish_value = True # id_file = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-162/acfa-162-mitchell.csv' id_file = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-162/acfa-162-kay.csv' output_folder = 'output/daos-publish' # Read a list of repo and object ids (csv) the_ids = [] ids = open(id_file) for row in csv.reader(ids): the_ids.append([row[0], row[1]]) ids.close() the_before_afters = [] the_heads = ['repo', 'asid', 'uid', 'title', 'before', 'after'] the_before_afters.append(the_heads) for an_obj in the_ids: out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[ 1] + '_old.json' # read from API # try: x = asf.getDigitalObjectFromParent(an_obj[0], an_obj[1]) # Save copy of existing object print('Saving data to ' + out_path + '....') f = open(out_path, "w+") f.write(x) f.close() x = json.loads(x) # the_old_field_data = x['file_versions'][0]['file_uri'] the_old_field_data = x['publish'] asid = str( x['uri'].split('/')[-1]) # get the asid from the uri string. title = x['title'] repo = str(an_obj[0]) y = x # Here set the desired value y['publish'] = publish_value if y['publish'] == the_old_field_data: the_new_field_data = "[no change]" else: the_new_field_data = y['publish'] the_before_afters.append([ an_obj[0], asid, an_obj[1], title, the_old_field_data, the_new_field_data ]) # convert dict back to json for posting. z = json.dumps(y) # Post the fixed object back to API. # (Comment these out for testing.) if the_new_field_data != "[no change]": resp = asf.postDigitalObject(repo, asid, z) print(resp) else: print('No update: skipping record.') # except: # print('Could not retrieve record ' + str(an_obj[1])) # Report changes to Google Sheet print('Writing before/after info to sheet...') the_report_sheet.clear() the_report_sheet.appendData(the_before_afters) print("Done!") quit()
script_name = os.path.basename(my_name) # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) sheet_id = "1pZk2tPMuZDOd1veOBSJNRk2fprA6p3Qb3WKZDtZay88" the_sheet = dataSheet(sheet_id, "subjects!A:Z") # the_sheet = dataSheet(sheet_id, "test!A:Z") # test now1 = datetime.datetime.now() start_time = str(now1) end_time = "" # set later # First get the subject records from API (this can take a long time!) asf.setServer("Prod") # AS instance: Prod | Dev | Test # out_path = os.path.join(my_path, "output/subjects.pickle") out_path = "/cul/cul0/ldpd/archivesspace/subjects/subjects.pickle" # uncomment to do the full download. the_subjects = asf.getSubjects() util.pickle_it(the_subjects, out_path) # Report the saved data to Google Sheet # List of fields to extract, expressed as dpaths. the_fields = [ ["uri", "uri"], ["title", "title"], ["source", "source"],
def main(): asf.setServer("Prod") # the_lookup_csv = "id_lookup_TEST.csv" # test the_lookup_csv = "id_lookup_prod.csv" # test output_folder = "output/resource_language_encode" the_sheet = dataSheet("1eTPY7AbDvjDU-lzK2VQruvZAvlGkAJZglh2JrruPvdg", "Test6!A:Z") the_data = the_sheet.getData() the_new_data = [] the_new_data.append(the_data.pop(0)) counter = 0 for a_row in the_data: counter += 1 print(" ") print(counter) the_new_row = a_row the_bibid = a_row[0] the_041 = a_row[1] the_string = a_row[3] res_info = asf.lookupByBibID(the_bibid, the_lookup_csv) if res_info: out_path_old = ( output_folder + "/" + str(res_info[0]) + "_" + str(res_info[1]) + "_old.json" ) out_path_new = ( output_folder + "/" + str(res_info[0]) + "_" + str(res_info[1]) + "_new.json" ) # pull down the resource the_resource = asf.getResource(res_info[0], res_info[1]) # Save copy of existing object print("Saving data to " + out_path_old + "....") with open(out_path_old, "w+") as f: f.write(the_resource) res_dict = json.loads(the_resource) langmaterials = res_dict["lang_materials"] # Collect encoded languages already present. There should be just one but not guaranteed, so make a list. primary_langs = [] for n in langmaterials: try: if n["language_and_script"]: # print("YES") primary_langs.append(n["language_and_script"]["language"]) except: print("Exception!") print("old:") print(primary_langs) print("new:") langs_parsed = language_lookup(the_string) print(langs_parsed) print("to add: ") langs_diff = diff(langs_parsed, primary_langs) print(langs_diff) if len(langs_diff) > 0: for l in langs_diff: res_dict["lang_materials"].append(make_language_note(l)) new_resource = json.dumps(res_dict) # Save new object print("Saving data to " + out_path_new + "....") with open(out_path_new, "w+") as f: f.write(new_resource) # Post new resource back to API print("Posting data for " + str(res_info[0]) + " : " + str(res_info[1])) try: post = asf.postResource(res_info[0], res_info[1], new_resource) print(post) except: print( "Error: There was a problem posting resource " + str(res_info[0]) + ":" + str(res_info[1]) + "!" ) langs_diff.append("[ERROR]") else: print("No new languages to add. Skipping.") the_new_row.append(",".join(langs_diff)) the_new_data.append(the_new_row) the_sheet.clear() the_sheet.appendData(the_new_data)
def main(): # Main code goes here. my_name = __file__ # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) asf.setServer("Prod") the_sheet = dataSheet("1UQm7ffd1Kq4zqlzHZajd9YkwW1_nmOJFS1W7nI-c_Vk", "new-batch!A:Z") output_folder = os.path.join(my_path, "output/resource_collecting_area") the_rows = the_sheet.getData() the_new_rows = [] the_heads = the_rows.pop(0) the_new_rows.append(the_heads) coll_area_index = 8 # the column of collecting area for a_row in the_rows: the_new_row = a_row # print(a_row) coll = "" repo, asid = a_row[0], a_row[1] if len(a_row) >= coll_area_index: # if there is a collecting location to add coll = a_row[coll_area_index] the_resource = asf.getResource(repo, asid) out_path_old = (output_folder + "/" + str(repo) + "_" + str(asid) + "_old.json") out_path_new = (output_folder + "/" + str(repo) + "_" + str(asid) + "_new.json") # Save copy of existing object print("Saving data to " + out_path_old + "....") with open(out_path_old, "w+") as f: f.write(the_resource) the_data = json.loads(the_resource) fix = False if "user_defined" in the_data: the_user_defined = the_data["user_defined"] if "enum_4" in the_user_defined: print("Already has enum_4! Skipping.") else: fix = True the_user_defined["enum_4"] = coll the_data["user_defined"] = the_user_defined the_new_resource = json.dumps(the_data) # Save copy of new object print("Saving data to " + out_path_new + "....") with open(out_path_new, "w+") as f: f.write(the_new_resource) if fix == True: try: post = "[NONE]" post = asf.postResource(repo, asid, the_new_resource) print(post) except: print("Error: There was a problem posting resource " + str(repo) + ":" + str(asid) + "!") the_new_row.append(coll) else: print("ERROR: No user_defined data in " + str(repo) + ":" + str(asid)) the_new_rows.append(the_new_row) the_sheet.clear() the_sheet.appendData(the_new_rows) # print(the_new_rows) quit()
def main(): # Main code goes here. asf.setServer("Prod") on_site = False # set to True to get on-site note, False to get off-site note. See the_access_note var below. output_folder = "output/resource_on-site_access" lookup_csv = "id_lookup_prod.csv" # bibid_file = ( # "/Users/dwh2128/Documents/ACFA/TEST/ACFA-224-onsite-notes/acfa-224-list_3.csv" # ) bibid_file = ( "/Users/dwh2128/Documents/ACFA/TEST/ACFA-243-off-site/acfa-243_off-site.csv" ) # Read a list of bibids (csv) the_bibids = [] with open(bibid_file) as ids: for row in csv.reader(ids): the_bibids.append(row[0]) if on_site == True: the_access_note = { "jsonmodel_type": "note_multipart", "label": "Restrictions on Access", "type": "accessrestrict", "rights_restriction": {"local_access_restriction_type": []}, "subnotes": [ { "jsonmodel_type": "note_text", "content": "This collection is located on-site.", "publish": True, } ], "publish": True, } else: the_access_note = { "jsonmodel_type": "note_multipart", "label": "Restrictions on Access", "type": "accessrestrict", "rights_restriction": {"local_access_restriction_type": []}, "subnotes": [ { "jsonmodel_type": "note_text", "content": "This collection is located off-site. You will need to request this material at least three business days in advance to use the collection in the Rare Book and Manuscript Library reading room.", "publish": True, } ], "publish": True, } for bib in the_bibids: try: repo, asid = asf.lookupByBibID(bib, lookup_csv) except: print("Error: No record found for " + str(bib) + ". Skipping...") continue out_path_old = output_folder + "/" + str(repo) + "_" + str(asid) + "_old.json" out_path_new = output_folder + "/" + str(repo) + "_" + str(asid) + "_new.json" the_resource = asf.getResource(repo, asid) # Save copy of existing object print("Saving data to " + out_path_old + "....") with open(out_path_old, "w+") as f: f.write(the_resource) the_data = json.loads(the_resource) # Test if there is already an access restriction note. has_note = False for a_note in the_data["notes"]: try: if a_note["type"] == "accessrestrict": has_note = True except KeyError: print("Note has no type -- skipping.") if has_note == True: print(str(bib) + " - Warning: Already has access note.") # else: the_data["notes"].append(the_access_note) the_new_resource = json.dumps(the_data) # Save copy of new object print("Saving data to " + out_path_new + "....") with open(out_path_new, "w+") as f: f.write(the_new_resource) try: post = asf.postResource(repo, asid, the_new_resource) print(post) except: print( "Error: There was a problem posting resource " + str(repo) + ":" + str(asid) + "!" ) quit()
# Script to get barcode and holding info from spreadsheet # and add to top containers in ArchivesSpace via API. See ACFA-206. import ASFunctions as asf import json from pprint import pprint from sheetFeeder import dataSheet import dcps_utils as util import os.path import csv import datetime asf.setServer('Prod') my_name = __file__ # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) # sheet_id = '1gUx1cPS8POLxqRblYIs1vlpr7yDGOyHmAJqpl6nMo4k' sheet_id = '1e43qKYvqGQFOMxA70U59yPKPs18y-k3ohRNdU-qrTH0' # test # list_sheet = dataSheet(sheet_id, 'report!A:Z') list_sheet = dataSheet(sheet_id, 'test!A:Z') # test the_data = list_sheet.getData() the_heads = the_data.pop(0) today = datetime.date.today().strftime("%Y-%m-%d")
def main(): asf.setServer('Prod') # AS instance: Prod | Dev | Test mode = 'Prod' # Prod or Test my_name = __file__ script_name = os.path.basename(my_name) # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) now1 = datetime.now() start_time = str(now1) end_time = '' # set later # today_str = str(date.today().strftime("%Y%m%d")) yest_str = str((date.today() - timedelta(days=1)).strftime("%Y%m%d")) ######################## ### PROCESS OAI DATA ### ######################## # Set path to Saxon processor # saxon_path = os.path.join(my_path, "../../resources/saxon-9.8.0.12-he.jar") # XSLT file to generate report marc_xslt_file = os.path.join(my_path, '../xslt/marcDataExtract.xsl') if mode == 'Prod': # OAI XML file to use as source # source_dir='/cul/cul0/lito/libsys/voyager/prod/data/loads/AS_harvest' source_dir = '/cul/cul0/ldpd/archivesspace/oai' sheet_id = '198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY' oai_file = source_dir + '/' + yest_str + '.asAllRaw.xml' else: # TEST yest_str = "20190915" # OAI XML file to use as source source_dir = '/Users/dwh2128/Documents/ACFA/exist-local/backups/cached_eads/cached_eads_20190912' # local test sheet_id = '1YzM1dinagfoTUirAoA2hHBfnhSM1PsPt8TkwTT9KlgQ' oai_file = yest_str + '.asAllRaw.xml' the_sheets = { 'oai': dataSheet(sheet_id, 'oai!A:Z'), 'oai_last': dataSheet(sheet_id, 'oai_last!A:Z'), 'log': dataSheet(sheet_id, 'log!A:Z') } the_outpath = os.path.join(my_path, 'output/' + yest_str + '.marc_reporter_out.xml') print(' ') # Copy oai current data to oai_last sheet for diff the_old_data = the_sheets['oai'].getData() the_sheets['oai_last'].clear() the_sheets['oai_last'].appendData(the_old_data) # Process OAI MARC and output to CSV util.saxon_process(oai_file, marc_xslt_file, the_outpath) # clear data from "new" sheet the_sheets['oai'].clear() # Send result csv to Google Sheet. y = the_sheets['oai'].importCSV(the_outpath, delim='|') print(' ') ######################## ### PROCESS UNPUBLISHED ### ######################## print('Finding unpublished records...') the_repos = [2, 3, 4, 5] the_fields = [ 'id', 'title', 'identifier', 'create_time', 'system_mtime', 'last_modified_by', 'json' ] the_heads = [ 'REPO', 'REPO_ID', 'RESOURCE_ID', 'TITLE', 'BIBID', 'CREATE_TIME', 'SYSTEM_MTIME', 'LAST_MODIFIED_BY' ] unpubs_sheet = dataSheet(sheet_id, 'unpublished!A:Z') the_unpublished = [] for r in the_repos: print('searching repo ' + str(r)) x = asf.getUnpublished(r, filter='resources', fields=the_fields) # print(x) for a in x: row = [a[v] for v in the_fields] # print(row) my_json = json.loads(row.pop(6)) try: call_no = my_json['user_defined']['string_1'] except: call_no = '' # get the repo from the uri string. repo_id = int(str(row[0].split('/')[-3]).rstrip()) # get the asid from the uri string. asid = int(str(row[0].split('/')[-1]).rstrip()) row.pop(0) row.insert(0, asid), row.insert(0, repo_id) if 'UA' in call_no: repo = 'nnc-ua' else: repo = get_repo(repo_id) row.insert(0, repo) the_unpublished.append(row) print('Repo ' + str(r) + ': ' + str(len(x))) # print('Total unpublished: ' + str(len(the_unpublished))) msg = 'Total unpublished: ' + str(len(the_unpublished)) print(msg) digester.post_digest(script_name, msg) # Test unpubs_sheet.clear() unpubs_sheet.appendData([the_heads]) unpubs_sheet.appendData(the_unpublished) ######################## ### GET NEWLY CREATED ### ######################## data_data = [{ 'range': 'resource-changes!A:Z', 'filter': 'resources' }, { 'range': 'accession-changes!A:Z', 'filter': 'accessions' }] for d in data_data: print('processing ' + d['filter']) the_delta_sheet = dataSheet(sheet_id, d['range']) the_date = yest_str # the_date = '2019-08-27' the_repos = [2, 3, 4, 5] the_fields = [ 'id', 'title', 'identifier', 'create_time', 'system_mtime', 'last_modified_by', 'publish' ] the_heads = [ 'repo', 'asid', 'title', 'identifier', 'create_time', 'system_mtime', 'last_modified_by', 'publish' ] the_modifieds = [] for r in the_repos: print('searching repo ' + str(r)) x = asf.getByDate(r, the_date, date_type='ctime', comparator='equal', filter=d['filter'], fields=the_fields) for a in x: row = [a[v] for v in the_fields] # print(row) # get the repo from the uri string. repo = str(row[0].split('/')[-3]).rstrip() # get the asid from the uri string. asid = str(row[0].split('/')[-1]).rstrip() row.pop(0) row.insert(0, asid), row.insert(0, repo) the_modifieds.append(row) # print(list(a.values())) # the_modifieds.append(list(a.values())) print('Repo ' + str(r) + ': ' + str(len(x))) print('Total ' + d['filter'] + ': ' + str(len(the_modifieds))) digester.post_digest(script_name, 'New ' + d['filter'] + ': ' + str(len(the_modifieds))) # Test # the_sheet.clear() # the_sheet.appendData([the_fields]) the_delta_sheet.appendData(the_modifieds) ######################## ### FINISH UP ### ######################## # Generate log string. now2 = datetime.now() end_time = str(now2) my_duration = str(now2 - now1) the_log = 'Data imported by ' + my_name + '. Start: ' + start_time + \ '. Finished: ' + end_time + ' (duration: ' + my_duration + ').' the_sheets['log'].appendData([[the_log]]) print(' ') print(the_log) digester.post_digest(script_name, the_log) # Test print(' ') print('Script done. Updated data is available at ' + the_sheets['oai'].url)
def main(): # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) if DEBUG is True: sheet_id = "18uvn9wIABHVIdjlSRNXqnHUKB2aTvZgKO62e-UFNuO8" # test else: sheet_id = "1dTeMAK_cGWAUvrqvAiY2hGy4gJewrmWjnuIZu8NhWwE" now1 = datetime.datetime.now() start_time = str(now1) end_time = "" # set later # First get the agent records from API (this can take a long time!) asf.setServer("Prod") # AS instance: Prod | Dev | Test if DEBUG is True: out_folder = "/cul/cul0/ldpd/archivesspace/test/agents" else: out_folder = "/cul/cul0/ldpd/archivesspace/agents" family_agents_file = os.path.join(out_folder, "agents_families.pickle") corp_agents_file = os.path.join(out_folder, "agents_corporate.pickle") persons_agents_file = os.path.join(out_folder, "agents_persons.pickle") the_info = [ { "name": "families", "endpoint": "/agents/families", "sheet": dataSheet(sheet_id, "families!A:Z"), "pickle": family_agents_file }, { "name": "corporate", "endpoint": "/agents/corporate_entities", "sheet": dataSheet(sheet_id, "corporate!A:Z"), "pickle": corp_agents_file }, { "name": "persons", "endpoint": "/agents/people", "sheet": dataSheet(sheet_id, "persons!A:Z"), "pickle": persons_agents_file }, ] # List of fields to extract, expressed as dpaths. the_fields = [ ["uri", "uri"], ["title", "title"], ["source", "names/0/source"], ["authority_id", "names/0/authority_id"], ["is_linked_to_published_record", "is_linked_to_published_record"], ["publish", "publish"], ["last_modified_by", "last_modified_by"], ["last_modified", "system_mtime"], ] the_record_cnts = {} if DEBUG is True: print("*** (DEBUG MODE) ***") for i in the_info: print("Getting agents: " + i["name"]) agent_data = get_agent_data(i["name"], i["endpoint"], i["pickle"]) print(" ") # Report the saved data to Google Sheet the_sheet = i["sheet"] the_heads = [x[0] for x in the_fields] the_output = [the_heads] the_record_cnts[i["name"]] = str(len(agent_data)) for agent in agent_data: the_row = [] # Use dpath to extract values from dict and compose into rows. for af in the_fields: try: d = str(dpath.util.get(agent, af[1])) except: d = "" the_row.append(d) # print(the_row) the_output.append(the_row) the_sheet.clear() save = the_sheet.appendData(the_output) print(save) # Generate log print(the_record_cnts) print(" ".join(the_record_cnts)) cnt_str = "".join(k + "=" + v + ". " for k, v in the_record_cnts.items()) # print(cnt_str) now2 = datetime.datetime.now() end_time = str(now2) my_duration = str(now2 - now1) the_log = ("Data imported by " + MY_NAME + ". " + cnt_str + " Start: " + start_time + ". Finished: " + end_time + " (duration: " + my_duration + ").") log_range = "log!A:A" log_sheet = dataSheet(sheet_id, log_range) log_sheet.appendData([[the_log]]) print(" ") print(the_log) log_it(SCRIPT_NAME, the_log) # digester.post_digest(SCRIPT_NAME, the_log) print(" ") exit_msg = "Script done. Updated data is available at " + \ "https://docs.google.com/spreadsheets/d/" + \ str(sheet_id) + "/edit?usp=sharing" print(exit_msg) log_it(SCRIPT_NAME, exit_msg) quit()
def main(): asf.setServer('Test') # Google sheet used for reporting changes. the_report_sheet = dataSheet( '1wNO0t2j5G9U0hUmb7E-jLd4T5skTs1aRxN7HrlyZwEI', 'resources!A:Z') id_file = 'resource_replacements.csv' output_folder = 'output/resource_replacements' # Read a list of repo and object ids (csv) the_ids = [] ids = open(id_file) for row in csv.reader(ids): the_ids.append([row[0], row[1]]) ids.close() # Search/replace patterns the_search_pattern = 'NCC' the_replace_pattern = 'NNC' the_before_afters = [] the_heads = ['repo', 'asid', 'before', 'after'] the_before_afters.append(the_heads) for an_obj in the_ids: out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[ 1] + '_old.json' # read from API x = asf.getResource(an_obj[0], an_obj[1]) # Save copy of existing object print('Saving data to ' + out_path + '....') f = open(out_path, "w+") f.write(x) f.close() x = json.loads(x) the_old_field_data = x['user_defined']['string_2'] y = x y['user_defined']['string_2'] = re.sub(the_search_pattern, the_replace_pattern, x['user_defined']['string_2']) if y['user_defined']['string_2'] == the_old_field_data: the_new_field_data = "[no change]" else: the_new_field_data = y['user_defined']['string_2'] the_before_afters.append([ an_obj[0], an_obj[1], '{string_2} ' + the_old_field_data, '{string_2} ' + the_new_field_data ]) # convert dict back to json for posting. z = json.dumps(y) # Post the fixed object back to API. post = asf.postResource(an_obj[0], an_obj[1], z) print(post) # Report changes to Google Sheet print('Writing before/after info to sheet...') the_report_sheet.clear() the_report_sheet.appendData(the_before_afters)