def filter_ids(): search_path = '/Users/sophierand/RichContextMetadata/metadata/' pub_paths = [ search_path + f for f in os.listdir(search_path) if f.endswith("stringsearch_pubs.json") ] for p in pub_paths: file_date = dparser.parse(time.ctime(os.path.getctime(p)), fuzzy=True).date() with open(p) as json_file: ss_json = json.load(json_file) a = [{ 'ds_name': s['related_dataset_name'], 'ds_id': s['related_dataset'], 'linkage_source': s['linkage_source'], 'file_run': file_date, 'time_since_run': abs((file_date - datetime.datetime.now().date()).days) } for s in ss_json] b = [dict(t) for t in {tuple(d.items()) for d in a}] new_list.append(b) ss_pub_list_flat = metadata_funs.flatten(new_list) b_exclude = list( set([ d['ds_id'] for d in ss_pub_list_flat if d['time_since_run'] <= 30 ])) return b_exclude
def collate_pubs(pub_paths): pub_list = [] for p in pub_paths: with open(p) as json_file: data = json.load(json_file) pub_list.append(data) pub_list_flat = metadata_funs.flatten(pub_list) return pub_list_flat
def main(api_client): dataset_names = metadata_funs.read_datasets() dataset_names_list = [{ 'dataset_name': d['title'], 'dataset_id': d['dataset_id'] } for d in dataset_names] # dataset_names_list = [d for d in dataset_names_list if d['dataset_id'] in ['dataset-f442e418ac191ac60f7f','dataset-01bf466ee1063265fc2c']] stringsearch_pubs_path = os.path.join( os.getcwd(), 'metadata/{}stringsearch_pubs.json'.format( metadata_funs.get_hash(str(datetime.datetime.now())))) stringsearch_pubs = gen_stringsearch_pub_metadata( api_client=api_client, dataset_names_list=dataset_names_list) pub_dataset_list_final = metadata_funs.flatten(stringsearch_pubs) json.dump(pub_dataset_list_final, open(stringsearch_pubs_path, 'w'), indent=2) return pub_dataset_list_final
def gen_ss_dyad(dataset_name_dict, api_client): """ intake a dataset dictionary that has a dataset_id and a list of names (name+alias) , and each is run through the string_search_dyads function to return publication metadata """ dataset_names_list = dataset_name_dict['dataset_name'] dataset_id = dataset_name_dict['dataset_id'] store_dyads = [] for ds in dataset_names_list: pub_dataset_dyads = return_string_search_dyads(dataset_string=ds, api_client=api_client) store_dyads.append(pub_dataset_dyads) store_dyads_flat = metadata_funs.flatten(store_dyads) for s in store_dyads_flat: s.update({ 'related_dataset': dataset_id, 'linkage_source': 'dataset_stringsearch' }) return store_dyads_flat
""" big_list = [] try: for d in ds_names: print('looking for ', d, ' now') # a = 'i would do a string search with {}'.format(d) a = gen_ss_dyad(dataset_name_dict=d, api_client=api_client) big_list.append(a) time.sleep(1) return big_list except KeyboardInterrupt: return big_list api_client = metadata_funs.create_api_client() dataset_names = metadata_funs.read_datasets() ds_names = gen_ds_names(dataset_names) exclude_ids = filter_ids() # ds_names_lim = [d for d in ds_names if d['dataset_id'] in ['dataset-b48654a3feb4deaaa272','dataset-53fcd9fbd727f01baad3']] ds_names_lim = [d for d in ds_names if d['dataset_id'] not in exclude_ids] # print('excluding these ids',exclude_ids) # print('searching for these ids', ds_names_lim) big_list = gen_dyad_list(ds_names=ds_names_lim) # print(big_list) final_list = metadata_funs.flatten(big_list) stringsearch_pubs_path = os.path.join( os.getcwd(), 'metadata/{}stringsearch_pubs.json'.format( metadata_funs.get_hash(str(datetime.datetime.now())))) json.dump(final_list, open(stringsearch_pubs_path, 'w'), indent=2)