Пример #1
0
def filter_ids():
    search_path = '/Users/sophierand/RichContextMetadata/metadata/'
    pub_paths = [
        search_path + f for f in os.listdir(search_path)
        if f.endswith("stringsearch_pubs.json")
    ]
    for p in pub_paths:
        file_date = dparser.parse(time.ctime(os.path.getctime(p)),
                                  fuzzy=True).date()
        with open(p) as json_file:
            ss_json = json.load(json_file)
        a = [{
            'ds_name':
            s['related_dataset_name'],
            'ds_id':
            s['related_dataset'],
            'linkage_source':
            s['linkage_source'],
            'file_run':
            file_date,
            'time_since_run':
            abs((file_date - datetime.datetime.now().date()).days)
        } for s in ss_json]
        b = [dict(t) for t in {tuple(d.items()) for d in a}]
        new_list.append(b)
    ss_pub_list_flat = metadata_funs.flatten(new_list)
    b_exclude = list(
        set([
            d['ds_id'] for d in ss_pub_list_flat if d['time_since_run'] <= 30
        ]))
    return b_exclude
def collate_pubs(pub_paths):
    pub_list = []
    for p in pub_paths:
        with open(p) as json_file:
            data = json.load(json_file)
        pub_list.append(data)
    pub_list_flat = metadata_funs.flatten(pub_list)
    return pub_list_flat
Пример #3
0
def main(api_client):
    dataset_names = metadata_funs.read_datasets()
    dataset_names_list = [{
        'dataset_name': d['title'],
        'dataset_id': d['dataset_id']
    } for d in dataset_names]
    #     dataset_names_list = [d for d in dataset_names_list if d['dataset_id'] in ['dataset-f442e418ac191ac60f7f','dataset-01bf466ee1063265fc2c']]
    stringsearch_pubs_path = os.path.join(
        os.getcwd(), 'metadata/{}stringsearch_pubs.json'.format(
            metadata_funs.get_hash(str(datetime.datetime.now()))))
    stringsearch_pubs = gen_stringsearch_pub_metadata(
        api_client=api_client, dataset_names_list=dataset_names_list)
    pub_dataset_list_final = metadata_funs.flatten(stringsearch_pubs)
    json.dump(pub_dataset_list_final,
              open(stringsearch_pubs_path, 'w'),
              indent=2)
    return pub_dataset_list_final
Пример #4
0
def gen_ss_dyad(dataset_name_dict, api_client):
    """
    intake a dataset dictionary that has a dataset_id and a list of names (name+alias)
    , and each is run through the string_search_dyads function to return publication metadata
    """

    dataset_names_list = dataset_name_dict['dataset_name']
    dataset_id = dataset_name_dict['dataset_id']
    store_dyads = []
    for ds in dataset_names_list:
        pub_dataset_dyads = return_string_search_dyads(dataset_string=ds,
                                                       api_client=api_client)
        store_dyads.append(pub_dataset_dyads)
    store_dyads_flat = metadata_funs.flatten(store_dyads)
    for s in store_dyads_flat:
        s.update({
            'related_dataset': dataset_id,
            'linkage_source': 'dataset_stringsearch'
        })
    return store_dyads_flat
Пример #5
0
    """
    big_list = []
    try:
        for d in ds_names:
            print('looking for ', d, ' now')
            # a = 'i would do a string search with {}'.format(d)
            a = gen_ss_dyad(dataset_name_dict=d, api_client=api_client)
            big_list.append(a)
            time.sleep(1)
        return big_list
    except KeyboardInterrupt:
        return big_list


api_client = metadata_funs.create_api_client()
dataset_names = metadata_funs.read_datasets()
ds_names = gen_ds_names(dataset_names)
exclude_ids = filter_ids()
# ds_names_lim = [d for d  in ds_names if d['dataset_id'] in ['dataset-b48654a3feb4deaaa272','dataset-53fcd9fbd727f01baad3']]
ds_names_lim = [d for d in ds_names if d['dataset_id'] not in exclude_ids]
# print('excluding these ids',exclude_ids)
# print('searching for these ids', ds_names_lim)
big_list = gen_dyad_list(ds_names=ds_names_lim)
# print(big_list)
final_list = metadata_funs.flatten(big_list)
stringsearch_pubs_path = os.path.join(
    os.getcwd(), 'metadata/{}stringsearch_pubs.json'.format(
        metadata_funs.get_hash(str(datetime.datetime.now()))))

json.dump(final_list, open(stringsearch_pubs_path, 'w'), indent=2)