def load_hume(cached=True): logger.info('Loading Hume statements') pkl_name = os.path.join(data_path, 'hume', 'stmts_influence.pkl') if cached: if os.path.exists(pkl_name): with open(pkl_name, 'rb') as fh: stmts = pickle.load(fh) logger.info(f'Loaded {len(stmts)} statements') return stmts fnames = glob.glob( os.path.join(data_path, 'hume', 'wm_dart.101119.121619', '*.json-ld')) fnames += glob.glob( os.path.join(data_path, 'hume', 'wm_factiva.121019.121619', '*.json-ld')) fnames += glob.glob( os.path.join(data_path, 'hume', 'wm_luma.121019.121619', '*.json-ld')) stmts = [] for fname in tqdm.tqdm(fnames): hp = hume.process_jsonld_file(fname) stmts += hp.statements logger.info(f'Loaded {len(stmts)} statements from Hume') with open(pkl_name, 'wb') as fh: pickle.dump(stmts, fh) return stmts
def load_hume(): logger.info('Loading Hume statements') fnames = glob.glob( os.path.join(data_path, 'hume/wm_dart.082919.v3.json-ld')) stmts = [] for fname in fnames: hp = hume.process_jsonld_file(fname) stmts += hp.statements logger.info(f'Loaded {len(stmts)} statements from Hume') return stmts
def read_hume(fname, version='new'): if version == 'new': bp = hume.process_jsonld_file(fname) # Remap doc names - only needed if original doc names should be # reconstructed # for stmt in bp.statements: # stmt.evidence[0].pmid = stmt.evidence[0].pmid[:-4] else: bp = hume.process_json_file_old(fname) # We need to filter out a duplicate document to avoid # artifactual duplicates ret_stmts = [] for stmt in bp.statements: doc = stmt.evidence[0].annotations['provenance'][0]['document']['@id'] if doc != 'ENG_NW_20171205': ret_stmts.append(stmt) return ret_stmts
def load_hume(cached=True): logger.info('Loading Hume statements') pkl_name = os.path.join(data_path, 'hume', 'stmts_intervention.pkl') if cached: if os.path.exists(pkl_name): with open(pkl_name, 'rb') as fh: stmts = pickle.load(fh) logger.info(f'Loaded {len(stmts)} statements') return stmts fnames = glob.glob( os.path.join(data_path, 'hume', 'wm_thanksgiving_intervention.030920', '*.json-ld')) stmts = [] for fname in tqdm.tqdm(fnames): hp = hume.process_jsonld_file(fname) stmts += hp.statements logger.info(f'Loaded {len(stmts)} statements from Hume') with open(pkl_name, 'wb') as fh: pickle.dump(stmts, fh) return stmts
def process_hume(): print('Processing Hume output') #path = 'docs/hume/wm_m12.v8.full.v4.json-ld' path = 'docs/hume/wm_m12.v11.500doc.after.json-ld' hp = hume.process_jsonld_file(path) return hp.statements
readers = ['sofia', 'eidos', 'hume', 'cwms'] grounding = 'compositional' do_upload = False stmts = [] for reader in readers: version = reader_versions[grounding][reader] pattern = '*' if reader != 'sofia' \ else ('*_new' if grounding == 'compositional' else '*_old') fnames = glob.glob('/Users/ben/data/dart/%s/%s/%s' % (reader, version, pattern)) print('Found %d files for %s' % (len(fnames), reader)) for fname in tqdm.tqdm(fnames): if reader == 'eidos': pp = eidos.process_json_file(fname, grounding_mode=grounding) elif reader == 'hume': pp = hume.process_jsonld_file(fname, grounding_mode=grounding) elif reader == 'cwms': pp = cwms.process_ekb_file(fname, grounding_mode=grounding) elif reader == 'sofia': pp = sofia.process_json_file(fname, grounding_mode=grounding) doc_id = os.path.basename(fname)[:32] for stmt in pp.statements: for ev in stmt.evidence: if 'provenance' not in ev.annotations: ev.annotations['provenance'] = [{ 'document': { '@id': doc_id } }] else: prov = ev.annotations['provenance'][0]['document']
def do_regrounding(stmts): concepts = [] for stmt in stmts: for concept in stmt.agent_list(): concept_txt = concept.db_refs.get('TEXT') concepts.append(concept_txt) groundings = er.reground_texts(concepts) # Update the corpus with new groundings idx = 0 for stmt in stmts: for concept in stmt.agent_list(): concept.db_refs['UN'] = groundings[idx] idx += 1 return stmts if __name__ == '__main__': config = load_config() fnames = config['files'] for fname in fnames: print('Processing %s' % fname) hp = hume.process_jsonld_file(fname) parts = fname.split('/') new_fname = '%s_%s' % (parts[-2], parts[-1]) new_fname = new_fname.replace('json-ld', 'json') print('Running regrounding') stmts = do_regrounding(hp.statements) print('Savig into JSON') stmts_to_json_file(hp.statements, new_fname)