def test_process_json_ld_file(): ep = eidos.process_json_file(test_jsonld) assert len(ep.statements) == 1 st = ep.statements[0] assert 'UN' in st.subj.concept.db_refs assert 'UN' in st.obj.concept.db_refs ep = eidos.process_json_file(test_jsonld, grounding_ns=['UN']) st = ep.statements[0] assert set(st.subj.concept.db_refs.keys()) == {'TEXT', 'UN'}
def test_process_correlations(): correl_jsonld = os.path.join(path_this, 'eidos_correlation.json') ep = eidos.process_json_file(correl_jsonld) assert len(ep.statements) == 1 st = ep.statements[0] assert isinstance(st, Association) assert isinstance(st.members[0], Event) names = {m.concept.name for m in st.members} assert names == {'harvest', 'requirement'}, names # This is to check the extraction filter ep = eidos.process_json_file(correl_jsonld, extract_filter={'influence'}) assert len(ep.statements) == 0
def test_geoloc_obj(): se_jsonld = os.path.join(path_this, 'eidos_geoloc_obj.json') ep = eidos.process_json_file(se_jsonld) st = ep.statements[1] ev = st.evidence[0] assert not ev.context, ev.context assert st.obj.context
def assemble_one_corpus(): """For assembling one of the four corpora.""" path = '/home/bmg16/data/wm/2-Jsonld' corpus_size = '16k' prefix = '%s%s' % (path, corpus_size) fnames = glob.glob('%s/*.jsonld' % prefix) # For large corpus all_statements = [] for idx, fname in enumerate(fnames): ep = eidos.process_json_file(fname) for stmt in ep.statements: for ev in stmt.evidence: ev.annotations['provenance'][0]['document']['@id'] = \ os.path.basename(fname) all_statements += ep.statements print('%d: %d' % (idx, len(all_statements))) with open('%s/3-Indra%s.pkl' % (prefix, corpus_size), 'wb') as fh: pickle.dump(all_statements, fh) scorer = get_eidos_scorer() assembled_stmts = ac.run_preassembly(all_statements, belief_scorer=scorer, return_toplevel=False) jd = stmts_to_json(assembled_stmts, use_sbo=False) with open('%s/3-Indra%s.json' % (prefix, corpus_size), 'w') as fh: json.dump(jd, fh, indent=1)
def test_process_correlations(): correl_jsonld = os.path.join(path_this, 'eidos_correlation.json') ep = eidos.process_json_file(correl_jsonld) assert len(ep.statements) == 1 st = ep.statements[0] assert isinstance(st, Association) names = {c.name for c in st.members} assert names == {'harvest', 'requirement'}, names
def test_process_correlations(): correl_jsonld = os.path.join(path_this, 'eidos_correlation.json') ep = eidos.process_json_file(correl_jsonld) assert len(ep.statements) == 1 st = ep.statements[0] assert isinstance(st, Association) assert isinstance(st.members[0], Event) names = {m.concept.name for m in st.members} assert names == {'harvest', 'requirement'}, names
def test_process_negation_hedging(): nh_jsonld = os.path.join(path_this, 'eidos_neg_hedge.json') ep = eidos.process_json_file(nh_jsonld) assert len(ep.statements) == 1 st = ep.statements[0] epi = st.evidence[0].epistemics assert epi.get('hedgings') == ['may'], epi assert epi.get('negated') is True, epi annot = st.evidence[0].annotations assert annot.get('negated_texts') == ['not']
def test_process_corefs(): coref_jsonld = os.path.join(path_this, 'eidos_coref.json') ep = eidos.process_json_file(coref_jsonld) assert ep.coreferences.get('_:Extraction_6') == '_:Extraction_4' assert len(ep.statements) == 2 # Get summaru of subj/objs from statements concepts = [(s.subj.name, s.obj.name) for s in ep.statements] assert ('rainfall', 'flood') in concepts, concepts # This ensures that the coreference was successfully resolved assert ('flood', 'displacement') in concepts, concepts
def test_process_json(): ep = eidos.process_json_file(test_json) assert ep is not None assert len(ep.statements) == 1 stmt = ep.statements[0] assert isinstance(stmt, Influence) assert stmt.subj_delta.get('polarity') == 1 assert stmt.obj_delta.get('polarity') == -1 assert stmt.subj_delta.get('adjectives') == ['large'] assert stmt.obj_delta.get('adjectives') == ['seriously'] print(stmt)
def test_process_corefs(): coref_jsonld = os.path.join(path_this, 'eidos_coref.json') ep = eidos.process_json_file(coref_jsonld) assert ep.doc.coreferences.get('_:Extraction_6') == '_:Extraction_4' assert len(ep.statements) == 2 # Get summaru of subj/objs from statements concepts = [(s.subj.concept.name, s.obj.concept.name) for s in ep.statements] assert ('rainfall', 'flood') in concepts, concepts # This ensures that the coreference was successfully resolved assert ('flood', 'displacement') in concepts, concepts
def test_process_geoids(): geo_jsonld = os.path.join(path_this, 'eidos_geoid.json') ep = eidos.process_json_file(geo_jsonld) # Make sure we collect all geoids up front ss_loc = {'name': 'South Sudan', 'db_refs': {'GEOID': '7909807'}} assert len(ep.geolocs) == 5, len(ep.geoids) assert ep.geolocs['_:GeoLocation_1'].to_json() == ss_loc # Make sure this event has the right geoid ev = ep.statements[1].evidence[0] assert ev.context.geo_location.to_json() == ss_loc # And that the subject context is captured in annotations assert ev.annotations['subj_context']['geo_location'] == ss_loc
def load_eidos(): logger.info('Loading Eidos statements') fnames = glob.glob(os.path.join(data_path, 'eidos/jsonldDir/*.jsonld')) stmts = [] for fname in fnames: doc_id = os.path.basename(fname).split('.')[0] ep = eidos.process_json_file(fname) fix_provenance(ep.statements, doc_id) stmts += ep.statements logger.info(f'Loaded {len(stmts)} statements from Eidos') return stmts
def process_eidos_un(): print('Processing Eidos output for UN corpus') fnames = sorted(glob.glob('/Users/ben/data/wm/2-Jsonld16k/*.jsonld')) stmts = [] for fname in tqdm.tqdm(fnames): ep = eidos.process_json_file(fname) for stmt in ep.statements: for ev in stmt.evidence: doc_id = os.path.splitext(os.path.basename(fname))[0] ev.annotations['provenance'][0]['document']['@id'] = doc_id stmts.append(stmt) return stmts
def test_standalone_event(): se_jsonld = os.path.join(path_this, 'eidos_standalone_event.json') ep = eidos.process_json_file(se_jsonld) assert len(ep.statements) == 1 st = ep.statements[0] assert isinstance(st, Event) assert hasattr(st, 'evidence') ev = st.evidence[0] assert ev.text is not None js = st.to_json() assert js['evidence'] from indra.statements import stmts_to_json js2 = stmts_to_json([st])[0] assert 'evidence' in js2
def test_process_json(): ep = eidos.process_json_file(test_json) assert ep is not None assert len(ep.statements) == 1 stmt = ep.statements[0] assert isinstance(stmt, Influence) assert stmt.subj_delta.get('polarity') == 1 assert stmt.obj_delta.get('polarity') == -1 assert stmt.subj_delta.get('adjectives') == ['large'] assert stmt.obj_delta.get('adjectives') == ['seriously'] assert(stmt.evidence[0].annotations['found_by'] == 'ported_syntax_1_verb-Causal') print(stmt)
def test_process_timex(): timex_jsonld = os.path.join(path_this, 'eidos_timex.json') ep = eidos.process_json_file(timex_jsonld) assert len(ep.statements) == 1 ev = ep.statements[0].evidence[0] assert ev.context is not None assert ev.context.__repr__() == ev.context.__str__() assert ev.context.time.duration == 365 * 86400, ev.context.time.duration assert ev.context.time.start == \ datetime.datetime(year=2018, month=1, day=1, hour=0, minute=0), \ ev.context.time.start assert ev.context.time.end == \ datetime.datetime(year=2019, month=1, day=1, hour=0, minute=0), \ ev.context.time.end
def test_process_polarity(): test_jsonld = os.path.join(path_this, 'eidos_neg_event.json') ep = eidos.process_json_file(test_jsonld) assert ep is not None assert len(ep.statements) == 1 stmt = ep.statements[0] assert isinstance(stmt, Influence) assert stmt.subj.concept.name == 'fuel', stmt.subj.concept.name assert stmt.obj.concept.name == 'water trucking', stmt.obj.concept.name assert stmt.obj.delta.polarity == -1 assert stmt.evidence[0].annotations['found_by'] == \ 'ported_syntax_1_verb-Causal' assert 'TEXT' in stmt.subj.concept.db_refs assert 'TEXT' in stmt.obj.concept.db_refs
def test_process_geoids(): geo_jsonld = os.path.join(path_this, 'eidos_geoid.json') ep = eidos.process_json_file(geo_jsonld) # Make sure we collect all geoids up front ss_loc = {'name': 'South Sudan', 'db_refs': {'GEOID': '7909807'}} assert len(ep.doc.geolocs) == 5, len(ep.geoids) assert ep.doc.geolocs['_:GeoLocation_1'].to_json() == ss_loc # Make sure this event has the right geoid assert isinstance(ep.statements[0], Influence) ev = ep.statements[1].evidence[0] assert ev.context.geo_location.to_json() == ss_loc # And that the subject context is captured in annotations assert 'subj_context' in ev.annotations, ev.annotations assert ev.annotations['subj_context']['geo_location'] == ss_loc
def load_eidos(limit=None, cached=True): logger.info('Loading Eidos statements') pkl_name = os.path.join(data_path, 'eidos', 'stmts.pkl') if cached: if os.path.exists(pkl_name): with open(pkl_name, 'rb') as fh: stmts = pickle.load(fh) logger.info(f'Loaded {len(stmts)} statements') return stmts fnames = glob.glob(os.path.join(data_path, 'eidos/jsonldDir/*.jsonld')) stmts = [] for fname in tqdm.tqdm(fnames[:limit]): doc_id = os.path.basename(fname).split('.')[0] ep = eidos.process_json_file(fname) fix_provenance(ep.statements, doc_id) stmts += ep.statements logger.info(f'Loaded {len(stmts)} statements from Eidos') with open(pkl_name, 'wb') as fh: pickle.dump(stmts, fh) return stmts
#def assemble_all(): if __name__ == '__main__': corpora = { #'50': '/home/bmg16/Dropbox/postdoc/darpa/src/indra_apps/' + \ # 'wm_fao/20181101/2-Jsonld50', '500': '/home/bmg16/Dropbox/postdoc/darpa/src/indra_apps/' + \ 'wm_fao/20181101/2-Jsonld500', '16k': '/home/bmg16/data/wm/2-Jsonld16k', } all_statements = [] for corpus_size, path in corpora.items(): fnames = glob.glob('%s/*.jsonld' % path) for idx, fname in enumerate(fnames): ep = eidos.process_json_file(fname) for stmt in ep.statements: for ev in stmt.evidence: ev.annotations['provenance'][0]['document']['@id'] = \ os.path.basename(fname) ev.annotations['provenance'][0]['document']['corpus'] = \ corpus_size all_statements += ep.statements print('%d: %d' % (idx, len(all_statements))) scorer = get_eidos_scorer() assembled_stmts = ac.run_preassembly(all_statements, belief_scorer=scorer, return_toplevel=False) jd = stmts_to_json(assembled_stmts, use_sbo=False)
def test_process_json_ld_file(): ep = eidos.process_json_file(test_jsonld) assert len(ep.statements) == 1 assert 'UN' in ep.statements[0].subj.concept.db_refs assert 'UN' in ep.statements[0].obj.concept.db_refs
if __name__ == '__main__': readers = ['sofia', 'eidos', 'hume', 'cwms'] grounding = 'compositional' do_upload = False stmts = [] for reader in readers: version = reader_versions[grounding][reader] pattern = '*' if reader != 'sofia' \ else ('*_new' if grounding == 'compositional' else '*_old') fnames = glob.glob('/Users/ben/data/dart/%s/%s/%s' % (reader, version, pattern)) print('Found %d files for %s' % (len(fnames), reader)) for fname in tqdm.tqdm(fnames): if reader == 'eidos': pp = eidos.process_json_file(fname, grounding_mode=grounding) elif reader == 'hume': pp = hume.process_jsonld_file(fname, grounding_mode=grounding) elif reader == 'cwms': pp = cwms.process_ekb_file(fname, grounding_mode=grounding) elif reader == 'sofia': pp = sofia.process_json_file(fname, grounding_mode=grounding) doc_id = os.path.basename(fname)[:32] for stmt in pp.statements: for ev in stmt.evidence: if 'provenance' not in ev.annotations: ev.annotations['provenance'] = [{ 'document': { '@id': doc_id } }]
def test_compositional_grounding(): jsonld = os.path.join(path_this, 'eidos_compositional.jsonld') ep = eidos.process_json_file(jsonld, grounding_mode='compositional') assert ep.statements