def extract_eidos_text(docnames): texts = {} # Extract the evidence text for all events into a dict for docname in docnames: key = docname texts[key] = [] print(docname) fname = 'docs/%s.txt' % docname with open(fname, 'r') as fh: print('Reading %s' % fname) txt = fh.read() json_fname = 'eidos/%s.txt.jsonld' % docname ep = eidos.process_json_ld_file(json_fname) for stmt in ep.statements: for ev in stmt.evidence: txt = ev.text txt = txt.replace('\n', ' ') texts[key].append(txt) # Now clean up all the texts to remove redundancies for key, sentences in texts.items(): cleaned_sentences = copy.copy(sentences) for s1, s2 in itertools.combinations(sentences, 2): if s1 in s2 and s1 in cleaned_sentences: cleaned_sentences.remove(s1) elif s2 in s1 and s2 in cleaned_sentences: cleaned_sentences.remove(s2) texts[key] = cleaned_sentences return texts
def text_to_stmts(text): """Run Eidos reading on a given text and return INDRA Statements.""" # We use some caching here so that sentences we have already read # are not re-read. fname = text.replace(' ', '_').replace(',', '_') + '.jsonld' if os.path.exists(fname): ep = eidos.process_json_ld_file(fname) else: ep = eidos.process_text(text) shutil.move('eidos_output.json', fname) return ep.statements
def read_eidos(docnames): stmts = [] for docname in docnames: fname = os.path.join('docs', '%s.txt' % docname) jsonname = os.path.join('eidos', '%s.txt.jsonld' % docname) if os.path.exists(jsonname): ep = eidos.process_json_ld_file(jsonname) else: with open(fname, 'r') as fh: print('Reading %s' % docname) txt = fh.read() ep = eidos.process_text(txt, save_json=jsonname, out_format='json_ld') print('%d stmts from %s' % (len(ep.statements), docname)) # Set the PMID on these statements so that we can get the document ID # during assembly for stmt in ep.statements: stmt.evidence[0].pmid = docname stmts += ep.statements return stmts
def test_process_json_ld_file(): ep = eidos.process_json_ld_file(test_jsonld) assert len(ep.statements) == 1 assert 'UN' in ep.statements[0].subj.db_refs assert 'UN' in ep.statements[0].obj.db_refs