def _get_text_for_grounding(stmt, agent_text): """Get text context for Adeft disambiguation If the INDRA database is available, attempts to get the fulltext from which the statement was extracted. If the fulltext is not available, the abstract is returned. If the indra database is not available, uses the pubmed client to get the abstract. If no abstract can be found, falls back on returning the evidence text for the statement. Parameters ---------- stmt : py:class:`indra.statements.Statement` Statement with agent we seek to disambiguate. agent_text : str Agent text that needs to be disambiguated Returns ------- text : str Text for Adeft disambiguation """ text = None # First we will try to get content from the DB try: from indra_db.util.content_scripts \ import get_text_content_from_text_refs from indra.literature.adeft_tools import universal_extract_text refs = stmt.evidence[0].text_refs # Prioritize the pmid attribute if given if stmt.evidence[0].pmid: refs['PMID'] = stmt.evidence[0].pmid logger.debug('Obtaining text for disambiguation with refs: %s' % refs) content = get_text_content_from_text_refs(refs) if not content: raise ValueError('Text obtained from DB is empty') text = universal_extract_text(content, contains=agent_text) if text: return text except Exception as e: logger.info('Could not get text for disambiguation from DB: %s' % e) # If that doesn't work, we try PubMed next if text is None: from indra.literature import pubmed_client pmid = stmt.evidence[0].pmid if pmid: logger.debug('Obtaining abstract for disambiguation for PMID%s' % pmid) text = pubmed_client.get_abstract(pmid) if text: return text # Finally, falling back on the evidence sentence if text is None: logger.info('Falling back on sentence-based disambiguation') text = stmt.evidence[0].text return text return None
def _get_text_for_grounding(stmt, agent_text): """Get text context for Adeft disambiguation If the INDRA database is available, attempts to get the fulltext from which the statement was extracted. If the fulltext is not available, the abstract is returned. If the indra database is not available, uses the pubmed client to get the abstract. If no abstract can be found, falls back on returning the evidence text for the statement. Parameters ---------- stmt : py:class:`indra.statements.Statement` Statement with agent we seek to disambiguate. agent_text : str Agent text that needs to be disambiguated Returns ------- text : str Text for Adeft disambiguation """ text = None # First we will try to get content from the DB try: from indra_db.util.content_scripts \ import get_text_content_from_text_refs from indra.literature.adeft_tools import universal_extract_text refs = stmt.evidence[0].text_refs # Prioritize the pmid attribute if given if stmt.evidence[0].pmid: refs['PMID'] = stmt.evidence[0].pmid logger.info('Obtaining text for disambiguation with refs: %s' % refs) content = get_text_content_from_text_refs(refs) text = universal_extract_text(content, contains=agent_text) if text: return text except Exception as e: logger.info('Could not get text for disambiguation from DB.') # If that doesn't work, we try PubMed next if text is None: from indra.literature import pubmed_client pmid = stmt.evidence[0].pmid if pmid: logger.info('Obtaining abstract for disambiguation for PMID%s' % pmid) text = pubmed_client.get_abstract(pmid) if text: return text # Finally, falling back on the evidence sentence if text is None: logger.info('Falling back on sentence-based disambiguation') text = stmt.evidence[0].text return text return None
def _get_text_for_grounding(self, stmt, agent_text): """Get text context for Adeft disambiguation If the INDRA database is available, attempts to get the fulltext from which the statement was extracted. If the fulltext is not available, the abstract is returned. If the indra database is not available, uses the pubmed client to get the abstract. If no abstract can be found, falls back on returning the evidence text for the statement. Parameters ---------- stmt : py:class:`indra.statements.Statement` Statement with agent we seek to disambiguate. agent_text : str Agent text that needs to be disambiguated Returns ------- text : str Text for Adeft disambiguation """ text = None # First we will try to get content from a local text content DB if # available since this is the fastest option if self.has_local_text_db: try: from indra_db_lite import get_plaintexts_for_text_ref_ids, \ get_text_ref_ids_for_pmids refs = stmt.evidence[0].text_refs trid = refs.get('TRID') pmid = refs.get('PMID') if trid: text_content = get_plaintexts_for_text_ref_ids([trid]) _, content = next(text_content.trid_content_pairs()) if content: return content elif pmid: mappings = get_text_ref_ids_for_pmids([int(pmid)]) if int(pmid) in mappings: trid = mappings[int(pmid)] text_content = get_plaintexts_for_text_ref_ids([trid]) _, content = next(text_content.trid_content_pairs()) if content: return content except Exception as e: logger.info('Could not get text from local DB: %s' % e) # If the above is not available or fails, we try the INDRA DB # if available. if self.__tc is not None: try: from indra.literature.adeft_tools import universal_extract_text refs = stmt.evidence[0].text_refs # Prioritize the pmid attribute if given if stmt.evidence[0].pmid: refs['PMID'] = stmt.evidence[0].pmid logger.debug( 'Obtaining text for disambiguation with refs: %s' % refs) content = self.__tc.get_text_content_from_text_refs(refs) if not content: raise ValueError('Text obtained from DB is empty') text = universal_extract_text(content, contains=agent_text) if text: return text except Exception as e: logger.info( 'Could not get text for disambiguation from DB: %s' % e) # If that doesn't work, we try PubMed next trying to fetch an abstract if text is None: from indra.literature import pubmed_client pmid = stmt.evidence[0].pmid if pmid: logger.debug( 'Obtaining abstract for disambiguation for PMID%s' % pmid) text = pubmed_client.get_abstract(pmid) if text: return text # Finally, falling back on the evidence sentence if text is None: logger.info('Falling back on sentence-based disambiguation') text = stmt.evidence[0].text return text return None
if __name__ == '__main__': parser = argparse.ArgumentParser(description='Get texts for statements' ' with agent from a list of shortforms') parser.add_argument('vars', nargs='*') args = parser.parse_args() shortforms = args.vars all_stmts = set() cased_shortforms = [ escape_filename(shortform) for shortform in sorted(shortforms) ] for shortform in shortforms: cased_shortform = escape_filename(shortform) path = os.path.join(DATA_PATH, 'statements', f'{cased_shortform}_statements.json') with open(path, 'r') as f: stmts = json.load(f) all_stmts.update(stmts) ref_dict, text_dict = get_text_content_from_stmt_ids(all_stmts) text_dict = { text_ref: universal_extract_text(article, contains=shortforms) for text_ref, article in text_dict.items() } agg_name = ':'.join(cased_shortforms) dir_path = os.path.join(DATA_PATH, 'texts', agg_name) if not os.path.exists(dir_path): os.makedirs(dir_path) with open(os.path.join(dir_path, f'{agg_name}_texts.json'), 'w') as f: json.dump(text_dict, f) with open(os.path.join(dir_path, f'{agg_name}_text_map.json'), 'w') as f: json.dump(ref_dict, f)
def get_text_content(pmid): content = get_text_content_from_text_refs(text_refs={'PMID': pmid}, db=db) if content: text = universal_extract_text(content) return text return None