def test_id_lookup_no_pmid(): """Look up a paper that has a PMCID and DOI but not PMID.""" res = id_lookup('10.1083/jcb.1974if', 'doi') assert res['pmcid'] == 'PMC3352949' res = id_lookup('PMC3352949', 'pmcid') assert res['doi'] == '10.1083/jcb.1974if' assert unicode_strs(res)
def __init__(self, xml_etree): self.tree = xml_etree self.statements = [] # Extract all sems by category self._sems = collections.defaultdict(list) for interp in self.tree.findall('interpretation'): sentence = interp.find('sentence-text').text sems = interp.findall('sem') for sem in sems: ref = sem.find('ref') if ref is not None: category = ref.attrib['category'] self._sems[category].append((sem, sentence)) # Get citation info pmcid = self.tree.attrib.get('pmcid') pmid = self.tree.attrib.get('pmid') if not pmid: pmid = self.tree.attrib.get('id') self.pmid = None if pmid: if pmid.startswith('PMID'): pmid = pmid[4:] self.pmid = pmid elif pmcid: ids = id_lookup(pmcid, 'pmcid') pmid = ids.get('pmid') if pmid is not None: self.pmid = pmid
def process_pmc(pmc_id, offline=False, output_fname=default_output_fname): """Return a ReachProcessor by processing a paper with a given PMC id. Uses the PMC client to obtain the full text. If it's not available, None is returned. Parameters ---------- pmc_id : str The ID of a PubmedCentral article. The string may start with PMC but passing just the ID also works. Examples: 3717945, PMC3717945 https://www.ncbi.nlm.nih.gov/pmc/ offline : Optional[bool] If set to True, the REACH system is ran offline. Otherwise (by default) the web service is called. Default: False Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ xml_str = pmc_client.get_xml(pmc_id) if xml_str is None: return None fname = pmc_id + '.nxml' with open(fname, 'wb') as fh: fh.write(xml_str.encode('utf-8')) ids = id_lookup(pmc_id, 'pmcid') pmid = ids.get('pmid') rp = process_nxml_file(fname, citation=pmid, offline=offline, output_fname=output_fname) return rp
def process_pmc(pmc_id, offline=False): """Return a ReachProcessor by processing a paper with a given PMC id. Uses the PMC client to obtain the full text. If it's not available, None is returned. Parameters ---------- pmc_id : str The ID of a PubmedCentral article. The string may start with PMC but passing just the ID also works. Examples: 3717945, PMC3717945 https://www.ncbi.nlm.nih.gov/pmc/ offline : Optional[bool] If set to True, the REACH system is ran offline. Otherwise (by default) the web service is called. Default: False Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ xml_str = pmc_client.get_xml(pmc_id) if xml_str is None: return None fname = pmc_id + '.nxml' with open(fname, 'wb') as fh: fh.write(xml_str.encode('utf-8')) ids = id_lookup(pmc_id, 'pmcid') pmid = ids.get('pmid') rp = process_nxml_file(fname, citation=pmid, offline=offline) return rp
def process_pmc(pmc_id, offline=False, url=None, output_fname=default_output_fname): """Return a ReachProcessor by processing a paper with a given PMC id. Uses the PMC client to obtain the full text. If it's not available, None is returned. Parameters ---------- pmc_id : str The ID of a PubmedCentral article. The string may start with PMC but passing just the ID also works. Examples: 3717945, PMC3717945 https://www.ncbi.nlm.nih.gov/pmc/ offline : Optional[bool] If set to True, the REACH system is run offline via a JAR file. Otherwise (by default) the web service is called. Default: False url : Optional[str] URL for a REACH web service instance, which is used for reading if provided. If not provided but offline is set to False (its default value), the Arizona REACH web service is called (http://agathon.sista.arizona.edu:8080/odinweb/api/help). Default: None output_fname : Optional[str] The file to output the REACH JSON output to. Defaults to reach_output.json in current working directory. Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ # Loading content from PMC first logger.info('Loading %s from PMC' % pmc_id) xml_str = pmc_client.get_xml(pmc_id) if xml_str is None: return None # Write into a file in the working folder fname = pmc_id + '.nxml' with open(fname, 'wb') as fh: fh.write(xml_str.encode('utf-8')) # Try to get the PMID for the paper so that the evidence pmid # attribute can be set correctly logger.info('Looking up PMID for %s' % pmc_id) ids = id_lookup(pmc_id, 'pmcid') pmid = ids.get('pmid') # Now process the NXML file with the provided arguments logger.info('Processing %s with REACH' % pmc_id) rp = process_nxml_file(fname, citation=pmid, offline=offline, url=url, output_fname=output_fname) return rp
def get_pmc_id(stmt): pmc_id = '' for ev in stmt.evidence: pmc_id = id_lookup(ev.pmid, 'pmid')['pmcid'] if pmc_id is not None: if not pmc_id.startswith('PMC'): pmc_id = 'PMC' + pmc_id else: pmc_id = '' return str(pmc_id)
def _get_evidence(self, card): pmcid = card.get('pmc_id') ids = id_lookup(pmcid, 'pmcid') pmid = ids.get('pmid') evidence = card.get('evidence') all_evidence = [] if evidence is not None: for text in evidence: e = Evidence(self.source_api, pmid=pmid, text=text) all_evidence.append(e) return all_evidence
def get_sample(pmids, k, fname): random.shuffle(pmids) done = 0 with open(fname, 'w') as fh: for pmid in pmids: ids = id_lookup(pmid, 'pmid') pmcid = ids.get('pmcid') if pmcid: fh.write('%s\n' % pmcid) print('Downloading %s' % pmcid) xml = pmc_client.get_xml(pmcid) if xml: with open('docs/pmc_xmls/%s.nxml' % pmcid, 'w') as xfh: xfh.write(xml) done += 1 if done == k: break
def test_id_lookup(): res = id_lookup('17513615', 'pmid') assert res['doi'] == '10.1158/1535-7163.MCT-06-0807'
def make_model(self, template=None, grouping_level='agent-pair', add_full_text_search_link=False, no_redundancy=False, **template_kwargs): """Return the assembled HTML content as a string. Parameters ---------- template : a Template object Manually pass a Jinja template to be used in generating the HTML. The template is responsible for rendering essentially the output of `make_json_model`. grouping_level : Optional[str] Statements can be grouped under sub-headings at three levels, 'statement' (ungrouped), 'relation' (grouped by agents and type), and 'agent-pair' (grouped by ordered pairs of agents). Default: 'agent-pair'. add_full_text_search_link : bool If True, link with Text fragment search in PMC journal will be added for the statements. no_redundancy : Optional[bool] If True, any group of statements that was already presented under a previous heading will be skipped. This is typically the case for complexes where different permutations of complex members are presented. By setting this argument to True, these can be eliminated. Default: False All other keyword arguments are passed along to the template. If you are using a custom template with args that are not passed below, this is how you pass them. Returns ------- str The assembled HTML as a string. """ # Make the JSON model. tl_stmts = self.make_json_model(grouping_level=grouping_level, no_redundancy=no_redundancy) if add_full_text_search_link: for statement in tl_stmts: statement = tl_stmts[statement] for stmt_formatted in statement["stmts_formatted"]: for stmt_info in stmt_formatted["stmt_info_list"]: for evidence in stmt_info["evidence"]: if 'PMCID' not in evidence.get('text_refs', {}): if evidence.get('pmid'): ev_pmcid = id_lookup( evidence['pmid'], 'pmid') \ .get('pmcid', None) if ev_pmcid: evidence['pmcid'] = ev_pmcid else: evidence['pmcid'] = \ evidence['text_refs']['PMCID'] metadata = { k.replace('_', ' ').title(): v for k, v in self.metadata.items() if not isinstance(v, list) and not isinstance(v, dict) } if self.db_rest_url and not self.db_rest_url.endswith('statements'): db_rest_url = self.db_rest_url + '/statements' else: db_rest_url = None # Fill the template. if template is None: template = default_template if self.source_counts and 'source_key_dict' not in template_kwargs: template_kwargs['source_key_dict'] = \ {src: src for src in all_sources} if 'source_colors' not in template_kwargs: template_kwargs['source_colors'] = DEFAULT_SOURCE_COLORS if 'source_info' not in template_kwargs: template_kwargs['source_info'] = SOURCE_INFO.copy() if 'simple' not in template_kwargs: template_kwargs['simple'] = True self.model = template.render( stmt_data=tl_stmts, metadata=metadata, title=self.title, db_rest_url=db_rest_url, add_full_text_search_link=add_full_text_search_link, # noqa **template_kwargs) return self.model
from indra import trips from indra.literature import id_lookup from assembly_eval import have_file, run_assembly if __name__ == '__main__': pmc_ids = ['PMC1234335', 'PMC3178447', 'PMC3690480', 'PMC4345513', 'PMC534114'] pmids = [id_lookup(pmcid)['pmid'] for pmcid in pmc_ids] # Use the existing EKB extractions. for pmid, pmcid in zip(pmids, pmc_ids): folder = 'trips' prefix = folder + '/' + pmcid print 'Processing %s...' % pmcid tp = trips.process_xml(open(prefix + '-20160503T1152.ekb').read()) # PMIDs from TRIPS need to be set here because it propagates # the PMCID by default for s in tp.statements: for e in s.evidence: e.pmid = pmid run_assembly(tp.statements, folder, pmcid)
from indra import trips, reach from indra.literature import id_lookup from assembly_eval import have_file, run_assembly if __name__ == "__main__": pmc_ids = ["PMC1234335", "PMC3178447", "PMC3690480", "PMC4345513", "PMC534114"] pmids = [id_lookup(pmcid)["pmid"] for pmcid in pmc_ids] for pmid, pmcid in zip(pmids, pmc_ids): print "Processing %s..." % pmcid trips_fname = "trips/" + pmcid + "-20160503T1152.ekb" tp = trips.process_xml(open(trips_fname).read()) for s in tp.statements: for e in s.evidence: e.pmid = pmid reach_fname = "reach/" + pmcid + ".json" rp = reach.process_json_file(reach_fname) all_statements = tp.statements + rp.statements run_assembly(all_statements, "combined", pmcid)
def make_model(self, template=None, with_grouping=True, add_full_text_search_link=False, **template_kwargs): """Return the assembled HTML content as a string. Parameters ---------- template : a Template object Manually pass a Jinja template to be used in generating the HTML. The template is responsible for rendering essentially the output of `make_json_model`. with_grouping : bool If True, statements will be grouped under multiple sub-headings. If False, all headings will be collapsed into one on every level, with all statements placed under a single heading. add_full_text_search_link : bool If True, link with Text fragment search in PMC journal will be added for the statements. All other keyword arguments are passed along to the template. If you are using a custom template with args that are not passed below, this is how you pass them. Returns ------- str The assembled HTML as a string. """ tl_stmts = self.make_json_model(with_grouping) if add_full_text_search_link: for statement in tl_stmts: statement = tl_stmts[statement] for stmt_formatted in statement["stmts_formatted"]: for stmt_info in stmt_formatted["stmt_info_list"]: for evidence in stmt_info["evidence"]: if 'PMCID' not in evidence.get('text_refs', {}): if evidence.get('pmid'): ev_pmcid = id_lookup( evidence['pmid'], 'pmid') \ .get('pmcid', None) if ev_pmcid: evidence['pmcid'] = ev_pmcid else: evidence['pmcid'] = \ evidence['text_refs']['PMCID'] metadata = { k.replace('_', ' ').title(): v for k, v in self.metadata.items() if not isinstance(v, list) and not isinstance(v, dict) } if self.db_rest_url and not self.db_rest_url.endswith('statements'): db_rest_url = self.db_rest_url + '/statements' else: db_rest_url = None # Fill the template. if template is None: template = default_template if self.source_counts and 'source_key_dict' not in template_kwargs: template_kwargs['source_key_dict'] = SRC_KEY_DICT if 'source_colors' not in template_kwargs: template_kwargs['source_colors'] = SOURCE_COLORS self.model = template.render( stmt_data=tl_stmts, metadata=metadata, title=self.title, db_rest_url=db_rest_url, add_full_text_search_link=add_full_text_search_link, # noqa **template_kwargs) return self.model
rerun = False # Download the papers if they are not available yet pmids = [] for pmcid in pmc_ids: prefix = folder + '/' + pmcid if not have_file(prefix + '.nxml') and\ not have_file(prefix + '.txt'): txt, txt_format = get_full_text(pmcid) if txt_format == 'nxml': fname = prefix + '.nxml' else: fname = prefix + '.txt' with open(fname, 'wt') as fh: fh.write(txt.encode('utf-8')) pmids.append(id_lookup(pmcid)['pmid']) # Read each paper if it hasn't been read yet. # Otherwise use the existing json extractions. for pmcid, pmid in zip(pmc_ids, pmids): prefix = folder + '/' + pmcid print 'Processing %s...' % pmcid # If REACH already processed it then don't run it again if rerun or not have_file(prefix + '.json'): if have_file(prefix + '.txt'): txt = open(prefix + '.txt').read().decode('utf-8') rp = reach.process_text(txt, citation=pmid) elif have_file(prefix + '.nxml'): rp = reach.process_nxml_file(prefix + '.nxml', citation=pmid) shutil.move('reach_output.json', prefix + '.json')