def process_pubmed_abstract(pubmed_id, offline=False): """Return a ReachProcessor by processing an abstract with a given Pubmed id. Uses the Pubmed client to get the abstract. If that fails, None is returned. Parameters ---------- pubmed_id : str The ID of a Pubmed article. The string may start with PMID but passing just the ID also works. Examples: 27168024, PMID27168024 https://www.ncbi.nlm.nih.gov/pubmed/ offline : Optional[bool] If set to True, the REACH system is ran offline. Otherwise (by default) the web service is called. Default: False Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ abs_txt = pubmed_client.get_abstract(pubmed_id) if abs_txt is None: return None rp = process_text(abs_txt, citation=pubmed_id, offline=offline) if rp and rp.statements: for st in rp.statements: for ev in st.evidence: ev.epistemics['section_type'] = 'abstract' return rp
def test_pmid_27821631(): time.sleep(0.3) pmid = '27821631' res = pubmed_client.get_abstract(pmid) assert len(res) > 50, res res = pubmed_client.get_metadata_for_ids([pmid], get_abstracts=True) assert res[pmid]['title'] is not None assert len(res[pmid]['abstract']) > 50
def test_pmid_27821631(): time.sleep(0.5) pmid = '27821631' res = pubmed_client.get_abstract(pmid) assert len(res) > 50, res res = pubmed_client.get_metadata_for_ids([pmid], get_abstracts=True) assert res[pmid]['title'] is not None assert len(res[pmid]['abstract']) > 50
def _get_text_for_grounding(stmt, agent_text): """Get text context for Adeft disambiguation If the INDRA database is available, attempts to get the fulltext from which the statement was extracted. If the fulltext is not available, the abstract is returned. If the indra database is not available, uses the pubmed client to get the abstract. If no abstract can be found, falls back on returning the evidence text for the statement. Parameters ---------- stmt : py:class:`indra.statements.Statement` Statement with agent we seek to disambiguate. agent_text : str Agent text that needs to be disambiguated Returns ------- text : str Text for Adeft disambiguation """ text = None # First we will try to get content from the DB try: from indra_db.util.content_scripts \ import get_text_content_from_text_refs from indra.literature.adeft_tools import universal_extract_text refs = stmt.evidence[0].text_refs # Prioritize the pmid attribute if given if stmt.evidence[0].pmid: refs['PMID'] = stmt.evidence[0].pmid logger.debug('Obtaining text for disambiguation with refs: %s' % refs) content = get_text_content_from_text_refs(refs) if not content: raise ValueError('Text obtained from DB is empty') text = universal_extract_text(content, contains=agent_text) if text: return text except Exception as e: logger.info('Could not get text for disambiguation from DB: %s' % e) # If that doesn't work, we try PubMed next if text is None: from indra.literature import pubmed_client pmid = stmt.evidence[0].pmid if pmid: logger.debug('Obtaining abstract for disambiguation for PMID%s' % pmid) text = pubmed_client.get_abstract(pmid) if text: return text # Finally, falling back on the evidence sentence if text is None: logger.info('Falling back on sentence-based disambiguation') text = stmt.evidence[0].text return text return None
def process_pubmed_abstract(pubmed_id, offline=False, url=None, output_fname=default_output_fname, **kwargs): """Return a ReachProcessor by processing an abstract with a given Pubmed id. Uses the Pubmed client to get the abstract. If that fails, None is returned. Parameters ---------- pubmed_id : str The ID of a Pubmed article. The string may start with PMID but passing just the ID also works. Examples: 27168024, PMID27168024 https://www.ncbi.nlm.nih.gov/pubmed/ offline : Optional[bool] If set to True, the REACH system is run offline via a JAR file. Otherwise (by default) the web service is called. Default: False url : Optional[str] URL for a REACH web service instance, which is used for reading if provided. If not provided but offline is set to False (its default value), the Arizona REACH web service is called (http://agathon.sista.arizona.edu:8080/odinweb/api/help). Default: None output_fname : Optional[str] The file to output the REACH JSON output to. Defaults to reach_output.json in current working directory. **kwargs : keyword arguments All other keyword arguments are passed directly to `process_text`. Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ # Get the abstract from PubMed, if that fails, return None abs_txt = pubmed_client.get_abstract(pubmed_id) if abs_txt is None: return None # Process the text with the provided arguments rp = process_text(abs_txt, citation=pubmed_id, offline=offline, url=url, output_fname=output_fname, **kwargs) # For some applications, the section type of the text is important so # that annotation is set here. if rp and rp.statements: for st in rp.statements: for ev in st.evidence: ev.epistemics['section_type'] = 'abstract' return rp
def _get_text_for_grounding(stmt, agent_text): """Get text context for Adeft disambiguation If the INDRA database is available, attempts to get the fulltext from which the statement was extracted. If the fulltext is not available, the abstract is returned. If the indra database is not available, uses the pubmed client to get the abstract. If no abstract can be found, falls back on returning the evidence text for the statement. Parameters ---------- stmt : py:class:`indra.statements.Statement` Statement with agent we seek to disambiguate. agent_text : str Agent text that needs to be disambiguated Returns ------- text : str Text for Adeft disambiguation """ text = None # First we will try to get content from the DB try: from indra_db.util.content_scripts \ import get_text_content_from_text_refs from indra.literature.adeft_tools import universal_extract_text refs = stmt.evidence[0].text_refs # Prioritize the pmid attribute if given if stmt.evidence[0].pmid: refs['PMID'] = stmt.evidence[0].pmid logger.info('Obtaining text for disambiguation with refs: %s' % refs) content = get_text_content_from_text_refs(refs) text = universal_extract_text(content, contains=agent_text) if text: return text except Exception as e: logger.info('Could not get text for disambiguation from DB.') # If that doesn't work, we try PubMed next if text is None: from indra.literature import pubmed_client pmid = stmt.evidence[0].pmid if pmid: logger.info('Obtaining abstract for disambiguation for PMID%s' % pmid) text = pubmed_client.get_abstract(pmid) if text: return text # Finally, falling back on the evidence sentence if text is None: logger.info('Falling back on sentence-based disambiguation') text = stmt.evidence[0].text return text return None
def test_readme_using_indra3(): from indra.sources import reach from indra.literature import pubmed_client # Search for 10 most recent abstracts in PubMed on 'BRAF' pmids = pubmed_client.get_ids('BRAF', retmax=10) all_statements = [] for pmid in pmids: abs = pubmed_client.get_abstract(pmid) if abs is not None: reach_processor = reach.process_text(abs, url=reach.local_text_url) if reach_processor is not None: all_statements += reach_processor.statements assert len(all_statements) > 0
def process_pubmed_abstract(pubmed_id, offline=False, output_fname=default_output_fname, **kwargs): """Return a ReachProcessor by processing an abstract with a given Pubmed id. Uses the Pubmed client to get the abstract. If that fails, None is returned. Parameters ---------- pubmed_id : str The ID of a Pubmed article. The string may start with PMID but passing just the ID also works. Examples: 27168024, PMID27168024 https://www.ncbi.nlm.nih.gov/pubmed/ offline : Optional[bool] If set to True, the REACH system is ran offline. Otherwise (by default) the web service is called. Default: False output_fname : Optional[str] The file to output the REACH JSON output to. Defaults to reach_output.json in current working directory. **kwargs : keyword arguments All other keyword arguments are passed directly to `process_text`. Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ abs_txt = pubmed_client.get_abstract(pubmed_id) if abs_txt is None: return None rp = process_text(abs_txt, citation=pubmed_id, offline=offline, output_fname=output_fname, **kwargs) if rp and rp.statements: for st in rp.statements: for ev in st.evidence: ev.epistemics['section_type'] = 'abstract' return rp
def get_text_content_for_pmids(pmids): """Get text content for articles given a list of their pmids Parameters ---------- pmids : list of str Returns ------- text_content : list of str """ pmc_pmids = set(pmc_client.filter_pmids(pmids, source_type='fulltext')) pmc_ids = [] for pmid in pmc_pmids: pmc_id = pmc_client.id_lookup(pmid, idtype='pmid')['pmcid'] if pmc_id: pmc_ids.append(pmc_id) else: pmc_pmids.discard(pmid) pmc_xmls = [] failed = set() for pmc_id in pmc_ids: if pmc_id is not None: pmc_xmls.append(pmc_client.get_xml(pmc_id)) else: failed.add(pmid) time.sleep(0.5) remaining_pmids = set(pmids) - pmc_pmids | failed abstracts = [] for pmid in remaining_pmids: abstract = pubmed_client.get_abstract(pmid) abstracts.append(abstract) time.sleep(0.5) return [ text_content for source in (pmc_xmls, abstracts) for text_content in source if text_content is not None ]
def get_text_content_for_pmids(pmids): """Get text content for articles given a list of their pmids Parameters ---------- pmids : list of str Returns ------- text_content : list of str """ pmc_pmids = set(pmc_client.filter_pmids(pmids, source_type='fulltext')) pmc_ids = [] for pmid in pmc_pmids: pmc_id = pmc_client.id_lookup(pmid, idtype='pmid')['pmcid'] if pmc_id: pmc_ids.append(pmc_id) else: pmc_pmids.discard(pmid) pmc_xmls = [] failed = set() for pmc_id in pmc_ids: if pmc_id is not None: pmc_xmls.append(pmc_client.get_xml(pmc_id)) else: failed.append(pmid) time.sleep(0.5) remaining_pmids = set(pmids) - pmc_pmids | failed abstracts = [] for pmid in remaining_pmids: abstract = pubmed_client.get_abstract(pmid) abstracts.append(abstract) time.sleep(0.5) return [text_content for source in (pmc_xmls, abstracts) for text_content in source if text_content is not None]
def test_get_no_abstract(): abstract = pubmed_client.get_abstract('xx') assert(abstract is None)
def test_get_no_abstract(): time.sleep(0.3) abstract = pubmed_client.get_abstract('xx') assert abstract is None
def test_get_no_abstract(): abstract = pubmed_client.get_abstract('xx') assert abstract is None
def test_get_abstract2(): time.sleep(0.3) # Try another one abstract = pubmed_client.get_abstract('27123883') assert unicode_strs(abstract)
def test_abstract_with_html_embedded(): time.sleep(0.5) res = pubmed_client.get_abstract('25484845') assert len(res) > 4, res
def test_get_abstract2(): time.sleep(0.5) # Try another one abstract = pubmed_client.get_abstract('27123883')
def test_get_abstract_title(): abstract = pubmed_client.get_abstract('27754804', prepend_title=True) assert(abstract.startswith('Targeting autophagy')) assert(abstract.endswith('vemurafenib.')) assert unicode_strs(abstract)
def test_get_abstract(): abstract = pubmed_client.get_abstract('27085458') assert(abstract.startswith('Wilms')) assert(abstract.endswith('documented.'))
def test_get_abstract_notitle(): time.sleep(0.5) abstract = pubmed_client.get_abstract('27754804', prepend_title=False) assert abstract.startswith('The RAF inhibitor') assert abstract.endswith('vemurafenib.')
def _get_text_for_grounding(self, stmt, agent_text): """Get text context for Adeft disambiguation If the INDRA database is available, attempts to get the fulltext from which the statement was extracted. If the fulltext is not available, the abstract is returned. If the indra database is not available, uses the pubmed client to get the abstract. If no abstract can be found, falls back on returning the evidence text for the statement. Parameters ---------- stmt : py:class:`indra.statements.Statement` Statement with agent we seek to disambiguate. agent_text : str Agent text that needs to be disambiguated Returns ------- text : str Text for Adeft disambiguation """ text = None # First we will try to get content from a local text content DB if # available since this is the fastest option if self.has_local_text_db: try: from indra_db_lite import get_plaintexts_for_text_ref_ids, \ get_text_ref_ids_for_pmids refs = stmt.evidence[0].text_refs trid = refs.get('TRID') pmid = refs.get('PMID') if trid: text_content = get_plaintexts_for_text_ref_ids([trid]) _, content = next(text_content.trid_content_pairs()) if content: return content elif pmid: mappings = get_text_ref_ids_for_pmids([int(pmid)]) if int(pmid) in mappings: trid = mappings[int(pmid)] text_content = get_plaintexts_for_text_ref_ids([trid]) _, content = next(text_content.trid_content_pairs()) if content: return content except Exception as e: logger.info('Could not get text from local DB: %s' % e) # If the above is not available or fails, we try the INDRA DB # if available. if self.__tc is not None: try: from indra.literature.adeft_tools import universal_extract_text refs = stmt.evidence[0].text_refs # Prioritize the pmid attribute if given if stmt.evidence[0].pmid: refs['PMID'] = stmt.evidence[0].pmid logger.debug( 'Obtaining text for disambiguation with refs: %s' % refs) content = self.__tc.get_text_content_from_text_refs(refs) if not content: raise ValueError('Text obtained from DB is empty') text = universal_extract_text(content, contains=agent_text) if text: return text except Exception as e: logger.info( 'Could not get text for disambiguation from DB: %s' % e) # If that doesn't work, we try PubMed next trying to fetch an abstract if text is None: from indra.literature import pubmed_client pmid = stmt.evidence[0].pmid if pmid: logger.debug( 'Obtaining abstract for disambiguation for PMID%s' % pmid) text = pubmed_client.get_abstract(pmid) if text: return text # Finally, falling back on the evidence sentence if text is None: logger.info('Falling back on sentence-based disambiguation') text = stmt.evidence[0].text return text return None
def test_universal_extract_paragraphs_abstract(): pmid = '16511588' abstract = pubmed_client.get_abstract(pmid) result = universal_extract_paragraphs(abstract) assert result[0] == abstract
def test_get_abstract_notitle(): time.sleep(0.3) abstract = pubmed_client.get_abstract('27754804', prepend_title=False) assert abstract.startswith('The RAF inhibitor') assert abstract.endswith('vemurafenib.') assert unicode_strs(abstract)
reads the abstracts corresponding to each PMID with Eidos. It is complementary to the pipeline which starts with the CORD19 document set.""" import os import time import pickle from tqdm import tqdm from indra.sources import eidos from indra.literature import pubmed_client root = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir) keywords = ['covid19', 'covid-19', 'sars-cov-2', 'sars-cov2'] ids = [] for kw in keywords: ids += pubmed_client.get_ids(kw) stmts = {} for pmid in tqdm(ids): time.sleep(3) abst = pubmed_client.get_abstract(pmid) if not abst: continue ep = eidos.process_text(abst, webservice='http://localhost:9000/') for stmt in ep.statements: stmt.evidence[0].pmid = pmid stmts[pmid] = ep.statements with open(os.path.join(root, 'stmts', 'eidos_abstract_stmts.pkl'), 'wb') as fh: pickle.dump(stmts, fh)
def get_full_text(paper_id, idtype, preferred_content_type='text/xml'): """Return the content and the content type of an article. This function retreives the content of an article by its PubMed ID, PubMed Central ID, or DOI. It prioritizes full text content when available and returns an abstract from PubMed as a fallback. Parameters ---------- paper_id : string ID of the article. idtype : 'pmid', 'pmcid', or 'doi Type of the ID. preferred_content_type : Optional[st]r Preference for full-text format, if available. Can be one of 'text/xml', 'text/plain', 'application/pdf'. Default: 'text/xml' Returns ------- content : str The content of the article. content_type : str The content type of the article """ if preferred_content_type not in \ ('text/xml', 'text/plain', 'application/pdf'): raise ValueError("preferred_content_type must be one of 'text/xml', " "'text/plain', or 'application/pdf'.") ids = id_lookup(paper_id, idtype) pmcid = ids.get('pmcid') pmid = ids.get('pmid') doi = ids.get('doi') # First try to find paper via PMC if pmcid: nxml = pmc_client.get_xml(pmcid) if nxml: return nxml, 'pmc_oa_xml' # If we got here, it means we didn't find the full text in PMC, so we'll # need either the DOI (for lookup in CrossRef) and/or the PMID (so we # can fall back on the abstract. If by some strange turn we have neither, # give up now. if not doi and not pmid: return (None, None) # If it does not have PMC NXML then we attempt to obtain the full-text # through the CrossRef Click-through API if doi: # Get publisher publisher = crossref_client.get_publisher(doi) # First check for whether this is Elsevier--if so, use the Elsevier # client directly, because the Clickthrough API key seems unreliable. # Return full XML. if publisher == 'Elsevier BV': logger.info('Elsevier: %s' % pmid) #article = elsevier_client.get_article(doi, output='txt') try: article_xml = elsevier_client.download_article(doi) except Exception as e: logger.error("Error downloading Elsevier article: %s" % e) article_xml = None if article_xml is not None: return (article_xml, 'elsevier_xml') # FIXME FIXME FIXME # Because we don't yet have a way to process non-Elsevier content # obtained from CrossRef, which includes both XML of unknown format # and PDFs, we just comment this section out for now """ # Check if there are any full text links links = crossref_client.get_fulltext_links(doi) if links: headers = {} # Set the Cross Ref Clickthrough API key in the header, if we've # got one cr_api_key = crossref_client.get_api_key() if cr_api_key is not None: headers['CR-Clickthrough-Client-Token'] = cr_api_key # Utility function to get particular links by content-type def lookup_content_type(link_list, content_type): content_list = [l.get('URL') for l in link_list if l.get('content-type') == content_type] return None if not content_list else content_list[0] # First check for what the user asked for if lookup_content_type(links, preferred_content_type): req = requests.get(lookup_content_type(links, preferred_content_type), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return req.text, req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request): ' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) # Check for XML first if lookup_content_type(links, 'text/xml'): req = requests.get(lookup_content_type(links, 'text/xml'), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return req.text, req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request):' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) # Next, plain text elif lookup_content_type(links, 'text/plain'): req = requests.get(lookup_content_type(links, 'text/plain'), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return req.text, req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request):' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) elif lookup_content_type(links, 'application/pdf'): pass # Wiley's links are often of content-type 'unspecified'. elif lookup_content_type(links, 'unspecified'): req = requests.get(lookup_content_type(links, 'unspecified'), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return 'foo', req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request):' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) elif req.status_code == 401: logger.warning('Full text query returned 401 (Unauthorized)') return (None, None) elif req.status_code == 403: logger.warning('Full text query returned 403 (Forbidden)') return (None, None) else: raise Exception("Unknown content type(s): %s" % links) elif publisher == 'American Society for Biochemistry & Molecular ' \ 'Biology (ASBMB)': url = crossref_client.get_url(doi) return get_asbmb_full_text(url) """ # end FIXME FIXME FIXME # No full text links and not a publisher we support. We'll have to # fall back to the abstract. #elif pmid: if pmid: abstract = pubmed_client.get_abstract(pmid) if abstract is None: return (None, None) else: return abstract, 'abstract' # We have a useless DOI and no PMID. Give up. else: return (None, None) # We don't have a DOI but we're guaranteed to have a PMID at this point, # so we fall back to the abstract: else: abstract = pubmed_client.get_abstract(pmid) if abstract is None: return (None, None) else: return abstract, 'abstract' # We'll only get here if we've missed a combination of conditions assert False
def get_full_text(paper_id, idtype, preferred_content_type='text/xml'): """Return the content and the content type of an article. This function retreives the content of an article by its PubMed ID, PubMed Central ID, or DOI. It prioritizes full text content when available and returns an abstract from PubMed as a fallback. Parameters ---------- paper_id : string ID of the article. idtype : 'pmid', 'pmcid', or 'doi Type of the ID. preferred_content_type : Optional[st]r Preference for full-text format, if available. Can be one of 'text/xml', 'text/plain', 'application/pdf'. Default: 'text/xml' Returns ------- content : str The content of the article. content_type : str The content type of the article """ if preferred_content_type not in \ ('text/xml', 'text/plain', 'application/pdf'): raise ValueError("preferred_content_type must be one of 'text/xml', " "'text/plain', or 'application/pdf'.") ids = id_lookup(paper_id, idtype) pmcid = ids.get('pmcid') pmid = ids.get('pmid') doi = ids.get('doi') # First try to find paper via PMC if pmcid: nxml = pmc_client.get_xml(pmcid) if nxml: return nxml, 'pmc_oa_xml' # If we got here, it means we didn't find the full text in PMC, so we'll # need either the DOI (for lookup in CrossRef) and/or the PMID (so we # can fall back on the abstract. If by some strange turn we have neither, # give up now. if not doi and not pmid: return (None, None) # If it does not have PMC NXML then we attempt to obtain the full-text # through the CrossRef Click-through API if doi: # Get publisher publisher = crossref_client.get_publisher(doi) # First check for whether this is Elsevier--if so, use the Elsevier # client directly, because the Clickthrough API key seems unreliable. # Return full XML. if publisher == 'Elsevier BV': logger.info('Elsevier: %s' % pmid) #article = elsevier_client.get_article(doi, output='txt') try: article_xml = elsevier_client.download_article(doi) except Exception as e: logger.error("Error downloading Elsevier article: %s" % e) article_xml = None if article_xml is not None: return (article_xml, 'elsevier_xml') # FIXME FIXME FIXME # Because we don't yet have a way to process non-Elsevier content # obtained from CrossRef, which includes both XML of unknown format # and PDFs, we just comment this section out for now """ # Check if there are any full text links links = crossref_client.get_fulltext_links(doi) if links: headers = {} # Set the Cross Ref Clickthrough API key in the header, if we've # got one if crossref_client.api_key is not None: headers['CR-Clickthrough-Client-Token'] = \ crossref_client.api_key # Utility function to get particular links by content-type def lookup_content_type(link_list, content_type): content_list = [l.get('URL') for l in link_list if l.get('content-type') == content_type] return None if not content_list else content_list[0] # First check for what the user asked for if lookup_content_type(links, preferred_content_type): req = requests.get(lookup_content_type(links, preferred_content_type), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return req.text, req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request): ' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) # Check for XML first if lookup_content_type(links, 'text/xml'): req = requests.get(lookup_content_type(links, 'text/xml'), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return req.text, req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request):' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) # Next, plain text elif lookup_content_type(links, 'text/plain'): req = requests.get(lookup_content_type(links, 'text/plain'), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return req.text, req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request):' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) elif lookup_content_type(links, 'application/pdf'): pass # Wiley's links are often of content-type 'unspecified'. elif lookup_content_type(links, 'unspecified'): req = requests.get(lookup_content_type(links, 'unspecified'), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return 'foo', req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request):' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) elif req.status_code == 401: logger.warning('Full text query returned 401 (Unauthorized)') return (None, None) elif req.status_code == 403: logger.warning('Full text query returned 403 (Forbidden)') return (None, None) else: raise Exception("Unknown content type(s): %s" % links) elif publisher == 'American Society for Biochemistry & Molecular ' \ 'Biology (ASBMB)': url = crossref_client.get_url(doi) return get_asbmb_full_text(url) """ # end FIXME FIXME FIXME # No full text links and not a publisher we support. We'll have to # fall back to the abstract. #elif pmid: if pmid: abstract = pubmed_client.get_abstract(pmid) if abstract is None: return (None, None) else: return abstract, 'abstract' # We have a useless DOI and no PMID. Give up. else: return (None, None) # We don't have a DOI but we're guaranteed to have a PMID at this point, # so we fall back to the abstract: else: abstract = pubmed_client.get_abstract(pmid) if abstract is None: return (None, None) else: return abstract, 'abstract' # We'll only get here if we've missed a combination of conditions assert False
def get_upload_content(pmid, force_fulltext_lookup=False): """Get full text and/or abstract for paper and upload to S3.""" # Make sure that the PMID doesn't start with PMID so that it doesn't # screw up the literature clients if pmid.startswith('PMID'): pmid = pmid[4:] # First, check S3: (ft_content_s3, ft_content_type_s3) = get_full_text(pmid) # The abstract is on S3 but there is no full text; if we're not forcing # fulltext lookup, then we're done if ft_content_type_s3 == 'abstract' and not force_fulltext_lookup: return (ft_content_s3, ft_content_type_s3) # If there's nothing (even an abstract on S3), or if there's an abstract # and we're forcing fulltext lookup, do the lookup elif ft_content_type_s3 is None or \ (ft_content_type_s3 == 'abstract' and force_fulltext_lookup) or \ (ft_content_type_s3 == 'elsevier_xml' and not elsevier_client.extract_text(ft_content_s3)): if ft_content_type_s3 == 'elsevier_xml': logger.info('PMID%s: elsevier_xml cached on S3 is missing full ' 'text element, getting again.' % pmid) # Try to retrieve from literature client logger.info("PMID%s: getting content using literature client" % pmid) (ft_content, ft_content_type) = lit.get_full_text(pmid, 'pmid') assert ft_content_type in ('pmc_oa_xml', 'elsevier_xml', 'abstract', None) # If we tried to get the full text and didn't even get the abstract, # then there was probably a problem with the web service. Try to # get the abstract instead: if ft_content_type is None: return (None, None) # If we got the abstract, and we already had the abstract on S3, then # do nothing elif ft_content_type == 'abstract' and ft_content_type_s3 == 'abstract': logger.info("PMID%s: found abstract but already had it on " \ "S3; skipping" % pmid) return (ft_content, ft_content_type) # If we got the abstract, and we had nothing on S3, then upload elif ft_content_type == 'abstract' and ft_content_type_s3 is None: logger.info("PMID%s: found abstract, uploading to S3" % pmid) put_abstract(pmid, ft_content) return (ft_content, ft_content_type) # If we got elsevier_xml, but cannot get a full text element, then # get and put the abstract elif ft_content_type == 'elsevier_xml' and \ not elsevier_client.extract_text(ft_content): logger.info("PMID%s: Couldn't get a full text element for " "the elsevier_xml content; getting abstract " % pmid) abstract = pubmed_client.get_abstract(pmid) # Abstract is None, so return None if abstract is None: logger.info("PMID%s: Unable to get abstract, returning None" % pmid) return (None, None) # Otherwise, upload and return the abstract else: logger.info("PMID%s: Uploading and returning abstract " % pmid) put_abstract(pmid, abstract) return (abstract, 'abstract') # We got a viable full text # (or something other than None or abstract...) else: logger.info("PMID%s: uploading and returning %s" % (pmid, ft_content_type)) put_full_text(pmid, ft_content, full_text_type=ft_content_type) return (ft_content, ft_content_type) # Some form of full text is already on S3 else: # TODO # In future, could check for abstract even if full text is found, and # upload it just to have it return (ft_content_s3, ft_content_type_s3) # We should always return before we get here assert False
def test_abstract_with_html_embedded(): time.sleep(0.3) res = pubmed_client.get_abstract('25484845') assert len(res) > 4, res
def test_get_abstract_notitle(): abstract = pubmed_client.get_abstract('27754804', prepend_title=False) assert abstract.startswith('The RAF inhibitor') assert abstract.endswith('vemurafenib.') assert unicode_strs(abstract)
def test_get_no_abstract(): time.sleep(0.5) abstract = pubmed_client.get_abstract('xx') assert abstract is None
def test_get_abstract2(): # Try another one abstract = pubmed_client.get_abstract('27123883') assert unicode_strs(abstract)
def test_get_abstract_title(): time.sleep(0.3) abstract = pubmed_client.get_abstract('27754804', prepend_title=True) assert abstract.lower().startswith('targeting autophagy') assert abstract.endswith('vemurafenib.') assert unicode_strs(abstract)
def test_get_abstract_title(): time.sleep(0.5) abstract = pubmed_client.get_abstract('27754804', prepend_title=True) assert abstract.lower().startswith('targeting autophagy') assert abstract.endswith('vemurafenib.')
def test_get_abstract_title(): abstract = pubmed_client.get_abstract('27754804', prepend_title=True) assert (abstract.lower().startswith('targeting autophagy')) assert (abstract.endswith('vemurafenib.')) assert unicode_strs(abstract)