예제 #1
0
def process_pubmed_abstract(pubmed_id, offline=False):
    """Return a ReachProcessor by processing an abstract with a given Pubmed id.

    Uses the Pubmed client to get the abstract. If that fails, None is
    returned.

    Parameters
    ----------
    pubmed_id : str
        The ID of a Pubmed article. The string may start with PMID but
        passing just the ID also works.
        Examples: 27168024, PMID27168024
        https://www.ncbi.nlm.nih.gov/pubmed/
    offline : Optional[bool]
        If set to True, the REACH system is ran offline. Otherwise (by default)
        the web service is called. Default: False

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    abs_txt = pubmed_client.get_abstract(pubmed_id)
    if abs_txt is None:
        return None
    rp = process_text(abs_txt, citation=pubmed_id, offline=offline)
    if rp and rp.statements:
        for st in rp.statements:
            for ev in st.evidence:
                ev.epistemics['section_type'] = 'abstract'
    return rp
예제 #2
0
def test_pmid_27821631():
    time.sleep(0.3)
    pmid = '27821631'
    res = pubmed_client.get_abstract(pmid)
    assert len(res) > 50, res
    res = pubmed_client.get_metadata_for_ids([pmid], get_abstracts=True)
    assert res[pmid]['title'] is not None
    assert len(res[pmid]['abstract']) > 50
예제 #3
0
def test_pmid_27821631():
    time.sleep(0.5)
    pmid = '27821631'
    res = pubmed_client.get_abstract(pmid)
    assert len(res) > 50, res
    res = pubmed_client.get_metadata_for_ids([pmid], get_abstracts=True)
    assert res[pmid]['title'] is not None
    assert len(res[pmid]['abstract']) > 50
예제 #4
0
파일: adeft.py 프로젝트: djinnome/indra
def _get_text_for_grounding(stmt, agent_text):
    """Get text context for Adeft disambiguation

    If the INDRA database is available, attempts to get the fulltext from
    which the statement was extracted. If the fulltext is not available, the
    abstract is returned. If the indra database is not available, uses the
    pubmed client to get the abstract. If no abstract can be found, falls back
    on returning the evidence text for the statement.

    Parameters
    ----------
    stmt : py:class:`indra.statements.Statement`
        Statement with agent we seek to disambiguate.

    agent_text : str
       Agent text that needs to be disambiguated

    Returns
    -------
    text : str
        Text for Adeft disambiguation
    """
    text = None
    # First we will try to get content from the DB
    try:
        from indra_db.util.content_scripts \
            import get_text_content_from_text_refs
        from indra.literature.adeft_tools import universal_extract_text
        refs = stmt.evidence[0].text_refs
        # Prioritize the pmid attribute if given
        if stmt.evidence[0].pmid:
            refs['PMID'] = stmt.evidence[0].pmid
        logger.debug('Obtaining text for disambiguation with refs: %s' %
                     refs)
        content = get_text_content_from_text_refs(refs)
        if not content:
            raise ValueError('Text obtained from DB is empty')
        text = universal_extract_text(content, contains=agent_text)
        if text:
            return text
    except Exception as e:
        logger.info('Could not get text for disambiguation from DB: %s' % e)
    # If that doesn't work, we try PubMed next
    if text is None:
        from indra.literature import pubmed_client
        pmid = stmt.evidence[0].pmid
        if pmid:
            logger.debug('Obtaining abstract for disambiguation for PMID%s' %
                         pmid)
            text = pubmed_client.get_abstract(pmid)
            if text:
                return text
    # Finally, falling back on the evidence sentence
    if text is None:
        logger.info('Falling back on sentence-based disambiguation')
        text = stmt.evidence[0].text
        return text
    return None
예제 #5
0
파일: api.py 프로젝트: rodriguezmDNA/indra
def process_pubmed_abstract(pubmed_id,
                            offline=False,
                            url=None,
                            output_fname=default_output_fname,
                            **kwargs):
    """Return a ReachProcessor by processing an abstract with a given Pubmed id.

    Uses the Pubmed client to get the abstract. If that fails, None is
    returned.

    Parameters
    ----------
    pubmed_id : str
        The ID of a Pubmed article. The string may start with PMID but
        passing just the ID also works.
        Examples: 27168024, PMID27168024
        https://www.ncbi.nlm.nih.gov/pubmed/
    offline : Optional[bool]
        If set to True, the REACH system is run offline via a JAR file.
        Otherwise (by default) the web service is called. Default: False
    url : Optional[str]
        URL for a REACH web service instance, which is used for reading if
        provided. If not provided but offline is set to False (its default
        value), the Arizona REACH web service is called
        (http://agathon.sista.arizona.edu:8080/odinweb/api/help).
        Default: None
    output_fname : Optional[str]
        The file to output the REACH JSON output to.
        Defaults to reach_output.json in current working directory.
    **kwargs : keyword arguments
        All other keyword arguments are passed directly to `process_text`.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    # Get the abstract from PubMed, if that fails, return None
    abs_txt = pubmed_client.get_abstract(pubmed_id)
    if abs_txt is None:
        return None
    # Process the text with the provided arguments
    rp = process_text(abs_txt,
                      citation=pubmed_id,
                      offline=offline,
                      url=url,
                      output_fname=output_fname,
                      **kwargs)
    # For some applications, the section type of the text is important so
    # that annotation is set here.
    if rp and rp.statements:
        for st in rp.statements:
            for ev in st.evidence:
                ev.epistemics['section_type'] = 'abstract'
    return rp
예제 #6
0
def _get_text_for_grounding(stmt, agent_text):
    """Get text context for Adeft disambiguation

    If the INDRA database is available, attempts to get the fulltext from
    which the statement was extracted. If the fulltext is not available, the
    abstract is returned. If the indra database is not available, uses the
    pubmed client to get the abstract. If no abstract can be found, falls back
    on returning the evidence text for the statement.

    Parameters
    ----------
    stmt : py:class:`indra.statements.Statement`
        Statement with agent we seek to disambiguate.

    agent_text : str
       Agent text that needs to be disambiguated

    Returns
    -------
    text : str
        Text for Adeft disambiguation
    """
    text = None
    # First we will try to get content from the DB
    try:
        from indra_db.util.content_scripts \
            import get_text_content_from_text_refs
        from indra.literature.adeft_tools import universal_extract_text
        refs = stmt.evidence[0].text_refs
        # Prioritize the pmid attribute if given
        if stmt.evidence[0].pmid:
            refs['PMID'] = stmt.evidence[0].pmid
        logger.info('Obtaining text for disambiguation with refs: %s' %
                    refs)
        content = get_text_content_from_text_refs(refs)
        text = universal_extract_text(content, contains=agent_text)
        if text:
            return text
    except Exception as e:
        logger.info('Could not get text for disambiguation from DB.')
    # If that doesn't work, we try PubMed next
    if text is None:
        from indra.literature import pubmed_client
        pmid = stmt.evidence[0].pmid
        if pmid:
            logger.info('Obtaining abstract for disambiguation for PMID%s' %
                        pmid)
            text = pubmed_client.get_abstract(pmid)
            if text:
                return text
    # Finally, falling back on the evidence sentence
    if text is None:
        logger.info('Falling back on sentence-based disambiguation')
        text = stmt.evidence[0].text
        return text
    return None
예제 #7
0
def test_readme_using_indra3():
    from indra.sources import reach
    from indra.literature import pubmed_client
    # Search for 10 most recent abstracts in PubMed on 'BRAF'
    pmids = pubmed_client.get_ids('BRAF', retmax=10)
    all_statements = []
    for pmid in pmids:
        abs = pubmed_client.get_abstract(pmid)
        if abs is not None:
            reach_processor = reach.process_text(abs, url=reach.local_text_url)
            if reach_processor is not None:
                all_statements += reach_processor.statements
    assert len(all_statements) > 0
예제 #8
0
def process_pubmed_abstract(pubmed_id,
                            offline=False,
                            output_fname=default_output_fname,
                            **kwargs):
    """Return a ReachProcessor by processing an abstract with a given Pubmed id.

    Uses the Pubmed client to get the abstract. If that fails, None is
    returned.

    Parameters
    ----------
    pubmed_id : str
        The ID of a Pubmed article. The string may start with PMID but
        passing just the ID also works.
        Examples: 27168024, PMID27168024
        https://www.ncbi.nlm.nih.gov/pubmed/
    offline : Optional[bool]
        If set to True, the REACH system is ran offline. Otherwise (by default)
        the web service is called. Default: False
    output_fname : Optional[str]
        The file to output the REACH JSON output to.
        Defaults to reach_output.json in current working directory.
    **kwargs : keyword arguments
        All other keyword arguments are passed directly to `process_text`.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    abs_txt = pubmed_client.get_abstract(pubmed_id)
    if abs_txt is None:
        return None
    rp = process_text(abs_txt,
                      citation=pubmed_id,
                      offline=offline,
                      output_fname=output_fname,
                      **kwargs)
    if rp and rp.statements:
        for st in rp.statements:
            for ev in st.evidence:
                ev.epistemics['section_type'] = 'abstract'
    return rp
예제 #9
0
def get_text_content_for_pmids(pmids):
    """Get text content for articles given a list of their pmids

    Parameters
    ----------
    pmids : list of str

    Returns
    -------
    text_content : list of str
    """
    pmc_pmids = set(pmc_client.filter_pmids(pmids, source_type='fulltext'))

    pmc_ids = []
    for pmid in pmc_pmids:
        pmc_id = pmc_client.id_lookup(pmid, idtype='pmid')['pmcid']
        if pmc_id:
            pmc_ids.append(pmc_id)
        else:
            pmc_pmids.discard(pmid)

    pmc_xmls = []
    failed = set()
    for pmc_id in pmc_ids:
        if pmc_id is not None:
            pmc_xmls.append(pmc_client.get_xml(pmc_id))
        else:
            failed.add(pmid)
        time.sleep(0.5)

    remaining_pmids = set(pmids) - pmc_pmids | failed
    abstracts = []
    for pmid in remaining_pmids:
        abstract = pubmed_client.get_abstract(pmid)
        abstracts.append(abstract)
        time.sleep(0.5)

    return [
        text_content for source in (pmc_xmls, abstracts)
        for text_content in source if text_content is not None
    ]
예제 #10
0
파일: api.py 프로젝트: johnbachman/indra
def process_pubmed_abstract(pubmed_id, offline=False,
                            output_fname=default_output_fname, **kwargs):
    """Return a ReachProcessor by processing an abstract with a given Pubmed id.

    Uses the Pubmed client to get the abstract. If that fails, None is
    returned.

    Parameters
    ----------
    pubmed_id : str
        The ID of a Pubmed article. The string may start with PMID but
        passing just the ID also works.
        Examples: 27168024, PMID27168024
        https://www.ncbi.nlm.nih.gov/pubmed/
    offline : Optional[bool]
        If set to True, the REACH system is ran offline. Otherwise (by default)
        the web service is called. Default: False
    output_fname : Optional[str]
        The file to output the REACH JSON output to.
        Defaults to reach_output.json in current working directory.
    **kwargs : keyword arguments
        All other keyword arguments are passed directly to `process_text`.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    abs_txt = pubmed_client.get_abstract(pubmed_id)
    if abs_txt is None:
        return None
    rp = process_text(abs_txt, citation=pubmed_id, offline=offline,
                      output_fname=output_fname, **kwargs)
    if rp and rp.statements:
        for st in rp.statements:
            for ev in st.evidence:
                ev.epistemics['section_type'] = 'abstract'
    return rp
예제 #11
0
def get_text_content_for_pmids(pmids):
    """Get text content for articles given a list of their pmids

    Parameters
    ----------
    pmids : list of str

    Returns
    -------
    text_content : list of str
    """
    pmc_pmids = set(pmc_client.filter_pmids(pmids, source_type='fulltext'))

    pmc_ids = []
    for pmid in pmc_pmids:
        pmc_id = pmc_client.id_lookup(pmid, idtype='pmid')['pmcid']
        if pmc_id:
            pmc_ids.append(pmc_id)
        else:
            pmc_pmids.discard(pmid)

    pmc_xmls = []
    failed = set()
    for pmc_id in pmc_ids:
        if pmc_id is not None:
            pmc_xmls.append(pmc_client.get_xml(pmc_id))
        else:
            failed.append(pmid)
        time.sleep(0.5)

    remaining_pmids = set(pmids) - pmc_pmids | failed
    abstracts = []
    for pmid in remaining_pmids:
        abstract = pubmed_client.get_abstract(pmid)
        abstracts.append(abstract)
        time.sleep(0.5)

    return [text_content for source in (pmc_xmls, abstracts)
            for text_content in source if text_content is not None]
예제 #12
0
def test_get_no_abstract():
    abstract = pubmed_client.get_abstract('xx')
    assert(abstract is None)
예제 #13
0
def test_get_no_abstract():
    time.sleep(0.3)
    abstract = pubmed_client.get_abstract('xx')
    assert abstract is None
예제 #14
0
def test_get_no_abstract():
    abstract = pubmed_client.get_abstract('xx')
    assert abstract is None
예제 #15
0
def test_get_abstract2():
    time.sleep(0.3)
    # Try another one
    abstract = pubmed_client.get_abstract('27123883')
    assert unicode_strs(abstract)
예제 #16
0
def test_abstract_with_html_embedded():
    time.sleep(0.5)
    res = pubmed_client.get_abstract('25484845')
    assert len(res) > 4, res
예제 #17
0
def test_get_abstract2():
    time.sleep(0.5)
    # Try another one
    abstract = pubmed_client.get_abstract('27123883')
예제 #18
0
def test_get_abstract_title():
    abstract = pubmed_client.get_abstract('27754804', prepend_title=True)
    assert(abstract.startswith('Targeting autophagy'))
    assert(abstract.endswith('vemurafenib.'))
    assert unicode_strs(abstract)
예제 #19
0
def test_get_abstract():
    abstract = pubmed_client.get_abstract('27085458')
    assert(abstract.startswith('Wilms'))
    assert(abstract.endswith('documented.'))
예제 #20
0
def test_get_abstract_notitle():
    time.sleep(0.5)
    abstract = pubmed_client.get_abstract('27754804', prepend_title=False)
    assert abstract.startswith('The RAF inhibitor')
    assert abstract.endswith('vemurafenib.')
예제 #21
0
    def _get_text_for_grounding(self, stmt, agent_text):
        """Get text context for Adeft disambiguation

        If the INDRA database is available, attempts to get the fulltext from
        which the statement was extracted. If the fulltext is not available,
        the abstract is returned. If the indra database is not available, uses
        the pubmed client to get the abstract. If no abstract can be found,
        falls back on returning the evidence text for the statement.

        Parameters
        ----------
        stmt : py:class:`indra.statements.Statement`
            Statement with agent we seek to disambiguate.

        agent_text : str
           Agent text that needs to be disambiguated

        Returns
        -------
        text : str
            Text for Adeft disambiguation
        """
        text = None
        # First we will try to get content from a local text content DB if
        # available since this is the fastest option
        if self.has_local_text_db:
            try:
                from indra_db_lite import get_plaintexts_for_text_ref_ids, \
                    get_text_ref_ids_for_pmids
                refs = stmt.evidence[0].text_refs
                trid = refs.get('TRID')
                pmid = refs.get('PMID')
                if trid:
                    text_content = get_plaintexts_for_text_ref_ids([trid])
                    _, content = next(text_content.trid_content_pairs())
                    if content:
                        return content
                elif pmid:
                    mappings = get_text_ref_ids_for_pmids([int(pmid)])
                    if int(pmid) in mappings:
                        trid = mappings[int(pmid)]
                        text_content = get_plaintexts_for_text_ref_ids([trid])
                        _, content = next(text_content.trid_content_pairs())
                        if content:
                            return content
            except Exception as e:
                logger.info('Could not get text from local DB: %s' % e)
        # If the above is not available or fails, we try the INDRA DB
        # if available.
        if self.__tc is not None:
            try:
                from indra.literature.adeft_tools import universal_extract_text
                refs = stmt.evidence[0].text_refs
                # Prioritize the pmid attribute if given
                if stmt.evidence[0].pmid:
                    refs['PMID'] = stmt.evidence[0].pmid
                logger.debug(
                    'Obtaining text for disambiguation with refs: %s' % refs)
                content = self.__tc.get_text_content_from_text_refs(refs)
                if not content:
                    raise ValueError('Text obtained from DB is empty')
                text = universal_extract_text(content, contains=agent_text)
                if text:
                    return text
            except Exception as e:
                logger.info(
                    'Could not get text for disambiguation from DB: %s' % e)
        # If that doesn't work, we try PubMed next trying to fetch an abstract
        if text is None:
            from indra.literature import pubmed_client
            pmid = stmt.evidence[0].pmid
            if pmid:
                logger.debug(
                    'Obtaining abstract for disambiguation for PMID%s' % pmid)
                text = pubmed_client.get_abstract(pmid)
                if text:
                    return text
        # Finally, falling back on the evidence sentence
        if text is None:
            logger.info('Falling back on sentence-based disambiguation')
            text = stmt.evidence[0].text
            return text
        return None
예제 #22
0
def test_universal_extract_paragraphs_abstract():
    pmid = '16511588'
    abstract = pubmed_client.get_abstract(pmid)
    result = universal_extract_paragraphs(abstract)
    assert result[0] == abstract
예제 #23
0
def test_get_abstract_notitle():
    time.sleep(0.3)
    abstract = pubmed_client.get_abstract('27754804', prepend_title=False)
    assert abstract.startswith('The RAF inhibitor')
    assert abstract.endswith('vemurafenib.')
    assert unicode_strs(abstract)
예제 #24
0
reads the abstracts corresponding to each PMID with Eidos. It is
complementary to the pipeline which starts with the CORD19 document set."""
import os
import time
import pickle
from tqdm import tqdm
from indra.sources import eidos
from indra.literature import pubmed_client

root = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir,
                    os.pardir)

keywords = ['covid19', 'covid-19', 'sars-cov-2', 'sars-cov2']
ids = []
for kw in keywords:
    ids += pubmed_client.get_ids(kw)

stmts = {}
for pmid in tqdm(ids):
    time.sleep(3)
    abst = pubmed_client.get_abstract(pmid)
    if not abst:
        continue
    ep = eidos.process_text(abst, webservice='http://localhost:9000/')
    for stmt in ep.statements:
        stmt.evidence[0].pmid = pmid
    stmts[pmid] = ep.statements

with open(os.path.join(root, 'stmts', 'eidos_abstract_stmts.pkl'), 'wb') as fh:
    pickle.dump(stmts, fh)
예제 #25
0
def get_full_text(paper_id, idtype, preferred_content_type='text/xml'):
    """Return the content and the content type of an article.

    This function retreives the content of an article by its PubMed ID,
    PubMed Central ID, or DOI. It prioritizes full text content when available
    and returns an abstract from PubMed as a fallback.

    Parameters
    ----------
    paper_id : string
        ID of the article.
    idtype : 'pmid', 'pmcid', or 'doi
        Type of the ID.
    preferred_content_type : Optional[st]r
        Preference for full-text format, if available. Can be one of
        'text/xml', 'text/plain', 'application/pdf'. Default: 'text/xml'

    Returns
    -------
    content : str
        The content of the article.
    content_type : str
        The content type of the article
    """
    if preferred_content_type not in \
            ('text/xml', 'text/plain', 'application/pdf'):
        raise ValueError("preferred_content_type must be one of 'text/xml', "
                         "'text/plain', or 'application/pdf'.")
    ids = id_lookup(paper_id, idtype)
    pmcid = ids.get('pmcid')
    pmid = ids.get('pmid')
    doi = ids.get('doi')
    # First try to find paper via PMC
    if pmcid:
        nxml = pmc_client.get_xml(pmcid)
        if nxml:
            return nxml, 'pmc_oa_xml'
    # If we got here, it means we didn't find the full text in PMC, so we'll
    # need either the DOI (for lookup in CrossRef) and/or the PMID (so we
    # can fall back on the abstract. If by some strange turn we have neither,
    # give up now.
    if not doi and not pmid:
        return (None, None)

    # If it does not have PMC NXML then we attempt to obtain the full-text
    # through the CrossRef Click-through API
    if doi:
        # Get publisher
        publisher = crossref_client.get_publisher(doi)

        # First check for whether this is Elsevier--if so, use the Elsevier
        # client directly, because the Clickthrough API key seems unreliable.
        # Return full XML.
        if publisher == 'Elsevier BV':
            logger.info('Elsevier: %s' % pmid)
            #article = elsevier_client.get_article(doi, output='txt')
            try:
                article_xml = elsevier_client.download_article(doi)
            except Exception as e:
                logger.error("Error downloading Elsevier article: %s" % e)
                article_xml = None
            if article_xml is not None:
                return (article_xml, 'elsevier_xml')

        # FIXME FIXME FIXME
        # Because we don't yet have a way to process non-Elsevier content
        # obtained from CrossRef, which includes both XML of unknown format
        # and PDFs, we just comment this section out for now
        """
        # Check if there are any full text links
        links = crossref_client.get_fulltext_links(doi)
        if links:
            headers = {}
            # Set the Cross Ref Clickthrough API key in the header, if we've
            # got one
            cr_api_key = crossref_client.get_api_key()
            if cr_api_key is not None:
                headers['CR-Clickthrough-Client-Token'] = cr_api_key
            # Utility function to get particular links by content-type
            def lookup_content_type(link_list, content_type):
                content_list = [l.get('URL') for l in link_list
                                if l.get('content-type') == content_type]
                return None if not content_list else content_list[0]
            # First check for what the user asked for
            if lookup_content_type(links, preferred_content_type):
                req = requests.get(lookup_content_type(links,
                                                       preferred_content_type),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request): '
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            # Check for XML first
            if lookup_content_type(links, 'text/xml'):
                req = requests.get(lookup_content_type(links, 'text/xml'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            # Next, plain text
            elif lookup_content_type(links, 'text/plain'):
                req = requests.get(lookup_content_type(links, 'text/plain'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            elif lookup_content_type(links, 'application/pdf'):
                pass
            # Wiley's links are often of content-type 'unspecified'.
            elif lookup_content_type(links, 'unspecified'):
                req = requests.get(lookup_content_type(links, 'unspecified'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return 'foo', req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
                elif req.status_code == 401:
                    logger.warning('Full text query returned 401 (Unauthorized)')
                    return (None, None)
                elif req.status_code == 403:
                    logger.warning('Full text query returned 403 (Forbidden)')
                    return (None, None)
            else:
                raise Exception("Unknown content type(s): %s" % links)
        elif publisher == 'American Society for Biochemistry & Molecular ' \
                          'Biology (ASBMB)':
            url = crossref_client.get_url(doi)
            return get_asbmb_full_text(url)
        """
        # end FIXME FIXME FIXME

        # No full text links and not a publisher we support. We'll have to
        # fall back to the abstract.
        #elif pmid:
        if pmid:
            abstract = pubmed_client.get_abstract(pmid)
            if abstract is None:
                return (None, None)
            else:
                return abstract, 'abstract'
        # We have a useless DOI and no PMID. Give up.
        else:
            return (None, None)
    # We don't have a DOI but we're guaranteed to have a PMID at this point,
    # so we fall back to the abstract:
    else:
        abstract = pubmed_client.get_abstract(pmid)
        if abstract is None:
            return (None, None)
        else:
            return abstract, 'abstract'
    # We'll only get here if we've missed a combination of conditions
    assert False
예제 #26
0
def get_full_text(paper_id, idtype, preferred_content_type='text/xml'):
    """Return the content and the content type of an article.

    This function retreives the content of an article by its PubMed ID,
    PubMed Central ID, or DOI. It prioritizes full text content when available
    and returns an abstract from PubMed as a fallback.

    Parameters
    ----------
    paper_id : string
        ID of the article.
    idtype : 'pmid', 'pmcid', or 'doi
        Type of the ID.
    preferred_content_type : Optional[st]r
        Preference for full-text format, if available. Can be one of
        'text/xml', 'text/plain', 'application/pdf'. Default: 'text/xml'

    Returns
    -------
    content : str
        The content of the article.
    content_type : str
        The content type of the article
    """
    if preferred_content_type not in \
            ('text/xml', 'text/plain', 'application/pdf'):
        raise ValueError("preferred_content_type must be one of 'text/xml', "
                         "'text/plain', or 'application/pdf'.")
    ids = id_lookup(paper_id, idtype)
    pmcid = ids.get('pmcid')
    pmid = ids.get('pmid')
    doi = ids.get('doi')
    # First try to find paper via PMC
    if pmcid:
        nxml = pmc_client.get_xml(pmcid)
        if nxml:
            return nxml, 'pmc_oa_xml'
    # If we got here, it means we didn't find the full text in PMC, so we'll
    # need either the DOI (for lookup in CrossRef) and/or the PMID (so we
    # can fall back on the abstract. If by some strange turn we have neither,
    # give up now.
    if not doi and not pmid:
        return (None, None)

    # If it does not have PMC NXML then we attempt to obtain the full-text
    # through the CrossRef Click-through API
    if doi:
        # Get publisher
        publisher = crossref_client.get_publisher(doi)

        # First check for whether this is Elsevier--if so, use the Elsevier
        # client directly, because the Clickthrough API key seems unreliable.
        # Return full XML.
        if publisher == 'Elsevier BV':
            logger.info('Elsevier: %s' % pmid)
            #article = elsevier_client.get_article(doi, output='txt')
            try:
                article_xml = elsevier_client.download_article(doi)
            except Exception as e:
                logger.error("Error downloading Elsevier article: %s" % e)
                article_xml = None
            if article_xml is not None:
                return (article_xml, 'elsevier_xml')

        # FIXME FIXME FIXME
        # Because we don't yet have a way to process non-Elsevier content
        # obtained from CrossRef, which includes both XML of unknown format
        # and PDFs, we just comment this section out for now
        """
        # Check if there are any full text links
        links = crossref_client.get_fulltext_links(doi)
        if links:
            headers = {}
            # Set the Cross Ref Clickthrough API key in the header, if we've
            # got one
            if crossref_client.api_key is not None:
                headers['CR-Clickthrough-Client-Token'] = \
                        crossref_client.api_key
            # Utility function to get particular links by content-type
            def lookup_content_type(link_list, content_type):
                content_list = [l.get('URL') for l in link_list
                                if l.get('content-type') == content_type]
                return None if not content_list else content_list[0]
            # First check for what the user asked for
            if lookup_content_type(links, preferred_content_type):
                req = requests.get(lookup_content_type(links,
                                                       preferred_content_type),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request): '
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            # Check for XML first
            if lookup_content_type(links, 'text/xml'):
                req = requests.get(lookup_content_type(links, 'text/xml'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            # Next, plain text
            elif lookup_content_type(links, 'text/plain'):
                req = requests.get(lookup_content_type(links, 'text/plain'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            elif lookup_content_type(links, 'application/pdf'):
                pass
            # Wiley's links are often of content-type 'unspecified'.
            elif lookup_content_type(links, 'unspecified'):
                req = requests.get(lookup_content_type(links, 'unspecified'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return 'foo', req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
                elif req.status_code == 401:
                    logger.warning('Full text query returned 401 (Unauthorized)')
                    return (None, None)
                elif req.status_code == 403:
                    logger.warning('Full text query returned 403 (Forbidden)')
                    return (None, None)
            else:
                raise Exception("Unknown content type(s): %s" % links)
        elif publisher == 'American Society for Biochemistry & Molecular ' \
                          'Biology (ASBMB)':
            url = crossref_client.get_url(doi)
            return get_asbmb_full_text(url)
        """
        # end FIXME FIXME FIXME

        # No full text links and not a publisher we support. We'll have to
        # fall back to the abstract.
        #elif pmid:
        if pmid:
            abstract = pubmed_client.get_abstract(pmid)
            if abstract is None:
                return (None, None)
            else:
                return abstract, 'abstract'
        # We have a useless DOI and no PMID. Give up.
        else:
            return (None, None)
    # We don't have a DOI but we're guaranteed to have a PMID at this point,
    # so we fall back to the abstract:
    else:
        abstract = pubmed_client.get_abstract(pmid)
        if abstract is None:
            return (None, None)
        else:
            return abstract, 'abstract'
    # We'll only get here if we've missed a combination of conditions
    assert False
예제 #27
0
def get_upload_content(pmid, force_fulltext_lookup=False):
    """Get full text and/or abstract for paper and upload to S3."""
    # Make sure that the PMID doesn't start with PMID so that it doesn't
    # screw up the literature clients
    if pmid.startswith('PMID'):
        pmid = pmid[4:]
    # First, check S3:
    (ft_content_s3, ft_content_type_s3) = get_full_text(pmid)
    # The abstract is on S3 but there is no full text; if we're not forcing
    # fulltext lookup, then we're done
    if ft_content_type_s3 == 'abstract' and not force_fulltext_lookup:
        return (ft_content_s3, ft_content_type_s3)
    # If there's nothing (even an abstract on S3), or if there's an abstract
    # and we're forcing fulltext lookup, do the lookup
    elif ft_content_type_s3 is None or \
            (ft_content_type_s3 == 'abstract' and force_fulltext_lookup) or \
            (ft_content_type_s3 == 'elsevier_xml' and
                    not elsevier_client.extract_text(ft_content_s3)):
        if ft_content_type_s3 == 'elsevier_xml':
            logger.info('PMID%s: elsevier_xml cached on S3 is missing full '
                        'text element, getting again.' % pmid)
        # Try to retrieve from literature client
        logger.info("PMID%s: getting content using literature client" % pmid)
        (ft_content, ft_content_type) = lit.get_full_text(pmid, 'pmid')
        assert ft_content_type in ('pmc_oa_xml', 'elsevier_xml', 'abstract',
                                   None)
        # If we tried to get the full text and didn't even get the abstract,
        # then there was probably a problem with the web service. Try to
        # get the abstract instead:
        if ft_content_type is None:
            return (None, None)
        # If we got the abstract, and we already had the abstract on S3, then
        # do nothing
        elif ft_content_type == 'abstract' and ft_content_type_s3 == 'abstract':
            logger.info("PMID%s: found abstract but already had it on " \
                        "S3; skipping" % pmid)
            return (ft_content, ft_content_type)
        # If we got the abstract, and we had nothing on S3, then upload
        elif ft_content_type == 'abstract' and ft_content_type_s3 is None:
            logger.info("PMID%s: found abstract, uploading to S3" % pmid)
            put_abstract(pmid, ft_content)
            return (ft_content, ft_content_type)
        # If we got elsevier_xml, but cannot get a full text element, then
        # get and put the abstract
        elif ft_content_type == 'elsevier_xml' and \
                not elsevier_client.extract_text(ft_content):
            logger.info("PMID%s: Couldn't get a full text element for "
                        "the elsevier_xml content; getting abstract " % pmid)
            abstract = pubmed_client.get_abstract(pmid)
            # Abstract is None, so return None
            if abstract is None:
                logger.info("PMID%s: Unable to get abstract, returning None" %
                            pmid)
                return (None, None)
            # Otherwise, upload and return the abstract
            else:
                logger.info("PMID%s: Uploading and returning abstract " % pmid)
                put_abstract(pmid, abstract)
                return (abstract, 'abstract')
        # We got a viable full text
        # (or something other than None or abstract...)
        else:
            logger.info("PMID%s: uploading and returning %s" %
                        (pmid, ft_content_type))
            put_full_text(pmid, ft_content, full_text_type=ft_content_type)
            return (ft_content, ft_content_type)
    # Some form of full text is already on S3
    else:
        # TODO
        # In future, could check for abstract even if full text is found, and
        # upload it just to have it
        return (ft_content_s3, ft_content_type_s3)
    # We should always return before we get here
    assert False
예제 #28
0
def test_abstract_with_html_embedded():
    time.sleep(0.3)
    res = pubmed_client.get_abstract('25484845')
    assert len(res) > 4, res
예제 #29
0
def test_get_abstract_notitle():
    abstract = pubmed_client.get_abstract('27754804', prepend_title=False)
    assert abstract.startswith('The RAF inhibitor')
    assert abstract.endswith('vemurafenib.')
    assert unicode_strs(abstract)
예제 #30
0
def test_get_no_abstract():
    time.sleep(0.5)
    abstract = pubmed_client.get_abstract('xx')
    assert abstract is None
예제 #31
0
def test_get_abstract2():
    # Try another one
    abstract = pubmed_client.get_abstract('27123883')
    assert unicode_strs(abstract)
예제 #32
0
def test_universal_extract_paragraphs_abstract():
    pmid = '16511588'
    abstract = pubmed_client.get_abstract(pmid)
    result = universal_extract_paragraphs(abstract)
    assert result[0] == abstract
예제 #33
0
def test_get_abstract_title():
    time.sleep(0.3)
    abstract = pubmed_client.get_abstract('27754804', prepend_title=True)
    assert abstract.lower().startswith('targeting autophagy')
    assert abstract.endswith('vemurafenib.')
    assert unicode_strs(abstract)
예제 #34
0
def test_get_abstract_title():
    time.sleep(0.5)
    abstract = pubmed_client.get_abstract('27754804', prepend_title=True)
    assert abstract.lower().startswith('targeting autophagy')
    assert abstract.endswith('vemurafenib.')
예제 #35
0
def test_get_abstract_title():
    abstract = pubmed_client.get_abstract('27754804', prepend_title=True)
    assert (abstract.lower().startswith('targeting autophagy'))
    assert (abstract.endswith('vemurafenib.'))
    assert unicode_strs(abstract)