Пример #1
0
def id_lookup(paper_id, idtype):
    """Take an ID of type PMID, PMCID, or DOI and lookup the other IDs.

    If the DOI is not found in Pubmed, try to obtain the DOI by doing a
    reverse-lookup of the DOI in CrossRef using article metadata.

    Parameters
    ----------
    paper_id : str
        ID of the article.
    idtype : str
        Type of the ID: 'pmid', 'pmcid', or 'doi

    Returns
    -------
    ids : dict
        A dictionary with the following keys: pmid, pmcid and doi.
    """
    if idtype not in ('pmid', 'pmcid', 'doi'):
        raise ValueError("Invalid idtype %s; must be 'pmid', 'pmcid', "
                         "or 'doi'." % idtype)

    ids = {'doi': None, 'pmid': None, 'pmcid': None}
    pmc_id_results = pmc_client.id_lookup(paper_id, idtype)
    # Start with the results of the PMC lookup and then override with the
    # provided ID
    ids['pmid'] = pmc_id_results.get('pmid')
    ids['pmcid'] = pmc_id_results.get('pmcid')
    ids['doi'] = pmc_id_results.get('doi')
    ids[idtype] = paper_id
    # If we gave a DOI, then our work is done after looking for PMID and PMCID
    if idtype == 'doi':
        return ids
    # If we gave a PMID or PMCID, we need to check to see if we got a DOI.
    # If we got a DOI back, we're done.
    elif ids.get('doi'):
        return ids
    # If we get here, then we've given PMID or PMCID and don't have a DOI yet.
    # If we gave a PMCID and have neither a PMID nor a DOI, then we'll run
    # into problems later on when we try to the reverse lookup using CrossRef.
    # So we bail here and return what we have (PMCID only) with a warning.
    if ids.get('pmcid') and ids.get('doi') is None and ids.get('pmid') is None:
        logger.warning('%s: PMCID without PMID or DOI' % ids.get('pmcid'))
        return ids
    # To clarify the state of things at this point:
    assert ids.get('pmid') is not None
    assert ids.get('doi') is None
    # As a last result, we try to get the DOI from CrossRef (which internally
    # tries to get the DOI from Pubmed in the process of collecting the
    # necessary metadata for the lookup):
    ids['doi'] = crossref_client.doi_query(ids['pmid'])
    # It may still be None, but at this point there's nothing we can do...
    return ids
Пример #2
0
def id_lookup(paper_id, idtype):
    """Take an ID of type PMID, PMCID, or DOI and lookup the other IDs.

    If the DOI is not found in Pubmed, try to obtain the DOI by doing a
    reverse-lookup of the DOI in CrossRef using article metadata.

    Parameters
    ----------
    paper_id : string
        ID of the article.
    idtype : 'pmid', 'pmcid', or 'doi
        Type of the ID.

    Returns
    -------
    ids : dict
        A dictionary with the following keys: pmid, pmcid and doi.
    """
    if idtype not in ('pmid', 'pmcid', 'doi'):
        raise ValueError("Invalid idtype %s; must be 'pmid', 'pmcid', "
                         "or 'doi'." % idtype)

    ids = {'doi': None, 'pmid': None, 'pmcid': None}
    pmc_id_results = pmc_client.id_lookup(paper_id, idtype)
    # Start with the results of the PMC lookup and then override with the
    # provided ID
    ids['pmid'] = pmc_id_results.get('pmid')
    ids['pmcid'] = pmc_id_results.get('pmcid')
    ids['doi'] = pmc_id_results.get('doi')
    ids[idtype] = paper_id
    # If we gave a DOI, then our work is done after looking for PMID and PMCID
    if idtype == 'doi':
        return ids
    # If we gave a PMID or PMCID, we need to check to see if we got a DOI.
    # If we got a DOI back, we're done.
    elif ids.get('doi'):
        return ids
    # If we get here, then we've given PMID or PMCID and don't have a DOI yet.
    # If we gave a PMCID and have neither a PMID nor a DOI, then we'll run
    # into problems later on when we try to the reverse lookup using CrossRef.
    # So we bail here and return what we have (PMCID only) with a warning.
    if ids.get('pmcid') and ids.get('doi') is None and ids.get('pmid') is None:
        logger.warning('%s: PMCID without PMID or DOI' % ids.get('pmcid'))
        return ids
    # To clarify the state of things at this point:
    assert ids.get('pmid') is not None
    assert ids.get('doi') is None
    # As a last result, we try to get the DOI from CrossRef (which internally
    # tries to get the DOI from Pubmed in the process of collecting the
    # necessary metadata for the lookup):
    ids['doi'] = crossref_client.doi_query(ids['pmid'])
    # It may still be None, but at this point there's nothing we can do...
    return ids
Пример #3
0
    def get_missing_pmids(self, tr_data):
        "Try to get missing pmids using the pmc client."
        num_missing = 0
        num_found = 0

        logger.debug("Getting missing pmids.")

        # TODO: This is very slow...should find a way to speed it up.
        for tr_entry in tr_data:
            if tr_entry['pmid'] is None:
                num_missing += 1
                ret = id_lookup(tr_entry['pmcid'])
                if 'pmid' in ret.keys():
                    tr_entry['pmid'] = ret['pmid']
                    num_found += 1

        ''' # The web api does not support this much access, sadly.
        thread_list = []
        for tr_entry in tr_data:
            if tr_entry['pmid'] is None:
                th = Thread(target=lookup_pmid, args=[tr_entry])
                thread_list.append(th)

        N = min(10, len(thread_list))
        logger.debug("Starting %d threading pool." % N)
        active_threads = []
        for _ in range(N):
            th = thread_list.pop()
            th.start()
            active_threads.append(th)

        while len(thread_list):
            for th in active_threads[:]:
                if not th.is_alive():
                    th.join()
                    active_threads.remove(th)
                    if len(thread_list):
                        new_th = thread_list.pop()
                        new_th.start()
                        active_threads.append(th)
            sleep(0.1)

        for th in active_threads:
            th.join()
        '''
        logger.debug("Found %d/%d new pmids." % (num_found, num_missing))
        return
Пример #4
0
def get_text_content_for_pmids(pmids):
    """Get text content for articles given a list of their pmids

    Parameters
    ----------
    pmids : list of str

    Returns
    -------
    text_content : list of str
    """
    pmc_pmids = set(pmc_client.filter_pmids(pmids, source_type='fulltext'))

    pmc_ids = []
    for pmid in pmc_pmids:
        pmc_id = pmc_client.id_lookup(pmid, idtype='pmid')['pmcid']
        if pmc_id:
            pmc_ids.append(pmc_id)
        else:
            pmc_pmids.discard(pmid)

    pmc_xmls = []
    failed = set()
    for pmc_id in pmc_ids:
        if pmc_id is not None:
            pmc_xmls.append(pmc_client.get_xml(pmc_id))
        else:
            failed.add(pmid)
        time.sleep(0.5)

    remaining_pmids = set(pmids) - pmc_pmids | failed
    abstracts = []
    for pmid in remaining_pmids:
        abstract = pubmed_client.get_abstract(pmid)
        abstracts.append(abstract)
        time.sleep(0.5)

    return [
        text_content for source in (pmc_xmls, abstracts)
        for text_content in source if text_content is not None
    ]
Пример #5
0
def get_text_content_for_pmids(pmids):
    """Get text content for articles given a list of their pmids

    Parameters
    ----------
    pmids : list of str

    Returns
    -------
    text_content : list of str
    """
    pmc_pmids = set(pmc_client.filter_pmids(pmids, source_type='fulltext'))

    pmc_ids = []
    for pmid in pmc_pmids:
        pmc_id = pmc_client.id_lookup(pmid, idtype='pmid')['pmcid']
        if pmc_id:
            pmc_ids.append(pmc_id)
        else:
            pmc_pmids.discard(pmid)

    pmc_xmls = []
    failed = set()
    for pmc_id in pmc_ids:
        if pmc_id is not None:
            pmc_xmls.append(pmc_client.get_xml(pmc_id))
        else:
            failed.append(pmid)
        time.sleep(0.5)

    remaining_pmids = set(pmids) - pmc_pmids | failed
    abstracts = []
    for pmid in remaining_pmids:
        abstract = pubmed_client.get_abstract(pmid)
        abstracts.append(abstract)
        time.sleep(0.5)

    return [text_content for source in (pmc_xmls, abstracts)
            for text_content in source if text_content is not None]
Пример #6
0
def test_id_lookup_pmcid_no_prefix_idtype():
    ids = pmc_client.id_lookup('4322985', idtype='pmcid')
    assert ids['doi'] == example_ids['doi']
    assert ids['pmid'] == example_ids['pmid']
    assert ids['pmcid'] == example_ids['pmcid']
    assert unicode_strs(ids)
Пример #7
0
def test_id_lookup_pmcid_idtype():
    ids = pmc_client.id_lookup('PMC4322985', idtype='pmcid')
    assert (ids['doi'] == example_ids['doi'])
    assert (ids['pmid'] == example_ids['pmid'])
    assert (ids['pmcid'] == example_ids['pmcid'])
    assert unicode_strs(ids)
Пример #8
0
def test_invalid_idtype():
    ids = pmc_client.id_lookup('DOI10.18632/oncotarget.2555', idtype='foo')
Пример #9
0
def test_id_lookup_doi_prefix_no_idtype():
    ids = pmc_client.id_lookup('DOI10.18632/oncotarget.2555')
    assert ids['doi'] == example_ids['doi']
    assert ids['pmid'] == example_ids['pmid']
    assert ids['pmcid'] == example_ids['pmcid']
    assert unicode_strs(ids)
Пример #10
0
def test_id_lookup_pmcid_no_prefix_idtype():
    ids = pmc_client.id_lookup('4322985', idtype='pmcid')
    assert ids['doi'] == example_ids['doi']
    assert ids['pmid'] == example_ids['pmid']
    assert ids['pmcid'] == example_ids['pmcid']
    assert unicode_strs(ids)
Пример #11
0
def test_id_lookup_doi_prefix_no_idtype():
    ids = pmc_client.id_lookup('DOI10.18632/oncotarget.2555')
    assert ids['doi'] == example_ids['doi']
    assert ids['pmid'] == example_ids['pmid']
    assert ids['pmcid'] == example_ids['pmcid']
    assert unicode_strs(ids)
Пример #12
0
def test_invalid_idtype():
    ids = pmc_client.id_lookup('DOI10.18632/oncotarget.2555', idtype='foo')
Пример #13
0
def test_id_lookup_pmid_with_prefix_no_idtype():
    ids = pmc_client.id_lookup('PMID25361007')
    assert ids['doi'] == example_ids['doi']
    assert ids['pmid'] == example_ids['pmid']
    assert ids['pmcid'] == example_ids['pmcid']
    assert unicode_strs(ids)
Пример #14
0
def test_id_lookup_pmid_with_prefix_no_idtype():
    ids = pmc_client.id_lookup('PMID25361007')
    assert ids['doi'] == example_ids['doi']
    assert ids['pmid'] == example_ids['pmid']
    assert ids['pmcid'] == example_ids['pmcid']
    assert unicode_strs(ids)
Пример #15
0
def test_id_lookup_pmid_no_prefix_no_idtype():
    ids = pmc_client.id_lookup("25361007")
    assert ids["doi"] == example_ids["doi"]
    assert ids["pmid"] == example_ids["pmid"]
    assert ids["pmcid"] == example_ids["pmcid"]
    assert unicode_strs(ids)
Пример #16
0
def test_invalid_idtype():
    ids = pmc_client.id_lookup("DOI10.18632/oncotarget.2555", idtype="foo")
Пример #17
0
def test_id_lookup_doi_prefix_no_idtype():
    ids = pmc_client.id_lookup("DOI10.18632/oncotarget.2555")
    assert ids["doi"] == example_ids["doi"]
    assert ids["pmid"] == example_ids["pmid"]
    assert ids["pmcid"] == example_ids["pmcid"]
    assert unicode_strs(ids)
Пример #18
0
def test_id_lookup_pmcid_no_prefix_idtype():
    ids = pmc_client.id_lookup("4322985", idtype="pmcid")
    assert ids["doi"] == example_ids["doi"]
    assert ids["pmid"] == example_ids["pmid"]
    assert ids["pmcid"] == example_ids["pmcid"]
    assert unicode_strs(ids)