예제 #1
0
def test_get_statements():
    num_stmts = 10000
    db = _get_prepped_db(num_stmts)

    # Test getting all statements
    stmts = dbc.get_statements([], preassembled=False, db=db)
    assert len(stmts) == num_stmts, len(stmts)

    stmts = dbc.get_statements([db.RawStatements.reading_id.isnot(None)],
                               preassembled=False,
                               db=db)
    pmids = {s.evidence[0].pmid for s in random.sample(stmts, 200)}
    assert pmids
    assert None not in pmids
    md_list = pubc.get_metadata_for_ids(list(pmids))
    assert len(md_list) == len(pmids), (len(md_list), len(pmids))

    # Test getting some statements
    stmt_uuid = stmts[0].uuid
    stmts = dbc.get_statements([db.RawStatements.uuid != stmt_uuid],
                               preassembled=False,
                               db=db)
    assert len(stmts) == num_stmts - 1, len(stmts)

    # Test getting statements without fix refs.
    stmts = dbc.get_statements([
        db.RawStatements.reading_id.isnot(None), db.RawStatements.reading_id
        == db.Reading.id, db.Reading.reader == 'SPARSER'
    ],
                               preassembled=False,
                               fix_refs=False,
                               db=db)
    assert 0 < len(stmts) < num_stmts, len(stmts)
    pmids = {s.evidence[0].pmid for s in random.sample(stmts, 200)}
    assert None in pmids, pmids
예제 #2
0
def test_pmid_27821631():
    time.sleep(0.5)
    pmid = '27821631'
    res = pubmed_client.get_abstract(pmid)
    assert len(res) > 50, res
    res = pubmed_client.get_metadata_for_ids([pmid], get_abstracts=True)
    assert res[pmid]['title'] is not None
    assert len(res[pmid]['abstract']) > 50
예제 #3
0
def test_pmid_27821631():
    time.sleep(0.3)
    pmid = '27821631'
    res = pubmed_client.get_abstract(pmid)
    assert len(res) > 50, res
    res = pubmed_client.get_metadata_for_ids([pmid], get_abstracts=True)
    assert res[pmid]['title'] is not None
    assert len(res[pmid]['abstract']) > 50
예제 #4
0
def test_get_pub_date():
    time.sleep(0.5)
    pmids = ['27123883', '27121204', '27115606']
    metadata = pubmed_client.get_metadata_for_ids(pmids)
    assert metadata[pmids[0]]['publication_date']['year'] == 2016
    assert metadata[pmids[0]]['publication_date']['month'] == 4
    assert metadata[pmids[0]]['publication_date']['day'] == 29
    assert metadata[pmids[1]]['publication_date']['year'] == 2016
    assert metadata[pmids[1]]['publication_date']['month'] == 4
    assert metadata[pmids[1]]['publication_date']['day'] == 29
    assert metadata[pmids[2]]['publication_date']['year'] == 2016
    assert metadata[pmids[2]]['publication_date']['month'] == 4
    assert metadata[pmids[2]]['publication_date']['day'] == 27
예제 #5
0
def _get_pmid_titles(pmids):
    pmids_to_titles = {}
    n = 200
    n_batches = len(pmids) // n
    if len(pmids) % n:
        n_batches += 1
    for i in range(n_batches):
        start = n * i
        end = start + n
        batch = pmids[start:end]
        m = pubmed_client.get_metadata_for_ids(batch)
        for pmid, metadata in m.items():
            pmids_to_titles[pmid] = metadata['title']
    return pmids_to_titles
예제 #6
0
def get_tr_metadata(ev_tr_dict):
    cord_by_doi, cord_by_pmid = get_cord_info()
    # If has DOI, look up in CORD19
    title, authors, journal, date = (None, None, None, None)
    if ev_tr_dict.get('DOI'):
        doi = ev_tr_dict['DOI']
        cord_entry = cord_by_doi.get(doi)
        if cord_entry:
            return (cord_entry['title'], cord_entry['authors'],
                    cord_entry['journal'], cord_entry['publish_time'].year)
        # Article not in CORD-19 corpus, get metadata from Crossref
        print("Querying crossref")
        cr_entry = crossref_client.get_metadata(doi)
        if cr_entry:
            try:
                author_str = '; '.join([
                    f"{auth['family']}, {auth.get('given', '')}"
                    for auth in cr_entry['author']
                ])
            except KeyError:
                try:
                    author_str = '; '.join(
                        [f"{auth['name']}" for auth in cr_entry['author']])
                except KeyError:
                    author_str = ''
            title_list = cr_entry['title']
            if title_list:
                title = title_list[0]
            container_list = cr_entry['container-title']
            if container_list:
                journal = container_list[0]
            return (title, author_str, journal,
                    cr_entry['issued']['date-parts'][0][0])
    # If we got here, then we haven't found the metadata yet, try by PMID
    if ev_tr_dict.get('PMID'):
        pmid = ev_tr_dict['PMID']
        cord_entry = cord_by_pmid.get(pmid)
        if cord_entry:
            return (cord_entry['title'], cord_entry['authors'],
                    cord_entry['journal'], cord_entry['publish_time'].year)
        print("Querying Pubmed")
        pm_entry = pubmed_client.get_metadata_for_ids([pmid])
        if pm_entry:
            pm_md = pm_entry[pmid]
            author_str = '; '.join(pm_md['authors'])
            return (pm_md['title'], author_str, pm_md.get('journal_title', ''),
                    pm_md['publication_date']['year'])
    # No luck, return empty strings
    return ('', '', '', '')
예제 #7
0
def get_stmts_pmids_mesh(subject, stmt_type, object_list):
    stmts = []
    for obj in object_list:
        idrp = idr.get_statements(subject=subject,
                                  object=obj,
                                  stmt_type=stmt_type,
                                  ev_limit=10000)
        stmts += idrp.statements

    # Collect the PMIDs for the stmts
    pmids = [e.pmid for s in stmts for e in s.evidence]

    mesh_terms = []
    for batch in batch_iter(pmids, 200):
        pmid_list = list(batch)
        print("Retrieving metadata for %d articles" % len(pmid_list))
        metadata = get_metadata_for_ids(pmid_list)
        for pmid, pmid_meta in metadata.items():
            mesh_terms += [d['mesh'] for d in pmid_meta['mesh_annotations']]
    return (stmts, pmids, mesh_terms)
예제 #8
0
def test_get_metadata_for_ids():
    time.sleep(0.5)
    pmids = ['27123883', '27121204', '27115606']
    metadata = pubmed_client.get_metadata_for_ids(pmids)
예제 #9
0
def doi_query(pmid, search_limit=10):
    """Get the DOI for a PMID by matching CrossRef and Pubmed metadata.

    Searches CrossRef using the article title and then accepts search hits only
    if they have a matching journal ISSN and page number with what is obtained
    from the Pubmed database.
    """
    # Get article metadata from PubMed
    pubmed_meta_dict = pubmed_client.get_metadata_for_ids([pmid],
                                                        get_issns_from_nlm=True)
    if pubmed_meta_dict is None or pubmed_meta_dict.get(pmid) is None:
        logger.warning('No metadata found in Pubmed for PMID%s' % pmid)
        return None
    # The test above ensures we've got this now
    pubmed_meta = pubmed_meta_dict[pmid]
    # Check if we already got a DOI from Pubmed itself!
    if pubmed_meta.get('doi'):
        return pubmed_meta.get('doi')
    # Check for the title, which we'll need for the CrossRef search
    pm_article_title = pubmed_meta.get('title')
    if pm_article_title is None:
        logger.warning('No article title found in Pubmed for PMID%s' % pmid)
        return None
    # Get the ISSN list
    pm_issn_list = pubmed_meta.get('issn_list')
    if not pm_issn_list:
        logger.warning('No ISSNs found in Pubmed for PMID%s' % pmid)
        return None
    # Get the page number
    pm_page = pubmed_meta.get('page')
    if not pm_page:
        logger.debug('No page number found in Pubmed for PMID%s' % pmid)
        return None
    # Now query CrossRef using the title we've got
    url = crossref_search_url
    params = {'q': pm_article_title, 'sort': 'score'}
    try:
        res = requests.get(crossref_search_url, params)
    except requests.exceptions.ConnectionError as e:
        logger.error('CrossRef service could not be reached.')
        logger.error(e)
        return None
    except Exception as e:
        logger.error('Error accessing CrossRef service: %s' % str(e))
        return None
    if res.status_code != 200:
        logger.info('PMID%s: no search results from CrossRef, code %d' %
                    (pmid, res.status_code))
        return None
    raw_message = res.json()
    mapped_doi = None
    # Iterate over the search results, looking up XREF metadata
    for result_ix, result in enumerate(raw_message):
        if result_ix > search_limit:
            logger.info('PMID%s: No match found within first %s results, '
                        'giving up!' % (pmid, search_limit))
            break
        xref_doi_url = result['doi']
        # Strip the URL prefix off of the DOI
        m = re.match('^http://dx.doi.org/(.*)$', xref_doi_url)
        if not m:
            logger.error('Could not match %s with DOI pattern.' % xref_doi_url)
            return None
        xref_doi = m.groups()[0]
        # Get the XREF metadata using the DOI
        xref_meta = get_metadata(xref_doi)
        if xref_meta is None:
            continue
        xref_issn_list = xref_meta.get('ISSN')
        xref_page = xref_meta.get('page')
        # If there's no ISSN info for this article, skip to the next result
        if not xref_issn_list:
            logger.debug('No ISSN found for DOI %s, skipping' % xref_doi_url)
            continue
        # If there's no page info for this article, skip to the next result
        if not xref_page:
            logger.debug('No page number found for DOI %s, skipping' %
                          xref_doi_url)
            continue
        # Now check for an ISSN match by looking for the set intersection
        # between the Pubmed ISSN list and the CrossRef ISSN list.
        matching_issns = set(pm_issn_list).intersection(set(xref_issn_list))
        # Before comparing page numbers, regularize the page numbers a bit.
        # Note that we only compare the first page number, since frequently
        # the final page number will simply be missing in one of the data
        # sources. We also canonicalize page numbers of the form '14E' to
        # 'E14' (which is the format used by Pubmed).
        pm_start_page = pm_page.split('-')[0].upper()
        xr_start_page = xref_page.split('-')[0].upper()
        if xr_start_page.endswith('E'):
            xr_start_page = 'E' + xr_start_page[:-1]
        # Now compare the ISSN list and page numbers
        if matching_issns and pm_start_page == xr_start_page:
            # We found a match!
            mapped_doi = xref_doi
            break
        # Otherwise, keep looking through the results...
    # Return a DOI, or None if we didn't find one that met our matching
    # criteria
    return mapped_doi
예제 #10
0
def test_get_metadata_for_ids():
    pmids = ['27123883', '27121204', '27115606']
    metadata = pubmed_client.get_metadata_for_ids(pmids)
    assert unicode_strs(metadata)
예제 #11
0
def doi_query(pmid, search_limit=10):
    """Get the DOI for a PMID by matching CrossRef and Pubmed metadata.

    Searches CrossRef using the article title and then accepts search hits only
    if they have a matching journal ISSN and page number with what is obtained
    from the Pubmed database.
    """
    # Get article metadata from PubMed
    pubmed_meta_dict = pubmed_client.get_metadata_for_ids(
        [pmid], get_issns_from_nlm=True)
    if pubmed_meta_dict is None or pubmed_meta_dict.get(pmid) is None:
        logger.warning('No metadata found in Pubmed for PMID%s' % pmid)
        return None
    # The test above ensures we've got this now
    pubmed_meta = pubmed_meta_dict[pmid]
    # Check if we already got a DOI from Pubmed itself!
    if pubmed_meta.get('doi'):
        return pubmed_meta.get('doi')
    # Check for the title, which we'll need for the CrossRef search
    pm_article_title = pubmed_meta.get('title')
    if pm_article_title is None:
        logger.warning('No article title found in Pubmed for PMID%s' % pmid)
        return None
    # Get the ISSN list
    pm_issn_list = pubmed_meta.get('issn_list')
    if not pm_issn_list:
        logger.warning('No ISSNs found in Pubmed for PMID%s' % pmid)
        return None
    # Get the page number
    pm_page = pubmed_meta.get('page')
    if not pm_page:
        logger.debug('No page number found in Pubmed for PMID%s' % pmid)
        return None
    # Now query CrossRef using the title we've got
    url = crossref_search_url
    params = {'q': pm_article_title, 'sort': 'score'}
    try:
        res = requests.get(crossref_search_url, params)
    except requests.exceptions.ConnectionError as e:
        logger.error('CrossRef service could not be reached.')
        logger.error(e)
        return None
    if res.status_code != 200:
        logger.info('PMID%s: no search results from CrossRef, code %d' %
                    (pmid, res.status_code))
        return None
    raw_message = res.json()
    mapped_doi = None
    # Iterate over the search results, looking up XREF metadata
    for result_ix, result in enumerate(raw_message):
        if result_ix > search_limit:
            logger.info('PMID%s: No match found within first %s results, '
                        'giving up!' % (pmid, search_limit))
            break
        xref_doi_url = result['doi']
        # Strip the URL prefix off of the DOI
        m = re.match('^http://dx.doi.org/(.*)$', xref_doi_url)
        xref_doi = m.groups()[0]
        # Get the XREF metadata using the DOI
        xref_meta = get_metadata(xref_doi)
        if xref_meta is None:
            continue
        xref_issn_list = xref_meta.get('ISSN')
        xref_page = xref_meta.get('page')
        # If there's no ISSN info for this article, skip to the next result
        if not xref_issn_list:
            logger.debug('No ISSN found for DOI %s, skipping' % xref_doi_url)
            continue
        # If there's no page info for this article, skip to the next result
        if not xref_page:
            logger.debug('No page number found for DOI %s, skipping' %
                         xref_doi_url)
            continue
        # Now check for an ISSN match by looking for the set intersection
        # between the Pubmed ISSN list and the CrossRef ISSN list.
        matching_issns = set(pm_issn_list).intersection(set(xref_issn_list))
        # Before comparing page numbers, regularize the page numbers a bit.
        # Note that we only compare the first page number, since frequently
        # the final page number will simply be missing in one of the data
        # sources. We also canonicalize page numbers of the form '14E' to
        # 'E14' (which is the format used by Pubmed).
        pm_start_page = pm_page.split('-')[0].upper()
        xr_start_page = xref_page.split('-')[0].upper()
        if xr_start_page.endswith('E'):
            xr_start_page = 'E' + xr_start_page[:-1]
        # Now compare the ISSN list and page numbers
        if matching_issns and pm_start_page == xr_start_page:
            # We found a match!
            mapped_doi = xref_doi
            break
        # Otherwise, keep looking through the results...
    # Return a DOI, or None if we didn't find one that met our matching
    # criteria
    return mapped_doi
예제 #12
0
from indra.literature import pubmed_client
import pickle

with open('pmids_from_gene.pkl') as f:
    pmids = pickle.load(f)

total_pmids = [
    pmid for gene, pmid_list in pmids.iteritems() for pmid in pmid_list
]
unique_pmids = sorted(list(set(total_pmids)), key=lambda x: int(x))

num_ids = len(unique_pmids)
chunk_size = 200
start_indices = range(0, num_ids, chunk_size)

results = {}
for start_ix in start_indices:
    print start_ix
    if start_ix + chunk_size < num_ids:
        end_ix = start_ix + chunk_size
    else:
        end_ix = num_ids
    results.update(
        pubmed_client.get_metadata_for_ids(unique_pmids[start_ix:end_ix]))
예제 #13
0
total_pmids = [
    pmid for gene, pmid_list in pmids.iteritems() for pmid in pmid_list
]
unique_pmids = sorted(list(set(total_pmids)), key=lambda x: int(x))

# Iterate over the PMIDs in the list
counter = 0
mismatch = []
for pmid in unique_pmids:
    counter += 1
    pmid_result = pmid_map.get(pmid)
    if pmid_result and pmid_result[1]:
        doi = pmid_result[1]
        # Lookup the metadata for the pub in pubmed
        #pubmed_meta = pubmed_metadata[pmid]
        pubmed_meta = pubmed_client.get_metadata_for_ids([pmid])[pmid]
        # Look up the metadata for the pub in CrossRef
        xref_meta = xref_metadata.get(doi)
        if xref_meta is None:
            print counter, pmid, doi, "Not found in Xref, skipping"
            continue
        xr_issn_list = xref_meta.get('ISSN')
        if xr_issn_list is None:
            print counter, pmid, doi, "No ISSNs in XREF, skipping"
            continue
        # PM ISSN
        nlm_id = pubmed_meta['journal_nlm_id']
        pm_issn = pubmed_meta['issn']
        pm_issn_linking = pubmed_meta['issn_linking']
        pm_issn_list = pubmed_client.get_issns_for_journal(nlm_id)
        if pm_issn_list is None:
예제 #14
0
def test_get_metadata_for_ids():
    time.sleep(0.3)
    pmids = ['27123883', '27121204', '27115606']
    metadata = pubmed_client.get_metadata_for_ids(pmids)
    assert unicode_strs(metadata)
예제 #15
0
from indra.literature import pubmed_client
import pickle

with open('pmids_from_gene.pkl') as f:
    pmids = pickle.load(f)

total_pmids = [pmid for gene, pmid_list in pmids.iteritems()
                    for pmid in pmid_list]
unique_pmids = sorted(list(set(total_pmids)), key=lambda x: int(x))

num_ids = len(unique_pmids)
chunk_size = 200
start_indices = range(0, num_ids, chunk_size)

results = {}
for start_ix in start_indices:
    print start_ix
    if start_ix + chunk_size < num_ids:
        end_ix = start_ix + chunk_size
    else:
        end_ix = num_ids
    results.update(pubmed_client.get_metadata_for_ids(
                                        unique_pmids[start_ix:end_ix]))
예제 #16
0
total_pmids = [pmid for gene, pmid_list in pmids.iteritems()
                    for pmid in pmid_list]
unique_pmids = sorted(list(set(total_pmids)), key=lambda x: int(x))

# Iterate over the PMIDs in the list
counter = 0
mismatch = []
for pmid in unique_pmids:
    counter += 1
    pmid_result = pmid_map.get(pmid)
    if pmid_result and pmid_result[1]:
        doi = pmid_result[1]
        # Lookup the metadata for the pub in pubmed
        #pubmed_meta = pubmed_metadata[pmid]
        pubmed_meta = pubmed_client.get_metadata_for_ids([pmid])[pmid]
        # Look up the metadata for the pub in CrossRef
        xref_meta = xref_metadata.get(doi)
        if xref_meta is None:
            print counter, pmid, doi, "Not found in Xref, skipping"
            continue
        xr_issn_list = xref_meta.get('ISSN')
        if xr_issn_list is None:
            print counter, pmid, doi, "No ISSNs in XREF, skipping"
            continue
        # PM ISSN
        nlm_id = pubmed_meta['journal_nlm_id']
        pm_issn = pubmed_meta['issn']
        pm_issn_linking = pubmed_meta['issn_linking']
        pm_issn_list = pubmed_client.get_issns_for_journal(nlm_id)
        if pm_issn_list is None: