def test_get_statements(): num_stmts = 10000 db = _get_prepped_db(num_stmts) # Test getting all statements stmts = dbc.get_statements([], preassembled=False, db=db) assert len(stmts) == num_stmts, len(stmts) stmts = dbc.get_statements([db.RawStatements.reading_id.isnot(None)], preassembled=False, db=db) pmids = {s.evidence[0].pmid for s in random.sample(stmts, 200)} assert pmids assert None not in pmids md_list = pubc.get_metadata_for_ids(list(pmids)) assert len(md_list) == len(pmids), (len(md_list), len(pmids)) # Test getting some statements stmt_uuid = stmts[0].uuid stmts = dbc.get_statements([db.RawStatements.uuid != stmt_uuid], preassembled=False, db=db) assert len(stmts) == num_stmts - 1, len(stmts) # Test getting statements without fix refs. stmts = dbc.get_statements([ db.RawStatements.reading_id.isnot(None), db.RawStatements.reading_id == db.Reading.id, db.Reading.reader == 'SPARSER' ], preassembled=False, fix_refs=False, db=db) assert 0 < len(stmts) < num_stmts, len(stmts) pmids = {s.evidence[0].pmid for s in random.sample(stmts, 200)} assert None in pmids, pmids
def test_pmid_27821631(): time.sleep(0.5) pmid = '27821631' res = pubmed_client.get_abstract(pmid) assert len(res) > 50, res res = pubmed_client.get_metadata_for_ids([pmid], get_abstracts=True) assert res[pmid]['title'] is not None assert len(res[pmid]['abstract']) > 50
def test_pmid_27821631(): time.sleep(0.3) pmid = '27821631' res = pubmed_client.get_abstract(pmid) assert len(res) > 50, res res = pubmed_client.get_metadata_for_ids([pmid], get_abstracts=True) assert res[pmid]['title'] is not None assert len(res[pmid]['abstract']) > 50
def test_get_pub_date(): time.sleep(0.5) pmids = ['27123883', '27121204', '27115606'] metadata = pubmed_client.get_metadata_for_ids(pmids) assert metadata[pmids[0]]['publication_date']['year'] == 2016 assert metadata[pmids[0]]['publication_date']['month'] == 4 assert metadata[pmids[0]]['publication_date']['day'] == 29 assert metadata[pmids[1]]['publication_date']['year'] == 2016 assert metadata[pmids[1]]['publication_date']['month'] == 4 assert metadata[pmids[1]]['publication_date']['day'] == 29 assert metadata[pmids[2]]['publication_date']['year'] == 2016 assert metadata[pmids[2]]['publication_date']['month'] == 4 assert metadata[pmids[2]]['publication_date']['day'] == 27
def _get_pmid_titles(pmids): pmids_to_titles = {} n = 200 n_batches = len(pmids) // n if len(pmids) % n: n_batches += 1 for i in range(n_batches): start = n * i end = start + n batch = pmids[start:end] m = pubmed_client.get_metadata_for_ids(batch) for pmid, metadata in m.items(): pmids_to_titles[pmid] = metadata['title'] return pmids_to_titles
def get_tr_metadata(ev_tr_dict): cord_by_doi, cord_by_pmid = get_cord_info() # If has DOI, look up in CORD19 title, authors, journal, date = (None, None, None, None) if ev_tr_dict.get('DOI'): doi = ev_tr_dict['DOI'] cord_entry = cord_by_doi.get(doi) if cord_entry: return (cord_entry['title'], cord_entry['authors'], cord_entry['journal'], cord_entry['publish_time'].year) # Article not in CORD-19 corpus, get metadata from Crossref print("Querying crossref") cr_entry = crossref_client.get_metadata(doi) if cr_entry: try: author_str = '; '.join([ f"{auth['family']}, {auth.get('given', '')}" for auth in cr_entry['author'] ]) except KeyError: try: author_str = '; '.join( [f"{auth['name']}" for auth in cr_entry['author']]) except KeyError: author_str = '' title_list = cr_entry['title'] if title_list: title = title_list[0] container_list = cr_entry['container-title'] if container_list: journal = container_list[0] return (title, author_str, journal, cr_entry['issued']['date-parts'][0][0]) # If we got here, then we haven't found the metadata yet, try by PMID if ev_tr_dict.get('PMID'): pmid = ev_tr_dict['PMID'] cord_entry = cord_by_pmid.get(pmid) if cord_entry: return (cord_entry['title'], cord_entry['authors'], cord_entry['journal'], cord_entry['publish_time'].year) print("Querying Pubmed") pm_entry = pubmed_client.get_metadata_for_ids([pmid]) if pm_entry: pm_md = pm_entry[pmid] author_str = '; '.join(pm_md['authors']) return (pm_md['title'], author_str, pm_md.get('journal_title', ''), pm_md['publication_date']['year']) # No luck, return empty strings return ('', '', '', '')
def get_stmts_pmids_mesh(subject, stmt_type, object_list): stmts = [] for obj in object_list: idrp = idr.get_statements(subject=subject, object=obj, stmt_type=stmt_type, ev_limit=10000) stmts += idrp.statements # Collect the PMIDs for the stmts pmids = [e.pmid for s in stmts for e in s.evidence] mesh_terms = [] for batch in batch_iter(pmids, 200): pmid_list = list(batch) print("Retrieving metadata for %d articles" % len(pmid_list)) metadata = get_metadata_for_ids(pmid_list) for pmid, pmid_meta in metadata.items(): mesh_terms += [d['mesh'] for d in pmid_meta['mesh_annotations']] return (stmts, pmids, mesh_terms)
def test_get_metadata_for_ids(): time.sleep(0.5) pmids = ['27123883', '27121204', '27115606'] metadata = pubmed_client.get_metadata_for_ids(pmids)
def doi_query(pmid, search_limit=10): """Get the DOI for a PMID by matching CrossRef and Pubmed metadata. Searches CrossRef using the article title and then accepts search hits only if they have a matching journal ISSN and page number with what is obtained from the Pubmed database. """ # Get article metadata from PubMed pubmed_meta_dict = pubmed_client.get_metadata_for_ids([pmid], get_issns_from_nlm=True) if pubmed_meta_dict is None or pubmed_meta_dict.get(pmid) is None: logger.warning('No metadata found in Pubmed for PMID%s' % pmid) return None # The test above ensures we've got this now pubmed_meta = pubmed_meta_dict[pmid] # Check if we already got a DOI from Pubmed itself! if pubmed_meta.get('doi'): return pubmed_meta.get('doi') # Check for the title, which we'll need for the CrossRef search pm_article_title = pubmed_meta.get('title') if pm_article_title is None: logger.warning('No article title found in Pubmed for PMID%s' % pmid) return None # Get the ISSN list pm_issn_list = pubmed_meta.get('issn_list') if not pm_issn_list: logger.warning('No ISSNs found in Pubmed for PMID%s' % pmid) return None # Get the page number pm_page = pubmed_meta.get('page') if not pm_page: logger.debug('No page number found in Pubmed for PMID%s' % pmid) return None # Now query CrossRef using the title we've got url = crossref_search_url params = {'q': pm_article_title, 'sort': 'score'} try: res = requests.get(crossref_search_url, params) except requests.exceptions.ConnectionError as e: logger.error('CrossRef service could not be reached.') logger.error(e) return None except Exception as e: logger.error('Error accessing CrossRef service: %s' % str(e)) return None if res.status_code != 200: logger.info('PMID%s: no search results from CrossRef, code %d' % (pmid, res.status_code)) return None raw_message = res.json() mapped_doi = None # Iterate over the search results, looking up XREF metadata for result_ix, result in enumerate(raw_message): if result_ix > search_limit: logger.info('PMID%s: No match found within first %s results, ' 'giving up!' % (pmid, search_limit)) break xref_doi_url = result['doi'] # Strip the URL prefix off of the DOI m = re.match('^http://dx.doi.org/(.*)$', xref_doi_url) if not m: logger.error('Could not match %s with DOI pattern.' % xref_doi_url) return None xref_doi = m.groups()[0] # Get the XREF metadata using the DOI xref_meta = get_metadata(xref_doi) if xref_meta is None: continue xref_issn_list = xref_meta.get('ISSN') xref_page = xref_meta.get('page') # If there's no ISSN info for this article, skip to the next result if not xref_issn_list: logger.debug('No ISSN found for DOI %s, skipping' % xref_doi_url) continue # If there's no page info for this article, skip to the next result if not xref_page: logger.debug('No page number found for DOI %s, skipping' % xref_doi_url) continue # Now check for an ISSN match by looking for the set intersection # between the Pubmed ISSN list and the CrossRef ISSN list. matching_issns = set(pm_issn_list).intersection(set(xref_issn_list)) # Before comparing page numbers, regularize the page numbers a bit. # Note that we only compare the first page number, since frequently # the final page number will simply be missing in one of the data # sources. We also canonicalize page numbers of the form '14E' to # 'E14' (which is the format used by Pubmed). pm_start_page = pm_page.split('-')[0].upper() xr_start_page = xref_page.split('-')[0].upper() if xr_start_page.endswith('E'): xr_start_page = 'E' + xr_start_page[:-1] # Now compare the ISSN list and page numbers if matching_issns and pm_start_page == xr_start_page: # We found a match! mapped_doi = xref_doi break # Otherwise, keep looking through the results... # Return a DOI, or None if we didn't find one that met our matching # criteria return mapped_doi
def test_get_metadata_for_ids(): pmids = ['27123883', '27121204', '27115606'] metadata = pubmed_client.get_metadata_for_ids(pmids) assert unicode_strs(metadata)
def doi_query(pmid, search_limit=10): """Get the DOI for a PMID by matching CrossRef and Pubmed metadata. Searches CrossRef using the article title and then accepts search hits only if they have a matching journal ISSN and page number with what is obtained from the Pubmed database. """ # Get article metadata from PubMed pubmed_meta_dict = pubmed_client.get_metadata_for_ids( [pmid], get_issns_from_nlm=True) if pubmed_meta_dict is None or pubmed_meta_dict.get(pmid) is None: logger.warning('No metadata found in Pubmed for PMID%s' % pmid) return None # The test above ensures we've got this now pubmed_meta = pubmed_meta_dict[pmid] # Check if we already got a DOI from Pubmed itself! if pubmed_meta.get('doi'): return pubmed_meta.get('doi') # Check for the title, which we'll need for the CrossRef search pm_article_title = pubmed_meta.get('title') if pm_article_title is None: logger.warning('No article title found in Pubmed for PMID%s' % pmid) return None # Get the ISSN list pm_issn_list = pubmed_meta.get('issn_list') if not pm_issn_list: logger.warning('No ISSNs found in Pubmed for PMID%s' % pmid) return None # Get the page number pm_page = pubmed_meta.get('page') if not pm_page: logger.debug('No page number found in Pubmed for PMID%s' % pmid) return None # Now query CrossRef using the title we've got url = crossref_search_url params = {'q': pm_article_title, 'sort': 'score'} try: res = requests.get(crossref_search_url, params) except requests.exceptions.ConnectionError as e: logger.error('CrossRef service could not be reached.') logger.error(e) return None if res.status_code != 200: logger.info('PMID%s: no search results from CrossRef, code %d' % (pmid, res.status_code)) return None raw_message = res.json() mapped_doi = None # Iterate over the search results, looking up XREF metadata for result_ix, result in enumerate(raw_message): if result_ix > search_limit: logger.info('PMID%s: No match found within first %s results, ' 'giving up!' % (pmid, search_limit)) break xref_doi_url = result['doi'] # Strip the URL prefix off of the DOI m = re.match('^http://dx.doi.org/(.*)$', xref_doi_url) xref_doi = m.groups()[0] # Get the XREF metadata using the DOI xref_meta = get_metadata(xref_doi) if xref_meta is None: continue xref_issn_list = xref_meta.get('ISSN') xref_page = xref_meta.get('page') # If there's no ISSN info for this article, skip to the next result if not xref_issn_list: logger.debug('No ISSN found for DOI %s, skipping' % xref_doi_url) continue # If there's no page info for this article, skip to the next result if not xref_page: logger.debug('No page number found for DOI %s, skipping' % xref_doi_url) continue # Now check for an ISSN match by looking for the set intersection # between the Pubmed ISSN list and the CrossRef ISSN list. matching_issns = set(pm_issn_list).intersection(set(xref_issn_list)) # Before comparing page numbers, regularize the page numbers a bit. # Note that we only compare the first page number, since frequently # the final page number will simply be missing in one of the data # sources. We also canonicalize page numbers of the form '14E' to # 'E14' (which is the format used by Pubmed). pm_start_page = pm_page.split('-')[0].upper() xr_start_page = xref_page.split('-')[0].upper() if xr_start_page.endswith('E'): xr_start_page = 'E' + xr_start_page[:-1] # Now compare the ISSN list and page numbers if matching_issns and pm_start_page == xr_start_page: # We found a match! mapped_doi = xref_doi break # Otherwise, keep looking through the results... # Return a DOI, or None if we didn't find one that met our matching # criteria return mapped_doi
from indra.literature import pubmed_client import pickle with open('pmids_from_gene.pkl') as f: pmids = pickle.load(f) total_pmids = [ pmid for gene, pmid_list in pmids.iteritems() for pmid in pmid_list ] unique_pmids = sorted(list(set(total_pmids)), key=lambda x: int(x)) num_ids = len(unique_pmids) chunk_size = 200 start_indices = range(0, num_ids, chunk_size) results = {} for start_ix in start_indices: print start_ix if start_ix + chunk_size < num_ids: end_ix = start_ix + chunk_size else: end_ix = num_ids results.update( pubmed_client.get_metadata_for_ids(unique_pmids[start_ix:end_ix]))
total_pmids = [ pmid for gene, pmid_list in pmids.iteritems() for pmid in pmid_list ] unique_pmids = sorted(list(set(total_pmids)), key=lambda x: int(x)) # Iterate over the PMIDs in the list counter = 0 mismatch = [] for pmid in unique_pmids: counter += 1 pmid_result = pmid_map.get(pmid) if pmid_result and pmid_result[1]: doi = pmid_result[1] # Lookup the metadata for the pub in pubmed #pubmed_meta = pubmed_metadata[pmid] pubmed_meta = pubmed_client.get_metadata_for_ids([pmid])[pmid] # Look up the metadata for the pub in CrossRef xref_meta = xref_metadata.get(doi) if xref_meta is None: print counter, pmid, doi, "Not found in Xref, skipping" continue xr_issn_list = xref_meta.get('ISSN') if xr_issn_list is None: print counter, pmid, doi, "No ISSNs in XREF, skipping" continue # PM ISSN nlm_id = pubmed_meta['journal_nlm_id'] pm_issn = pubmed_meta['issn'] pm_issn_linking = pubmed_meta['issn_linking'] pm_issn_list = pubmed_client.get_issns_for_journal(nlm_id) if pm_issn_list is None:
def test_get_metadata_for_ids(): time.sleep(0.3) pmids = ['27123883', '27121204', '27115606'] metadata = pubmed_client.get_metadata_for_ids(pmids) assert unicode_strs(metadata)
from indra.literature import pubmed_client import pickle with open('pmids_from_gene.pkl') as f: pmids = pickle.load(f) total_pmids = [pmid for gene, pmid_list in pmids.iteritems() for pmid in pmid_list] unique_pmids = sorted(list(set(total_pmids)), key=lambda x: int(x)) num_ids = len(unique_pmids) chunk_size = 200 start_indices = range(0, num_ids, chunk_size) results = {} for start_ix in start_indices: print start_ix if start_ix + chunk_size < num_ids: end_ix = start_ix + chunk_size else: end_ix = num_ids results.update(pubmed_client.get_metadata_for_ids( unique_pmids[start_ix:end_ix]))
total_pmids = [pmid for gene, pmid_list in pmids.iteritems() for pmid in pmid_list] unique_pmids = sorted(list(set(total_pmids)), key=lambda x: int(x)) # Iterate over the PMIDs in the list counter = 0 mismatch = [] for pmid in unique_pmids: counter += 1 pmid_result = pmid_map.get(pmid) if pmid_result and pmid_result[1]: doi = pmid_result[1] # Lookup the metadata for the pub in pubmed #pubmed_meta = pubmed_metadata[pmid] pubmed_meta = pubmed_client.get_metadata_for_ids([pmid])[pmid] # Look up the metadata for the pub in CrossRef xref_meta = xref_metadata.get(doi) if xref_meta is None: print counter, pmid, doi, "Not found in Xref, skipping" continue xr_issn_list = xref_meta.get('ISSN') if xr_issn_list is None: print counter, pmid, doi, "No ISSNs in XREF, skipping" continue # PM ISSN nlm_id = pubmed_meta['journal_nlm_id'] pm_issn = pubmed_meta['issn'] pm_issn_linking = pubmed_meta['issn_linking'] pm_issn_list = pubmed_client.get_issns_for_journal(nlm_id) if pm_issn_list is None: