def main(taxon): query = db.gene_detail.find( { "entrezgene": { "$ne": "" }, "Taxonomy_Id": taxon }, { "ensembl_gene_id": 1, "entrezgene": 1 }) ec = eutils.Client() for result in query: id = int(result["entrezgene"]) try: gene = ec.efetch(db='gene', id=id) except: print( str(result["entrezgene"]) + "\t" + result["ensembl_gene_id"] + "not fonund") continue detail = gene.entrezgenes[0] summary = detail.summary synonyms = detail.synonyms if not summary and not synonyms: continue record = { 'ensembl_gene_id': result["ensembl_gene_id"], 'summary': summary, 'synonyms': synonyms } db.summary.insert_one(record)
def main(inputfile): genelist = pd.read_csv(inputfile) for esembl_id in genelist["gene_ID"]: ec = eutils.Client() esr = ec.esearch(db='gene', term=esembl_id) if esr.count == 0: print(esembl_id + "can't match gene id") continue if esr.count > 0: print(esembl_id + "can match multiple gene ids") id = esr.ids[0] gene = ec.efetch(db='gene', id=id) detail = gene.entrezgenes[0] summary = detail.summary synonyms = detail.synonyms record = { 'ensembl_gene_id': esembl_id, 'summary': summary, 'synonyms': synonyms } db.summary.insert_one(record)
import argparse import eutils if __name__ == '__main__': parser = argparse.ArgumentParser(description='ESummary', epilog='') parser.add_argument('db', help='Database to use') parser.add_argument('--id_list', help='list of ids') parser.add_argument('--id', help='Comma separated individual IDs') parser.add_argument('--history_file', help='Filter existing history') parser.add_argument('--user_email', help="User email") parser.add_argument('--admin_email', help="Admin email") args = parser.parse_args() c = eutils.Client(history_file=args.history_file, user_email=args.user_email, admin_email=args.admin_email) merged_ids = c.parse_ids(args.id_list, args.id, args.history_file) payload = { 'db': args.db, } if args.history_file is not None: payload.update(c.get_history()) else: payload['id'] = ','.join(merged_ids) print(c.summary(**payload))
parser = argparse.ArgumentParser(description='ECitMatch', epilog='') parser.add_argument('--file', type=argparse.FileType('r'), help='Tabular file containing citations to search') parser.add_argument('--key', nargs='*', help='Citation Key') parser.add_argument('--journal_title', nargs='*', help='Journal Title') parser.add_argument('--year', nargs='*', help='Year') parser.add_argument('--volume', nargs='*', help='Volume') parser.add_argument('--first_page', nargs='*', help='First Page') parser.add_argument('--author_name', nargs='*', help='Author name') # Emails parser.add_argument('--user_email', help="User email") parser.add_argument('--admin_email', help="Admin email") args = parser.parse_args() c = eutils.Client(user_email=args.user_email, admin_email=args.admin_email) citations = [] if args.file is None: for key, journal, year, volume, first_page, author_name in \ zip(args.key, args.journal_title, args.year, args.volume, args.first_page, args.author_name): citations.append({ 'key': key, 'journal': journal, 'year': year, 'volume': volume, 'first_page': first_page, 'author_name': author_name, }) else: for line in args.file:
def eutils_search(db="PubMed", exact=False, raw=False, retmode=None, retmax=None, sort=None, unlabeled_string=None, affiliation=None, article_identifier=None, all_fields=None, author=None, author_identifier=None, book=None, corporate_author=None, create_date=None, completion_date=None, conflict_of_interest=None, ec_rn_number=None, editor=None, entrez_date=None, filter_citations=None, first_author_name=None, full_author_name=None, full_investigator_name=None, grant_number=None, investigator=None, isbn=None, issue=None, journal=None, language=None, last_author=None, location_id=None, mesh_date=None, mesh_major_topic=None, mesh_subheadings=None, mesh_terms=None, modification_date=None, nlm_unique_id=None, other_term=None, owner=None, pagination=None, personal_name_as_subject=None, pharmacological_action=None, place_of_publication=None, pmid=None, publisher=None, publication_date=None, publication_type=None, secondary_source_id=None, subset=None, supplementary_concept=None, text_words=None, title=None, title_abstract=None, transliterated_title=None, uid=None, volume=None): ref_set = [] result_set = [] if not exact: base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?" if db is not None: base_url = base_url + "db=" + str(db) + "&" if retmode is not None: base_url = base_url + "retmode=" + str(retmode) + "&" if retmax is not None: base_url = base_url + "retmax=" + str(retmax) + "&" else: base_url = base_url + "retmax=" + str(100000) + "&" if sort is not None: base_url = base_url + "sort=" + str(sort) + "&" term_url = "term=" if unlabeled_string is not None: term_url = eutils_param_process(unlabeled_string) if affiliation is not None: term_url = term_url + eutils_param_process(affiliation, "ad") if article_identifier is not None: term_url = term_url + eutils_param_process(article_identifier, "aid") if all_fields is not None: term_url = term_url + eutils_param_process( all_fields, "all", raw=raw) if author is not None: term_url = term_url + eutils_param_process(author, "au") if author_identifier is not None: term_url = term_url + eutils_param_process(author_identifier, "auid") if book is not None: term_url = term_url + eutils_param_process(book, "book") if corporate_author is not None: term_url = term_url + eutils_param_process(corporate_author, "cn") if create_date is not None: term_url = term_url + eutils_param_process(create_date, "crdt") if completion_date is not None: term_url = term_url + eutils_param_process(completion_date, "dcom") if conflict_of_interest is not None: term_url = term_url + eutils_param_process(conflict_of_interest, "cois") if ec_rn_number is not None: term_url = term_url + eutils_param_process(ec_rn_number, "rn") if editor is not None: term_url = term_url + eutils_param_process(editor, "ed") if entrez_date is not None: term_url = term_url + eutils_param_process(entrez_date, "edat") if filter_citations is not None: term_url = term_url + eutils_param_process(filter_citations, "filter") if first_author_name is not None: term_url = term_url + eutils_param_process(first_author_name, "iau") if full_author_name is not None: term_url = term_url + eutils_param_process(full_author_name, "fau") if full_investigator_name is not None: term_url = term_url + eutils_param_process(full_investigator_name, "fir") if grant_number is not None: term_url = term_url + eutils_param_process(grant_number, "gr") if investigator is not None: term_url = term_url + eutils_param_process(investigator, "ir") if isbn is not None: term_url = term_url + eutils_param_process(isbn, "isbn") if issue is not None: term_url = term_url + eutils_param_process(issue, "ip") if journal is not None: term_url = term_url + eutils_param_process(journal, "ta") if language is not None: term_url = term_url + eutils_param_process(language, "la") if last_author is not None: term_url = term_url + eutils_param_process(last_author, "lastau") if location_id is not None: term_url = term_url + eutils_param_process(location_id, "lid") if mesh_date is not None: term_url = term_url + eutils_param_process(mesh_date, "mhda") if mesh_major_topic is not None: term_url = term_url + eutils_param_process(mesh_major_topic, "majr") if mesh_subheadings is not None: term_url = term_url + eutils_param_process(mesh_subheadings, "sh") if mesh_terms is not None: term_url = term_url + eutils_param_process(mesh_terms, "mh") if modification_date is not None: term_url = term_url + eutils_param_process(modification_date, "lr") if nlm_unique_id is not None: term_url = term_url + eutils_param_process(nlm_unique_id, "jid") if other_term is not None: term_url = term_url + eutils_param_process(other_term, "ot") if owner is not None: print("NOT FUNCTIONAL.") if pagination is not None: term_url = term_url + eutils_param_process(pagination, "pg") if personal_name_as_subject is not None: term_url = term_url + eutils_param_process( personal_name_as_subject, "ps") if pharmacological_action is not None: term_url = term_url + eutils_param_process(pharmacological_action, "pa") if place_of_publication is not None: term_url = term_url + eutils_param_process(place_of_publication, "pl") if pmid is not None: term_url = term_url + eutils_param_process(pmid, "pmid") if publisher is not None: term_url = term_url + eutils_param_process(publisher, "pubn") if publication_date is not None: term_url = term_url + eutils_param_process(publication_date, "dp") if publication_type is not None: term_url = term_url + eutils_param_process(publication_type, "pt") if secondary_source_id is not None: term_url = term_url + eutils_param_process(secondary_source_id, "si") if subset is not None: term_url = term_url + eutils_param_process(subset, "sb") if supplementary_concept is not None: term_url = term_url + eutils_param_process(supplementary_concept, "nm") if text_words is not None: term_url = term_url + eutils_param_process(text_words, "tw") if title is not None: term_url = term_url + eutils_param_process(title, "ti") if title_abstract is not None: term_url = term_url + eutils_param_process(title_abstract, "tiab") if transliterated_title is not None: term_url = term_url + eutils_param_process(transliterated_title, "tt") if uid is not None: term_url = term_url + eutils_param_process(uid, "pmid") if volume is not None: term_url = term_url + eutils_param_process(volume, "vi") if term_url[-1] == "+": term_url = term_url[:-1] if term_url[-1] == "&": term_url = term_url[:-1] print(base_url + term_url) print(base_url + "term=" + term_url) r = requests.get(base_url + "term=" + term_url, headers={"Content-Type": "application/json"}) if not r.ok: r.raise_for_status() sys.exit() decoded = r.json() result_set.append(decoded) if retmax is None: if "count" in decoded["esearchresult"]: count_set = int(decoded['esearchresult']['count']) - 100000 while count_set > 0: retstart = int(decoded['esearchresult']['retstart']) + 1 r = requests.get( base_url + "term=" + term_url + "&" + "retstart=" + str(retstart), headers={"Content-Type": "application/json"}) if not r.ok: r.raise_for_status() sys.exit() decoded = r.json() result_set.append(decoded) count_set = count_set - 100000 else: print("An error occurred") return [] ec = eutils.Client() ref_set = [] id_set = [] for dec_set in result_set: for pmid in dec_set["esearchresult"]["idlist"]: id_set.append(pmid) pmset = ec.efetch(db="pubmed", id=id_set) for pm in pmset: title = pm.title authors = pm.authors journal = pm.jrnl volume = pm.volume issue = pm.issue year = pm.year pages = pm.pages pmid = pm.pmid doi = pm.doi pmc = pm.pmc temp_ref = gnomics.objects.reference.Reference( identifier=pmid, identifier_type="PubMed ID", source="Entrez Programming Utilities", language=None, name=title) if doi is not None: gnomics.objects.reference.Reference.add_identifier( temp_ref, identifier=doi, identifier_type="DOI", source="Entrez Programming Utilities", language=None, name=title) if pmc is not None: gnomics.objects.reference.Reference.add_identifier( temp_ref, identifier=pmc, identifier_type="PMC ID", source="Entrez Programming Utilities", language=None, name=title) ref_set.append(temp_ref) else: base_url = "https://www.ncbi.nlm.nih.gov/pubmed/?" ext_url = "" if unlabeled_string: ext_url = ext_url + unlabeled_string elif all_fields: ext_url = ext_url + all_fields ext_url = "term=" + str( urllib.parse.quote_plus('("' + ext_url.replace('"', "'"))) r = requests.get(base_url + ext_url, headers={"Content-Type": "application/json"}) if "PMID:" in r.text: pmid_html = re.findall('<dt>PMID:</dt>.{1,}?[\d].{1,}?</dd>', r.text) pmid = re.findall('\d+', pmid_html[0]) temp_ref = gnomics.objects.reference.Reference( identifier=pmid[0], identifier_type="PubMed ID", source="PubMed", language=None) ref_set.append(temp_ref) return ref_set
def eutils_from_df(input_df, chunksize, output_csv): """ Retrieves and saves PubMed article content from PubMed via E-Utilities API to CSV file for set of PMIDs contained within Pandas Dataframe. Args: input_df: object name for Dataframe containing PMIDs of interest chunksize: number of PMIDs to pass to API output_csv: filename for CSV file to which article content will be saved Returns: CSV file with rows pertaining to article content for each PMID in input_csv. Columns correspond to fields retrieved via efetch client: 'PMID', 'Year', 'Title', 'Abstract', 'Authors', 'Journal', 'Volume', 'Issue', 'Pages', 'DOI', 'PMC' List and dataframe containing all PubMed article data successfully retrieved from database """ # Specifies names for output csv column headers fieldnames = [ 'PMID', 'Year', 'Title', 'Abstract', 'Authors', 'Journal', 'Volume', 'Issue', 'Pages', 'DOI', 'PMC', ] # Creates generator object containing each row in the input dataframe pm_chunks_gen = (input_df[i:i + chunksize] for i in range(0, len(input_df), chunksize)) # Initialises empty list for compilation of article dictionaries into single container pm_article_list = [] # Initialise eutils client to access NCBI E-Utilities API ec = eutils.Client() # Open CSV file to which each PubMed IDs downloaded data appended as a new row with specified column names with open(output_csv, 'a') as datafile: writer = csv.DictWriter( datafile, fieldnames=fieldnames, ) writer.writeheader() # Converts each chunk of PubMed IDs from dataframe to list for chunk_count, chunk in zip(range(0, len(input_df)), pm_chunks_gen): try: index_list = list(chunk.index.values) chunk_list = list(chunk['PMID']) print('Chunk No. ' + str(chunk_count)) # Passes chunk of PubMed IDs to E-Utilities API # Returns iterator object containing key data for each PubMed ID pm_article_set = iter(ec.efetch( db='pubmed', id=chunk_list, )) # Assigns each PubMed ID an index value # Iterates over pm_article_set to access data for each individual PubMed ID for id_index, id_value in enumerate(chunk_list): print(index_list[id_index], id_value) try: # For each PMID index/value pair, iterates through article set # Aggregates key article attributes for each PubMed ID into dictionary pm_article = next(pm_article_set) pm_article_content = dict( PMID=str(pm_article.pmid), Year=str(pm_article.year), Title=str(pm_article.title), Abstract=str(pm_article.abstract), Authors=str(pm_article.authors), Journal=str(pm_article.jrnl), Volume=str(pm_article.volume), Issue=str(pm_article.issue), Pages=str(pm_article.pages), DOI=str(pm_article.doi), PMC=str(pm_article.pmc), ) print(pm_article_content) print(pm_article.pmid + ' - Download from Enterez complete') # Saves dictionary as new item in list for later construction of dataframe pm_article_list.append(pm_article_content) print(pm_article.pmid + ' - Save to list complete') # Writes dictionary to new row of csv file for future reference writer.writerow(pm_article_content) print(pm_article.pmid + ' - Write Data to CSV Complete') # Except statements for content errors except ( StopIteration, TypeError, NameError, ValueError, lxml.etree.XMLSyntaxError, eutils.exceptions.EutilsNCBIError, ) as e1: print('Error: ' + str(e1)) continue # Except statements for network/connection errors except ( TimeoutError, RuntimeError, ConnectionError, ConnectionResetError, eutils.exceptions.EutilsRequestError, requests.exceptions.ConnectionError, ) as e2: print('Error: ' + str(e2)) time.sleep(10) continue except StopIteration: print('All downloads complete') break # Save list of dictionaries to dataframe & write to CSV file pm_article_df = pd.DataFrame.from_records( pm_article_list, columns=fieldnames, ) print('Save to DataFrame complete') datafile.close() return pm_article_df
def client(): return eutils.Client()
def __init__(self, *args, **kwargs): super(PubmedLoader, self).__init__(*args, **kwargs) self.client = eutils.Client()