def index(): ix = open_dir("indexdir") writer = ix.writer() for pfile in pubmed_files: print "parsing",pfile txt=open(project+"/"+pfile,"r") records=Medline.parse(txt) for r in records: if "AB" not in r: continue authors="" if "FAU" in r: authors+=",".join(r["FAU"]) elif "AU" in r: authors+=",".join(r["AU"]) else: firstAuthor="Unknown" date=datetime.datetime.strptime(r["DA"],"%Y%m%d") title=r["TI"] pmid=r["PMID"].decode("utf-8") writer.add_document( title=title.decode("utf-8"), path=pfile.decode("utf-8"), abstract=r['AB'].decode("utf-8"), authors=authors.decode("utf-8"), pmid=pmid, dateAdded=date ) writer.commit() print "Index contain",ix.doc_count()
def top_papers(papers,outpath=None,delim="\t", top=20): """This function fetches all the relevant PubMed info for each PMID in 'papers' and 1) puts it into a list and 2) outputs it to a file named in outpath.""" # # Can be used with SP & GOA data papers_annots = [(len(papers[p]), p) for p in papers] papers_annots2 = [] papers_annots.sort() idlist = [p[1] for p in papers_annots[-top:]] Entrez.email = "*****@*****.**" h = Entrez.efetch(db="pubmed", id=",".join(idlist), rettype="medline", retmode="text") medrecs = list(Medline.parse(h)) titles = [medrec.get("TI","?") for medrec in medrecs] years = [medrec.get("DP","?") for medrec in medrecs] journals = [medrec.get("JT", "?") for medrec in medrecs] for p, title, year, journal in zip(papers_annots[-top:], titles,years, journals): papers_annots2.append((p[0],p[1], title, year.split()[0].strip(), journal)) if outpath: fout = open(outpath,"w") print >> fout, "num proteins\tpubmed ID\tTitle\tYear\tJournal" for p in papers_annots2: print >> fout, "%d\t%s\t%s\t%s\t%s" % p fout.close() #papers_annots2 = [(# all annotations, PMID, Title, Year, Journal)] return papers_annots2
def store_abstracts_for_query(query,query_tag,maxN=None,preview_only=False): # if query_tag=="": # simpleQuery=" ".join(map(lambda x:x.name,queryTerms)) # else: # simpleQuery=query_tag # query=pg.build_query(queryTerms) print "will search",query Entrez.email = "*****@*****.**" search_results = Entrez.read(Entrez.esearch(db="pubmed", term=query, reldate=10*365, datetype="pdat", usehistory="y")) count = int(search_results["Count"]) print "Found %i results" % count if maxN!=None and maxN<count: count=maxN print "Only keeping first",count,"abstracts" if preview_only: return sys.stdout.flush() batch_size = 50 for start in range(0,count,batch_size): end = min(count, start+batch_size) print "Going to download record %i to %i" % (start+1, end) sys.stdout.flush() fetch_handle = Entrez.efetch(db="pubmed", rettype="medline", retmode="text", retstart=start, retmax=batch_size, webenv=search_results["WebEnv"], query_key=search_results["QueryKey"]) records=Medline.parse(fetch_handle) for r in records: pubmed_to_pg.store_medline_entry(r,query_tag)
def pubmed(): # Get the count of papers about orchid only in database pubmed Entrez.email = "*****@*****.**" # Always tell NCBI who you are handle = Entrez.egquery(term="orchid") record = Entrez.read(handle) for row in record["eGQueryResult"]: if row["DbName"] == "pubmed": print "The count of papers about orchid in database pubmed:", row["Count"] # Get the list of ids of above handle = Entrez.esearch(db="pubmed", term="orchid", retmax=100) record = Entrez.read(handle) idlist = record["IdList"] print "The id list of papers about orchid in database pubmed:", idlist print # Search papers author by "Liu ZJ" from pubmed handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") records = Medline.parse(handle) search_author = "Liu ZJ" for record in records: if "AU" not in record: continue if search_author in record["AU"]: print "Author %s found." % search_author print "title:", record.get("TI", "?") print "authors:", record.get("AU", "?") print "source:", record.get("SO", "?") print
def getMeSH(url): query = urllib.unquote_plus(url) if not query: return "<h3> No query </h3>" MAX_COUNT = 10000 Entrez.email = '*****@*****.**' pubmedquery = query.replace('-','\-') h = Entrez.esearch(db='pubmed', term=pubmedquery, retmax=MAX_COUNT) result = Entrez.read(h) ids = result['IdList'] if not ids: return "<h3> geen gevonden resultaten </h3>" h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text') records = Medline.parse(h) MeSHCount = 0 MeSHContent = "" for record in records: try: if "tox" in str(record.get("MH")): MeSHContent += "<h4><a href='http://www.ncbi.nlm.nih.gov/pubmed/"+str(record.get("PMID"))+"'>" MeSHContent += "PMID: "+str(record.get("PMID"))+"</a> is analysed on toxicity. </h4> \n" except (TypeError): continue; return MeSHContent
def createTable(query): if not query: return "<h3> No query </h3>" MAX_COUNT = 100 pubmedquery = query.replace("-", "\-") Entrez.email = "*****@*****.**" h = Entrez.esearch(db="pubmed", term=pubmedquery, retmax=MAX_COUNT) result = Entrez.read(h) ids = result["IdList"] h = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="text") records = Medline.parse(h) tableContent = "" for record in records: try: tableContent += ( "<tr><td width='22%'>" + str(record.get("TI")) + "</td>" "<td width='5%'>" + str(record.get("DP")) + "</td>" "<td width='5%'>" + str(writers(record.get("FAU"))) + "</td>" "<td width='5%'>" + str(record.get("JT")) + "</td>" "<td width='5%'>" + str(query) + "</td>" "<td>" "<a href='http://www.ncbi.nlm.nih.gov/pubmed/" + str(record.get("PMID")) + "'><img src='PubMed.png' height='75px' width='75px' alt='PubMed' align='right'/></a>" + str(record.get("AB")) + "</td></tr>" ) except (TypeError): continue return tableContent
def fetch(t, s): h = Entrez.esearch(db="pubmed", term=t, retmax=10000, sort=s) idList = Entrez.read(h)["IdList"] results = "Total publications for SA Beatson: **" + str(len(idList)) + "**\n\n" results += "Chronologically sorted:\n\n" if idList: handle = Entrez.efetch(db="pubmed", id=idList, rettype="medline", retmode="text") records = Medline.parse(handle) max = len(idList) + 1 for record in records: title = record["TI"] author = ", ".join(record["AU"]) source = record["SO"] pub_date = datetime.strptime(record["DA"], "%Y%m%d").date() pmid = record["PMID"] cur_pub = "| **%i.** %s\n| %s\n| %s\n| http://www.ncbi.nlm.nih.gov/pubmed/%s\n|\n" % ( max - 1, title, author, source, pmid, ) results = results + cur_pub max = max - 1 return results
def top_papers_dict(papers, outpath=None,delim="\t", top=None): """This function fetches all the relevent PubMed info for each PMID in 'papers' (at the limit supplied in 'top') and 1) puts it into a dict.""" # # Can be used with SP & GOA data # papers_annots = [(len(papers_prots[p]), p) for p in papers_prots] papers_annots = [(len(papers[p]), p) for p in papers] papers_annots2_dict = {} papers_annots.sort() if top is None: negTop = 0 else: negTop = -top idlist = [p[1] for p in papers_annots[negTop:]] Entrez.email = MY_EMAIL h = Entrez.efetch(db="pubmed", id=",".join(idlist), rettype="medline", retmode="text") medrecs = list(Medline.parse(h)) titles = [medrec.get("TI","?") for medrec in medrecs] years = [medrec.get("DP","?") for medrec in medrecs] journals = [medrec.get("JT", "?") for medrec in medrecs] for p, title, year, journal in zip(papers_annots[negTop:], titles,years, journals): #papers_annots2_dict[PMID] = [# of total annotations, Title, Year, Journal] papers_annots2_dict[p[1]] = [len(papers[p[1]]), title, year.split()[0].strip(), journal] """if outpath: fout = open(outpath,"w") print >> fout, "num proteins\tpubmed ID\tTitle\tYear\tJournal" for p in papers_annots2: print >> fout, "%d\t%s\t%s\t%s\t%s" % p fout.close() """ return papers_annots2_dict
def test_medline_from_url(self): """Test Entrez into Medline.read from URL""" efetch = Entrez.efetch(db="pubmed", id="19304878", rettype="medline", retmode="text") record = Medline.read(efetch) self.assertTrue(isinstance(record, dict)) self.assertEqual("19304878", record["PMID"]) self.assertEqual("10.1093/bioinformatics/btp163 [doi]", record["LID"])
def retrive_record(row): name=row[1]+"[AUTH]" handle = Entrez.esearch(db="pubmed",term=name) record=Entrez.read(handle) idlist=record["IdList"] handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") records = Medline.parse(handle) for record in records: temp=[] temp.append(row[0]) temp.append(row[1]) #title temp.append(record.get("TI","?")) #authors temp.append(record.get("AU","?")) # temp.append(record.get("AD","?")) # temp.append(record.get("DP","?")) #pubmed id for url temp.append(record.get("PMID","?")) return temp
def test_pubmed_16381885_and_19850725(self): """Bio.TogoWS.entry("pubmed", "16381885,19850725")""" handle = TogoWS.entry("pubmed", "16381885,19850725") records = list(Medline.parse(handle)) handle.close() self.assertEqual(len(records), 2) self.assertEqual(records[0]["TI"], "From genomics to chemical genomics: new developments in KEGG.") self.assertEqual( records[0]["AU"], [ "Kanehisa M", "Goto S", "Hattori M", "Aoki-Kinoshita KF", "Itoh M", "Kawashima S", "Katayama T", "Araki M", "Hirakawa M", ], ) self.assertEqual( records[1]["TI"], "DDBJ launches a new archive database with analytical tools " + "for next-generation sequence data.", ) self.assertEqual( records[1]["AU"], ["Kaminuma E", "Mashima J", "Kodama Y", "Gojobori T", "Ogasawara O", "Okubo K", "Takagi T", "Nakamura Y"], )
def createTable(query): if not query: return "<h3> No query </h3>" MAX_COUNT = 100 Entrez.email = '*****@*****.**' pubmedquery = query.replace('-','\-') h = Entrez.esearch(db='pubmed', term=pubmedquery, retmax=MAX_COUNT) result = Entrez.read(h) ids = result['IdList'] if not ids: return "<h3> geen gevonden resultaten </h3>" h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text') records = Medline.parse(h) tableContent = "" for record in records: try: tableContent += "<tr><td width='22%'>"+str(record.get("TI"))+"</td>"\ "<td width='5%'>"+str(record.get("DP"))+"</td>"\ "<td width='5%'>"+str(writers(record.get("FAU")))+"</td>"\ "<td width='5%'>"+str(record.get("JT"))+"</td>"\ "<td width='5%'>"+str(query)+"</td>"\ "<td>"\ "<a href='http://www.ncbi.nlm.nih.gov/pubmed/"+str(record.get("PMID"))+"'><img src='PubMed.png' height='75px' width='75px' alt='PubMed' align='right'/></a>"\ +str(record.get("AB"))+"</td></tr>" except (TypeError): continue; return tableContent
def fetch_pubmed_data(pmid): from Bio import Medline,Entrez try: ncbiemail= settings.author_email except: try: ncbiemail= settings.author_email except: raise Exception('Please set an email to use ncbi services') Entrez.email = ncbiemail Entrez.tool = 'mybiodb' try: entrez_response=Medline.parse( Entrez.efetch(db="pubmed", id=pmid, rettype="medline",retmode="text",)).next() if not entrez_response.has_key('PMID'): response.flash='pubmed ID error' else: return entrez_response except IOError: session.flash='Remote service not available, please try again.' return
def fetch(self, batchSize=100): """Return a batch of results. """ if self._done: return [] end = min(self._searchCount, self._searchPosition + batchSize) log.info("Downloading from %i to %i..." % (self._searchPosition+1, end)) fetchHandle = Entrez.efetch(db="pubmed", rettype="medline", retmode="text", retstart=self._searchPosition, retmax=batchSize, webenv=self._searchSession, query_key=self._queryKey) result = Medline.parse(fetchHandle) papers = [paper.Paper(r) for r in result if r.get("PMID") is not None ] fetchHandle.close() log.info("... downloading done") self._searchPosition = self._searchPosition + batchSize if self._searchPosition >= self._searchCount: self._done = True log.info("Search ended.") return papers
def search_pubmed(term): "Searches a term on pubmed" print("Searching for", term) try: # Do a first query handle = Entrez.egquery(term=term) record = Entrez.read(handle) nb_entries = 0 for row in record["eGQueryResult"]: if row["DbName"]=="pubmed": nb_entries = row["Count"] print(row["Count"], 'results found.') if int(nb_entries) == 0: return BibDatabase() # Search the IDs handle = Entrez.esearch(db="pubmed", term=term, retmax=min(int(nb_entries), MAX_RESULTS)) record = Entrez.read(handle) idlist = record["IdList"] # Get the descriptions handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") records = Medline.parse(handle) records = list(records) return transform_pubmed(records) except Exception as e: print('The search failed.') print(e) return []
def test_read(self): with open("Medline/pubmed_result1.txt") as handle: record = Medline.read(handle) self.assertEqual(record["PMID"], "12230038") self.assertEqual(record["OWN"], "NLM") self.assertEqual(record["STAT"], "MEDLINE") self.assertEqual(record["DA"], "20020916") self.assertEqual(record["DCOM"], "20030606") self.assertEqual(record["LR"], "20041117") self.assertEqual(record["PUBM"], "Print") self.assertEqual(record["IS"], "1467-5463 (Print)") self.assertEqual(record["VI"], "3") self.assertEqual(record["IP"], "3") self.assertEqual(record["DP"], "2002 Sep") self.assertEqual(record["TI"], "The Bio* toolkits--a brief overview.") self.assertEqual(record["PG"], "296-302") self.assertEqual(record["AB"], "Bioinformatics research is often difficult to do with commercial software. The Open Source BioPerl, BioPython and Biojava projects provide toolkits with multiple functionality that make it easier to create customised pipelines or analysis. This review briefly compares the quirks of the underlying languages and the functionality, documentation, utility and relative advantages of the Bio counterparts, particularly from the point of view of the beginning biologist programmer.") self.assertEqual(record["AD"], "tacg Informatics, Irvine, CA 92612, USA. [email protected]") self.assertEqual(record["FAU"], ["Mangalam, Harry"]) self.assertEqual(record["AU"], ["Mangalam H"]) self.assertEqual(record["LA"], ["eng"]) self.assertEqual(record["PT"], ["Journal Article"]) self.assertEqual(record["PL"], "England") self.assertEqual(record["TA"], "Brief Bioinform") self.assertEqual(record["JT"], "Briefings in bioinformatics") self.assertEqual(record["JID"], "100912837") self.assertEqual(record["SB"], "IM") self.assertEqual(record["MH"], ["*Computational Biology", "Computer Systems", "Humans", "Internet", "*Programming Languages", "*Software", "User-Computer Interface"]) self.assertEqual(record["EDAT"], "2002/09/17 10:00") self.assertEqual(record["MHDA"], "2003/06/07 05:00") self.assertEqual(record["PST"], "ppublish") self.assertEqual(record["SO"], "Brief Bioinform. 2002 Sep;3(3):296-302.")
def get_wikiref(pmid): """ Returns the Wiki cite journal entry for a given Pubmed ID """ handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text") records = Medline.parse(handle) records = list(records) import datetime now = datetime.datetime.now() jetzt= now.strftime("%Y-%m-%d") # generate the {{cite journal}} format for rec in records: aut = rec["AU"] firstauthor = aut.pop(0) coauthors = ", ".join(aut) # get date of publication # CRDT datee = rec["CRDT"][0].split('/')[0] #if datee == '': # datee = rec["PD"] outstring = "{{cite journal|title=%s|journal=%s|year=%s|author=%s|coauthors=%s|volume=%s|pages=%s|id=PMID %s|accessdate=%s}}" % \ (rec["TI"], rec["JT"], datee, firstauthor, coauthors, rec["VI"], rec["PG"], pmid, jetzt) # example: #{{cite journal|title=|journal=|date=2008/07/31/|first=Cyril|last=Herry|coauthors=i|volume=454|issue=7204|pages=600-606|id=PMID 18615015 {{doi|10.1038/nature07166}}|url=http://www.fmi.ch/downloads/news/2008.07.11.01.luthi.nature.press.release.pdf|format=|accessdate=2009-09-12 }} return outstring
def main(Substance, Organism, Gene): zoekterm1 = "Cocaine" zoekterm2 = "Elegans" MAX_COUNT = 50 dic = {} titels = [] TERM = '' TERMS = [] count = 1 if zoekterm2 == "": TERM = zoekterm1 if zoekterm1 == "": print("vul een zoekterm in") sys.exit() elif zoekterm2 != "": TERM = zoekterm1+" and "+zoekterm2 TERMS.append(TERM) print(TERM) handle = Entrez.esearch(db="pubmed", term= TERM, retmax=MAX_COUNT) record = Entrez.read(handle) idlist = record["IdList"] handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") records = Medline.parse(handle) records = list(records) for record in records: titel = record.get("PMID","?") titels.append(titel) pubSet = set(titels) dic[TERM] = pubSet print(dic) return "Jay"
def __init__(self, pmids): Entrez.email = '*****@*****.**' ## pmids is a list (array of pmid) handle = Entrez.efetch(db='pubmed', id=pmids, rettype='medline', retmode='text') self.records = Medline.parse(handle)
def getMedlineList(pmids): """ This function takes a list of article-ids and returns a list of MedLine articles that contains an abstract. """ records = [] cleaned_records = [] listLength = len(pmids) Entrez.email = '*****@*****.**' for i in range(0, listLength, 650): tempList = pmids[i:i + 650] handle = Entrez.efetch(db='pubmed', id=tempList,rettype='medline', retmode='text') try: records.extend(list(Medline.parse(handle))) except: IOmodule.writeOutTxt(_mainFolder+'/'+'errordir_medline_records', pmids[i], '') print 'Downloaded',len(records),'MedLine articles.',str(listLength-len(records)),'remaining...' for article in records: if 'AB' in article: cleaned_records.append(article) print 'Returned',len(cleaned_records),'MedLine articles containing an abstract.' return cleaned_records
def get_pubmed_document(pubmed_ids, source='pubmed', include_pubtator=True): Entrez.email = settings.ENTREZ_EMAIL if type(pubmed_ids) == list: ids = [str(doc_id) for doc_id in pubmed_ids] else: ids = [str(pubmed_ids)] h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text') records = Medline.parse(h) # Reference to abbreviations: http://www.nlm.nih.gov/bsd/mms/medlineelements.html for record in records: if record.get('TI') and record.get('AB') and record.get('PMID') and record.get('CRDT'): #if Document.objects.pubmed_count(record.get('PMID')) is 0: title = ' '.join( pad_split(record.get('TI')) ) abstract = ' '.join( pad_split(record.get('AB')) ) doc, doc_c = Document.objects.get_or_create(document_id=record.get('PMID')) doc.title = title doc.source = source doc.save() sec, sec_c = Section.objects.get_or_create(kind='t', document=doc) sec.text = title sec.save() sec, sec_c = Section.objects.get_or_create(kind='a', document=doc) sec.text = abstract sec.save() if include_pubtator: doc.init_pubtator()
def pubsearch(jids): Entrez.email = "*****@*****.**" # always let Entrez know who is calling pubterm = "" for i in jids: pubterm += i + "[JID] or " IDhandle = Entrez.esearch( db="pubmed", term="peptide AND (" + pubterm + " and ", mindate="2011", maxdate="2014", retmax=2500 ) # for documentation on esearch, see # http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch # max number for retmax is 100k. Use retstart to get more than this. # Date range used to limit a search result by the date specified by datetype. These two parameters (mindate, maxdate) must be used together to specify an arbitrary date range. The general date format is YYYY/MM/DD, and these variants are also allowed: YYYY, YYYY/MM. record = Entrez.read(IDhandle) # record is returned as a dictionary. Lists search terms, all ID numbners etc idlist = record["IdList"] # return a list of ID numbers from the record dictionary recordHandle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") # search pubmed for records with idlist as input records = Medline.parse(recordHandle) # create dictionary from recordHandle return records
def processInput(k): print "Querying PMID: "+str(k)+"." getall = Medline.read(Entrez.efetch(db="pubmed", id=k, rettype="medline", retmode="text")) singlemesh = getall.get("MH") singledate = getall.get("EDAT") for j1 in range(len(singlemesh)): cur.execute("INSERT INTO MeSH002(PMID, MeSH, Dates) VALUES("+str(k)+",'" + getall.get("MH")[j1][0:24].translate(None, "'*&")+"','" + str(singledate[0:10]) +"')" )
def get_pubmed_data(idlist): """Takes a list of pubmed ids and returns title, auth, yr""" handle = Entrez.efetch(db='pubmed', id=idlist, rettype='medline', retmode='text') records = Medline.parse(handle) mypms = [] for record in records: mypms.append((record["TI"], record["AU"], record["PMID"])) return mypms
def Pubmedsearch(PMID): pmid = "" pmid = PMID handle = Entrez.efetch(db="pubmed", id= pmid, rettype="medline",retmode="text") records = Medline.parse(handle) records = list(records) for record in records: return (str(pmid)+"\t"+str(record.get("TI", "?"))+"\t"+str(record.get("FAU", "?"))+"\t"+str(record.get("AU", "?"))+"\t"+str(record.get("AD", "?")))
def pubmedsearch (TERM, MAX_COUNT = 10000): # Returns an Entrez object matching *TERM* Entrez.email = '*****@*****.**' h = Entrez.esearch(db='pubmed', retmax=MAX_COUNT, term=TERM) result = Entrez.read(h) ids = result['IdList'] h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text') records = Medline.parse(h) return records
def get_articles(term): idlist = get_article_ids(term) counter=0 #pbar = make_pbar(len(idlist),text="Fetching") articles=[] if len(idlist) > 100: chunks=[idlist[i:i+100] for i in range(0, len(idlist), 100)] for chunk in chunks: handle = Entrez.efetch(db="pubmed", id=chunk, rettype="medline", retmode="text") articles.extend(list(Medline.parse(handle))) print '#' #pbar.update(p.currval+len(chunk)) else: handle=Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") articles.extend(list(Medline.parse(handle))) #pbar.finish() return articles
def get_record_from_pmid(pmid): # now get the actual citation; should really only be a singleton, # but this library likes to operate over lists citations = Entrez.efetch(db="pubmed",id=pmid, rettype="medline",retmode="text") # again, Bio likes to operate on lists, even though we only have # a singleton here record = list(Medline.parse(citations))[0] return record
def pubmedsearch (TERM, MAX_COUNT = 10000): # Returns an Entrez object matching *TERM* Entrez.email = '*****@*****.**' Entrez.tool = 'pm_impacts' h = Entrez.esearch(db='pubmed', retmax=MAX_COUNT, term=TERM) result = Entrez.read(h) ids = result['IdList'] h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text') records = Medline.parse(h) return records
def store_abstract_with_pmid(pmid,queryTag=None): """Populate the PG databases with the MEDLINE entries having these pmid. Pmid can is a scalar or a list of pmid """ if queryTag==None: queryTag="PMID" Entrez.email="*****@*****.**" handle=Entrez.efetch(db="pubmed",rettype="medline",retmode="text",id=pmid) for r in Medline.parse(handle): store_medline_entry(r,queryTag)
def collectPubmedInfo(email, term, record_dict, retmax, outputPath): Entrez.email = email # Use ESearch handle = Entrez.esearch(db="pubmed", term=term, retmax=retmax) record = Entrez.read(handle) count = record["Count"] idlist = record["IdList"] # Use pandas to save a formated file df_1 = pd.DataFrame({"PubmedIDs(%s)" % (count): idlist}) df_1.to_csv(outputPath + "PubmedIDs_1.txt", sep="\t") # Use EFetch to collect id and use medline to get details handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") records = Medline.parse(handle) for record in records: for key in record_dict.keys(): record_dict[key].append(record[key]) # Use pandas to save a formated file df_2 = pd.DataFrame(record_dict) df_2.to_csv(outputPath + "Results_1.txt", sep="\t")
def fetch_from_entrez(index, cache_dir=False): logger = logging.getLogger('build') # slugify the index for the cache filename (some indices have symbols not allowed in file names (e.g. /)) index_slug = slugify(index) cache_file_path = '{}/{}'.format('/'.join(cache_dir), index_slug) # try fetching from cache if cache_dir: d = fetch_from_cache(cache_dir, index_slug) if d: logger.info('Fetched {} from cache'.format(cache_file_path)) return d # if nothing is found in the cache, use the web API logger.info('Fetching {} from Entrez'.format(index)) tries = 0 max_tries = 5 while tries < max_tries: if tries > 0: logger.warning('Failed fetching pubmed {}, retrying'.format( str(index))) try: Entrez.email = '*****@*****.**' handle = Entrez.efetch(db="pubmed", id=str(index), rettype="medline", retmode="text") except: tries += 1 time.sleep(2) else: d = Medline.read(handle) # save to cache save_to_cache(cache_dir, index_slug, d) logger.info('Saved entry for {} in cache'.format(cache_file_path)) return d
def getMedlineAbstracts(idList): fields = { "TI": "title", "AU": "authors", "JT": "journal", "DP": "date", "MH": "keywords", "AB": "abstract", "PMID": "PMID" } pubmedUrl = "https://www.ncbi.nlm.nih.gov/pubmed/" abstracts = pd.DataFrame() try: handle = Entrez.efetch(db="pubmed", id=idList, rettype="medline", retmode="json") records = Medline.parse(handle) results = [] for record in records: aux = {} for field in fields: if field in record: aux[fields[field]] = record[field] if "PMID" in aux: aux["url"] = pubmedUrl + aux["PMID"] else: aux["url"] = "" results.append(aux) abstracts = pd.DataFrame.from_dict(results) except error.URLError as e: print("URLError: Request to Bio.Entrez failed. Error: {}".format(e)) except error.HTTPError as e: print("HTTPError: Request to Bio.Entrez failed. Error: {}".format(e)) except Exception as e: print("Request to Bio.Entrez failed. Error: {}".format(e)) return abstracts
def download_abstracts(dataset, path='.', email=None, out_file=None): """ Download the abstracts for a dataset/list of pmids """ try: from Bio import Entrez, Medline except: raise Exception( 'Module biopython is required for downloading abstracts from PubMed.' ) if email is None: raise Exception('No email address provided.') Entrez.email = email if isinstance(dataset, Dataset): pmids = dataset.image_table.ids.astype(str).tolist() elif isinstance(dataset, list): pmids = [str(pmid) for pmid in dataset] else: raise Exception('Dataset type not recognized: {0}'.format( type(dataset))) records = [] # PubMed only allows you to search ~1000 at a time. I chose 900 to be safe. chunks = [pmids[x:x + 900] for x in range(0, len(pmids), 900)] for chunk in chunks: h = Entrez.efetch(db='pubmed', id=chunk, rettype='medline', retmode='text') records += list(Medline.parse(h)) # Pull data for studies with abstracts data = [[study['PMID'], study['AB']] for study in records if study.get('AB', None)] df = pd.DataFrame(columns=['pmid', 'abstract'], data=data) if out_file is not None: df.to_csv(os.path.join(os.path.abspath(path), out_file), index=False) return df
def add_paper(pmid, created_by="OTTO", method_obtained="Curator triage"): """ Adds paper to referencedbentity table Parameters ---------- pmid: int created_by: str, optional method_obtained: str, optional Returns ------- object reference object """ record = Medline.read( Entrez.efetch(db="pubmed", id=str(pmid), rettype='medline')) rec_keys = list(record.keys()) if 'PMID' not in rec_keys: raise ValueError( 'Unable to fetch record feom pubmed. Make sure it is a valid PMID.' ) print(record) ncbi = DBSession.query(Source).filter_by(format_name='NCBI').one_or_none() source_id = ncbi.source_id ## insert into DBENTITY/REFERENCEDBENTITY/REFERENCEDOCUMENT [reference_id, authors, doi_url, pmc_url, sgdid, reference] = insert_referencedbentity(pmid, source_id, record, created_by, method_obtained) insert_authors(reference_id, authors, source_id, created_by) insert_pubtypes(pmid, reference_id, record.get('PT', []), source_id, created_by) insert_urls(pmid, reference_id, doi_url, pmc_url, source_id, created_by) # removed to support changes in http://redmine.stanford.edu/issues/4758 # insert_relations(pmid, reference_id, record, created_by) return reference
def read_data(self): path = self.path final_list = [] with open(path) as handle: records = Medline.parse(handle) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') for record in records: new_dict = {} check = self.check_preprocess_condition(record) if not check: continue # abstract = record.get('AB',False) # abstractWords = nltk.tokenize.word_tokenize(abstract) # sw = stopwords.words('english') # char_to_remove = [',','.','!','?',':'] # for word in sw: # if word in abstractWords: # abstractWords.remove(word) # final_ab = ' '.join(list(abstractWords)) # #remove punctuations # puncString = ".,:?!()0123456789" # final_ab = "".join(l for l in final_ab if l not in puncString) # final_ab = final_ab.lower() # for rmc in puncString: # final_ab=final_ab.replace(rmc,'') new_dict['PMID'] = record.get('PMID', '') new_dict['TI'] = record.get('TI', '') new_dict['OT'] = record.get('OT', []) new_dict['AB'] = record.get('AB', '') new_dict['tokens'] = record.get('tokens', '') final_list.append(new_dict) print "clean abastract count>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>", len( final_list) return final_list
def abs_search(gene_dict, pattern_dict, abstract_file, out_queue): try: result_dict = {} ABS_OPEN = open(abstract_file, 'r') all_abstracts = Medline.parse(ABS_OPEN) for abstract in all_abstracts: if 'AB' in abstract: abstract_text = re.sub(r'\r\n|\n|\s+|;', ' ', abstract['AB']) abstract_pmid = 'Unknown' abstract_journ = 'Unknown' if 'PMID' in abstract: abstract_pmid = abstract['PMID'] if 'SO' in abstract: abstract_journ = abstract['SO'] for key in gene_dict.keys(): for gene in gene_dict[key]: match = gene.search(abstract_text, re.MULTILINE) if match: if key not in result_dict: result_dict[key] = [] result_dict[key].append([abstract_pmid, match.group(0), \ abstract_text[match.start(0)-(match.start(0) if match.start(0) < 30 else 30):\ match.end(0)+(match.end(0) if match.end(0) < 30 else 30)]]) result = dict.fromkeys(pattern_dict.keys()) for pattern in sorted(pattern_dict.keys()): result[pattern] = [] for match in pattern_dict[pattern].finditer( abstract_text, re.MULTILINE): match = str(match.group(0)) if match not in result[pattern]: result[pattern].append(match) result_dict[key][-1].append(', '.join( result[pattern])) result_dict[key][-1].append(abstract_journ) ABS_OPEN.close() out_queue.put(result_dict) except: print("One of the processes got an exception and was killed")
def getAbstracts(ID): dictionary = {} dictionary_textmining = {} abstracts = [] keys = [] auteur = [] datum = [] titel = [] handle = Entrez.efetch(db="pubmed", id=ID, rettype='Medline', retmode='text') records = Medline.parse(handle) for record in records: PMID = record.get('PMID') auteurs = record.get('AU') if record.get('AB') is not None: abstract = record.get('AB') else: abstract = "-" date = record.get('DP') title = record.get('TI') if record.get('OT') is None: keywords = "-" else: keywords = record.get('OT') auteur.append(auteurs) abstracts.append(abstract) datum.append(date) titel.append(title) keys.append(keywords) dictionary[PMID] = [title, abstract, keywords, auteurs, date] dictionary_textmining[PMID] = [title, abstract, keywords] return keys, abstracts, auteur, datum, titel, dictionary, dictionary_textmining
def pmid2abstract_info(self, pmid_list): # make sure that pmid are strings pmid_list = [str(i) for i in pmid_list] try: handle = Entrez.efetch(db="pubmed", id=','.join(pmid_list), rettype="medline", retmode="text") records = Medline.parse(handle) except: print("FAIL:", pmid_list) return None pmid2data = {} for record in records: try: pmid = record["PMID"] except: print(record) #{'id:': ['696885 Error occurred: PMID 28696885 is a duplicate of PMID 17633143']} if 'duplicate' in record['id:']: duplicate = record['id:'].split(' ')[0] correct = record['id:'].split(' ')[-1] print("removing duplicated PMID... %s --> %s" % (duplicate, correct)) # remove duplicate from list pmid_list.remove(duplicate) return self.pmid2abstract_info(pmid_list) pmid2data[pmid] = {} pmid2data[pmid]["title"] = record.get("TI", "?") pmid2data[pmid]["authors"] = record.get("AU", "?") pmid2data[pmid]["source"] = record.get("SO", "?") pmid2data[pmid]["abstract"] = record.get("AB", "?") pmid2data[pmid]["journal"] = record.get("TA", "?") pmid2data[pmid]["year"] = record.get("DP", "?") pmid2data[pmid]["pmid"] = pmid return pmid2data
def get_paper(pmids): """ :param pmids: PubMed ids of papers :type pmids: list :rtype: str Возвращает название статьи и список авторов """ papers = [] handle = Entrez.efetch(db="pubmed", id=[str(pmid) for pmid in pmids], rettype="medline", retmode="text") records = Medline.parse(handle) for pm_record in records: authors = pm_record.get("AU", "?") if len(authors) > 2: authors = '%s, %s et al.' % (authors[0], authors[1]) papers.append( '%s, %s, %s' % (pm_record.get("TI", "?"), authors, pm_record.get("SO", "?"))) return '\n'.join(papers)
def test_pubmed_16381885(self): """Bio.TogoWS.entry("pubmed", "16381885").""" # Gives Medline plain text handle = TogoWS.entry("pubmed", "16381885") data = Medline.read(handle) handle.close() self.assertEqual( data["TI"], "From genomics to chemical genomics: new developments in KEGG.") self.assertEqual( data["AU"], [ "Kanehisa M", "Goto S", "Hattori M", "Aoki-Kinoshita KF", "Itoh M", "Kawashima S", "Katayama T", "Araki M", "Hirakawa M", ], )
def getCancerData(searchTerm, filename, email): f = open(filename, "w") Entrez.email = email # Always tell NCBI who you are handle = Entrez.egquery(term=searchTerm) record = Entrez.read(handle) for row in record["eGQueryResult"]: if row["DbName"] == "pubmed": print(row["Count"]) #prints number of articles retmax = row["Count"] retmax = 300000 handle = Entrez.esearch(db="pubmed", term=searchTerm, retmax=retmax) record = Entrez.read(handle) idlist = record["IdList"] handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") records = Medline.parse(handle) records = list(records) #all pmids are in this list for record in records: s = ", " authors = s.join(record.get("AU", "?")) count = count + 1 f.write("PMID: " + record.get("PMID", "?")) f.write("Title: " + record.get("TI", "?")) f.write("Authors: " + authors) #writes the title, author, f.write("Source: " + record.get("SO", "?")) #source and abstract to a file f.write("Abstract: " + record.get("AB", "?")) handle.close() f.close()
def parser(inputFile): print("Creating pubmedDB database ...") #Change current directory to where the code is saved since inputFile is a relational address: os.chdir(os.path.dirname(os.path.abspath(__file__))) with open(inputFile) as handle: # Each record is related to one article records = Medline.parse(handle) count = 0 # Fetch desired info of each record for record in records: abstractVal = record.get('AB') titleVal = record.get('TI') keywordsVal = record.get('OT') meshVal = record.get('MH') count += 1 # Insert record into pubmedDB mycol.insert_one({ "title": titleVal, "abstract": abstractVal, "keywords": [keywordsVal], "meshterms": [meshVal] }) print("Inserted {} records into pubmedDB".format(count))
def fetch_publication_list(citations, rettype='medline'): """ Fetch Publications. :param rettype: :param citations: :return: """ sys.stdout.write("=====================================") print(f"Fetching {len(citations)} publications. rettype: {rettype}.") citation_string = ','.join(citations) Entrez.email = '*****@*****.**' retries = 5 failed = True for i in range(retries): try: h = Entrez.efetch(db='pubmed', id=citation_string, rettype=rettype, retmode='text') failed = False except HTTPError: pass else: break finally: # we are not allowed to hit NCBI more than 3 times per second time.sleep(0.4) if failed: print("Failed to retrieve data from PubMed") records = [] else: if rettype == 'medline': records = Medline.parse(h) else: records = Entrez.parse(h) return records
def entrezQuery(idList, outFile): writeFile = open(outFile, 'w') handle = Entrez.efetch(db="pubmed", id=idList, rettype="medline", retmode="text") records = Medline.parse(handle) for record in records: #either return record entry or empty string pmid = record.get('PMID', '') title = record.get('TI', '') authors = record.get('FAU', '') authors = modifyAuthors(authors) #format authors journal = record.get('JT', '') date = record.get('DP', '') date = date[0:4] #year only abstract = record.get('AB', '') #need ascii encapsulation for uniformity with database writeFile.write( ascii(pmid) + '\t' + ascii(title) + '\t' + ascii(authors) + '\t' + ascii(journal) + '\t' + ascii(date) + '\t' + ascii(abstract) + '\n')
def _retrieve_record_batch(self, batch_start, batch_size): """Retrieves a PubMed article record batch. Retrieval is based on the info recovered by '_search_for_records()'. The batch size is limited by the 'batch_start' and 'batch_size' parameters. Returns a string containing the article info, if execution was successful and returns None otherwise. Args: batch_start (int): Specifies the starting index of this record batch. batch_size (int): Specifies the size of this records batch. Returns: list: A list of dictionaries that hold the data for each record. """ if None in [self.search_record_web_env, self.search_record_query_key]: raise ValueError( # Perform a search first! 'No WebEnv or QueryKey data in this PubMed class instance.' ) fetch_handle = Entrez.efetch( db='pubmed', rettype='medline', retmode='text', retstart=batch_start, retmax=batch_size, webenv=self.search_record_web_env, query_key=self.search_record_query_key ) data = Medline.parse(fetch_handle) records = [record for record in data] fetch_handle.close() return records
def get_pubmed_list(geneName): Entrez.email = "*****@*****.**" queryTerm = geneName + "[All Fields] AND (\"human\"[All Fields) AND (\"gene\"[All Fields) " handle = Entrez.esearch(db="pubmed", term=queryTerm, retmax=1000) record = Entrez.read(handle) idlist = record["IdList"] logging.info(geneName + ":\t" + record["Count"]) handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") records = Medline.parse(handle) records = list(records) return_value = {'pubmed_ids': []} for record in records: return_value['pubmed_ids'].append(record.get("PMID", "")) #print record.get("PMID", "") handle.close() return return_value
def buildlist(): #pull PMCID's from PMID's from Bio import Medline pmcidh = [] pmidh2 = [] handle = open("PMChand_medline.txt") records = Medline.parse(handle) for rec2 in records: try: pmci = rec2['PMC'] pmcidh.append(pmci) pmidh2.append(rec2["PMID"]) except: continue handle.close() #Query PubTator import requests pmcid_fail = [] urlxml = [""] * 52 for i in range(52): url = "https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmcids=" + pmcidh[ i] try: response = requests.get(url, timeout=3) if response.status_code == 200: urlxml[i] = response.text except: print(pmcidh[i]) pmcid_fail.append(pmcidh[i]) continue global pmcid2pmid pmcid2pmid = dict(zip(pmcidh, pmidh2)) return pmcid_fail, urlxml, pmcid2pmid, pmidh2
def pub_med_parser(drug, side_effect): drug_eng = Drugs.drugs(drug) side_effect = Sideeffect.sideEffect(side_effect) Entrez.email = "*****@*****.**" terms = "((" + drug_eng[ 0] + "[Title]) AND " + side_effect + "[Title/Abstract])" handle = Entrez.esearch(db="pubmed", term=terms, rettype="medline", retmode="text") record = Entrez.read(handle) handle.close() idlist = record["IdList"] handle2 = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") records = Medline.parse(handle2) records = list(records) var = 0 titres = [] for record in records: titre = record.get("TI", "?") titres.append(titre) for i in titres: if drug_eng[0] in i and side_effect in i: var += 1 if var != 0: return True else: return False
def download_many(ids, callback_fn, broken_fn=None, batchsize=500, parser=None): """Download multiple PubMed records, no return value (DEPRECATED). Please use Bio.Entrez instead as described in the Biopython Tutorial. Download many records from PubMed. ids is a list of either the Medline Unique ID or the PubMed ID's of the articles. Each time a record is downloaded, callback_fn is called with the text of the record. broken_fn is an optional function that is called with the id of records that were not able to be downloaded. batchsize is the number of records to request each time. """ # parser is an undocumented parameter that allows people to # specify an optional parser to handle each record. This is # dangerous because the results may be malformed, and exceptions # in the parser may disrupt the whole download process. if batchsize > 500 or batchsize < 1: raise ValueError("batchsize must be between 1 and 500") current_batchsize = batchsize # Loop until all the ids are processed. We want to process as # many as possible with each request. Unfortunately, errors can # occur. Some id may be incorrect, or the server may be # unresponsive. In addition, one broken id out of a list of id's # can cause a non-specific error. Thus, the strategy I'm going to # take, is to start by downloading as many as I can. If the # request fails, I'm going to half the number of records I try to # get. If there's only one more record, then I'll report it as # broken and move on. If the request succeeds, I'll double the # number of records until I get back up to the batchsize. nsuccesses = 0 while ids: if current_batchsize > len(ids): current_batchsize = len(ids) id_str = ','.join(ids[:current_batchsize]) try: # Query PubMed. If one or more of the id's are broken, # this will raise an IOError. handle = Entrez.efetch(db="pubmed", id=id_str, retmode='text', rettype='medlars') # I'm going to check to make sure PubMed returned the same # number of id's as I requested. If it didn't then I'm going # to raise an exception. This could take a lot of memory if # the batchsize is large. results = handle.read() num_ids = 0 for x in Medline.Iterator(File.StringHandle(results)): num_ids = num_ids + 1 if num_ids != current_batchsize: raise IOError handle = File.StringHandle(results) except IOError: # Query did not work. if current_batchsize == 1: # There was only 1 id in the query. Report it as # broken and move on. id = ids.pop(0) if broken_fn is not None: broken_fn(id) else: # I don't know which one is broken. Try again with # fewer id's. current_batchsize = current_batchsize / 2 nsuccesses = 0 continue nsuccesses = nsuccesses + 1 # Iterate through the results and pass the records to the # callback. idnum = 0 for rec in Medline.Iterator(handle, parser): callback_fn(ids[idnum], rec) idnum = idnum + 1 ids = ids[current_batchsize:] # If I'm not downloading the maximum number of articles, # double the number for next time. if nsuccesses >= 2 and current_batchsize < batchsize: current_batchsize = current_batchsize * 2 if current_batchsize > batchsize: current_batchsize = batchsize
"""Example script showing how to interact with PubMed.""" # standard library import string # biopython from Bio import PubMed from Bio import Medline # do the search and get the ids search_term = 'orchid' orchid_ids = PubMed.search_for(search_term) print orchid_ids # access Medline through a dictionary interface that returns PubMed Records rec_parser = Medline.RecordParser() medline_dict = PubMed.Dictionary(parser=rec_parser) for id in orchid_ids[0:5]: cur_record = medline_dict[id] print 'title:', string.rstrip(cur_record.title) print 'authors:', cur_record.authors print 'source:', string.strip(cur_record.source) print
from Bio import Entrez, Medline Entrez.email = "*****@*****.**" count = 1 dst = open("/tmp/pmc.csv", "w") #for i in range(0,4): search_handle = Entrez.esearch(db="pmc", usehistory="y", term='Multimodal AND "Deep Learning" AND (cancer OR tumour OR neoplasm)', retmax=400, retstart=0) page_record = Entrez.read(search_handle) for pmcid in page_record['IdList']: print("Fetching pmcid = " + pmcid) fetch_handle = Entrez.efetch(db='pmc', rettype="medline", retmode="text", id=pmcid) records = Medline.parse(fetch_handle) for record in records: if 'AU' in record: author = ','.join(record['AU']) print(author) else: author = '' if 'AID' in record: doi = ','.join(record['AID']) print(doi) else: doi = '' if 'PMC' in record: pmc = record['PMC'] print(pmc) else:
y41, y42 = 2000, 2004 ####D-1 handle = Entrez.esearch(db="pubmed", term=findkey, retmax=nofref, mindate=str(y11), maxdate=str(y12)) record = Entrez.read(handle) handle.close() idlist = record["IdList"] handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") records = Medline.parse(handle) records = list(records) w, h = 5, len(records) maa = [[None for x in range(w)] for y in range(h)] for i in range(0, len(records)): maa[i][0] = records[i].get("PMID", "?") maa[i][1] = records[i].get("TI", "?") maa[i][2] = records[i].get("AB", "?") maa[i][3] = records[i].get("DP") maa[i][4] = records[i].get("PT", "?") ma1 = pd.DataFrame(maa) ma1.columns = ['PMID', 'TI', 'AB', 'DP', 'PT']
def collect_NCBI(): global all_pmids global pmid_dict if os.path.exists(f'./{rel_name}/{rel_name}_pmid_dict.json'): with open(f'./{rel_name}/{rel_name}_pmid_dict.json', 'r') as f: jd = f.read() temp_dict = json.loads(jd) pmid_dict.update(temp_dict) return pmid_dict for idx in tqdm(range(len(all_pmids))): pmid = all_pmids[idx] # get records for each pmid fetch_records_handle1 = efetch(db="pubmed", id=str(pmid), rettype="medline", retmode="text") # parse fetched records records1 = Medline.parse(fetch_records_handle1) # Need to iterate over records to extract information for record1 in records1: # try except check to be sure that NCBI is not returning empty result try: # let's get pmcid if exists id2 = record1['PMC'][3:] #print('PMC',id2) # get records for pmcid fetch_records_handle2 = efetch(db="pubmed", id=str(id2), rettype="medline", retmode="text") # parse records for pmcid records2 = Medline.parse(fetch_records_handle2) # Need to iterate over records to extract information ''' Collect following information: authors, authors' affiliations, publication date, citations, grants Store all these information in an dictionary (pmid_dict) ''' for record2 in records2: authors = record2['FAU'] affiliations = record2['AD'] pub_date = record2['DCOM'] citations = get_links_id(pmid) grants = record2['GR'] pmid_dict[pmid] = { 'pmcid_number': id2, 'pmcid': True, 'authors': authors, 'affiliations': affiliations, 'grants': grants, 'pub_date': pub_date, 'citations': citations } except: authors = record1['FAU'] try: affiliations = record1['AD'] except: affiliations = '' try: pub_date = record1['DCOM'] except: pub_date = '' try: citations = get_links_id(pmid) except: citations = '' try: grants = record1['GR'] except: grants = '' pmid_dict[pmid] = { 'pmcid_number': '', 'pmcid': False, 'authors': authors, 'affiliations': affiliations, 'grants': grants, 'pub_date': pub_date, 'citations': citations } with open(f'./{rel_name}/{rel_name}_pmid_dict.json', 'w') as output: output.write(json.dumps(pmid_dict)) return pmid_dict
def query_search_pubmed(query: str, ret_max: str, email: str, min_date: str, max_date: str): """Search PubMed via the user's query supplied through the command line Parameters ---------- query: a query to be searched against PubMed database email: a user's email to access to the PubMed database ret_max: total number of records from query to be retrieved min_date: the minimum or start date to search max_date: the maximum or end date to search Return ------- retrieve document summaries as records """ Entrez.email = email if min_date and max_date: # search the PubMed db for the entered query search = Entrez.esearch(db="pubmed", term=query, sort="relevance", retmode="text", retmax=ret_max, mindate=min_date, maxdate=max_date) else: # search the PubMed db for the entered query search = Entrez.esearch(db="pubmed", term=query, sort="relevance", retmode="text", retmax=ret_max, usehistory='y') search_records = Entrez.read(search) search.close() # get the list of ids for the searched records list_ids = search_records['IdList'] print(f"\nTotal of {len(list_ids)} records retrieved!") ids = ",".join(list_ids) # return document summaries as a result handle fetch_records = Entrez.efetch(db="pubmed", id=ids, rettype="Medline", retmode="text", webenv=search_records['WebEnv'], query_key=search_records['QueryKey']) search_results = Medline.parse(fetch_records) # fetch_records.close() return search_results
def test_parse(self): handle = open("Medline/pubmed_result2.txt") records = Medline.parse(handle) record = next(records) self.assertEqual(record["PMID"], "16403221") self.assertEqual(record["OWN"], "NLM") self.assertEqual(record["STAT"], "MEDLINE") self.assertEqual(record["DA"], "20060220") self.assertEqual(record["DCOM"], "20060314") self.assertEqual(record["PUBM"], "Electronic") self.assertEqual(record["IS"], "1471-2105 (Electronic)") self.assertEqual(record["VI"], "7") self.assertEqual(record["DP"], "2006") self.assertEqual( record["TI"], "A high level interface to SCOP and ASTRAL implemented in python.") self.assertEqual(record["PG"], "10") self.assertEqual( record["AB"], "BACKGROUND: Benchmarking algorithms in structural bioinformatics often involves the construction of datasets of proteins with given sequence and structural properties. The SCOP database is a manually curated structural classification which groups together proteins on the basis of structural similarity. The ASTRAL compendium provides non redundant subsets of SCOP domains on the basis of sequence similarity such that no two domains in a given subset share more than a defined degree of sequence similarity. Taken together these two resources provide a 'ground truth' for assessing structural bioinformatics algorithms. We present a small and easy to use API written in python to enable construction of datasets from these resources. RESULTS: We have designed a set of python modules to provide an abstraction of the SCOP and ASTRAL databases. The modules are designed to work as part of the Biopython distribution. Python users can now manipulate and use the SCOP hierarchy from within python programs, and use ASTRAL to return sequences of domains in SCOP, as well as clustered representations of SCOP from ASTRAL. CONCLUSION: The modules make the analysis and generation of datasets for use in structural genomics easier and more principled." ) self.assertEqual( record["AD"], "Bioinformatics, Institute of Cell and Molecular Science, School of Medicine and Dentistry, Queen Mary, University of London, London EC1 6BQ, UK. [email protected]" ) self.assertEqual( record["FAU"], ["Casbon, James A", "Crooks, Gavin E", "Saqi, Mansoor A S"]) self.assertEqual(record["AU"], ["Casbon JA", "Crooks GE", "Saqi MA"]) self.assertEqual(record["LA"], ["eng"]) self.assertEqual(record["PT"], ["Evaluation Studies", "Journal Article"]) self.assertEqual(record["DEP"], "20060110") self.assertEqual(record["PL"], "England") self.assertEqual(record["TA"], "BMC Bioinformatics") self.assertEqual(record["JT"], "BMC bioinformatics") self.assertEqual(record["JID"], "100965194") self.assertEqual(record["SB"], "IM") self.assertEqual(record["MH"], [ "*Database Management Systems", "*Databases, Protein", "Information Storage and Retrieval/*methods", "Programming Languages", "Sequence Alignment/*methods", "Sequence Analysis, Protein/*methods", "Sequence Homology, Amino Acid", "*Software", "*User-Computer Interface" ]) self.assertEqual(record["PMC"], "PMC1373603") self.assertEqual(record["EDAT"], "2006/01/13 09:00") self.assertEqual(record["MHDA"], "2006/03/15 09:00") self.assertEqual(record["PHST"], [ "2005/06/17 [received]", "2006/01/10 [accepted]", "2006/01/10 [aheadofprint]" ]) self.assertEqual( record["AID"], ["1471-2105-7-10 [pii]", "10.1186/1471-2105-7-10 [doi]"]) self.assertEqual(record["PST"], "epublish") self.assertEqual(record["SO"], "BMC Bioinformatics. 2006 Jan 10;7:10.") record = next(records) self.assertEqual(record["PMID"], "16377612") self.assertEqual(record["OWN"], "NLM") self.assertEqual(record["STAT"], "MEDLINE") self.assertEqual(record["DA"], "20060223") self.assertEqual(record["DCOM"], "20060418") self.assertEqual(record["LR"], "20061115") self.assertEqual(record["PUBM"], "Print-Electronic") self.assertEqual(record["IS"], "1367-4803 (Print)") self.assertEqual(record["VI"], "22") self.assertEqual(record["IP"], "5") self.assertEqual(record["DP"], "2006 Mar 1") self.assertEqual( record["TI"], "GenomeDiagram: a python package for the visualization of large-scale genomic data." ) self.assertEqual(record["PG"], "616-7") self.assertEqual( record["AB"], "SUMMARY: We present GenomeDiagram, a flexible, open-source Python module for the visualization of large-scale genomic, comparative genomic and other data with reference to a single chromosome or other biological sequence. GenomeDiagram may be used to generate publication-quality vector graphics, rastered images and in-line streamed graphics for webpages. The package integrates with datatypes from the BioPython project, and is available for Windows, Linux and Mac OS X systems. AVAILABILITY: GenomeDiagram is freely available as source code (under GNU Public License) at http://bioinf.scri.ac.uk/lp/programs.html, and requires Python 2.3 or higher, and recent versions of the ReportLab and BioPython packages. SUPPLEMENTARY INFORMATION: A user manual, example code and images are available at http://bioinf.scri.ac.uk/lp/programs.html." ) self.assertEqual( record["AD"], "Plant Pathogen Programme, Scottish Crop Research Institute, Invergowrie, Dundee DD2 5DA, Scotland, UK. [email protected]" ) self.assertEqual(record["FAU"], [ "Pritchard, Leighton", "White, Jennifer A", "Birch, Paul R J", "Toth, Ian K" ]) self.assertEqual(record["AU"], ["Pritchard L", "White JA", "Birch PR", "Toth IK"]) self.assertEqual(record["LA"], ["eng"]) self.assertEqual( record["PT"], ["Journal Article", "Research Support, Non-U.S. Gov't"]) self.assertEqual(record["DEP"], "20051223") self.assertEqual(record["PL"], "England") self.assertEqual(record["TA"], "Bioinformatics") self.assertEqual(record["JT"], "Bioinformatics (Oxford, England)") self.assertEqual(record["JID"], "9808944") self.assertEqual(record["SB"], "IM") self.assertEqual(record["MH"], [ "Chromosome Mapping/*methods", "*Computer Graphics", "*Database Management Systems", "*Databases, Genetic", "Information Storage and Retrieval/methods", "*Programming Languages", "*Software", "*User-Computer Interface" ]) self.assertEqual(record["EDAT"], "2005/12/27 09:00") self.assertEqual(record["MHDA"], "2006/04/19 09:00") self.assertEqual(record["PHST"], ["2005/12/23 [aheadofprint]"]) self.assertEqual( record["AID"], ["btk021 [pii]", "10.1093/bioinformatics/btk021 [doi]"]) self.assertEqual(record["PST"], "ppublish") self.assertEqual( record["SO"], "Bioinformatics. 2006 Mar 1;22(5):616-7. Epub 2005 Dec 23.") record = next(records) self.assertEqual(record["PMID"], "14871861") self.assertEqual(record["OWN"], "NLM") self.assertEqual(record["STAT"], "MEDLINE") self.assertEqual(record["DA"], "20040611") self.assertEqual(record["DCOM"], "20050104") self.assertEqual(record["LR"], "20061115") self.assertEqual(record["PUBM"], "Print-Electronic") self.assertEqual(record["IS"], "1367-4803 (Print)") self.assertEqual(record["VI"], "20") self.assertEqual(record["IP"], "9") self.assertEqual(record["DP"], "2004 Jun 12") self.assertEqual(record["TI"], "Open source clustering software.") self.assertEqual(record["PG"], "1453-4") self.assertEqual( record["AB"], "SUMMARY: We have implemented k-means clustering, hierarchical clustering and self-organizing maps in a single multipurpose open-source library of C routines, callable from other C and C++ programs. Using this library, we have created an improved version of Michael Eisen's well-known Cluster program for Windows, Mac OS X and Linux/Unix. In addition, we generated a Python and a Perl interface to the C Clustering Library, thereby combining the flexibility of a scripting language with the speed of C. AVAILABILITY: The C Clustering Library and the corresponding Python C extension module Pycluster were released under the Python License, while the Perl module Algorithm::Cluster was released under the Artistic License. The GUI code Cluster 3.0 for Windows, Macintosh and Linux/Unix, as well as the corresponding command-line program, were released under the same license as the original Cluster code. The complete source code is available at http://bonsai.ims.u-tokyo.ac.jp/mdehoon/software/cluster. Alternatively, Algorithm::Cluster can be downloaded from CPAN, while Pycluster is also available as part of the Biopython distribution." ) self.assertEqual( record["AD"], "Human Genome Center, Institute of Medical Science, University of Tokyo, 4-6-1 Shirokanedai, Minato-ku, Tokyo, 108-8639 Japan. [email protected]" ) self.assertEqual( record["FAU"], ["de Hoon, M J L", "Imoto, S", "Nolan, J", "Miyano, S"]) self.assertEqual(record["AU"], ["de Hoon MJ", "Imoto S", "Nolan J", "Miyano S"]) self.assertEqual(record["LA"], ["eng"]) self.assertEqual(record["PT"], [ "Comparative Study", "Evaluation Studies", "Journal Article", "Validation Studies" ]) self.assertEqual(record["DEP"], "20040210") self.assertEqual(record["PL"], "England") self.assertEqual(record["TA"], "Bioinformatics") self.assertEqual(record["JT"], "Bioinformatics (Oxford, England)") self.assertEqual(record["JID"], "9808944") self.assertEqual(record["SB"], "IM") self.assertEqual(record["MH"], [ "*Algorithms", "*Cluster Analysis", "Gene Expression Profiling/*methods", "Pattern Recognition, Automated/methods", "*Programming Languages", "Sequence Alignment/*methods", "Sequence Analysis, DNA/*methods", "*Software" ]) self.assertEqual(record["EDAT"], "2004/02/12 05:00") self.assertEqual(record["MHDA"], "2005/01/05 09:00") self.assertEqual(record["PHST"], ["2004/02/10 [aheadofprint]"]) self.assertEqual( record["AID"], ["10.1093/bioinformatics/bth078 [doi]", "bth078 [pii]"]) self.assertEqual(record["PST"], "ppublish") self.assertEqual( record["SO"], "Bioinformatics. 2004 Jun 12;20(9):1453-4. Epub 2004 Feb 10.") record = next(records) self.assertEqual(record["PMID"], "14630660") self.assertEqual(record["OWN"], "NLM") self.assertEqual(record["STAT"], "MEDLINE") self.assertEqual(record["DA"], "20031121") self.assertEqual(record["DCOM"], "20040722") self.assertEqual(record["LR"], "20061115") self.assertEqual(record["PUBM"], "Print") self.assertEqual(record["IS"], "1367-4803 (Print)") self.assertEqual(record["VI"], "19") self.assertEqual(record["IP"], "17") self.assertEqual(record["DP"], "2003 Nov 22") self.assertEqual( record["TI"], "PDB file parser and structure class implemented in Python.") self.assertEqual(record["PG"], "2308-10") self.assertEqual( record["AB"], "The biopython project provides a set of bioinformatics tools implemented in Python. Recently, biopython was extended with a set of modules that deal with macromolecular structure. Biopython now contains a parser for PDB files that makes the atomic information available in an easy-to-use but powerful data structure. The parser and data structure deal with features that are often left out or handled inadequately by other packages, e.g. atom and residue disorder (if point mutants are present in the crystal), anisotropic B factors, multiple models and insertion codes. In addition, the parser performs some sanity checking to detect obvious errors. AVAILABILITY: The Biopython distribution (including source code and documentation) is freely available (under the Biopython license) from http://www.biopython.org" ) self.assertEqual( record["AD"], "Department of Cellular and Molecular Interactions, Vlaams Interuniversitair Instituut voor Biotechnologie and Computational Modeling Lab, Department of Computer Science, Vrije Universiteit Brussel, Pleinlaan 2, 1050 Brussels, Belgium. [email protected]" ) self.assertEqual(record["FAU"], ["Hamelryck, Thomas", "Manderick, Bernard"]) self.assertEqual(record["AU"], ["Hamelryck T", "Manderick B"]) self.assertEqual(record["LA"], ["eng"]) self.assertEqual(record["PT"], [ "Comparative Study", "Evaluation Studies", "Journal Article", "Research Support, Non-U.S. Gov't", "Validation Studies" ]) self.assertEqual(record["PL"], "England") self.assertEqual(record["TA"], "Bioinformatics") self.assertEqual(record["JT"], "Bioinformatics (Oxford, England)") self.assertEqual(record["JID"], "9808944") self.assertEqual(record["RN"], ["0 (Macromolecular Substances)"]) self.assertEqual(record["SB"], "IM") self.assertEqual(record["MH"], [ "Computer Simulation", "Database Management Systems/*standards", "*Databases, Protein", "Information Storage and Retrieval/*methods/*standards", "Macromolecular Substances", "*Models, Molecular", "*Programming Languages", "Protein Conformation", "*Software" ]) self.assertEqual(record["EDAT"], "2003/11/25 05:00") self.assertEqual(record["MHDA"], "2004/07/23 05:00") self.assertEqual(record["PST"], "ppublish") self.assertEqual(record["SO"], "Bioinformatics. 2003 Nov 22;19(17):2308-10.") self.assertRaises(StopIteration, next, records) handle.close()
from Bio import Medline # example script to download medline version of the pubmed query # esearch -db pubmed -query 'antimicrobial resistance' | efilter -mindate 1950 -maxdate 1990 -datetype PDAT | efetch -format medline > 50_90_medline.txt medline=[] with open('../data/medline/10_18_medline.txt') as medline_file: records = Medline.parse(medline_file) for record in records: medline.append(record) for i in range(len(medline)): medline_entry = medline[0] print(medline_entry) # outpath = 'mesh_10_18/'+ medline_entry.get('PMID') # with open(outpath,'w') as file: # file.write(str(medline_entry.get('MH'))) # file.close() # example script to move expty file # grep -lrIZ None | xargs -r0 mv -t nonefile/ --
def handle(self, *args, **options): for term in args: print "buscando [%s]" % term handle = Entrez.esearch(db="pubmed", retmax=10, term=term) record = Entrez.read(handle) ids_list = record['IdList'] for id in ids_list: a = Entrez.efetch(db="pubmed", id=id, rettype='medline', retmode='text') ff = a.readlines() records = Medline.parse(ff) r = records.next() try: cit = Citation() cit.pmid = int(r['PMID']) cit.title = r['TI'] if 'TI' in r.keys() else None cit.abstract = r['AB'] if 'AB' in r.keys() else None cit.pagination = r['PG'] if 'PG' in r.keys() else None cit.copyright_information = " ; ".join( r['CI']) if 'CI' in r.keys() else None # dates if 'CRDT' in r.keys(): conv = time.strptime(r['CRDT'][0], "%Y/%m/%d %H:%M") cit.date_created = datetime.datetime(*conv[:6]) if 'DCOM' in r.keys(): # 'DCOM': '19990406' conv = time.strptime(r['DCOM'], "%Y%m%d") cit.date_completed = datetime.datetime(*conv[:6]) if 'LR' in r.keys(): conv = time.strptime(r['LR'], "%Y%m%d") cit.date_revised = datetime.datetime(*conv[:6]) if 'DEP' in r.keys(): conv = time.strptime(r['DEP'], "%Y%m%d") cit.date_electronic_publication = datetime.datetime( *conv[:6]) cit.save() # relationships # type if 'PT' in r.keys(): for pub_type in r['PT']: (pt, created) = PubType.objects.get_or_create( pub_type=pub_type) cit.pub_types.add(pt) # authors if 'AU' in r.keys(): for i, author in enumerate(r['AU']): if author != 'et al.': (a, created) = Author.objects.get_or_create( name=author, full_name=r['FAU'][i]) cit.authors.add(a) # language if 'LA' in r.keys(): for lang in r['LA']: (l, created) = Language.objects.get_or_create( language=lang) cit.languages.add(l) # affiliation if 'AD' in r.keys(): (organization, created) = Organization.objects.get_or_create( name=r['AD']) cit.affiliation = organization # journal if 'JID' in r.keys(): issn = r['IS'] if 'IS' in r.keys() else None volume = r['VI'] if 'VI' in r.keys() else None issue = r['IP'] if 'IP' in r.keys() else None if not 'PL' in r: r['PL'] = 'Mexico' (journal, created) = Journal.objects.get_or_create( jid=r['JID'], issn=issn, volume=volume, issue=issue, title=r['JT'], iso_abbreviation=r['TA'], country=r['PL']) cit.journal = journal cit.save() # meshterms if 'MH' in r: for term in r['MH']: term_subs = term.split('/') if term_subs[0].startswith('*'): (mh, created) = Meshterm.objects.get_or_create( term=term_subs[0][1:]) major = True else: (mh, created) = Meshterm.objects.get_or_create( term=term_subs[0]) major = False mc = Meshcitation.objects.create(meshterm=mh, citation=cit, major=major) if len(term_subs) > 1: for subterm in term_subs[1:]: if subterm.startswith('*'): major = True subterm = subterm[1:] else: major = False (sh, created ) = Subheading.objects.get_or_create( term=subterm) sht = Subheadingterm.objects.create( subheading=sh, meshcitation=mc, major=major) self.stdout.write('%s' % cit) except: print "error trying to load %s" % r['PMID'] import pprint import sys print sys.exc_info()[0] pprint.pprint(r) raise
abs_count.append(result["Count"]) ids = result["IdList"] # only download abstracts if there are at least 5 available if len(ids) > 5: batches = [ids[x:x + 10] for x in range(0, len(ids), batch_size)] record_list = [] for batch in tqdm(batches): h = Entrez.efetch(db="pubmed", id=batch, rettype="medline", retmode="text") records = Medline.parse(h) record_list.extend(list(records)) if len(record_list) != 0: # if the recorsd list is not empty record_list_df = pd.DataFrame(record_list) # make a data frame record_list_df = record_list_df[record_list_df['AB'].notna( )] # keep rows without na in abstract column record_list_df['AB'] = record_list_df['AB'].str.lower( ) # make all text lower case record_list_df['AB'] = record_list_df['AB'].apply( remove_punctuation) # remove punctuation text_list = [row.split(' ') for row in record_list_df['AB'] ] # convert into list of single words
def download_abstracts(dataset, email): """Download the abstracts for a list of PubMed IDs. Uses the BioPython package. .. versionadded:: 0.0.2 Parameters ---------- dataset : :obj:`nimare.dataset.Dataset` A Dataset object where IDs are in the form PMID-EXPID email : :obj:`str` Email address to use to call the PubMed API Returns ------- dataset : :obj:`nimare.dataset.Dataset` Warning ------- This function assumes that the dataset uses identifiers in the format [PMID-EXPID]. Thus, the ``study_id`` column of the :obj:`nimare.dataset.Dataset.texts` DataFrame should correspond to PMID. """ try: from Bio import Entrez, Medline except ImportError: raise Exception( "Module biopython is required for downloading abstracts from PubMed." ) Entrez.email = email if isinstance(dataset, Dataset): pmids = dataset.texts["study_id"].astype(str).tolist() pmids = sorted(list(set(pmids))) elif isinstance(dataset, list): pmids = [str(pmid) for pmid in dataset] else: raise Exception(f"Dataset type not recognized: {type(dataset)}") records = [] # PubMed only allows you to search ~1000 at a time. I chose 900 to be safe. chunks = [pmids[x:x + 900] for x in range(0, len(pmids), 900)] for i, chunk in enumerate(chunks): LGR.info(f"Downloading chunk {i + 1} of {len(chunks)}") h = Entrez.efetch(db="pubmed", id=chunk, rettype="medline", retmode="text") records += list(Medline.parse(h)) # Pull data for studies with abstracts data = [[study["PMID"], study["AB"]] for study in records if study.get("AB", None)] df = pd.DataFrame(columns=["study_id", "abstract"], data=data) if not isinstance(dataset, Dataset): return df dataset.texts = pd.merge(dataset.texts, df, left_on="study_id", right_on="study_id", how="left") return dataset