Exemplo n.º 1
0
def index():
	ix = open_dir("indexdir")
	writer = ix.writer()
	for pfile in pubmed_files:
		print "parsing",pfile
		txt=open(project+"/"+pfile,"r")
		records=Medline.parse(txt)
		for r in records:
			if "AB" not in r:
				continue
			authors=""
			if "FAU" in r:
				authors+=",".join(r["FAU"])
			elif "AU" in r:
				authors+=",".join(r["AU"])
			else:
				firstAuthor="Unknown"
			date=datetime.datetime.strptime(r["DA"],"%Y%m%d")
			title=r["TI"]
			pmid=r["PMID"].decode("utf-8")

			writer.add_document(
				title=title.decode("utf-8"),
				path=pfile.decode("utf-8"),
				abstract=r['AB'].decode("utf-8"),
				authors=authors.decode("utf-8"),
				pmid=pmid,
				dateAdded=date
				)
	writer.commit()
	print "Index contain",ix.doc_count()
Exemplo n.º 2
0
def top_papers(papers,outpath=None,delim="\t", top=20):
    """This function fetches all the relevant PubMed info for each PMID in 'papers' and 
    1) puts it into a list and 2) outputs it to a file named in outpath."""
    #
    # Can be used with SP & GOA data
    
    papers_annots = [(len(papers[p]), p) for p in papers]
    papers_annots2 = []
        
    papers_annots.sort()
    idlist = [p[1] for p in papers_annots[-top:]]
    Entrez.email = "*****@*****.**"
    h = Entrez.efetch(db="pubmed", id=",".join(idlist), 
                          rettype="medline", retmode="text")
    medrecs = list(Medline.parse(h))
    titles = [medrec.get("TI","?") for medrec in medrecs]
    years = [medrec.get("DP","?") for medrec in medrecs]
    journals = [medrec.get("JT", "?") for medrec in medrecs]
    for p, title, year, journal in zip(papers_annots[-top:], titles,years, journals):
        papers_annots2.append((p[0],p[1], title, year.split()[0].strip(), journal))
    if outpath:
        fout = open(outpath,"w")
        print >> fout, "num proteins\tpubmed ID\tTitle\tYear\tJournal"
        for p in papers_annots2:
            print >> fout, "%d\t%s\t%s\t%s\t%s" % p
        fout.close()
    #papers_annots2 = [(# all annotations, PMID, Title, Year, Journal)] 
    return papers_annots2
Exemplo n.º 3
0
def store_abstracts_for_query(query,query_tag,maxN=None,preview_only=False):
	# if query_tag=="":
	# 	simpleQuery=" ".join(map(lambda x:x.name,queryTerms))
	# else:
	# 	simpleQuery=query_tag
	# query=pg.build_query(queryTerms)
	print "will search",query
	Entrez.email = "*****@*****.**"
	search_results = Entrez.read(Entrez.esearch(db="pubmed",
												term=query,
												reldate=10*365, datetype="pdat",
												usehistory="y"))
	count = int(search_results["Count"])
	print "Found %i results" % count
	if maxN!=None and maxN<count:
		count=maxN
		print "Only keeping first",count,"abstracts"
	if preview_only:
		return
	sys.stdout.flush()
	batch_size = 50
	for start in range(0,count,batch_size):
			end = min(count, start+batch_size)
			print "Going to download record %i to %i" % (start+1, end)
			sys.stdout.flush()
			fetch_handle = Entrez.efetch(db="pubmed",
										 rettype="medline", retmode="text",
										 retstart=start, retmax=batch_size,
										 webenv=search_results["WebEnv"],
										 query_key=search_results["QueryKey"])
			records=Medline.parse(fetch_handle)
			for r in records:
				pubmed_to_pg.store_medline_entry(r,query_tag)
Exemplo n.º 4
0
def pubmed():
    # Get the count of papers about orchid only in database pubmed
    Entrez.email = "*****@*****.**"     # Always tell NCBI who you are
    handle = Entrez.egquery(term="orchid")
    record = Entrez.read(handle)
    for row in record["eGQueryResult"]:
        if row["DbName"] == "pubmed":
            print "The count of papers about orchid in database pubmed:", row["Count"]

    # Get the list of ids of above
    handle = Entrez.esearch(db="pubmed", term="orchid", retmax=100)
    record = Entrez.read(handle)
    idlist = record["IdList"]
    print "The id list of papers about orchid in database pubmed:", idlist
    print

    # Search papers author by "Liu ZJ" from pubmed
    handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)
    search_author = "Liu ZJ"
    for record in records:
        if "AU" not in record:
            continue
        if search_author in record["AU"]:
            print "Author %s found." % search_author
            print "title:", record.get("TI", "?")
            print "authors:", record.get("AU", "?")
            print "source:", record.get("SO", "?")
            print
Exemplo n.º 5
0
def getMeSH(url):
        query = urllib.unquote_plus(url)

        if not query:
                return "<h3> No query </h3>"
        
        MAX_COUNT = 10000
         
        Entrez.email = '*****@*****.**'
        pubmedquery = query.replace('-','\-')
        h = Entrez.esearch(db='pubmed', term=pubmedquery, retmax=MAX_COUNT)
        result = Entrez.read(h)
        ids = result['IdList']
        if not ids:
                return "<h3> geen gevonden resultaten </h3>"
        h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text')
        records = Medline.parse(h)

        MeSHCount = 0
        MeSHContent = ""
        
        for record in records:
                try:
                        if "tox" in str(record.get("MH")):
                                MeSHContent += "<h4><a href='http://www.ncbi.nlm.nih.gov/pubmed/"+str(record.get("PMID"))+"'>"
                                MeSHContent += "PMID: "+str(record.get("PMID"))+"</a> is analysed on toxicity. </h4> \n"
                except (TypeError):
                        continue;
                
        return MeSHContent
Exemplo n.º 6
0
def createTable(query):

    if not query:
        return "<h3> No query </h3>"

    MAX_COUNT = 100
    pubmedquery = query.replace("-", "\-")
    Entrez.email = "*****@*****.**"
    h = Entrez.esearch(db="pubmed", term=pubmedquery, retmax=MAX_COUNT)
    result = Entrez.read(h)
    ids = result["IdList"]
    h = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="text")
    records = Medline.parse(h)

    tableContent = ""

    for record in records:
        try:
            tableContent += (
                "<tr><td width='22%'>" + str(record.get("TI")) + "</td>"
                "<td width='5%'>" + str(record.get("DP")) + "</td>"
                "<td width='5%'>" + str(writers(record.get("FAU"))) + "</td>"
                "<td width='5%'>" + str(record.get("JT")) + "</td>"
                "<td width='5%'>" + str(query) + "</td>"
                "<td>"
                "<a href='http://www.ncbi.nlm.nih.gov/pubmed/"
                + str(record.get("PMID"))
                + "'><img src='PubMed.png' height='75px' width='75px' alt='PubMed' align='right'/></a>"
                + str(record.get("AB"))
                + "</td></tr>"
            )
        except (TypeError):
            continue

    return tableContent
def fetch(t, s):
    h = Entrez.esearch(db="pubmed", term=t, retmax=10000, sort=s)
    idList = Entrez.read(h)["IdList"]
    results = "Total publications for SA Beatson: **" + str(len(idList)) + "**\n\n"
    results += "Chronologically sorted:\n\n"

    if idList:
        handle = Entrez.efetch(db="pubmed", id=idList, rettype="medline", retmode="text")
        records = Medline.parse(handle)
        max = len(idList) + 1
        for record in records:
            title = record["TI"]
            author = ", ".join(record["AU"])
            source = record["SO"]
            pub_date = datetime.strptime(record["DA"], "%Y%m%d").date()
            pmid = record["PMID"]
            cur_pub = "| **%i.** %s\n| %s\n| %s\n| http://www.ncbi.nlm.nih.gov/pubmed/%s\n|\n" % (
                max - 1,
                title,
                author,
                source,
                pmid,
            )
            results = results + cur_pub
            max = max - 1
    return results
Exemplo n.º 8
0
def top_papers_dict(papers, outpath=None,delim="\t", top=None):
    """This function fetches all the relevent PubMed info for each PMID in 'papers' 
    (at the limit supplied in 'top') and 1) puts it into a dict."""
    #
    # Can be used with SP & GOA data
    
#    papers_annots = [(len(papers_prots[p]), p) for p in papers_prots]
    papers_annots = [(len(papers[p]), p) for p in papers]
    papers_annots2_dict = {}
        
    papers_annots.sort()
    if top is None:
        negTop = 0
    else:
        negTop = -top
    idlist = [p[1] for p in papers_annots[negTop:]]
    Entrez.email = MY_EMAIL
    h = Entrez.efetch(db="pubmed", id=",".join(idlist), 
                          rettype="medline", retmode="text")
    medrecs = list(Medline.parse(h))
    titles = [medrec.get("TI","?") for medrec in medrecs]
    years = [medrec.get("DP","?") for medrec in medrecs]
    journals = [medrec.get("JT", "?") for medrec in medrecs]
    for p, title, year, journal in zip(papers_annots[negTop:], titles,years, journals):
        #papers_annots2_dict[PMID] = [# of total annotations, Title, Year, Journal] 
        papers_annots2_dict[p[1]] = [len(papers[p[1]]), title, year.split()[0].strip(), journal]
    """if outpath:
        fout = open(outpath,"w")
        print >> fout, "num proteins\tpubmed ID\tTitle\tYear\tJournal"
        for p in papers_annots2:
            print >> fout, "%d\t%s\t%s\t%s\t%s" % p
        fout.close()
    """
    return papers_annots2_dict
Exemplo n.º 9
0
 def test_medline_from_url(self):
     """Test Entrez into Medline.read from URL"""
     efetch = Entrez.efetch(db="pubmed", id="19304878", rettype="medline", retmode="text")
     record = Medline.read(efetch)
     self.assertTrue(isinstance(record, dict))
     self.assertEqual("19304878", record["PMID"])
     self.assertEqual("10.1093/bioinformatics/btp163 [doi]", record["LID"])
Exemplo n.º 10
0
def retrive_record(row):

    name=row[1]+"[AUTH]"        
    handle = Entrez.esearch(db="pubmed",term=name)
    record=Entrez.read(handle)
    idlist=record["IdList"]
    

    
    handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline",
                       retmode="text")
    records = Medline.parse(handle)

    for record in records:
        temp=[]
        temp.append(row[0])
        temp.append(row[1])
        #title
        temp.append(record.get("TI","?"))
        #authors
        temp.append(record.get("AU","?"))
        #
        temp.append(record.get("AD","?"))
        #
        temp.append(record.get("DP","?"))
        #pubmed id for url
        temp.append(record.get("PMID","?"))

    return temp
Exemplo n.º 11
0
 def test_pubmed_16381885_and_19850725(self):
     """Bio.TogoWS.entry("pubmed", "16381885,19850725")"""
     handle = TogoWS.entry("pubmed", "16381885,19850725")
     records = list(Medline.parse(handle))
     handle.close()
     self.assertEqual(len(records), 2)
     self.assertEqual(records[0]["TI"], "From genomics to chemical genomics: new developments in KEGG.")
     self.assertEqual(
         records[0]["AU"],
         [
             "Kanehisa M",
             "Goto S",
             "Hattori M",
             "Aoki-Kinoshita KF",
             "Itoh M",
             "Kawashima S",
             "Katayama T",
             "Araki M",
             "Hirakawa M",
         ],
     )
     self.assertEqual(
         records[1]["TI"],
         "DDBJ launches a new archive database with analytical tools " + "for next-generation sequence data.",
     )
     self.assertEqual(
         records[1]["AU"],
         ["Kaminuma E", "Mashima J", "Kodama Y", "Gojobori T", "Ogasawara O", "Okubo K", "Takagi T", "Nakamura Y"],
     )
Exemplo n.º 12
0
def createTable(query):

        if not query:
                return "<h3> No query </h3>"
        
        MAX_COUNT = 100
         
        Entrez.email = '*****@*****.**'
        pubmedquery = query.replace('-','\-')
        h = Entrez.esearch(db='pubmed', term=pubmedquery, retmax=MAX_COUNT)
        result = Entrez.read(h)
        ids = result['IdList']
        if not ids:
                return "<h3> geen gevonden resultaten </h3>"
        h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text')
        records = Medline.parse(h)


        tableContent = ""
        
        for record in records:
                try:    
                        tableContent += "<tr><td width='22%'>"+str(record.get("TI"))+"</td>"\
                        "<td width='5%'>"+str(record.get("DP"))+"</td>"\
                        "<td width='5%'>"+str(writers(record.get("FAU")))+"</td>"\
                        "<td width='5%'>"+str(record.get("JT"))+"</td>"\
                        "<td width='5%'>"+str(query)+"</td>"\
                        "<td>"\
                        "<a href='http://www.ncbi.nlm.nih.gov/pubmed/"+str(record.get("PMID"))+"'><img src='PubMed.png' height='75px' width='75px' alt='PubMed' align='right'/></a>"\
                        +str(record.get("AB"))+"</td></tr>"
                except (TypeError):
                        continue;
                
        return tableContent
Exemplo n.º 13
0
def fetch_pubmed_data(pmid):

    from Bio import Medline,Entrez
    
    try:
        ncbiemail= settings.author_email
    except:
        try:
            ncbiemail= settings.author_email
        except:
            raise Exception('Please set an email to use ncbi services')
    
    Entrez.email = ncbiemail
    Entrez.tool = 'mybiodb'

    try:
        entrez_response=Medline.parse( Entrez.efetch(db="pubmed", id=pmid, rettype="medline",retmode="text",)).next()
        if not entrez_response.has_key('PMID'):
             response.flash='pubmed ID error'
        else:
            return entrez_response
    except IOError:
        session.flash='Remote service not available, please try again.'

       
    return
Exemplo n.º 14
0
	def fetch(self, batchSize=100):
		"""Return a batch of results.
		"""
		if self._done:
			return []

		end = min(self._searchCount, self._searchPosition + batchSize)

		log.info("Downloading from %i to %i..." % (self._searchPosition+1, end))

		fetchHandle = Entrez.efetch(db="pubmed", rettype="medline", retmode="text", retstart=self._searchPosition, retmax=batchSize, webenv=self._searchSession, query_key=self._queryKey)
		result = Medline.parse(fetchHandle)

		papers = [paper.Paper(r) for r in result if r.get("PMID") is not None ]

		fetchHandle.close()

		log.info("... downloading done")

		self._searchPosition = self._searchPosition + batchSize

		if self._searchPosition >= self._searchCount:
			self._done = True
			log.info("Search ended.")

		return papers
Exemplo n.º 15
0
def search_pubmed(term):
    "Searches a term on pubmed"
    print("Searching for", term)
    try:
        # Do a first query
        handle = Entrez.egquery(term=term)
        record = Entrez.read(handle)
        nb_entries = 0
        for row in record["eGQueryResult"]:
            if row["DbName"]=="pubmed":
                nb_entries = row["Count"]
                print(row["Count"], 'results found.')
        if int(nb_entries) == 0:
            return BibDatabase()
        # Search the IDs
        handle = Entrez.esearch(db="pubmed", term=term, retmax=min(int(nb_entries), MAX_RESULTS))
        record = Entrez.read(handle)
        idlist = record["IdList"]
        # Get the descriptions
        handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
        records = Medline.parse(handle)
        records = list(records)
        return transform_pubmed(records)

    except Exception as e:
        print('The search failed.')
        print(e)
        return []
Exemplo n.º 16
0
 def test_read(self):
     with open("Medline/pubmed_result1.txt") as handle:
         record = Medline.read(handle)
     self.assertEqual(record["PMID"], "12230038")
     self.assertEqual(record["OWN"], "NLM")
     self.assertEqual(record["STAT"], "MEDLINE")
     self.assertEqual(record["DA"], "20020916")
     self.assertEqual(record["DCOM"], "20030606")
     self.assertEqual(record["LR"], "20041117")
     self.assertEqual(record["PUBM"], "Print")
     self.assertEqual(record["IS"], "1467-5463 (Print)")
     self.assertEqual(record["VI"], "3")
     self.assertEqual(record["IP"], "3")
     self.assertEqual(record["DP"], "2002 Sep")
     self.assertEqual(record["TI"], "The Bio* toolkits--a brief overview.")
     self.assertEqual(record["PG"], "296-302")
     self.assertEqual(record["AB"], "Bioinformatics research is often difficult to do with commercial software. The Open Source BioPerl, BioPython and Biojava projects provide toolkits with multiple functionality that make it easier to create customised pipelines or analysis. This review briefly compares the quirks of the underlying languages and the functionality, documentation, utility and relative advantages of the Bio counterparts, particularly from the point of view of the beginning biologist programmer.")
     self.assertEqual(record["AD"], "tacg Informatics, Irvine, CA 92612, USA. [email protected]")
     self.assertEqual(record["FAU"], ["Mangalam, Harry"])
     self.assertEqual(record["AU"], ["Mangalam H"])
     self.assertEqual(record["LA"], ["eng"])
     self.assertEqual(record["PT"], ["Journal Article"])
     self.assertEqual(record["PL"], "England")
     self.assertEqual(record["TA"], "Brief Bioinform")
     self.assertEqual(record["JT"], "Briefings in bioinformatics")
     self.assertEqual(record["JID"], "100912837")
     self.assertEqual(record["SB"], "IM")
     self.assertEqual(record["MH"], ["*Computational Biology", "Computer Systems", "Humans", "Internet", "*Programming Languages", "*Software", "User-Computer Interface"])
     self.assertEqual(record["EDAT"], "2002/09/17 10:00")
     self.assertEqual(record["MHDA"], "2003/06/07 05:00")
     self.assertEqual(record["PST"], "ppublish")
     self.assertEqual(record["SO"], "Brief Bioinform. 2002 Sep;3(3):296-302.")
Exemplo n.º 17
0
def get_wikiref(pmid):
    """ Returns the Wiki cite journal entry for a given Pubmed ID """
    
    handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
    records = Medline.parse(handle)
    records = list(records)
    
    import datetime
    now = datetime.datetime.now()
    jetzt= now.strftime("%Y-%m-%d")
    
    # generate the {{cite journal}} format
        
    for rec in records:
        aut = rec["AU"]
        firstauthor = aut.pop(0)
        coauthors = ", ".join(aut)
        
        # get date of publication
        # CRDT
        datee = rec["CRDT"][0].split('/')[0]
        #if datee == '':
        #    datee = rec["PD"]
        
        outstring = "{{cite journal|title=%s|journal=%s|year=%s|author=%s|coauthors=%s|volume=%s|pages=%s|id=PMID %s|accessdate=%s}}" % \
                    (rec["TI"], rec["JT"], datee, firstauthor, coauthors, rec["VI"], rec["PG"], pmid, jetzt)
        
        # example:
        #{{cite journal|title=|journal=|date=2008/07/31/|first=Cyril|last=Herry|coauthors=i|volume=454|issue=7204|pages=600-606|id=PMID 18615015 {{doi|10.1038/nature07166}}|url=http://www.fmi.ch/downloads/news/2008.07.11.01.luthi.nature.press.release.pdf|format=|accessdate=2009-09-12 }}
        
    return outstring
def main(Substance, Organism, Gene):
    zoekterm1 = "Cocaine"
    zoekterm2 = "Elegans"
    MAX_COUNT = 50
    dic = {}
    titels = []
    TERM = ''
    TERMS = []
    count = 1
    if zoekterm2 == "":
        TERM = zoekterm1
    if zoekterm1 == "":
        print("vul een zoekterm in")
        sys.exit()
    elif zoekterm2 != "":
        TERM = zoekterm1+" and "+zoekterm2
    TERMS.append(TERM)
    print(TERM)
    handle = Entrez.esearch(db="pubmed", term= TERM, retmax=MAX_COUNT)
    record = Entrez.read(handle)
    idlist = record["IdList"]
    handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)
    records = list(records)
    for record in records:
        titel = record.get("PMID","?")
        titels.append(titel)
        pubSet = set(titels)        
    dic[TERM] = pubSet
    print(dic)
    return "Jay"
Exemplo n.º 19
0
    def __init__(self, pmids):
        
         Entrez.email = '*****@*****.**'

         ## pmids is a list (array of pmid)
         handle = Entrez.efetch(db='pubmed', id=pmids, rettype='medline', retmode='text')
         self.records = Medline.parse(handle)
Exemplo n.º 20
0
def getMedlineList(pmids):

    """
    This function takes a list of article-ids and returns a list of
    MedLine articles that contains an abstract.
    """

    records = []
    cleaned_records = []
    listLength = len(pmids)

    Entrez.email = '*****@*****.**'

    for i in range(0, listLength, 650):
        tempList = pmids[i:i + 650]
        handle = Entrez.efetch(db='pubmed', id=tempList,rettype='medline', retmode='text')
        try:
            records.extend(list(Medline.parse(handle)))
        except:
            IOmodule.writeOutTxt(_mainFolder+'/'+'errordir_medline_records', pmids[i], '')

        print 'Downloaded',len(records),'MedLine articles.',str(listLength-len(records)),'remaining...'

    for article in records:
        if 'AB' in article:
            cleaned_records.append(article)
    
    print 'Returned',len(cleaned_records),'MedLine articles containing an abstract.'
    return cleaned_records
def get_pubmed_document(pubmed_ids, source='pubmed', include_pubtator=True):
    Entrez.email = settings.ENTREZ_EMAIL

    if type(pubmed_ids) == list:
        ids = [str(doc_id) for doc_id in pubmed_ids]
    else:
        ids = [str(pubmed_ids)]

    h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text')
    records = Medline.parse(h)

    # Reference to abbreviations: http://www.nlm.nih.gov/bsd/mms/medlineelements.html
    for record in records:
        if record.get('TI') and record.get('AB') and record.get('PMID') and record.get('CRDT'):
            #if Document.objects.pubmed_count(record.get('PMID')) is 0:
            title = ' '.join( pad_split(record.get('TI')) )
            abstract = ' '.join( pad_split(record.get('AB')) )

            doc, doc_c = Document.objects.get_or_create(document_id=record.get('PMID'))
            doc.title = title
            doc.source = source
            doc.save()

            sec, sec_c = Section.objects.get_or_create(kind='t', document=doc)
            sec.text = title
            sec.save()

            sec, sec_c = Section.objects.get_or_create(kind='a', document=doc)
            sec.text = abstract
            sec.save()

            if include_pubtator:
                doc.init_pubtator()
Exemplo n.º 22
0
def pubsearch(jids):
    Entrez.email = "*****@*****.**"
    # always let Entrez know who is calling

    pubterm = ""
    for i in jids:
        pubterm += i + "[JID] or "

    IDhandle = Entrez.esearch(
        db="pubmed", term="peptide AND (" + pubterm + " and ", mindate="2011", maxdate="2014", retmax=2500
    )
    # for documentation on esearch, see
    # http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
    # max number for retmax is 100k. Use retstart to get more than this.
    # Date range used to limit a search result by the date specified by datetype. These two parameters (mindate, maxdate) must be used together to specify an arbitrary date range. The general date format is YYYY/MM/DD, and these variants are also allowed: YYYY, YYYY/MM.

    record = Entrez.read(IDhandle)
    # record is returned as a dictionary. Lists search terms, all ID numbners etc

    idlist = record["IdList"]
    # return a list of ID numbers from the record dictionary

    recordHandle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
    # search pubmed for records with idlist as input

    records = Medline.parse(recordHandle)
    # create dictionary from recordHandle

    return records
Exemplo n.º 23
0
	def processInput(k):
		print "Querying PMID: "+str(k)+"."
		getall = Medline.read(Entrez.efetch(db="pubmed", id=k, rettype="medline", retmode="text"))
		singlemesh = getall.get("MH")
		singledate = getall.get("EDAT")	
		for j1 in range(len(singlemesh)):
			cur.execute("INSERT INTO MeSH002(PMID, MeSH, Dates) VALUES("+str(k)+",'" + getall.get("MH")[j1][0:24].translate(None, "'*&")+"','" +  str(singledate[0:10]) +"')" )
Exemplo n.º 24
0
def get_pubmed_data(idlist):
	"""Takes a list of pubmed ids and returns title, auth, yr"""
	handle = Entrez.efetch(db='pubmed', id=idlist, rettype='medline', retmode='text')
	records = Medline.parse(handle)
	mypms = []
	for record in records:
		mypms.append((record["TI"], record["AU"], record["PMID"]))
	return mypms
Exemplo n.º 25
0
def Pubmedsearch(PMID):
	pmid = ""
	pmid = PMID

    	handle = Entrez.efetch(db="pubmed", id= pmid, rettype="medline",retmode="text")
    	records = Medline.parse(handle)
    	records = list(records)
    	for record in records:
        	return (str(pmid)+"\t"+str(record.get("TI", "?"))+"\t"+str(record.get("FAU", "?"))+"\t"+str(record.get("AU", "?"))+"\t"+str(record.get("AD", "?")))
Exemplo n.º 26
0
def pubmedsearch (TERM, MAX_COUNT = 10000):
    # Returns an Entrez object matching *TERM*
    Entrez.email = '*****@*****.**'
    h = Entrez.esearch(db='pubmed', retmax=MAX_COUNT, term=TERM)
    result = Entrez.read(h)
    ids = result['IdList']
    h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text') 
    records = Medline.parse(h)
    return records
Exemplo n.º 27
0
def get_articles(term):
    idlist = get_article_ids(term)
    counter=0
    #pbar = make_pbar(len(idlist),text="Fetching")
    
    articles=[]
    if len(idlist) > 100:
        chunks=[idlist[i:i+100] for i in range(0, len(idlist), 100)]
        for chunk in chunks:
            handle = Entrez.efetch(db="pubmed", id=chunk, rettype="medline", retmode="text")
            articles.extend(list(Medline.parse(handle)))
            print '#'
            #pbar.update(p.currval+len(chunk))
    else:
        handle=Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
        articles.extend(list(Medline.parse(handle)))
    #pbar.finish()
    
    return articles
Exemplo n.º 28
0
def get_record_from_pmid(pmid):
    # now get the actual citation; should really only be a singleton,
    # but this library likes to operate over lists
    citations = Entrez.efetch(db="pubmed",id=pmid,
                                rettype="medline",retmode="text")

    # again, Bio likes to operate on lists, even though we only have
    # a singleton here
    record = list(Medline.parse(citations))[0]
    return record
Exemplo n.º 29
0
def pubmedsearch (TERM, MAX_COUNT = 10000):
    # Returns an Entrez object matching *TERM*
    Entrez.email = '*****@*****.**'
    Entrez.tool = 'pm_impacts'
    h = Entrez.esearch(db='pubmed', retmax=MAX_COUNT, term=TERM)
    result = Entrez.read(h)
    ids = result['IdList']
    h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text') 
    records = Medline.parse(h)
    return records
Exemplo n.º 30
0
def store_abstract_with_pmid(pmid,queryTag=None):
	"""Populate the PG databases with the MEDLINE entries having these pmid. Pmid can is a scalar
	or a list of pmid
	"""
	if queryTag==None:
		queryTag="PMID"
	Entrez.email="*****@*****.**"
	handle=Entrez.efetch(db="pubmed",rettype="medline",retmode="text",id=pmid)
	for r in Medline.parse(handle):
		store_medline_entry(r,queryTag)
Exemplo n.º 31
0
def collectPubmedInfo(email, term, record_dict, retmax, outputPath):
    Entrez.email = email
    # Use ESearch
    handle = Entrez.esearch(db="pubmed", term=term, retmax=retmax)
    record = Entrez.read(handle)
    count = record["Count"]
    idlist = record["IdList"]
    # Use pandas to save a formated file
    df_1 = pd.DataFrame({"PubmedIDs(%s)" % (count): idlist})
    df_1.to_csv(outputPath + "PubmedIDs_1.txt", sep="\t")
    # Use EFetch to collect id and use medline to get details
    handle = Entrez.efetch(db="pubmed",
                           id=idlist,
                           rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)
    for record in records:
        for key in record_dict.keys():
            record_dict[key].append(record[key])
    # Use pandas to save a formated file
    df_2 = pd.DataFrame(record_dict)
    df_2.to_csv(outputPath + "Results_1.txt", sep="\t")
Exemplo n.º 32
0
def fetch_from_entrez(index, cache_dir=False):
    logger = logging.getLogger('build')

    # slugify the index for the cache filename (some indices have symbols not allowed in file names (e.g. /))
    index_slug = slugify(index)
    cache_file_path = '{}/{}'.format('/'.join(cache_dir), index_slug)

    # try fetching from cache
    if cache_dir:
        d = fetch_from_cache(cache_dir, index_slug)
        if d:
            logger.info('Fetched {} from cache'.format(cache_file_path))
            return d

    # if nothing is found in the cache, use the web API
    logger.info('Fetching {} from Entrez'.format(index))
    tries = 0
    max_tries = 5
    while tries < max_tries:
        if tries > 0:
            logger.warning('Failed fetching pubmed {}, retrying'.format(
                str(index)))

        try:
            Entrez.email = '*****@*****.**'
            handle = Entrez.efetch(db="pubmed",
                                   id=str(index),
                                   rettype="medline",
                                   retmode="text")
        except:
            tries += 1
            time.sleep(2)
        else:
            d = Medline.read(handle)

            # save to cache
            save_to_cache(cache_dir, index_slug, d)
            logger.info('Saved entry for {} in cache'.format(cache_file_path))
            return d
Exemplo n.º 33
0
def getMedlineAbstracts(idList):
    fields = {
        "TI": "title",
        "AU": "authors",
        "JT": "journal",
        "DP": "date",
        "MH": "keywords",
        "AB": "abstract",
        "PMID": "PMID"
    }
    pubmedUrl = "https://www.ncbi.nlm.nih.gov/pubmed/"
    abstracts = pd.DataFrame()
    try:
        handle = Entrez.efetch(db="pubmed",
                               id=idList,
                               rettype="medline",
                               retmode="json")
        records = Medline.parse(handle)
        results = []
        for record in records:
            aux = {}
            for field in fields:
                if field in record:
                    aux[fields[field]] = record[field]
            if "PMID" in aux:
                aux["url"] = pubmedUrl + aux["PMID"]
            else:
                aux["url"] = ""
            results.append(aux)

        abstracts = pd.DataFrame.from_dict(results)
    except error.URLError as e:
        print("URLError: Request to Bio.Entrez failed. Error: {}".format(e))
    except error.HTTPError as e:
        print("HTTPError: Request to Bio.Entrez failed. Error: {}".format(e))
    except Exception as e:
        print("Request to Bio.Entrez failed. Error: {}".format(e))

    return abstracts
Exemplo n.º 34
0
def download_abstracts(dataset, path='.', email=None, out_file=None):
    """ Download the abstracts for a dataset/list of pmids
    """
    try:
        from Bio import Entrez, Medline
    except:
        raise Exception(
            'Module biopython is required for downloading abstracts from PubMed.'
        )

    if email is None:
        raise Exception('No email address provided.')
    Entrez.email = email

    if isinstance(dataset, Dataset):
        pmids = dataset.image_table.ids.astype(str).tolist()
    elif isinstance(dataset, list):
        pmids = [str(pmid) for pmid in dataset]
    else:
        raise Exception('Dataset type not recognized: {0}'.format(
            type(dataset)))

    records = []
    # PubMed only allows you to search ~1000 at a time. I chose 900 to be safe.
    chunks = [pmids[x:x + 900] for x in range(0, len(pmids), 900)]
    for chunk in chunks:
        h = Entrez.efetch(db='pubmed',
                          id=chunk,
                          rettype='medline',
                          retmode='text')
        records += list(Medline.parse(h))

    # Pull data for studies with abstracts
    data = [[study['PMID'], study['AB']] for study in records
            if study.get('AB', None)]
    df = pd.DataFrame(columns=['pmid', 'abstract'], data=data)
    if out_file is not None:
        df.to_csv(os.path.join(os.path.abspath(path), out_file), index=False)
    return df
Exemplo n.º 35
0
def add_paper(pmid, created_by="OTTO", method_obtained="Curator triage"):
    """ Adds paper to referencedbentity table

    Parameters
    ----------
    pmid: int
    created_by: str, optional
    method_obtained: str, optional

    Returns
    -------
    object
        reference object
    """

    record = Medline.read(
        Entrez.efetch(db="pubmed", id=str(pmid), rettype='medline'))
    rec_keys = list(record.keys())
    if 'PMID' not in rec_keys:
        raise ValueError(
            'Unable to fetch record feom pubmed. Make sure it is a valid PMID.'
        )

    print(record)

    ncbi = DBSession.query(Source).filter_by(format_name='NCBI').one_or_none()
    source_id = ncbi.source_id
    ## insert into DBENTITY/REFERENCEDBENTITY/REFERENCEDOCUMENT
    [reference_id, authors, doi_url, pmc_url, sgdid,
     reference] = insert_referencedbentity(pmid, source_id, record, created_by,
                                           method_obtained)
    insert_authors(reference_id, authors, source_id, created_by)
    insert_pubtypes(pmid, reference_id, record.get('PT', []), source_id,
                    created_by)
    insert_urls(pmid, reference_id, doi_url, pmc_url, source_id, created_by)
    # removed to support changes in http://redmine.stanford.edu/issues/4758
    # insert_relations(pmid, reference_id, record, created_by)
    return reference
Exemplo n.º 36
0
    def read_data(self):
        path = self.path
        final_list = []

        with open(path) as handle:
            records = Medline.parse(handle)
            tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
            for record in records:
                new_dict = {}
                check = self.check_preprocess_condition(record)
                if not check:
                    continue
                # abstract = record.get('AB',False)
                # abstractWords =  nltk.tokenize.word_tokenize(abstract)
                # sw = stopwords.words('english')
                # char_to_remove = [',','.','!','?',':']
                # for word in sw:
                #     if word in abstractWords:
                #         abstractWords.remove(word)
                # final_ab = ' '.join(list(abstractWords))
                # #remove punctuations
                # puncString = ".,:?!()0123456789"

                # final_ab = "".join(l for l in final_ab if l not in puncString)

                # final_ab = final_ab.lower()

                # for rmc in puncString:
                #     final_ab=final_ab.replace(rmc,'')
                new_dict['PMID'] = record.get('PMID', '')
                new_dict['TI'] = record.get('TI', '')
                new_dict['OT'] = record.get('OT', [])
                new_dict['AB'] = record.get('AB', '')
                new_dict['tokens'] = record.get('tokens', '')
                final_list.append(new_dict)
            print "clean abastract count>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>", len(
                final_list)
            return final_list
Exemplo n.º 37
0
def abs_search(gene_dict, pattern_dict, abstract_file, out_queue):
    try:
        result_dict = {}
        ABS_OPEN = open(abstract_file, 'r')
        all_abstracts = Medline.parse(ABS_OPEN)
        for abstract in all_abstracts:
            if 'AB' in abstract:
                abstract_text = re.sub(r'\r\n|\n|\s+|;', ' ', abstract['AB'])
                abstract_pmid = 'Unknown'
                abstract_journ = 'Unknown'
                if 'PMID' in abstract:
                    abstract_pmid = abstract['PMID']
                if 'SO' in abstract:
                    abstract_journ = abstract['SO']
                for key in gene_dict.keys():
                    for gene in gene_dict[key]:
                        match = gene.search(abstract_text, re.MULTILINE)
                        if match:
                            if key not in result_dict:
                                result_dict[key] = []
                            result_dict[key].append([abstract_pmid, match.group(0), \
                                                     abstract_text[match.start(0)-(match.start(0) if match.start(0) < 30 else 30):\
                                                                   match.end(0)+(match.end(0) if match.end(0) < 30 else 30)]])
                            result = dict.fromkeys(pattern_dict.keys())
                            for pattern in sorted(pattern_dict.keys()):
                                result[pattern] = []
                                for match in pattern_dict[pattern].finditer(
                                        abstract_text, re.MULTILINE):
                                    match = str(match.group(0))
                                    if match not in result[pattern]:
                                        result[pattern].append(match)
                                result_dict[key][-1].append(', '.join(
                                    result[pattern]))
                            result_dict[key][-1].append(abstract_journ)
        ABS_OPEN.close()
        out_queue.put(result_dict)
    except:
        print("One of the processes got an exception and was killed")
Exemplo n.º 38
0
def getAbstracts(ID):
    dictionary = {}
    dictionary_textmining = {}
    abstracts = []
    keys = []
    auteur = []
    datum = []
    titel = []

    handle = Entrez.efetch(db="pubmed",
                           id=ID,
                           rettype='Medline',
                           retmode='text')
    records = Medline.parse(handle)
    for record in records:
        PMID = record.get('PMID')
        auteurs = record.get('AU')
        if record.get('AB') is not None:
            abstract = record.get('AB')
        else:
            abstract = "-"
        date = record.get('DP')
        title = record.get('TI')
        if record.get('OT') is None:
            keywords = "-"
        else:
            keywords = record.get('OT')

        auteur.append(auteurs)
        abstracts.append(abstract)
        datum.append(date)
        titel.append(title)
        keys.append(keywords)

        dictionary[PMID] = [title, abstract, keywords, auteurs, date]
        dictionary_textmining[PMID] = [title, abstract, keywords]

    return keys, abstracts, auteur, datum, titel, dictionary, dictionary_textmining
Exemplo n.º 39
0
    def pmid2abstract_info(self, pmid_list):
        

        # make sure that pmid are strings
        pmid_list = [str(i) for i in pmid_list]

        try:
            handle = Entrez.efetch(db="pubmed", id=','.join(pmid_list), rettype="medline", retmode="text")
            records = Medline.parse(handle)
        except:
            print("FAIL:", pmid_list)
            return None

        pmid2data = {}
        for record in records:
            try:
                pmid = record["PMID"]
            except:
                print(record)
                #{'id:': ['696885 Error occurred: PMID 28696885 is a duplicate of PMID 17633143']}
                if 'duplicate' in record['id:']:
                    duplicate = record['id:'].split(' ')[0]
                    correct = record['id:'].split(' ')[-1]
                    print("removing duplicated PMID... %s --> %s" % (duplicate, correct))
                    # remove duplicate from list
                    pmid_list.remove(duplicate)
                    return self.pmid2abstract_info(pmid_list)

            pmid2data[pmid] = {}
            pmid2data[pmid]["title"] = record.get("TI", "?")
            pmid2data[pmid]["authors"] = record.get("AU", "?")
            pmid2data[pmid]["source"] = record.get("SO", "?")
            pmid2data[pmid]["abstract"] = record.get("AB", "?")
            pmid2data[pmid]["journal"] = record.get("TA", "?")
            pmid2data[pmid]["year"] = record.get("DP", "?")
            pmid2data[pmid]["pmid"] = pmid

        return pmid2data
Exemplo n.º 40
0
def get_paper(pmids):
    """

    :param pmids: PubMed ids of papers
    :type pmids: list
    :rtype: str
    Возвращает название статьи и список авторов
    """
    papers = []
    handle = Entrez.efetch(db="pubmed",
                           id=[str(pmid) for pmid in pmids],
                           rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)

    for pm_record in records:
        authors = pm_record.get("AU", "?")
        if len(authors) > 2:
            authors = '%s, %s et al.' % (authors[0], authors[1])
        papers.append(
            '%s, %s, %s' %
            (pm_record.get("TI", "?"), authors, pm_record.get("SO", "?")))
    return '\n'.join(papers)
Exemplo n.º 41
0
 def test_pubmed_16381885(self):
     """Bio.TogoWS.entry("pubmed", "16381885")."""
     # Gives Medline plain text
     handle = TogoWS.entry("pubmed", "16381885")
     data = Medline.read(handle)
     handle.close()
     self.assertEqual(
         data["TI"],
         "From genomics to chemical genomics: new developments in KEGG.")
     self.assertEqual(
         data["AU"],
         [
             "Kanehisa M",
             "Goto S",
             "Hattori M",
             "Aoki-Kinoshita KF",
             "Itoh M",
             "Kawashima S",
             "Katayama T",
             "Araki M",
             "Hirakawa M",
         ],
     )
Exemplo n.º 42
0
def getCancerData(searchTerm, filename, email):
    f = open(filename, "w")
    Entrez.email = email  # Always tell NCBI who you are
    handle = Entrez.egquery(term=searchTerm)
    record = Entrez.read(handle)
    for row in record["eGQueryResult"]:
        if row["DbName"] == "pubmed":
            print(row["Count"])  #prints number of articles

    retmax = row["Count"]
    retmax = 300000

    handle = Entrez.esearch(db="pubmed", term=searchTerm, retmax=retmax)
    record = Entrez.read(handle)
    idlist = record["IdList"]

    handle = Entrez.efetch(db="pubmed",
                           id=idlist,
                           rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)

    records = list(records)  #all pmids are in this list

    for record in records:
        s = ", "
        authors = s.join(record.get("AU", "?"))
        count = count + 1
        f.write("PMID: " + record.get("PMID", "?"))
        f.write("Title: " + record.get("TI", "?"))
        f.write("Authors: " + authors)  #writes the title, author,
        f.write("Source: " +
                record.get("SO", "?"))  #source and abstract to a file
        f.write("Abstract: " + record.get("AB", "?"))

    handle.close()
    f.close()
Exemplo n.º 43
0
def parser(inputFile):
    print("Creating pubmedDB database ...")

    #Change current directory to where the code is saved since inputFile is a relational address:
    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    with open(inputFile) as handle:
        # Each record is related to one article
        records = Medline.parse(handle)
        count = 0
        # Fetch desired info of each record
        for record in records:
            abstractVal = record.get('AB')
            titleVal = record.get('TI')
            keywordsVal = record.get('OT')
            meshVal = record.get('MH')
            count += 1
            # Insert record into pubmedDB
            mycol.insert_one({
                "title": titleVal,
                "abstract": abstractVal,
                "keywords": [keywordsVal],
                "meshterms": [meshVal]
            })
        print("Inserted {} records into pubmedDB".format(count))
Exemplo n.º 44
0
def fetch_publication_list(citations, rettype='medline'):
    """
    Fetch Publications.
    :param rettype:
    :param citations:
    :return:
    """
    sys.stdout.write("=====================================")
    print(f"Fetching {len(citations)} publications. rettype: {rettype}.")
    citation_string = ','.join(citations)
    Entrez.email = '*****@*****.**'
    retries = 5
    failed = True
    for i in range(retries):
        try:
            h = Entrez.efetch(db='pubmed',
                              id=citation_string,
                              rettype=rettype,
                              retmode='text')
            failed = False
        except HTTPError:
            pass
        else:
            break
        finally:
            # we are not allowed to hit NCBI more than 3 times per second
            time.sleep(0.4)
    if failed:
        print("Failed to retrieve data from PubMed")
        records = []
    else:
        if rettype == 'medline':
            records = Medline.parse(h)
        else:
            records = Entrez.parse(h)
    return records
Exemplo n.º 45
0
def entrezQuery(idList, outFile):

    writeFile = open(outFile, 'w')

    handle = Entrez.efetch(db="pubmed",
                           id=idList,
                           rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)
    for record in records:
        #either return record entry or empty string
        pmid = record.get('PMID', '')
        title = record.get('TI', '')
        authors = record.get('FAU', '')
        authors = modifyAuthors(authors)  #format authors
        journal = record.get('JT', '')
        date = record.get('DP', '')
        date = date[0:4]  #year only
        abstract = record.get('AB', '')
        #need ascii encapsulation for uniformity with database
        writeFile.write(
            ascii(pmid) + '\t' + ascii(title) + '\t' + ascii(authors) + '\t' +
            ascii(journal) + '\t' + ascii(date) + '\t' + ascii(abstract) +
            '\n')
Exemplo n.º 46
0
    def _retrieve_record_batch(self, batch_start, batch_size):
        """Retrieves a PubMed article record batch.

        Retrieval is based on the info recovered by '_search_for_records()'.
        The batch size is limited by the 'batch_start' and 'batch_size'
        parameters. Returns a string containing the article info, if execution
        was successful and returns None otherwise.

        Args:
            batch_start (int): Specifies the starting index of this record
                batch.
            batch_size (int): Specifies the size of this records batch.

        Returns:
            list: A list of dictionaries that hold the data for each record.
        """
        if None in [self.search_record_web_env, self.search_record_query_key]:
            raise ValueError(  # Perform a search first!
                    'No WebEnv or QueryKey data in this PubMed class instance.'
            )

        fetch_handle = Entrez.efetch(
                db='pubmed',
                rettype='medline',
                retmode='text',
                retstart=batch_start,
                retmax=batch_size,
                webenv=self.search_record_web_env,
                query_key=self.search_record_query_key
        )

        data = Medline.parse(fetch_handle)
        records = [record for record in data]
        fetch_handle.close()

        return records
Exemplo n.º 47
0
def get_pubmed_list(geneName):

    Entrez.email = "*****@*****.**"
    queryTerm = geneName + "[All Fields] AND (\"human\"[All Fields) AND (\"gene\"[All Fields) "
    handle = Entrez.esearch(db="pubmed", term=queryTerm, retmax=1000)
    record = Entrez.read(handle)
    idlist = record["IdList"]
    logging.info(geneName + ":\t" + record["Count"])

    handle = Entrez.efetch(db="pubmed",
                           id=idlist,
                           rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)
    records = list(records)

    return_value = {'pubmed_ids': []}
    for record in records:
        return_value['pubmed_ids'].append(record.get("PMID", ""))
        #print record.get("PMID", "")

    handle.close()

    return return_value
Exemplo n.º 48
0
def buildlist():
    #pull PMCID's from PMID's
    from Bio import Medline
    pmcidh = []
    pmidh2 = []
    handle = open("PMChand_medline.txt")
    records = Medline.parse(handle)
    for rec2 in records:
        try:
            pmci = rec2['PMC']
            pmcidh.append(pmci)
            pmidh2.append(rec2["PMID"])
        except:
            continue
    handle.close()

    #Query PubTator
    import requests
    pmcid_fail = []

    urlxml = [""] * 52
    for i in range(52):
        url = "https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmcids=" + pmcidh[
            i]
        try:
            response = requests.get(url, timeout=3)
            if response.status_code == 200:
                urlxml[i] = response.text
        except:
            print(pmcidh[i])
            pmcid_fail.append(pmcidh[i])
            continue

    global pmcid2pmid
    pmcid2pmid = dict(zip(pmcidh, pmidh2))
    return pmcid_fail, urlxml, pmcid2pmid, pmidh2
Exemplo n.º 49
0
def pub_med_parser(drug, side_effect):

    drug_eng = Drugs.drugs(drug)
    side_effect = Sideeffect.sideEffect(side_effect)
    Entrez.email = "*****@*****.**"
    terms = "((" + drug_eng[
        0] + "[Title]) AND " + side_effect + "[Title/Abstract])"
    handle = Entrez.esearch(db="pubmed",
                            term=terms,
                            rettype="medline",
                            retmode="text")
    record = Entrez.read(handle)
    handle.close()

    idlist = record["IdList"]
    handle2 = Entrez.efetch(db="pubmed",
                            id=idlist,
                            rettype="medline",
                            retmode="text")
    records = Medline.parse(handle2)
    records = list(records)

    var = 0
    titres = []
    for record in records:
        titre = record.get("TI", "?")
        titres.append(titre)

    for i in titres:
        if drug_eng[0] in i and side_effect in i:
            var += 1

    if var != 0:
        return True
    else:
        return False
Exemplo n.º 50
0
def download_many(ids,
                  callback_fn,
                  broken_fn=None,
                  batchsize=500,
                  parser=None):
    """Download multiple PubMed records, no return value (DEPRECATED).

    Please use Bio.Entrez instead as described in the Biopython Tutorial.

    Download many records from PubMed.  ids is a list of either the
    Medline Unique ID or the PubMed ID's of the articles.  Each time a
    record is downloaded, callback_fn is called with the text of the
    record.  broken_fn is an optional function that is called with the
    id of records that were not able to be downloaded.  batchsize is the
    number of records to request each time.

    """
    # parser is an undocumented parameter that allows people to
    # specify an optional parser to handle each record.  This is
    # dangerous because the results may be malformed, and exceptions
    # in the parser may disrupt the whole download process.
    if batchsize > 500 or batchsize < 1:
        raise ValueError("batchsize must be between 1 and 500")
    current_batchsize = batchsize

    # Loop until all the ids are processed.  We want to process as
    # many as possible with each request.  Unfortunately, errors can
    # occur.  Some id may be incorrect, or the server may be
    # unresponsive.  In addition, one broken id out of a list of id's
    # can cause a non-specific error.  Thus, the strategy I'm going to
    # take, is to start by downloading as many as I can.  If the
    # request fails, I'm going to half the number of records I try to
    # get.  If there's only one more record, then I'll report it as
    # broken and move on.  If the request succeeds, I'll double the
    # number of records until I get back up to the batchsize.
    nsuccesses = 0
    while ids:
        if current_batchsize > len(ids):
            current_batchsize = len(ids)

        id_str = ','.join(ids[:current_batchsize])

        try:
            # Query PubMed.  If one or more of the id's are broken,
            # this will raise an IOError.
            handle = Entrez.efetch(db="pubmed",
                                   id=id_str,
                                   retmode='text',
                                   rettype='medlars')

            # I'm going to check to make sure PubMed returned the same
            # number of id's as I requested.  If it didn't then I'm going
            # to raise an exception.  This could take a lot of memory if
            # the batchsize is large.
            results = handle.read()
            num_ids = 0
            for x in Medline.Iterator(File.StringHandle(results)):
                num_ids = num_ids + 1
            if num_ids != current_batchsize:
                raise IOError
            handle = File.StringHandle(results)
        except IOError:  # Query did not work.
            if current_batchsize == 1:
                # There was only 1 id in the query.  Report it as
                # broken and move on.
                id = ids.pop(0)
                if broken_fn is not None:
                    broken_fn(id)
            else:
                # I don't know which one is broken.  Try again with
                # fewer id's.
                current_batchsize = current_batchsize / 2
            nsuccesses = 0
            continue
        nsuccesses = nsuccesses + 1

        # Iterate through the results and pass the records to the
        # callback.
        idnum = 0
        for rec in Medline.Iterator(handle, parser):
            callback_fn(ids[idnum], rec)
            idnum = idnum + 1

        ids = ids[current_batchsize:]

        # If I'm not downloading the maximum number of articles,
        # double the number for next time.
        if nsuccesses >= 2 and current_batchsize < batchsize:
            current_batchsize = current_batchsize * 2
            if current_batchsize > batchsize:
                current_batchsize = batchsize
Exemplo n.º 51
0
"""Example script showing how to interact with PubMed."""
# standard library
import string

# biopython
from Bio import PubMed
from Bio import Medline

# do the search and get the ids
search_term = 'orchid'
orchid_ids = PubMed.search_for(search_term)

print orchid_ids

# access Medline through a dictionary interface that returns PubMed Records
rec_parser = Medline.RecordParser()
medline_dict = PubMed.Dictionary(parser=rec_parser)

for id in orchid_ids[0:5]:
    cur_record = medline_dict[id]
    print 'title:', string.rstrip(cur_record.title)
    print 'authors:', cur_record.authors
    print 'source:', string.strip(cur_record.source)
    print
Exemplo n.º 52
0
from Bio import Entrez, Medline

Entrez.email = "*****@*****.**"
count = 1
dst = open("/tmp/pmc.csv", "w")

#for i in range(0,4):
search_handle = Entrez.esearch(db="pmc", usehistory="y", term='Multimodal AND "Deep Learning" AND (cancer OR tumour OR neoplasm)', retmax=400, retstart=0)
page_record = Entrez.read(search_handle)

for pmcid in page_record['IdList']:
    print("Fetching pmcid = " + pmcid)
    fetch_handle = Entrez.efetch(db='pmc', rettype="medline", retmode="text", id=pmcid)
    records = Medline.parse(fetch_handle)
    for record in records:
        if 'AU' in record:
            author = ','.join(record['AU'])
            print(author)
        else:
            author = ''

        if 'AID' in record:    
            doi = ','.join(record['AID'])
            print(doi)
        else:
            doi = ''

        if 'PMC' in record:    
            pmc = record['PMC']
            print(pmc)
        else:
Exemplo n.º 53
0
y41, y42 = 2000, 2004
####D-1
handle = Entrez.esearch(db="pubmed",
                        term=findkey,
                        retmax=nofref,
                        mindate=str(y11),
                        maxdate=str(y12))
record = Entrez.read(handle)
handle.close()
idlist = record["IdList"]

handle = Entrez.efetch(db="pubmed",
                       id=idlist,
                       rettype="medline",
                       retmode="text")
records = Medline.parse(handle)

records = list(records)

w, h = 5, len(records)
maa = [[None for x in range(w)] for y in range(h)]

for i in range(0, len(records)):
    maa[i][0] = records[i].get("PMID", "?")
    maa[i][1] = records[i].get("TI", "?")
    maa[i][2] = records[i].get("AB", "?")
    maa[i][3] = records[i].get("DP")
    maa[i][4] = records[i].get("PT", "?")

ma1 = pd.DataFrame(maa)
ma1.columns = ['PMID', 'TI', 'AB', 'DP', 'PT']
Exemplo n.º 54
0
def collect_NCBI():
    global all_pmids
    global pmid_dict

    if os.path.exists(f'./{rel_name}/{rel_name}_pmid_dict.json'):
        with open(f'./{rel_name}/{rel_name}_pmid_dict.json', 'r') as f:
            jd = f.read()
            temp_dict = json.loads(jd)
        pmid_dict.update(temp_dict)
        return pmid_dict

    for idx in tqdm(range(len(all_pmids))):
        pmid = all_pmids[idx]
        # get records for each pmid
        fetch_records_handle1 = efetch(db="pubmed",
                                       id=str(pmid),
                                       rettype="medline",
                                       retmode="text")
        # parse fetched records
        records1 = Medline.parse(fetch_records_handle1)

        # Need to iterate over records to extract information
        for record1 in records1:
            # try except check to be sure that NCBI is not returning empty result
            try:
                # let's get pmcid if exists
                id2 = record1['PMC'][3:]
                #print('PMC',id2)

                # get records for pmcid
                fetch_records_handle2 = efetch(db="pubmed",
                                               id=str(id2),
                                               rettype="medline",
                                               retmode="text")
                # parse records for pmcid
                records2 = Medline.parse(fetch_records_handle2)

                # Need to iterate over records to extract information
                '''
                Collect following information: authors, authors' affiliations, publication date, citations, grants
                Store all these information in an dictionary (pmid_dict)
                '''
                for record2 in records2:
                    authors = record2['FAU']
                    affiliations = record2['AD']
                    pub_date = record2['DCOM']
                    citations = get_links_id(pmid)
                    grants = record2['GR']
                    pmid_dict[pmid] = {
                        'pmcid_number': id2,
                        'pmcid': True,
                        'authors': authors,
                        'affiliations': affiliations,
                        'grants': grants,
                        'pub_date': pub_date,
                        'citations': citations
                    }
            except:
                authors = record1['FAU']
                try:
                    affiliations = record1['AD']
                except:
                    affiliations = ''
                try:
                    pub_date = record1['DCOM']
                except:
                    pub_date = ''
                try:
                    citations = get_links_id(pmid)
                except:
                    citations = ''
                try:
                    grants = record1['GR']
                except:
                    grants = ''
                pmid_dict[pmid] = {
                    'pmcid_number': '',
                    'pmcid': False,
                    'authors': authors,
                    'affiliations': affiliations,
                    'grants': grants,
                    'pub_date': pub_date,
                    'citations': citations
                }

    with open(f'./{rel_name}/{rel_name}_pmid_dict.json', 'w') as output:
        output.write(json.dumps(pmid_dict))

    return pmid_dict
Exemplo n.º 55
0
def query_search_pubmed(query: str, ret_max: str, email: str, min_date: str,
                        max_date: str):
    """Search PubMed via the user's query supplied through the command line

    Parameters
    ----------
    query: a query to be searched against PubMed database

    email: a user's email to access to the PubMed database

    ret_max: total number of records from query to be retrieved

    min_date: the minimum or start date to search

    max_date: the maximum or end date to search


    Return
    -------
    retrieve document summaries as records

    """

    Entrez.email = email

    if min_date and max_date:
        # search the PubMed db for the entered query
        search = Entrez.esearch(db="pubmed",
                                term=query,
                                sort="relevance",
                                retmode="text",
                                retmax=ret_max,
                                mindate=min_date,
                                maxdate=max_date)
    else:
        # search the PubMed db for the entered query
        search = Entrez.esearch(db="pubmed",
                                term=query,
                                sort="relevance",
                                retmode="text",
                                retmax=ret_max,
                                usehistory='y')

    search_records = Entrez.read(search)
    search.close()

    # get the list of ids for the searched records
    list_ids = search_records['IdList']

    print(f"\nTotal of {len(list_ids)} records retrieved!")

    ids = ",".join(list_ids)

    # return document summaries as a result handle
    fetch_records = Entrez.efetch(db="pubmed",
                                  id=ids,
                                  rettype="Medline",
                                  retmode="text",
                                  webenv=search_records['WebEnv'],
                                  query_key=search_records['QueryKey'])

    search_results = Medline.parse(fetch_records)
    # fetch_records.close()

    return search_results
Exemplo n.º 56
0
 def test_parse(self):
     handle = open("Medline/pubmed_result2.txt")
     records = Medline.parse(handle)
     record = next(records)
     self.assertEqual(record["PMID"], "16403221")
     self.assertEqual(record["OWN"], "NLM")
     self.assertEqual(record["STAT"], "MEDLINE")
     self.assertEqual(record["DA"], "20060220")
     self.assertEqual(record["DCOM"], "20060314")
     self.assertEqual(record["PUBM"], "Electronic")
     self.assertEqual(record["IS"], "1471-2105 (Electronic)")
     self.assertEqual(record["VI"], "7")
     self.assertEqual(record["DP"], "2006")
     self.assertEqual(
         record["TI"],
         "A high level interface to SCOP and ASTRAL implemented in python.")
     self.assertEqual(record["PG"], "10")
     self.assertEqual(
         record["AB"],
         "BACKGROUND: Benchmarking algorithms in structural bioinformatics often involves the construction of datasets of proteins with given sequence and structural properties. The SCOP database is a manually curated structural classification which groups together proteins on the basis of structural similarity. The ASTRAL compendium provides non redundant subsets of SCOP domains on the basis of sequence similarity such that no two domains in a given subset share more than a defined degree of sequence similarity. Taken together these two resources provide a 'ground truth' for assessing structural bioinformatics algorithms. We present a small and easy to use API written in python to enable construction of datasets from these resources. RESULTS: We have designed a set of python modules to provide an abstraction of the SCOP and ASTRAL databases. The modules are designed to work as part of the Biopython distribution. Python users can now manipulate and use the SCOP hierarchy from within python programs, and use ASTRAL to return sequences of domains in SCOP, as well as clustered representations of SCOP from ASTRAL. CONCLUSION: The modules make the analysis and generation of datasets for use in structural genomics easier and more principled."
     )
     self.assertEqual(
         record["AD"],
         "Bioinformatics, Institute of Cell and Molecular Science, School of Medicine and Dentistry, Queen Mary, University of London, London EC1 6BQ, UK. [email protected]"
     )
     self.assertEqual(
         record["FAU"],
         ["Casbon, James A", "Crooks, Gavin E", "Saqi, Mansoor A S"])
     self.assertEqual(record["AU"], ["Casbon JA", "Crooks GE", "Saqi MA"])
     self.assertEqual(record["LA"], ["eng"])
     self.assertEqual(record["PT"],
                      ["Evaluation Studies", "Journal Article"])
     self.assertEqual(record["DEP"], "20060110")
     self.assertEqual(record["PL"], "England")
     self.assertEqual(record["TA"], "BMC Bioinformatics")
     self.assertEqual(record["JT"], "BMC bioinformatics")
     self.assertEqual(record["JID"], "100965194")
     self.assertEqual(record["SB"], "IM")
     self.assertEqual(record["MH"], [
         "*Database Management Systems", "*Databases, Protein",
         "Information Storage and Retrieval/*methods",
         "Programming Languages", "Sequence Alignment/*methods",
         "Sequence Analysis, Protein/*methods",
         "Sequence Homology, Amino Acid", "*Software",
         "*User-Computer Interface"
     ])
     self.assertEqual(record["PMC"], "PMC1373603")
     self.assertEqual(record["EDAT"], "2006/01/13 09:00")
     self.assertEqual(record["MHDA"], "2006/03/15 09:00")
     self.assertEqual(record["PHST"], [
         "2005/06/17 [received]", "2006/01/10 [accepted]",
         "2006/01/10 [aheadofprint]"
     ])
     self.assertEqual(
         record["AID"],
         ["1471-2105-7-10 [pii]", "10.1186/1471-2105-7-10 [doi]"])
     self.assertEqual(record["PST"], "epublish")
     self.assertEqual(record["SO"], "BMC Bioinformatics. 2006 Jan 10;7:10.")
     record = next(records)
     self.assertEqual(record["PMID"], "16377612")
     self.assertEqual(record["OWN"], "NLM")
     self.assertEqual(record["STAT"], "MEDLINE")
     self.assertEqual(record["DA"], "20060223")
     self.assertEqual(record["DCOM"], "20060418")
     self.assertEqual(record["LR"], "20061115")
     self.assertEqual(record["PUBM"], "Print-Electronic")
     self.assertEqual(record["IS"], "1367-4803 (Print)")
     self.assertEqual(record["VI"], "22")
     self.assertEqual(record["IP"], "5")
     self.assertEqual(record["DP"], "2006 Mar 1")
     self.assertEqual(
         record["TI"],
         "GenomeDiagram: a python package for the visualization of large-scale genomic data."
     )
     self.assertEqual(record["PG"], "616-7")
     self.assertEqual(
         record["AB"],
         "SUMMARY: We present GenomeDiagram, a flexible, open-source Python module for the visualization of large-scale genomic, comparative genomic and other data with reference to a single chromosome or other biological sequence. GenomeDiagram may be used to generate publication-quality vector graphics, rastered images and in-line streamed graphics for webpages. The package integrates with datatypes from the BioPython project, and is available for Windows, Linux and Mac OS X systems. AVAILABILITY: GenomeDiagram is freely available as source code (under GNU Public License) at http://bioinf.scri.ac.uk/lp/programs.html, and requires Python 2.3 or higher, and recent versions of the ReportLab and BioPython packages. SUPPLEMENTARY INFORMATION: A user manual, example code and images are available at http://bioinf.scri.ac.uk/lp/programs.html."
     )
     self.assertEqual(
         record["AD"],
         "Plant Pathogen Programme, Scottish Crop Research Institute, Invergowrie, Dundee DD2 5DA, Scotland, UK. [email protected]"
     )
     self.assertEqual(record["FAU"], [
         "Pritchard, Leighton", "White, Jennifer A", "Birch, Paul R J",
         "Toth, Ian K"
     ])
     self.assertEqual(record["AU"],
                      ["Pritchard L", "White JA", "Birch PR", "Toth IK"])
     self.assertEqual(record["LA"], ["eng"])
     self.assertEqual(
         record["PT"],
         ["Journal Article", "Research Support, Non-U.S. Gov't"])
     self.assertEqual(record["DEP"], "20051223")
     self.assertEqual(record["PL"], "England")
     self.assertEqual(record["TA"], "Bioinformatics")
     self.assertEqual(record["JT"], "Bioinformatics (Oxford, England)")
     self.assertEqual(record["JID"], "9808944")
     self.assertEqual(record["SB"], "IM")
     self.assertEqual(record["MH"], [
         "Chromosome Mapping/*methods", "*Computer Graphics",
         "*Database Management Systems", "*Databases, Genetic",
         "Information Storage and Retrieval/methods",
         "*Programming Languages", "*Software", "*User-Computer Interface"
     ])
     self.assertEqual(record["EDAT"], "2005/12/27 09:00")
     self.assertEqual(record["MHDA"], "2006/04/19 09:00")
     self.assertEqual(record["PHST"], ["2005/12/23 [aheadofprint]"])
     self.assertEqual(
         record["AID"],
         ["btk021 [pii]", "10.1093/bioinformatics/btk021 [doi]"])
     self.assertEqual(record["PST"], "ppublish")
     self.assertEqual(
         record["SO"],
         "Bioinformatics. 2006 Mar 1;22(5):616-7. Epub 2005 Dec 23.")
     record = next(records)
     self.assertEqual(record["PMID"], "14871861")
     self.assertEqual(record["OWN"], "NLM")
     self.assertEqual(record["STAT"], "MEDLINE")
     self.assertEqual(record["DA"], "20040611")
     self.assertEqual(record["DCOM"], "20050104")
     self.assertEqual(record["LR"], "20061115")
     self.assertEqual(record["PUBM"], "Print-Electronic")
     self.assertEqual(record["IS"], "1367-4803 (Print)")
     self.assertEqual(record["VI"], "20")
     self.assertEqual(record["IP"], "9")
     self.assertEqual(record["DP"], "2004 Jun 12")
     self.assertEqual(record["TI"], "Open source clustering software.")
     self.assertEqual(record["PG"], "1453-4")
     self.assertEqual(
         record["AB"],
         "SUMMARY: We have implemented k-means clustering, hierarchical clustering and self-organizing maps in a single multipurpose open-source library of C routines, callable from other C and C++ programs. Using this library, we have created an improved version of Michael Eisen's well-known Cluster program for Windows, Mac OS X and Linux/Unix. In addition, we generated a Python and a Perl interface to the C Clustering Library, thereby combining the flexibility of a scripting language with the speed of C. AVAILABILITY: The C Clustering Library and the corresponding Python C extension module Pycluster were released under the Python License, while the Perl module Algorithm::Cluster was released under the Artistic License. The GUI code Cluster 3.0 for Windows, Macintosh and Linux/Unix, as well as the corresponding command-line program, were released under the same license as the original Cluster code. The complete source code is available at http://bonsai.ims.u-tokyo.ac.jp/mdehoon/software/cluster. Alternatively, Algorithm::Cluster can be downloaded from CPAN, while Pycluster is also available as part of the Biopython distribution."
     )
     self.assertEqual(
         record["AD"],
         "Human Genome Center, Institute of Medical Science, University of Tokyo, 4-6-1 Shirokanedai, Minato-ku, Tokyo, 108-8639 Japan. [email protected]"
     )
     self.assertEqual(
         record["FAU"],
         ["de Hoon, M J L", "Imoto, S", "Nolan, J", "Miyano, S"])
     self.assertEqual(record["AU"],
                      ["de Hoon MJ", "Imoto S", "Nolan J", "Miyano S"])
     self.assertEqual(record["LA"], ["eng"])
     self.assertEqual(record["PT"], [
         "Comparative Study", "Evaluation Studies", "Journal Article",
         "Validation Studies"
     ])
     self.assertEqual(record["DEP"], "20040210")
     self.assertEqual(record["PL"], "England")
     self.assertEqual(record["TA"], "Bioinformatics")
     self.assertEqual(record["JT"], "Bioinformatics (Oxford, England)")
     self.assertEqual(record["JID"], "9808944")
     self.assertEqual(record["SB"], "IM")
     self.assertEqual(record["MH"], [
         "*Algorithms", "*Cluster Analysis",
         "Gene Expression Profiling/*methods",
         "Pattern Recognition, Automated/methods", "*Programming Languages",
         "Sequence Alignment/*methods", "Sequence Analysis, DNA/*methods",
         "*Software"
     ])
     self.assertEqual(record["EDAT"], "2004/02/12 05:00")
     self.assertEqual(record["MHDA"], "2005/01/05 09:00")
     self.assertEqual(record["PHST"], ["2004/02/10 [aheadofprint]"])
     self.assertEqual(
         record["AID"],
         ["10.1093/bioinformatics/bth078 [doi]", "bth078 [pii]"])
     self.assertEqual(record["PST"], "ppublish")
     self.assertEqual(
         record["SO"],
         "Bioinformatics. 2004 Jun 12;20(9):1453-4. Epub 2004 Feb 10.")
     record = next(records)
     self.assertEqual(record["PMID"], "14630660")
     self.assertEqual(record["OWN"], "NLM")
     self.assertEqual(record["STAT"], "MEDLINE")
     self.assertEqual(record["DA"], "20031121")
     self.assertEqual(record["DCOM"], "20040722")
     self.assertEqual(record["LR"], "20061115")
     self.assertEqual(record["PUBM"], "Print")
     self.assertEqual(record["IS"], "1367-4803 (Print)")
     self.assertEqual(record["VI"], "19")
     self.assertEqual(record["IP"], "17")
     self.assertEqual(record["DP"], "2003 Nov 22")
     self.assertEqual(
         record["TI"],
         "PDB file parser and structure class implemented in Python.")
     self.assertEqual(record["PG"], "2308-10")
     self.assertEqual(
         record["AB"],
         "The biopython project provides a set of bioinformatics tools implemented in Python. Recently, biopython was extended with a set of modules that deal with macromolecular structure. Biopython now contains a parser for PDB files that makes the atomic information available in an easy-to-use but powerful data structure. The parser and data structure deal with features that are often left out or handled inadequately by other packages, e.g. atom and residue disorder (if point mutants are present in the crystal), anisotropic B factors, multiple models and insertion codes. In addition, the parser performs some sanity checking to detect obvious errors. AVAILABILITY: The Biopython distribution (including source code and documentation) is freely available (under the Biopython license) from http://www.biopython.org"
     )
     self.assertEqual(
         record["AD"],
         "Department of Cellular and Molecular Interactions, Vlaams Interuniversitair Instituut voor Biotechnologie and Computational Modeling Lab, Department of Computer Science, Vrije Universiteit Brussel, Pleinlaan 2, 1050 Brussels, Belgium. [email protected]"
     )
     self.assertEqual(record["FAU"],
                      ["Hamelryck, Thomas", "Manderick, Bernard"])
     self.assertEqual(record["AU"], ["Hamelryck T", "Manderick B"])
     self.assertEqual(record["LA"], ["eng"])
     self.assertEqual(record["PT"], [
         "Comparative Study", "Evaluation Studies", "Journal Article",
         "Research Support, Non-U.S. Gov't", "Validation Studies"
     ])
     self.assertEqual(record["PL"], "England")
     self.assertEqual(record["TA"], "Bioinformatics")
     self.assertEqual(record["JT"], "Bioinformatics (Oxford, England)")
     self.assertEqual(record["JID"], "9808944")
     self.assertEqual(record["RN"], ["0 (Macromolecular Substances)"])
     self.assertEqual(record["SB"], "IM")
     self.assertEqual(record["MH"], [
         "Computer Simulation", "Database Management Systems/*standards",
         "*Databases, Protein",
         "Information Storage and Retrieval/*methods/*standards",
         "Macromolecular Substances", "*Models, Molecular",
         "*Programming Languages", "Protein Conformation", "*Software"
     ])
     self.assertEqual(record["EDAT"], "2003/11/25 05:00")
     self.assertEqual(record["MHDA"], "2004/07/23 05:00")
     self.assertEqual(record["PST"], "ppublish")
     self.assertEqual(record["SO"],
                      "Bioinformatics. 2003 Nov 22;19(17):2308-10.")
     self.assertRaises(StopIteration, next, records)
     handle.close()
Exemplo n.º 57
0
from Bio import Medline

# example script to download medline version of the pubmed query
# esearch -db pubmed -query 'antimicrobial resistance' | efilter -mindate 1950 -maxdate 1990 -datetype PDAT | efetch -format medline > 50_90_medline.txt
medline=[]
with open('../data/medline/10_18_medline.txt') as medline_file:
	records = Medline.parse(medline_file)
	for record in records:
		medline.append(record)

for i in range(len(medline)):
	medline_entry = medline[0]
	print(medline_entry)
# 	outpath = 'mesh_10_18/'+ medline_entry.get('PMID')
# 	with open(outpath,'w') as file:
# 		file.write(str(medline_entry.get('MH')))
# 		file.close()

# example script to move expty file
# grep -lrIZ None | xargs -r0 mv -t nonefile/ --
Exemplo n.º 58
0
    def handle(self, *args, **options):
        for term in args:
            print "buscando [%s]" % term

            handle = Entrez.esearch(db="pubmed", retmax=10, term=term)
            record = Entrez.read(handle)

            ids_list = record['IdList']

            for id in ids_list:
                a = Entrez.efetch(db="pubmed",
                                  id=id,
                                  rettype='medline',
                                  retmode='text')
                ff = a.readlines()
                records = Medline.parse(ff)
                r = records.next()
                try:
                    cit = Citation()
                    cit.pmid = int(r['PMID'])

                    cit.title = r['TI'] if 'TI' in r.keys() else None
                    cit.abstract = r['AB'] if 'AB' in r.keys() else None
                    cit.pagination = r['PG'] if 'PG' in r.keys() else None
                    cit.copyright_information = " ; ".join(
                        r['CI']) if 'CI' in r.keys() else None

                    # dates
                    if 'CRDT' in r.keys():
                        conv = time.strptime(r['CRDT'][0], "%Y/%m/%d %H:%M")
                        cit.date_created = datetime.datetime(*conv[:6])
                    if 'DCOM' in r.keys():
                        # 'DCOM': '19990406'
                        conv = time.strptime(r['DCOM'], "%Y%m%d")
                        cit.date_completed = datetime.datetime(*conv[:6])
                    if 'LR' in r.keys():
                        conv = time.strptime(r['LR'], "%Y%m%d")
                        cit.date_revised = datetime.datetime(*conv[:6])
                    if 'DEP' in r.keys():
                        conv = time.strptime(r['DEP'], "%Y%m%d")
                        cit.date_electronic_publication = datetime.datetime(
                            *conv[:6])

                    cit.save()

                    # relationships

                    # type
                    if 'PT' in r.keys():
                        for pub_type in r['PT']:
                            (pt, created) = PubType.objects.get_or_create(
                                pub_type=pub_type)
                            cit.pub_types.add(pt)

                    # authors
                    if 'AU' in r.keys():
                        for i, author in enumerate(r['AU']):
                            if author != 'et al.':
                                (a, created) = Author.objects.get_or_create(
                                    name=author, full_name=r['FAU'][i])
                                cit.authors.add(a)

                    # language
                    if 'LA' in r.keys():
                        for lang in r['LA']:
                            (l, created) = Language.objects.get_or_create(
                                language=lang)
                            cit.languages.add(l)

                    # affiliation
                    if 'AD' in r.keys():
                        (organization,
                         created) = Organization.objects.get_or_create(
                             name=r['AD'])
                        cit.affiliation = organization

                    # journal
                    if 'JID' in r.keys():
                        issn = r['IS'] if 'IS' in r.keys() else None
                        volume = r['VI'] if 'VI' in r.keys() else None
                        issue = r['IP'] if 'IP' in r.keys() else None

                        if not 'PL' in r:
                            r['PL'] = 'Mexico'

                        (journal, created) = Journal.objects.get_or_create(
                            jid=r['JID'],
                            issn=issn,
                            volume=volume,
                            issue=issue,
                            title=r['JT'],
                            iso_abbreviation=r['TA'],
                            country=r['PL'])
                        cit.journal = journal

                    cit.save()

                    # meshterms
                    if 'MH' in r:
                        for term in r['MH']:
                            term_subs = term.split('/')
                            if term_subs[0].startswith('*'):
                                (mh, created) = Meshterm.objects.get_or_create(
                                    term=term_subs[0][1:])
                                major = True
                            else:
                                (mh, created) = Meshterm.objects.get_or_create(
                                    term=term_subs[0])
                                major = False

                            mc = Meshcitation.objects.create(meshterm=mh,
                                                             citation=cit,
                                                             major=major)

                            if len(term_subs) > 1:
                                for subterm in term_subs[1:]:
                                    if subterm.startswith('*'):
                                        major = True
                                        subterm = subterm[1:]
                                    else:
                                        major = False

                                    (sh, created
                                     ) = Subheading.objects.get_or_create(
                                         term=subterm)
                                    sht = Subheadingterm.objects.create(
                                        subheading=sh,
                                        meshcitation=mc,
                                        major=major)

                    self.stdout.write('%s' % cit)

                except:
                    print "error trying to load %s" % r['PMID']
                    import pprint
                    import sys
                    print sys.exc_info()[0]
                    pprint.pprint(r)
                    raise
Exemplo n.º 59
0
    abs_count.append(result["Count"])

    ids = result["IdList"]

    # only download abstracts if there are at least 5 available
    if len(ids) > 5:

        batches = [ids[x:x + 10] for x in range(0, len(ids), batch_size)]

        record_list = []
        for batch in tqdm(batches):
            h = Entrez.efetch(db="pubmed",
                              id=batch,
                              rettype="medline",
                              retmode="text")
            records = Medline.parse(h)
            record_list.extend(list(records))

        if len(record_list) != 0:  # if the recorsd list is not empty

            record_list_df = pd.DataFrame(record_list)  # make a data frame

            record_list_df = record_list_df[record_list_df['AB'].notna(
            )]  # keep rows without na in abstract column
            record_list_df['AB'] = record_list_df['AB'].str.lower(
            )  # make all text lower case
            record_list_df['AB'] = record_list_df['AB'].apply(
                remove_punctuation)  # remove punctuation

            text_list = [row.split(' ') for row in record_list_df['AB']
                         ]  # convert into list of single words
Exemplo n.º 60
0
def download_abstracts(dataset, email):
    """Download the abstracts for a list of PubMed IDs.

    Uses the BioPython package.

    .. versionadded:: 0.0.2

    Parameters
    ----------
    dataset : :obj:`nimare.dataset.Dataset`
        A Dataset object where IDs are in the form PMID-EXPID
    email : :obj:`str`
        Email address to use to call the PubMed API

    Returns
    -------
    dataset : :obj:`nimare.dataset.Dataset`

    Warning
    -------
    This function assumes that the dataset uses identifiers in the format
    [PMID-EXPID]. Thus, the ``study_id`` column of the
    :obj:`nimare.dataset.Dataset.texts` DataFrame should correspond to PMID.
    """
    try:
        from Bio import Entrez, Medline
    except ImportError:
        raise Exception(
            "Module biopython is required for downloading abstracts from PubMed."
        )

    Entrez.email = email

    if isinstance(dataset, Dataset):
        pmids = dataset.texts["study_id"].astype(str).tolist()
        pmids = sorted(list(set(pmids)))
    elif isinstance(dataset, list):
        pmids = [str(pmid) for pmid in dataset]
    else:
        raise Exception(f"Dataset type not recognized: {type(dataset)}")

    records = []
    # PubMed only allows you to search ~1000 at a time. I chose 900 to be safe.
    chunks = [pmids[x:x + 900] for x in range(0, len(pmids), 900)]
    for i, chunk in enumerate(chunks):
        LGR.info(f"Downloading chunk {i + 1} of {len(chunks)}")
        h = Entrez.efetch(db="pubmed",
                          id=chunk,
                          rettype="medline",
                          retmode="text")
        records += list(Medline.parse(h))

    # Pull data for studies with abstracts
    data = [[study["PMID"], study["AB"]] for study in records
            if study.get("AB", None)]
    df = pd.DataFrame(columns=["study_id", "abstract"], data=data)
    if not isinstance(dataset, Dataset):
        return df

    dataset.texts = pd.merge(dataset.texts,
                             df,
                             left_on="study_id",
                             right_on="study_id",
                             how="left")
    return dataset