Python Medline示例，Bio.Medline Python示例

示例#1

0

显示文件

文件： whoosh_test_001.py 项目： massyah/GRN-analysis

def index():
	ix = open_dir("indexdir")
	writer = ix.writer()
	for pfile in pubmed_files:
		print "parsing",pfile
		txt=open(project+"/"+pfile,"r")
		records=Medline.parse(txt)
		for r in records:
			if "AB" not in r:
				continue
			authors=""
			if "FAU" in r:
				authors+=",".join(r["FAU"])
			elif "AU" in r:
				authors+=",".join(r["AU"])
			else:
				firstAuthor="Unknown"
			date=datetime.datetime.strptime(r["DA"],"%Y%m%d")
			title=r["TI"]
			pmid=r["PMID"].decode("utf-8")

			writer.add_document(
				title=title.decode("utf-8"),
				path=pfile.decode("utf-8"),
				abstract=r['AB'].decode("utf-8"),
				authors=authors.decode("utf-8"),
				pmid=pmid,
				dateAdded=date
				)
	writer.commit()
	print "Index contain",ix.doc_count()

示例#2

0

显示文件

文件： sp_tools.py 项目： FriedbergLab/Uniprot-Bias

def top_papers(papers,outpath=None,delim="\t", top=20):
    """This function fetches all the relevant PubMed info for each PMID in 'papers' and 
    1) puts it into a list and 2) outputs it to a file named in outpath."""
    #
    # Can be used with SP & GOA data
    
    papers_annots = [(len(papers[p]), p) for p in papers]
    papers_annots2 = []
        
    papers_annots.sort()
    idlist = [p[1] for p in papers_annots[-top:]]
    Entrez.email = "*****@*****.**"
    h = Entrez.efetch(db="pubmed", id=",".join(idlist), 
                          rettype="medline", retmode="text")
    medrecs = list(Medline.parse(h))
    titles = [medrec.get("TI","?") for medrec in medrecs]
    years = [medrec.get("DP","?") for medrec in medrecs]
    journals = [medrec.get("JT", "?") for medrec in medrecs]
    for p, title, year, journal in zip(papers_annots[-top:], titles,years, journals):
        papers_annots2.append((p[0],p[1], title, year.split()[0].strip(), journal))
    if outpath:
        fout = open(outpath,"w")
        print >> fout, "num proteins\tpubmed ID\tTitle\tYear\tJournal"
        for p in papers_annots2:
            print >> fout, "%d\t%s\t%s\t%s\t%s" % p
        fout.close()
    #papers_annots2 = [(# all annotations, PMID, Title, Year, Journal)] 
    return papers_annots2

示例#3

0

显示文件

文件： entrez_pubmed_interface.py 项目： massyah/LINK

def store_abstracts_for_query(query,query_tag,maxN=None,preview_only=False):
	# if query_tag=="":
	# 	simpleQuery=" ".join(map(lambda x:x.name,queryTerms))
	# else:
	# 	simpleQuery=query_tag
	# query=pg.build_query(queryTerms)
	print "will search",query
	Entrez.email = "*****@*****.**"
	search_results = Entrez.read(Entrez.esearch(db="pubmed",
												term=query,
												reldate=10*365, datetype="pdat",
												usehistory="y"))
	count = int(search_results["Count"])
	print "Found %i results" % count
	if maxN!=None and maxN<count:
		count=maxN
		print "Only keeping first",count,"abstracts"
	if preview_only:
		return
	sys.stdout.flush()
	batch_size = 50
	for start in range(0,count,batch_size):
			end = min(count, start+batch_size)
			print "Going to download record %i to %i" % (start+1, end)
			sys.stdout.flush()
			fetch_handle = Entrez.efetch(db="pubmed",
										 rettype="medline", retmode="text",
										 retstart=start, retmax=batch_size,
										 webenv=search_results["WebEnv"],
										 query_key=search_results["QueryKey"])
			records=Medline.parse(fetch_handle)
			for r in records:
				pubmed_to_pg.store_medline_entry(r,query_tag)

示例#4

0

显示文件

def pubmed():
    # Get the count of papers about orchid only in database pubmed
    Entrez.email = "*****@*****.**"     # Always tell NCBI who you are
    handle = Entrez.egquery(term="orchid")
    record = Entrez.read(handle)
    for row in record["eGQueryResult"]:
        if row["DbName"] == "pubmed":
            print "The count of papers about orchid in database pubmed:", row["Count"]

    # Get the list of ids of above
    handle = Entrez.esearch(db="pubmed", term="orchid", retmax=100)
    record = Entrez.read(handle)
    idlist = record["IdList"]
    print "The id list of papers about orchid in database pubmed:", idlist
    print

    # Search papers author by "Liu ZJ" from pubmed
    handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)
    search_author = "Liu ZJ"
    for record in records:
        if "AU" not in record:
            continue
        if search_author in record["AU"]:
            print "Author %s found." % search_author
            print "title:", record.get("TI", "?")
            print "authors:", record.get("AU", "?")
            print "source:", record.get("SO", "?")
            print

示例#5

0

显示文件

文件： getMeSH.py 项目： BjornWouters/PubCheck

def getMeSH(url):
        query = urllib.unquote_plus(url)

        if not query:
                return "<h3> No query </h3>"
        
        MAX_COUNT = 10000
         
        Entrez.email = '*****@*****.**'
        pubmedquery = query.replace('-','\-')
        h = Entrez.esearch(db='pubmed', term=pubmedquery, retmax=MAX_COUNT)
        result = Entrez.read(h)
        ids = result['IdList']
        if not ids:
                return "<h3> geen gevonden resultaten </h3>"
        h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text')
        records = Medline.parse(h)

        MeSHCount = 0
        MeSHContent = ""
        
        for record in records:
                try:
                        if "tox" in str(record.get("MH")):
                                MeSHContent += "<h4><a href='http://www.ncbi.nlm.nih.gov/pubmed/"+str(record.get("PMID"))+"'>"
                                MeSHContent += "PMID: "+str(record.get("PMID"))+"</a> is analysed on toxicity. </h4> \n"
                except (TypeError):
                        continue;
                
        return MeSHContent

示例#6

0

显示文件

文件： table.py 项目： NathanvanDalen/PubCheck-1

def createTable(query):

    if not query:
        return "<h3> No query </h3>"

    MAX_COUNT = 100
    pubmedquery = query.replace("-", "\-")
    Entrez.email = "*****@*****.**"
    h = Entrez.esearch(db="pubmed", term=pubmedquery, retmax=MAX_COUNT)
    result = Entrez.read(h)
    ids = result["IdList"]
    h = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="text")
    records = Medline.parse(h)

    tableContent = ""

    for record in records:
        try:
            tableContent += (
                "<tr><td width='22%'>" + str(record.get("TI")) + "</td>"
                "<td width='5%'>" + str(record.get("DP")) + "</td>"
                "<td width='5%'>" + str(writers(record.get("FAU"))) + "</td>"
                "<td width='5%'>" + str(record.get("JT")) + "</td>"
                "<td width='5%'>" + str(query) + "</td>"
                "<td>"
                "<a href='http://www.ncbi.nlm.nih.gov/pubmed/"
                + str(record.get("PMID"))
                + "'><img src='PubMed.png' height='75px' width='75px' alt='PubMed' align='right'/></a>"
                + str(record.get("AB"))
                + "</td></tr>"
            )
        except (TypeError):
            continue

    return tableContent

示例#7

0

显示文件

文件： get_publications.py 项目： nbenzakour/BeatsonLab-MicrobialGenomics.github.io

def fetch(t, s):
    h = Entrez.esearch(db="pubmed", term=t, retmax=10000, sort=s)
    idList = Entrez.read(h)["IdList"]
    results = "Total publications for SA Beatson: **" + str(len(idList)) + "**\n\n"
    results += "Chronologically sorted:\n\n"

    if idList:
        handle = Entrez.efetch(db="pubmed", id=idList, rettype="medline", retmode="text")
        records = Medline.parse(handle)
        max = len(idList) + 1
        for record in records:
            title = record["TI"]
            author = ", ".join(record["AU"])
            source = record["SO"]
            pub_date = datetime.strptime(record["DA"], "%Y%m%d").date()
            pmid = record["PMID"]
            cur_pub = "| **%i.** %s\n| %s\n| %s\n| http://www.ncbi.nlm.nih.gov/pubmed/%s\n|\n" % (
                max - 1,
                title,
                author,
                source,
                pmid,
            )
            results = results + cur_pub
            max = max - 1
    return results

示例#8

0

显示文件

文件： sp_tools.py 项目： FriedbergLab/Uniprot-Bias

def top_papers_dict(papers, outpath=None,delim="\t", top=None):
    """This function fetches all the relevent PubMed info for each PMID in 'papers' 
    (at the limit supplied in 'top') and 1) puts it into a dict."""
    #
    # Can be used with SP & GOA data
    
#    papers_annots = [(len(papers_prots[p]), p) for p in papers_prots]
    papers_annots = [(len(papers[p]), p) for p in papers]
    papers_annots2_dict = {}
        
    papers_annots.sort()
    if top is None:
        negTop = 0
    else:
        negTop = -top
    idlist = [p[1] for p in papers_annots[negTop:]]
    Entrez.email = MY_EMAIL
    h = Entrez.efetch(db="pubmed", id=",".join(idlist), 
                          rettype="medline", retmode="text")
    medrecs = list(Medline.parse(h))
    titles = [medrec.get("TI","?") for medrec in medrecs]
    years = [medrec.get("DP","?") for medrec in medrecs]
    journals = [medrec.get("JT", "?") for medrec in medrecs]
    for p, title, year, journal in zip(papers_annots[negTop:], titles,years, journals):
        #papers_annots2_dict[PMID] = [# of total annotations, Title, Year, Journal] 
        papers_annots2_dict[p[1]] = [len(papers[p[1]]), title, year.split()[0].strip(), journal]
    """if outpath:
        fout = open(outpath,"w")
        print >> fout, "num proteins\tpubmed ID\tTitle\tYear\tJournal"
        for p in papers_annots2:
            print >> fout, "%d\t%s\t%s\t%s\t%s" % p
        fout.close()
    """
    return papers_annots2_dict

示例#9

0

显示文件

文件： test_Entrez_online.py 项目： janusz005/biopython

 def test_medline_from_url(self):
     """Test Entrez into Medline.read from URL"""
     efetch = Entrez.efetch(db="pubmed", id="19304878", rettype="medline", retmode="text")
     record = Medline.read(efetch)
     self.assertTrue(isinstance(record, dict))
     self.assertEqual("19304878", record["PMID"])
     self.assertEqual("10.1093/bioinformatics/btp163 [doi]", record["LID"])

示例#10

0

显示文件

文件： pubmedAPI.py 项目： LeaHaha/leahaha

def retrive_record(row):

    name=row[1]+"[AUTH]"        
    handle = Entrez.esearch(db="pubmed",term=name)
    record=Entrez.read(handle)
    idlist=record["IdList"]
    

    
    handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline",
                       retmode="text")
    records = Medline.parse(handle)

    for record in records:
        temp=[]
        temp.append(row[0])
        temp.append(row[1])
        #title
        temp.append(record.get("TI","?"))
        #authors
        temp.append(record.get("AU","?"))
        #
        temp.append(record.get("AD","?"))
        #
        temp.append(record.get("DP","?"))
        #pubmed id for url
        temp.append(record.get("PMID","?"))

    return temp

示例#11

0

显示文件

文件： test_TogoWS.py 项目： biopython/biopython

 def test_pubmed_16381885_and_19850725(self):
     """Bio.TogoWS.entry("pubmed", "16381885,19850725")"""
     handle = TogoWS.entry("pubmed", "16381885,19850725")
     records = list(Medline.parse(handle))
     handle.close()
     self.assertEqual(len(records), 2)
     self.assertEqual(records[0]["TI"], "From genomics to chemical genomics: new developments in KEGG.")
     self.assertEqual(
         records[0]["AU"],
         [
             "Kanehisa M",
             "Goto S",
             "Hattori M",
             "Aoki-Kinoshita KF",
             "Itoh M",
             "Kawashima S",
             "Katayama T",
             "Araki M",
             "Hirakawa M",
         ],
     )
     self.assertEqual(
         records[1]["TI"],
         "DDBJ launches a new archive database with analytical tools " + "for next-generation sequence data.",
     )
     self.assertEqual(
         records[1]["AU"],
         ["Kaminuma E", "Mashima J", "Kodama Y", "Gojobori T", "Ogasawara O", "Okubo K", "Takagi T", "Nakamura Y"],
     )

示例#12

0

显示文件

文件： table.py 项目： BjornWouters/PubCheck

def createTable(query):

        if not query:
                return "<h3> No query </h3>"
        
        MAX_COUNT = 100
         
        Entrez.email = '*****@*****.**'
        pubmedquery = query.replace('-','\-')
        h = Entrez.esearch(db='pubmed', term=pubmedquery, retmax=MAX_COUNT)
        result = Entrez.read(h)
        ids = result['IdList']
        if not ids:
                return "<h3> geen gevonden resultaten </h3>"
        h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text')
        records = Medline.parse(h)


        tableContent = ""
        
        for record in records:
                try:    
                        tableContent += "<tr><td width='22%'>"+str(record.get("TI"))+"</td>"\
                        "<td width='5%'>"+str(record.get("DP"))+"</td>"\
                        "<td width='5%'>"+str(writers(record.get("FAU")))+"</td>"\
                        "<td width='5%'>"+str(record.get("JT"))+"</td>"\
                        "<td width='5%'>"+str(query)+"</td>"\
                        "<td>"\
                        "<a href='http://www.ncbi.nlm.nih.gov/pubmed/"+str(record.get("PMID"))+"'><img src='PubMed.png' height='75px' width='75px' alt='PubMed' align='right'/></a>"\
                        +str(record.get("AB"))+"</td></tr>"
                except (TypeError):
                        continue;
                
        return tableContent

示例#13

0

显示文件

文件： utilities.py 项目： apierleoni/MyBioDb

def fetch_pubmed_data(pmid):

    from Bio import Medline,Entrez
    
    try:
        ncbiemail= settings.author_email
    except:
        try:
            ncbiemail= settings.author_email
        except:
            raise Exception('Please set an email to use ncbi services')
    
    Entrez.email = ncbiemail
    Entrez.tool = 'mybiodb'

    try:
        entrez_response=Medline.parse( Entrez.efetch(db="pubmed", id=pmid, rettype="medline",retmode="text",)).next()
        if not entrez_response.has_key('PMID'):
             response.flash='pubmed ID error'
        else:
            return entrez_response
    except IOError:
        session.flash='Remote service not available, please try again.'

       
    return

示例#14

0

显示文件

文件： paperbot.py 项目： BlogomaticProject/Blogomatic

	def fetch(self, batchSize=100):
		"""Return a batch of results.
		"""
		if self._done:
			return []

		end = min(self._searchCount, self._searchPosition + batchSize)

		log.info("Downloading from %i to %i..." % (self._searchPosition+1, end))

		fetchHandle = Entrez.efetch(db="pubmed", rettype="medline", retmode="text", retstart=self._searchPosition, retmax=batchSize, webenv=self._searchSession, query_key=self._queryKey)
		result = Medline.parse(fetchHandle)

		papers = [paper.Paper(r) for r in result if r.get("PMID") is not None ]

		fetchHandle.close()

		log.info("... downloading done")

		self._searchPosition = self._searchPosition + batchSize

		if self._searchPosition >= self._searchCount:
			self._done = True
			log.info("Search ended.")

		return papers

示例#15

0

显示文件

文件： Search.py 项目： vitay/Biberon

def search_pubmed(term):
    "Searches a term on pubmed"
    print("Searching for", term)
    try:
        # Do a first query
        handle = Entrez.egquery(term=term)
        record = Entrez.read(handle)
        nb_entries = 0
        for row in record["eGQueryResult"]:
            if row["DbName"]=="pubmed":
                nb_entries = row["Count"]
                print(row["Count"], 'results found.')
        if int(nb_entries) == 0:
            return BibDatabase()
        # Search the IDs
        handle = Entrez.esearch(db="pubmed", term=term, retmax=min(int(nb_entries), MAX_RESULTS))
        record = Entrez.read(handle)
        idlist = record["IdList"]
        # Get the descriptions
        handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
        records = Medline.parse(handle)
        records = list(records)
        return transform_pubmed(records)

    except Exception as e:
        print('The search failed.')
        print(e)
        return []

示例#16

0

显示文件

文件： test_Medline.py 项目： HuttonICS/biopython

 def test_read(self):
     with open("Medline/pubmed_result1.txt") as handle:
         record = Medline.read(handle)
     self.assertEqual(record["PMID"], "12230038")
     self.assertEqual(record["OWN"], "NLM")
     self.assertEqual(record["STAT"], "MEDLINE")
     self.assertEqual(record["DA"], "20020916")
     self.assertEqual(record["DCOM"], "20030606")
     self.assertEqual(record["LR"], "20041117")
     self.assertEqual(record["PUBM"], "Print")
     self.assertEqual(record["IS"], "1467-5463 (Print)")
     self.assertEqual(record["VI"], "3")
     self.assertEqual(record["IP"], "3")
     self.assertEqual(record["DP"], "2002 Sep")
     self.assertEqual(record["TI"], "The Bio* toolkits--a brief overview.")
     self.assertEqual(record["PG"], "296-302")
     self.assertEqual(record["AB"], "Bioinformatics research is often difficult to do with commercial software. The Open Source BioPerl, BioPython and Biojava projects provide toolkits with multiple functionality that make it easier to create customised pipelines or analysis. This review briefly compares the quirks of the underlying languages and the functionality, documentation, utility and relative advantages of the Bio counterparts, particularly from the point of view of the beginning biologist programmer.")
     self.assertEqual(record["AD"], "tacg Informatics, Irvine, CA 92612, USA. [email protected]")
     self.assertEqual(record["FAU"], ["Mangalam, Harry"])
     self.assertEqual(record["AU"], ["Mangalam H"])
     self.assertEqual(record["LA"], ["eng"])
     self.assertEqual(record["PT"], ["Journal Article"])
     self.assertEqual(record["PL"], "England")
     self.assertEqual(record["TA"], "Brief Bioinform")
     self.assertEqual(record["JT"], "Briefings in bioinformatics")
     self.assertEqual(record["JID"], "100912837")
     self.assertEqual(record["SB"], "IM")
     self.assertEqual(record["MH"], ["*Computational Biology", "Computer Systems", "Humans", "Internet", "*Programming Languages", "*Software", "User-Computer Interface"])
     self.assertEqual(record["EDAT"], "2002/09/17 10:00")
     self.assertEqual(record["MHDA"], "2003/06/07 05:00")
     self.assertEqual(record["PST"], "ppublish")
     self.assertEqual(record["SO"], "Brief Bioinform. 2002 Sep;3(3):296-302.")

示例#17

0

显示文件

文件： parse_songbird.py 项目： unidesigner/connectomewiki

def get_wikiref(pmid):
    """ Returns the Wiki cite journal entry for a given Pubmed ID """
    
    handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
    records = Medline.parse(handle)
    records = list(records)
    
    import datetime
    now = datetime.datetime.now()
    jetzt= now.strftime("%Y-%m-%d")
    
    # generate the {{cite journal}} format
        
    for rec in records:
        aut = rec["AU"]
        firstauthor = aut.pop(0)
        coauthors = ", ".join(aut)
        
        # get date of publication
        # CRDT
        datee = rec["CRDT"][0].split('/')[0]
        #if datee == '':
        #    datee = rec["PD"]
        
        outstring = "{{cite journal|title=%s|journal=%s|year=%s|author=%s|coauthors=%s|volume=%s|pages=%s|id=PMID %s|accessdate=%s}}" % \
                    (rec["TI"], rec["JT"], datee, firstauthor, coauthors, rec["VI"], rec["PG"], pmid, jetzt)
        
        # example:
        #{{cite journal|title=|journal=|date=2008/07/31/|first=Cyril|last=Herry|coauthors=i|volume=454|issue=7204|pages=600-606|id=PMID 18615015 {{doi|10.1038/nature07166}}|url=http://www.fmi.ch/downloads/news/2008.07.11.01.luthi.nature.press.release.pdf|format=|accessdate=2009-09-12 }}
        
    return outstring

示例#18

0

显示文件

文件： artikelSearch.py 项目： Jorisvansteenbrugge/Inficio_Raptum

def main(Substance, Organism, Gene):
    zoekterm1 = "Cocaine"
    zoekterm2 = "Elegans"
    MAX_COUNT = 50
    dic = {}
    titels = []
    TERM = ''
    TERMS = []
    count = 1
    if zoekterm2 == "":
        TERM = zoekterm1
    if zoekterm1 == "":
        print("vul een zoekterm in")
        sys.exit()
    elif zoekterm2 != "":
        TERM = zoekterm1+" and "+zoekterm2
    TERMS.append(TERM)
    print(TERM)
    handle = Entrez.esearch(db="pubmed", term= TERM, retmax=MAX_COUNT)
    record = Entrez.read(handle)
    idlist = record["IdList"]
    handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)
    records = list(records)
    for record in records:
        titel = record.get("PMID","?")
        titels.append(titel)
        pubSet = set(titels)        
    dic[TERM] = pubSet
    print(dic)
    return "Jay"

示例#19

0

显示文件

文件： pubmed.py 项目： sweng66/lit-review

    def __init__(self, pmids):
        
         Entrez.email = '*****@*****.**'

         ## pmids is a list (array of pmid)
         handle = Entrez.efetch(db='pubmed', id=pmids, rettype='medline', retmode='text')
         self.records = Medline.parse(handle)

示例#20

0

显示文件

文件： PubmedSIR.py 项目： hmbachelor/bachelor

def getMedlineList(pmids):

    """
    This function takes a list of article-ids and returns a list of
    MedLine articles that contains an abstract.
    """

    records = []
    cleaned_records = []
    listLength = len(pmids)

    Entrez.email = '*****@*****.**'

    for i in range(0, listLength, 650):
        tempList = pmids[i:i + 650]
        handle = Entrez.efetch(db='pubmed', id=tempList,rettype='medline', retmode='text')
        try:
            records.extend(list(Medline.parse(handle)))
        except:
            IOmodule.writeOutTxt(_mainFolder+'/'+'errordir_medline_records', pmids[i], '')

        print 'Downloaded',len(records),'MedLine articles.',str(listLength-len(records)),'remaining...'

    for article in records:
        if 'AB' in article:
            cleaned_records.append(article)
    
    print 'Returned',len(cleaned_records),'MedLine articles containing an abstract.'
    return cleaned_records

示例#21

0

显示文件

文件： tasks.py 项目： x0xMaximus/YdF2fG2lkPHbCpAiHJpmexo0mnXdbb2NYLYOXIy6Rq

def get_pubmed_document(pubmed_ids, source='pubmed', include_pubtator=True):
    Entrez.email = settings.ENTREZ_EMAIL

    if type(pubmed_ids) == list:
        ids = [str(doc_id) for doc_id in pubmed_ids]
    else:
        ids = [str(pubmed_ids)]

    h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text')
    records = Medline.parse(h)

    # Reference to abbreviations: http://www.nlm.nih.gov/bsd/mms/medlineelements.html
    for record in records:
        if record.get('TI') and record.get('AB') and record.get('PMID') and record.get('CRDT'):
            #if Document.objects.pubmed_count(record.get('PMID')) is 0:
            title = ' '.join( pad_split(record.get('TI')) )
            abstract = ' '.join( pad_split(record.get('AB')) )

            doc, doc_c = Document.objects.get_or_create(document_id=record.get('PMID'))
            doc.title = title
            doc.source = source
            doc.save()

            sec, sec_c = Section.objects.get_or_create(kind='t', document=doc)
            sec.text = title
            sec.save()

            sec, sec_c = Section.objects.get_or_create(kind='a', document=doc)
            sec.text = abstract
            sec.save()

            if include_pubtator:
                doc.init_pubtator()

示例#22

0

显示文件

文件： pubmed_scraper_003.py 项目： ratiom/pubmed

def pubsearch(jids):
    Entrez.email = "*****@*****.**"
    # always let Entrez know who is calling

    pubterm = ""
    for i in jids:
        pubterm += i + "[JID] or "

    IDhandle = Entrez.esearch(
        db="pubmed", term="peptide AND (" + pubterm + " and ", mindate="2011", maxdate="2014", retmax=2500
    )
    # for documentation on esearch, see
    # http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
    # max number for retmax is 100k. Use retstart to get more than this.
    # Date range used to limit a search result by the date specified by datetype. These two parameters (mindate, maxdate) must be used together to specify an arbitrary date range. The general date format is YYYY/MM/DD, and these variants are also allowed: YYYY, YYYY/MM.

    record = Entrez.read(IDhandle)
    # record is returned as a dictionary. Lists search terms, all ID numbners etc

    idlist = record["IdList"]
    # return a list of ID numbers from the record dictionary

    recordHandle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
    # search pubmed for records with idlist as input

    records = Medline.parse(recordHandle)
    # create dictionary from recordHandle

    return records

示例#23

0

显示文件

文件： MeSH_to_MySQL.py 项目： AldoCP/PubMed_databases

	def processInput(k):
		print "Querying PMID: "+str(k)+"."
		getall = Medline.read(Entrez.efetch(db="pubmed", id=k, rettype="medline", retmode="text"))
		singlemesh = getall.get("MH")
		singledate = getall.get("EDAT")	
		for j1 in range(len(singlemesh)):
			cur.execute("INSERT INTO MeSH002(PMID, MeSH, Dates) VALUES("+str(k)+",'" + getall.get("MH")[j1][0:24].translate(None, "'*&")+"','" +  str(singledate[0:10]) +"')" )

示例#24

0

显示文件

文件： fetchgeo.py 项目： msGenDev/fetchgeo

def get_pubmed_data(idlist):
	"""Takes a list of pubmed ids and returns title, auth, yr"""
	handle = Entrez.efetch(db='pubmed', id=idlist, rettype='medline', retmode='text')
	records = Medline.parse(handle)
	mypms = []
	for record in records:
		mypms.append((record["TI"], record["AU"], record["PMID"]))
	return mypms

示例#25

0

显示文件

文件： Gene2Pubmed_author.py 项目： ashutoshkpandey/PubMed-IR

def Pubmedsearch(PMID):
	pmid = ""
	pmid = PMID

    	handle = Entrez.efetch(db="pubmed", id= pmid, rettype="medline",retmode="text")
    	records = Medline.parse(handle)
    	records = list(records)
    	for record in records:
        	return (str(pmid)+"\t"+str(record.get("TI", "?"))+"\t"+str(record.get("FAU", "?"))+"\t"+str(record.get("AU", "?"))+"\t"+str(record.get("AD", "?")))

示例#26

0

显示文件

文件： pubmedsearch.py 项目： jagstein/USnewsy

def pubmedsearch (TERM, MAX_COUNT = 10000):
    # Returns an Entrez object matching *TERM*
    Entrez.email = '*****@*****.**'
    h = Entrez.esearch(db='pubmed', retmax=MAX_COUNT, term=TERM)
    result = Entrez.read(h)
    ids = result['IdList']
    h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text') 
    records = Medline.parse(h)
    return records

示例#27

0

显示文件

文件： pubmed.py 项目： 2dpodcast/PyCon2013_SNA

def get_articles(term):
    idlist = get_article_ids(term)
    counter=0
    #pbar = make_pbar(len(idlist),text="Fetching")
    
    articles=[]
    if len(idlist) > 100:
        chunks=[idlist[i:i+100] for i in range(0, len(idlist), 100)]
        for chunk in chunks:
            handle = Entrez.efetch(db="pubmed", id=chunk, rettype="medline", retmode="text")
            articles.extend(list(Medline.parse(handle)))
            print '#'
            #pbar.update(p.currval+len(chunk))
    else:
        handle=Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
        articles.extend(list(Medline.parse(handle)))
    #pbar.finish()
    
    return articles

示例#28

0

显示文件

文件： get_PMCIDs.py 项目： bwallace/PMCID_getter

def get_record_from_pmid(pmid):
    # now get the actual citation; should really only be a singleton,
    # but this library likes to operate over lists
    citations = Entrez.efetch(db="pubmed",id=pmid,
                                rettype="medline",retmode="text")

    # again, Bio likes to operate on lists, even though we only have
    # a singleton here
    record = list(Medline.parse(citations))[0]
    return record

示例#29

0

显示文件

文件： pubmedsearch.py 项目： jagstein/Rankings-dz

def pubmedsearch (TERM, MAX_COUNT = 10000):
    # Returns an Entrez object matching *TERM*
    Entrez.email = '*****@*****.**'
    Entrez.tool = 'pm_impacts'
    h = Entrez.esearch(db='pubmed', retmax=MAX_COUNT, term=TERM)
    result = Entrez.read(h)
    ids = result['IdList']
    h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text') 
    records = Medline.parse(h)
    return records

示例#30

0

显示文件

文件： pubmed_to_pg.py 项目： massyah/LINK

def store_abstract_with_pmid(pmid,queryTag=None):
	"""Populate the PG databases with the MEDLINE entries having these pmid. Pmid can is a scalar
	or a list of pmid
	"""
	if queryTag==None:
		queryTag="PMID"
	Entrez.email="*****@*****.**"
	handle=Entrez.efetch(db="pubmed",rettype="medline",retmode="text",id=pmid)
	for r in Medline.parse(handle):
		store_medline_entry(r,queryTag)

示例#31

0

显示文件

文件： BioPy_0513.py 项目： schnappi-wkl/ProBioinformatics

def collectPubmedInfo(email, term, record_dict, retmax, outputPath):
    Entrez.email = email
    # Use ESearch
    handle = Entrez.esearch(db="pubmed", term=term, retmax=retmax)
    record = Entrez.read(handle)
    count = record["Count"]
    idlist = record["IdList"]
    # Use pandas to save a formated file
    df_1 = pd.DataFrame({"PubmedIDs(%s)" % (count): idlist})
    df_1.to_csv(outputPath + "PubmedIDs_1.txt", sep="\t")
    # Use EFetch to collect id and use medline to get details
    handle = Entrez.efetch(db="pubmed",
                           id=idlist,
                           rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)
    for record in records:
        for key in record_dict.keys():
            record_dict[key].append(record[key])
    # Use pandas to save a formated file
    df_2 = pd.DataFrame(record_dict)
    df_2.to_csv(outputPath + "Results_1.txt", sep="\t")

示例#32

0

显示文件

def fetch_from_entrez(index, cache_dir=False):
    logger = logging.getLogger('build')

    # slugify the index for the cache filename (some indices have symbols not allowed in file names (e.g. /))
    index_slug = slugify(index)
    cache_file_path = '{}/{}'.format('/'.join(cache_dir), index_slug)

    # try fetching from cache
    if cache_dir:
        d = fetch_from_cache(cache_dir, index_slug)
        if d:
            logger.info('Fetched {} from cache'.format(cache_file_path))
            return d

    # if nothing is found in the cache, use the web API
    logger.info('Fetching {} from Entrez'.format(index))
    tries = 0
    max_tries = 5
    while tries < max_tries:
        if tries > 0:
            logger.warning('Failed fetching pubmed {}, retrying'.format(
                str(index)))

        try:
            Entrez.email = '*****@*****.**'
            handle = Entrez.efetch(db="pubmed",
                                   id=str(index),
                                   rettype="medline",
                                   retmode="text")
        except:
            tries += 1
            time.sleep(2)
        else:
            d = Medline.read(handle)

            # save to cache
            save_to_cache(cache_dir, index_slug, d)
            logger.info('Saved entry for {} in cache'.format(cache_file_path))
            return d

示例#33

0

显示文件

def getMedlineAbstracts(idList):
    fields = {
        "TI": "title",
        "AU": "authors",
        "JT": "journal",
        "DP": "date",
        "MH": "keywords",
        "AB": "abstract",
        "PMID": "PMID"
    }
    pubmedUrl = "https://www.ncbi.nlm.nih.gov/pubmed/"
    abstracts = pd.DataFrame()
    try:
        handle = Entrez.efetch(db="pubmed",
                               id=idList,
                               rettype="medline",
                               retmode="json")
        records = Medline.parse(handle)
        results = []
        for record in records:
            aux = {}
            for field in fields:
                if field in record:
                    aux[fields[field]] = record[field]
            if "PMID" in aux:
                aux["url"] = pubmedUrl + aux["PMID"]
            else:
                aux["url"] = ""
            results.append(aux)

        abstracts = pd.DataFrame.from_dict(results)
    except error.URLError as e:
        print("URLError: Request to Bio.Entrez failed. Error: {}".format(e))
    except error.HTTPError as e:
        print("HTTPError: Request to Bio.Entrez failed. Error: {}".format(e))
    except Exception as e:
        print("Request to Bio.Entrez failed. Error: {}".format(e))

    return abstracts

示例#34

0

显示文件

文件： dataset.py 项目： JulioAPeraza/neurosynth

def download_abstracts(dataset, path='.', email=None, out_file=None):
    """ Download the abstracts for a dataset/list of pmids
    """
    try:
        from Bio import Entrez, Medline
    except:
        raise Exception(
            'Module biopython is required for downloading abstracts from PubMed.'
        )

    if email is None:
        raise Exception('No email address provided.')
    Entrez.email = email

    if isinstance(dataset, Dataset):
        pmids = dataset.image_table.ids.astype(str).tolist()
    elif isinstance(dataset, list):
        pmids = [str(pmid) for pmid in dataset]
    else:
        raise Exception('Dataset type not recognized: {0}'.format(
            type(dataset)))

    records = []
    # PubMed only allows you to search ~1000 at a time. I chose 900 to be safe.
    chunks = [pmids[x:x + 900] for x in range(0, len(pmids), 900)]
    for chunk in chunks:
        h = Entrez.efetch(db='pubmed',
                          id=chunk,
                          rettype='medline',
                          retmode='text')
        records += list(Medline.parse(h))

    # Pull data for studies with abstracts
    data = [[study['PMID'], study['AB']] for study in records
            if study.get('AB', None)]
    df = pd.DataFrame(columns=['pmid', 'abstract'], data=data)
    if out_file is not None:
        df.to_csv(os.path.join(os.path.abspath(path), out_file), index=False)
    return df

示例#35

0

显示文件

def add_paper(pmid, created_by="OTTO", method_obtained="Curator triage"):
    """ Adds paper to referencedbentity table

    Parameters
    ----------
    pmid: int
    created_by: str, optional
    method_obtained: str, optional

    Returns
    -------
    object
        reference object
    """

    record = Medline.read(
        Entrez.efetch(db="pubmed", id=str(pmid), rettype='medline'))
    rec_keys = list(record.keys())
    if 'PMID' not in rec_keys:
        raise ValueError(
            'Unable to fetch record feom pubmed. Make sure it is a valid PMID.'
        )

    print(record)

    ncbi = DBSession.query(Source).filter_by(format_name='NCBI').one_or_none()
    source_id = ncbi.source_id
    ## insert into DBENTITY/REFERENCEDBENTITY/REFERENCEDOCUMENT
    [reference_id, authors, doi_url, pmc_url, sgdid,
     reference] = insert_referencedbentity(pmid, source_id, record, created_by,
                                           method_obtained)
    insert_authors(reference_id, authors, source_id, created_by)
    insert_pubtypes(pmid, reference_id, record.get('PT', []), source_id,
                    created_by)
    insert_urls(pmid, reference_id, doi_url, pmc_url, source_id, created_by)
    # removed to support changes in http://redmine.stanford.edu/issues/4758
    # insert_relations(pmid, reference_id, record, created_by)
    return reference

示例#36

0

显示文件

    def read_data(self):
        path = self.path
        final_list = []

        with open(path) as handle:
            records = Medline.parse(handle)
            tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
            for record in records:
                new_dict = {}
                check = self.check_preprocess_condition(record)
                if not check:
                    continue
                # abstract = record.get('AB',False)
                # abstractWords =  nltk.tokenize.word_tokenize(abstract)
                # sw = stopwords.words('english')
                # char_to_remove = [',','.','!','?',':']
                # for word in sw:
                #     if word in abstractWords:
                #         abstractWords.remove(word)
                # final_ab = ' '.join(list(abstractWords))
                # #remove punctuations
                # puncString = ".,:?!()0123456789"

                # final_ab = "".join(l for l in final_ab if l not in puncString)

                # final_ab = final_ab.lower()

                # for rmc in puncString:
                #     final_ab=final_ab.replace(rmc,'')
                new_dict['PMID'] = record.get('PMID', '')
                new_dict['TI'] = record.get('TI', '')
                new_dict['OT'] = record.get('OT', [])
                new_dict['AB'] = record.get('AB', '')
                new_dict['tokens'] = record.get('tokens', '')
                final_list.append(new_dict)
            print "clean abastract count>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>", len(
                final_list)
            return final_list

示例#37

0

显示文件

文件： gene-finder.py 项目： Malaheenee/gene-finder

def abs_search(gene_dict, pattern_dict, abstract_file, out_queue):
    try:
        result_dict = {}
        ABS_OPEN = open(abstract_file, 'r')
        all_abstracts = Medline.parse(ABS_OPEN)
        for abstract in all_abstracts:
            if 'AB' in abstract:
                abstract_text = re.sub(r'\r\n|\n|\s+|;', ' ', abstract['AB'])
                abstract_pmid = 'Unknown'
                abstract_journ = 'Unknown'
                if 'PMID' in abstract:
                    abstract_pmid = abstract['PMID']
                if 'SO' in abstract:
                    abstract_journ = abstract['SO']
                for key in gene_dict.keys():
                    for gene in gene_dict[key]:
                        match = gene.search(abstract_text, re.MULTILINE)
                        if match:
                            if key not in result_dict:
                                result_dict[key] = []
                            result_dict[key].append([abstract_pmid, match.group(0), \
                                                     abstract_text[match.start(0)-(match.start(0) if match.start(0) < 30 else 30):\
                                                                   match.end(0)+(match.end(0) if match.end(0) < 30 else 30)]])
                            result = dict.fromkeys(pattern_dict.keys())
                            for pattern in sorted(pattern_dict.keys()):
                                result[pattern] = []
                                for match in pattern_dict[pattern].finditer(
                                        abstract_text, re.MULTILINE):
                                    match = str(match.group(0))
                                    if match not in result[pattern]:
                                        result[pattern].append(match)
                                result_dict[key][-1].append(', '.join(
                                    result[pattern]))
                            result_dict[key][-1].append(abstract_journ)
        ABS_OPEN.close()
        out_queue.put(result_dict)
    except:
        print("One of the processes got an exception and was killed")

示例#38

0

显示文件

文件： Entrez.py 项目： ALuesink/Project-Blok8

def getAbstracts(ID):
    dictionary = {}
    dictionary_textmining = {}
    abstracts = []
    keys = []
    auteur = []
    datum = []
    titel = []

    handle = Entrez.efetch(db="pubmed",
                           id=ID,
                           rettype='Medline',
                           retmode='text')
    records = Medline.parse(handle)
    for record in records:
        PMID = record.get('PMID')
        auteurs = record.get('AU')
        if record.get('AB') is not None:
            abstract = record.get('AB')
        else:
            abstract = "-"
        date = record.get('DP')
        title = record.get('TI')
        if record.get('OT') is None:
            keywords = "-"
        else:
            keywords = record.get('OT')

        auteur.append(auteurs)
        abstracts.append(abstract)
        datum.append(date)
        titel.append(title)
        keys.append(keywords)

        dictionary[PMID] = [title, abstract, keywords, auteurs, date]
        dictionary_textmining[PMID] = [title, abstract, keywords]

    return keys, abstracts, auteur, datum, titel, dictionary, dictionary_textmining

示例#39

0

显示文件

    def pmid2abstract_info(self, pmid_list):
        

        # make sure that pmid are strings
        pmid_list = [str(i) for i in pmid_list]

        try:
            handle = Entrez.efetch(db="pubmed", id=','.join(pmid_list), rettype="medline", retmode="text")
            records = Medline.parse(handle)
        except:
            print("FAIL:", pmid_list)
            return None

        pmid2data = {}
        for record in records:
            try:
                pmid = record["PMID"]
            except:
                print(record)
                #{'id:': ['696885 Error occurred: PMID 28696885 is a duplicate of PMID 17633143']}
                if 'duplicate' in record['id:']:
                    duplicate = record['id:'].split(' ')[0]
                    correct = record['id:'].split(' ')[-1]
                    print("removing duplicated PMID... %s --> %s" % (duplicate, correct))
                    # remove duplicate from list
                    pmid_list.remove(duplicate)
                    return self.pmid2abstract_info(pmid_list)

            pmid2data[pmid] = {}
            pmid2data[pmid]["title"] = record.get("TI", "?")
            pmid2data[pmid]["authors"] = record.get("AU", "?")
            pmid2data[pmid]["source"] = record.get("SO", "?")
            pmid2data[pmid]["abstract"] = record.get("AB", "?")
            pmid2data[pmid]["journal"] = record.get("TA", "?")
            pmid2data[pmid]["year"] = record.get("DP", "?")
            pmid2data[pmid]["pmid"] = pmid

        return pmid2data

示例#40

0

显示文件

def get_paper(pmids):
    """

    :param pmids: PubMed ids of papers
    :type pmids: list
    :rtype: str
    Возвращает название статьи и список авторов
    """
    papers = []
    handle = Entrez.efetch(db="pubmed",
                           id=[str(pmid) for pmid in pmids],
                           rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)

    for pm_record in records:
        authors = pm_record.get("AU", "?")
        if len(authors) > 2:
            authors = '%s, %s et al.' % (authors[0], authors[1])
        papers.append(
            '%s, %s, %s' %
            (pm_record.get("TI", "?"), authors, pm_record.get("SO", "?")))
    return '\n'.join(papers)

示例#41

0

显示文件

文件： test_TogoWS.py 项目： sgalpha01/biopython

 def test_pubmed_16381885(self):
     """Bio.TogoWS.entry("pubmed", "16381885")."""
     # Gives Medline plain text
     handle = TogoWS.entry("pubmed", "16381885")
     data = Medline.read(handle)
     handle.close()
     self.assertEqual(
         data["TI"],
         "From genomics to chemical genomics: new developments in KEGG.")
     self.assertEqual(
         data["AU"],
         [
             "Kanehisa M",
             "Goto S",
             "Hattori M",
             "Aoki-Kinoshita KF",
             "Itoh M",
             "Kawashima S",
             "Katayama T",
             "Araki M",
             "Hirakawa M",
         ],
     )

示例#42

0

显示文件

def getCancerData(searchTerm, filename, email):
    f = open(filename, "w")
    Entrez.email = email  # Always tell NCBI who you are
    handle = Entrez.egquery(term=searchTerm)
    record = Entrez.read(handle)
    for row in record["eGQueryResult"]:
        if row["DbName"] == "pubmed":
            print(row["Count"])  #prints number of articles

    retmax = row["Count"]
    retmax = 300000

    handle = Entrez.esearch(db="pubmed", term=searchTerm, retmax=retmax)
    record = Entrez.read(handle)
    idlist = record["IdList"]

    handle = Entrez.efetch(db="pubmed",
                           id=idlist,
                           rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)

    records = list(records)  #all pmids are in this list

    for record in records:
        s = ", "
        authors = s.join(record.get("AU", "?"))
        count = count + 1
        f.write("PMID: " + record.get("PMID", "?"))
        f.write("Title: " + record.get("TI", "?"))
        f.write("Authors: " + authors)  #writes the title, author,
        f.write("Source: " +
                record.get("SO", "?"))  #source and abstract to a file
        f.write("Abstract: " + record.get("AB", "?"))

    handle.close()
    f.close()

示例#43

0

显示文件

def parser(inputFile):
    print("Creating pubmedDB database ...")

    #Change current directory to where the code is saved since inputFile is a relational address:
    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    with open(inputFile) as handle:
        # Each record is related to one article
        records = Medline.parse(handle)
        count = 0
        # Fetch desired info of each record
        for record in records:
            abstractVal = record.get('AB')
            titleVal = record.get('TI')
            keywordsVal = record.get('OT')
            meshVal = record.get('MH')
            count += 1
            # Insert record into pubmedDB
            mycol.insert_one({
                "title": titleVal,
                "abstract": abstractVal,
                "keywords": [keywordsVal],
                "meshterms": [meshVal]
            })
        print("Inserted {} records into pubmedDB".format(count))

示例#44

0

显示文件

def fetch_publication_list(citations, rettype='medline'):
    """
    Fetch Publications.
    :param rettype:
    :param citations:
    :return:
    """
    sys.stdout.write("=====================================")
    print(f"Fetching {len(citations)} publications. rettype: {rettype}.")
    citation_string = ','.join(citations)
    Entrez.email = '*****@*****.**'
    retries = 5
    failed = True
    for i in range(retries):
        try:
            h = Entrez.efetch(db='pubmed',
                              id=citation_string,
                              rettype=rettype,
                              retmode='text')
            failed = False
        except HTTPError:
            pass
        else:
            break
        finally:
            # we are not allowed to hit NCBI more than 3 times per second
            time.sleep(0.4)
    if failed:
        print("Failed to retrieve data from PubMed")
        records = []
    else:
        if rettype == 'medline':
            records = Medline.parse(h)
        else:
            records = Entrez.parse(h)
    return records

示例#45

0

显示文件

文件： entrez_pubmed_retrieve.py 项目： evelinajim/CPP_setup

def entrezQuery(idList, outFile):

    writeFile = open(outFile, 'w')

    handle = Entrez.efetch(db="pubmed",
                           id=idList,
                           rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)
    for record in records:
        #either return record entry or empty string
        pmid = record.get('PMID', '')
        title = record.get('TI', '')
        authors = record.get('FAU', '')
        authors = modifyAuthors(authors)  #format authors
        journal = record.get('JT', '')
        date = record.get('DP', '')
        date = date[0:4]  #year only
        abstract = record.get('AB', '')
        #need ascii encapsulation for uniformity with database
        writeFile.write(
            ascii(pmid) + '\t' + ascii(title) + '\t' + ascii(authors) + '\t' +
            ascii(journal) + '\t' + ascii(date) + '\t' + ascii(abstract) +
            '\n')

示例#46

0

显示文件

    def _retrieve_record_batch(self, batch_start, batch_size):
        """Retrieves a PubMed article record batch.

        Retrieval is based on the info recovered by '_search_for_records()'.
        The batch size is limited by the 'batch_start' and 'batch_size'
        parameters. Returns a string containing the article info, if execution
        was successful and returns None otherwise.

        Args:
            batch_start (int): Specifies the starting index of this record
                batch.
            batch_size (int): Specifies the size of this records batch.

        Returns:
            list: A list of dictionaries that hold the data for each record.
        """
        if None in [self.search_record_web_env, self.search_record_query_key]:
            raise ValueError(  # Perform a search first!
                    'No WebEnv or QueryKey data in this PubMed class instance.'
            )

        fetch_handle = Entrez.efetch(
                db='pubmed',
                rettype='medline',
                retmode='text',
                retstart=batch_start,
                retmax=batch_size,
                webenv=self.search_record_web_env,
                query_key=self.search_record_query_key
        )

        data = Medline.parse(fetch_handle)
        records = [record for record in data]
        fetch_handle.close()

        return records

示例#47

0

显示文件

def get_pubmed_list(geneName):

    Entrez.email = "*****@*****.**"
    queryTerm = geneName + "[All Fields] AND (\"human\"[All Fields) AND (\"gene\"[All Fields) "
    handle = Entrez.esearch(db="pubmed", term=queryTerm, retmax=1000)
    record = Entrez.read(handle)
    idlist = record["IdList"]
    logging.info(geneName + ":\t" + record["Count"])

    handle = Entrez.efetch(db="pubmed",
                           id=idlist,
                           rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)
    records = list(records)

    return_value = {'pubmed_ids': []}
    for record in records:
        return_value['pubmed_ids'].append(record.get("PMID", ""))
        #print record.get("PMID", "")

    handle.close()

    return return_value

示例#48

0

显示文件

文件： pubvhand2.py 项目： hwz628496/RotationFall2019

def buildlist():
    #pull PMCID's from PMID's
    from Bio import Medline
    pmcidh = []
    pmidh2 = []
    handle = open("PMChand_medline.txt")
    records = Medline.parse(handle)
    for rec2 in records:
        try:
            pmci = rec2['PMC']
            pmcidh.append(pmci)
            pmidh2.append(rec2["PMID"])
        except:
            continue
    handle.close()

    #Query PubTator
    import requests
    pmcid_fail = []

    urlxml = [""] * 52
    for i in range(52):
        url = "https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmcids=" + pmcidh[
            i]
        try:
            response = requests.get(url, timeout=3)
            if response.status_code == 200:
                urlxml[i] = response.text
        except:
            print(pmcidh[i])
            pmcid_fail.append(pmcidh[i])
            continue

    global pmcid2pmid
    pmcid2pmid = dict(zip(pmcidh, pmidh2))
    return pmcid_fail, urlxml, pmcid2pmid, pmidh2

示例#49

0

显示文件

文件： Parser.py 项目： Salem-Abderaouf/CNPM

def pub_med_parser(drug, side_effect):

    drug_eng = Drugs.drugs(drug)
    side_effect = Sideeffect.sideEffect(side_effect)
    Entrez.email = "*****@*****.**"
    terms = "((" + drug_eng[
        0] + "[Title]) AND " + side_effect + "[Title/Abstract])"
    handle = Entrez.esearch(db="pubmed",
                            term=terms,
                            rettype="medline",
                            retmode="text")
    record = Entrez.read(handle)
    handle.close()

    idlist = record["IdList"]
    handle2 = Entrez.efetch(db="pubmed",
                            id=idlist,
                            rettype="medline",
                            retmode="text")
    records = Medline.parse(handle2)
    records = list(records)

    var = 0
    titres = []
    for record in records:
        titre = record.get("TI", "?")
        titres.append(titre)

    for i in titres:
        if drug_eng[0] in i and side_effect in i:
            var += 1

    if var != 0:
        return True
    else:
        return False

示例#50

0

显示文件

文件： PubMed.py 项目： manucorreia/biopython

def download_many(ids,
                  callback_fn,
                  broken_fn=None,
                  batchsize=500,
                  parser=None):
    """Download multiple PubMed records, no return value (DEPRECATED).

    Please use Bio.Entrez instead as described in the Biopython Tutorial.

    Download many records from PubMed.  ids is a list of either the
    Medline Unique ID or the PubMed ID's of the articles.  Each time a
    record is downloaded, callback_fn is called with the text of the
    record.  broken_fn is an optional function that is called with the
    id of records that were not able to be downloaded.  batchsize is the
    number of records to request each time.

    """
    # parser is an undocumented parameter that allows people to
    # specify an optional parser to handle each record.  This is
    # dangerous because the results may be malformed, and exceptions
    # in the parser may disrupt the whole download process.
    if batchsize > 500 or batchsize < 1:
        raise ValueError("batchsize must be between 1 and 500")
    current_batchsize = batchsize

    # Loop until all the ids are processed.  We want to process as
    # many as possible with each request.  Unfortunately, errors can
    # occur.  Some id may be incorrect, or the server may be
    # unresponsive.  In addition, one broken id out of a list of id's
    # can cause a non-specific error.  Thus, the strategy I'm going to
    # take, is to start by downloading as many as I can.  If the
    # request fails, I'm going to half the number of records I try to
    # get.  If there's only one more record, then I'll report it as
    # broken and move on.  If the request succeeds, I'll double the
    # number of records until I get back up to the batchsize.
    nsuccesses = 0
    while ids:
        if current_batchsize > len(ids):
            current_batchsize = len(ids)

        id_str = ','.join(ids[:current_batchsize])

        try:
            # Query PubMed.  If one or more of the id's are broken,
            # this will raise an IOError.
            handle = Entrez.efetch(db="pubmed",
                                   id=id_str,
                                   retmode='text',
                                   rettype='medlars')

            # I'm going to check to make sure PubMed returned the same
            # number of id's as I requested.  If it didn't then I'm going
            # to raise an exception.  This could take a lot of memory if
            # the batchsize is large.
            results = handle.read()
            num_ids = 0
            for x in Medline.Iterator(File.StringHandle(results)):
                num_ids = num_ids + 1
            if num_ids != current_batchsize:
                raise IOError
            handle = File.StringHandle(results)
        except IOError:  # Query did not work.
            if current_batchsize == 1:
                # There was only 1 id in the query.  Report it as
                # broken and move on.
                id = ids.pop(0)
                if broken_fn is not None:
                    broken_fn(id)
            else:
                # I don't know which one is broken.  Try again with
                # fewer id's.
                current_batchsize = current_batchsize / 2
            nsuccesses = 0
            continue
        nsuccesses = nsuccesses + 1

        # Iterate through the results and pass the records to the
        # callback.
        idnum = 0
        for rec in Medline.Iterator(handle, parser):
            callback_fn(ids[idnum], rec)
            idnum = idnum + 1

        ids = ids[current_batchsize:]

        # If I'm not downloading the maximum number of articles,
        # double the number for next time.
        if nsuccesses >= 2 and current_batchsize < batchsize:
            current_batchsize = current_batchsize * 2
            if current_batchsize > batchsize:
                current_batchsize = batchsize

示例#51

0

显示文件

"""Example script showing how to interact with PubMed."""
# standard library
import string

# biopython
from Bio import PubMed
from Bio import Medline

# do the search and get the ids
search_term = 'orchid'
orchid_ids = PubMed.search_for(search_term)

print orchid_ids

# access Medline through a dictionary interface that returns PubMed Records
rec_parser = Medline.RecordParser()
medline_dict = PubMed.Dictionary(parser=rec_parser)

for id in orchid_ids[0:5]:
    cur_record = medline_dict[id]
    print 'title:', string.rstrip(cur_record.title)
    print 'authors:', cur_record.authors
    print 'source:', string.strip(cur_record.source)
    print

示例#52

0

显示文件

from Bio import Entrez, Medline

Entrez.email = "*****@*****.**"
count = 1
dst = open("/tmp/pmc.csv", "w")

#for i in range(0,4):
search_handle = Entrez.esearch(db="pmc", usehistory="y", term='Multimodal AND "Deep Learning" AND (cancer OR tumour OR neoplasm)', retmax=400, retstart=0)
page_record = Entrez.read(search_handle)

for pmcid in page_record['IdList']:
    print("Fetching pmcid = " + pmcid)
    fetch_handle = Entrez.efetch(db='pmc', rettype="medline", retmode="text", id=pmcid)
    records = Medline.parse(fetch_handle)
    for record in records:
        if 'AU' in record:
            author = ','.join(record['AU'])
            print(author)
        else:
            author = ''

        if 'AID' in record:    
            doi = ','.join(record['AID'])
            print(doi)
        else:
            doi = ''

        if 'PMC' in record:    
            pmc = record['PMC']
            print(pmc)
        else:

示例#53

0

显示文件

y41, y42 = 2000, 2004
####D-1
handle = Entrez.esearch(db="pubmed",
                        term=findkey,
                        retmax=nofref,
                        mindate=str(y11),
                        maxdate=str(y12))
record = Entrez.read(handle)
handle.close()
idlist = record["IdList"]

handle = Entrez.efetch(db="pubmed",
                       id=idlist,
                       rettype="medline",
                       retmode="text")
records = Medline.parse(handle)

records = list(records)

w, h = 5, len(records)
maa = [[None for x in range(w)] for y in range(h)]

for i in range(0, len(records)):
    maa[i][0] = records[i].get("PMID", "?")
    maa[i][1] = records[i].get("TI", "?")
    maa[i][2] = records[i].get("AB", "?")
    maa[i][3] = records[i].get("DP")
    maa[i][4] = records[i].get("PT", "?")

ma1 = pd.DataFrame(maa)
ma1.columns = ['PMID', 'TI', 'AB', 'DP', 'PT']

示例#54

0

显示文件

文件： edge_weight.py 项目： stresearch/covid_edge_weights

def collect_NCBI():
    global all_pmids
    global pmid_dict

    if os.path.exists(f'./{rel_name}/{rel_name}_pmid_dict.json'):
        with open(f'./{rel_name}/{rel_name}_pmid_dict.json', 'r') as f:
            jd = f.read()
            temp_dict = json.loads(jd)
        pmid_dict.update(temp_dict)
        return pmid_dict

    for idx in tqdm(range(len(all_pmids))):
        pmid = all_pmids[idx]
        # get records for each pmid
        fetch_records_handle1 = efetch(db="pubmed",
                                       id=str(pmid),
                                       rettype="medline",
                                       retmode="text")
        # parse fetched records
        records1 = Medline.parse(fetch_records_handle1)

        # Need to iterate over records to extract information
        for record1 in records1:
            # try except check to be sure that NCBI is not returning empty result
            try:
                # let's get pmcid if exists
                id2 = record1['PMC'][3:]
                #print('PMC',id2)

                # get records for pmcid
                fetch_records_handle2 = efetch(db="pubmed",
                                               id=str(id2),
                                               rettype="medline",
                                               retmode="text")
                # parse records for pmcid
                records2 = Medline.parse(fetch_records_handle2)

                # Need to iterate over records to extract information
                '''
                Collect following information: authors, authors' affiliations, publication date, citations, grants
                Store all these information in an dictionary (pmid_dict)
                '''
                for record2 in records2:
                    authors = record2['FAU']
                    affiliations = record2['AD']
                    pub_date = record2['DCOM']
                    citations = get_links_id(pmid)
                    grants = record2['GR']
                    pmid_dict[pmid] = {
                        'pmcid_number': id2,
                        'pmcid': True,
                        'authors': authors,
                        'affiliations': affiliations,
                        'grants': grants,
                        'pub_date': pub_date,
                        'citations': citations
                    }
            except:
                authors = record1['FAU']
                try:
                    affiliations = record1['AD']
                except:
                    affiliations = ''
                try:
                    pub_date = record1['DCOM']
                except:
                    pub_date = ''
                try:
                    citations = get_links_id(pmid)
                except:
                    citations = ''
                try:
                    grants = record1['GR']
                except:
                    grants = ''
                pmid_dict[pmid] = {
                    'pmcid_number': '',
                    'pmcid': False,
                    'authors': authors,
                    'affiliations': affiliations,
                    'grants': grants,
                    'pub_date': pub_date,
                    'citations': citations
                }

    with open(f'./{rel_name}/{rel_name}_pmid_dict.json', 'w') as output:
        output.write(json.dumps(pmid_dict))

    return pmid_dict

示例#55

0

显示文件

文件： pubmed2doc.py 项目： AnasAlzahrani/pubmed2doc

def query_search_pubmed(query: str, ret_max: str, email: str, min_date: str,
                        max_date: str):
    """Search PubMed via the user's query supplied through the command line

    Parameters
    ----------
    query: a query to be searched against PubMed database

    email: a user's email to access to the PubMed database

    ret_max: total number of records from query to be retrieved

    min_date: the minimum or start date to search

    max_date: the maximum or end date to search


    Return
    -------
    retrieve document summaries as records

    """

    Entrez.email = email

    if min_date and max_date:
        # search the PubMed db for the entered query
        search = Entrez.esearch(db="pubmed",
                                term=query,
                                sort="relevance",
                                retmode="text",
                                retmax=ret_max,
                                mindate=min_date,
                                maxdate=max_date)
    else:
        # search the PubMed db for the entered query
        search = Entrez.esearch(db="pubmed",
                                term=query,
                                sort="relevance",
                                retmode="text",
                                retmax=ret_max,
                                usehistory='y')

    search_records = Entrez.read(search)
    search.close()

    # get the list of ids for the searched records
    list_ids = search_records['IdList']

    print(f"\nTotal of {len(list_ids)} records retrieved!")

    ids = ",".join(list_ids)

    # return document summaries as a result handle
    fetch_records = Entrez.efetch(db="pubmed",
                                  id=ids,
                                  rettype="Medline",
                                  retmode="text",
                                  webenv=search_records['WebEnv'],
                                  query_key=search_records['QueryKey'])

    search_results = Medline.parse(fetch_records)
    # fetch_records.close()

    return search_results

示例#56

0

显示文件

 def test_parse(self):
     handle = open("Medline/pubmed_result2.txt")
     records = Medline.parse(handle)
     record = next(records)
     self.assertEqual(record["PMID"], "16403221")
     self.assertEqual(record["OWN"], "NLM")
     self.assertEqual(record["STAT"], "MEDLINE")
     self.assertEqual(record["DA"], "20060220")
     self.assertEqual(record["DCOM"], "20060314")
     self.assertEqual(record["PUBM"], "Electronic")
     self.assertEqual(record["IS"], "1471-2105 (Electronic)")
     self.assertEqual(record["VI"], "7")
     self.assertEqual(record["DP"], "2006")
     self.assertEqual(
         record["TI"],
         "A high level interface to SCOP and ASTRAL implemented in python.")
     self.assertEqual(record["PG"], "10")
     self.assertEqual(
         record["AB"],
         "BACKGROUND: Benchmarking algorithms in structural bioinformatics often involves the construction of datasets of proteins with given sequence and structural properties. The SCOP database is a manually curated structural classification which groups together proteins on the basis of structural similarity. The ASTRAL compendium provides non redundant subsets of SCOP domains on the basis of sequence similarity such that no two domains in a given subset share more than a defined degree of sequence similarity. Taken together these two resources provide a 'ground truth' for assessing structural bioinformatics algorithms. We present a small and easy to use API written in python to enable construction of datasets from these resources. RESULTS: We have designed a set of python modules to provide an abstraction of the SCOP and ASTRAL databases. The modules are designed to work as part of the Biopython distribution. Python users can now manipulate and use the SCOP hierarchy from within python programs, and use ASTRAL to return sequences of domains in SCOP, as well as clustered representations of SCOP from ASTRAL. CONCLUSION: The modules make the analysis and generation of datasets for use in structural genomics easier and more principled."
     )
     self.assertEqual(
         record["AD"],
         "Bioinformatics, Institute of Cell and Molecular Science, School of Medicine and Dentistry, Queen Mary, University of London, London EC1 6BQ, UK. [email protected]"
     )
     self.assertEqual(
         record["FAU"],
         ["Casbon, James A", "Crooks, Gavin E", "Saqi, Mansoor A S"])
     self.assertEqual(record["AU"], ["Casbon JA", "Crooks GE", "Saqi MA"])
     self.assertEqual(record["LA"], ["eng"])
     self.assertEqual(record["PT"],
                      ["Evaluation Studies", "Journal Article"])
     self.assertEqual(record["DEP"], "20060110")
     self.assertEqual(record["PL"], "England")
     self.assertEqual(record["TA"], "BMC Bioinformatics")
     self.assertEqual(record["JT"], "BMC bioinformatics")
     self.assertEqual(record["JID"], "100965194")
     self.assertEqual(record["SB"], "IM")
     self.assertEqual(record["MH"], [
         "*Database Management Systems", "*Databases, Protein",
         "Information Storage and Retrieval/*methods",
         "Programming Languages", "Sequence Alignment/*methods",
         "Sequence Analysis, Protein/*methods",
         "Sequence Homology, Amino Acid", "*Software",
         "*User-Computer Interface"
     ])
     self.assertEqual(record["PMC"], "PMC1373603")
     self.assertEqual(record["EDAT"], "2006/01/13 09:00")
     self.assertEqual(record["MHDA"], "2006/03/15 09:00")
     self.assertEqual(record["PHST"], [
         "2005/06/17 [received]", "2006/01/10 [accepted]",
         "2006/01/10 [aheadofprint]"
     ])
     self.assertEqual(
         record["AID"],
         ["1471-2105-7-10 [pii]", "10.1186/1471-2105-7-10 [doi]"])
     self.assertEqual(record["PST"], "epublish")
     self.assertEqual(record["SO"], "BMC Bioinformatics. 2006 Jan 10;7:10.")
     record = next(records)
     self.assertEqual(record["PMID"], "16377612")
     self.assertEqual(record["OWN"], "NLM")
     self.assertEqual(record["STAT"], "MEDLINE")
     self.assertEqual(record["DA"], "20060223")
     self.assertEqual(record["DCOM"], "20060418")
     self.assertEqual(record["LR"], "20061115")
     self.assertEqual(record["PUBM"], "Print-Electronic")
     self.assertEqual(record["IS"], "1367-4803 (Print)")
     self.assertEqual(record["VI"], "22")
     self.assertEqual(record["IP"], "5")
     self.assertEqual(record["DP"], "2006 Mar 1")
     self.assertEqual(
         record["TI"],
         "GenomeDiagram: a python package for the visualization of large-scale genomic data."
     )
     self.assertEqual(record["PG"], "616-7")
     self.assertEqual(
         record["AB"],
         "SUMMARY: We present GenomeDiagram, a flexible, open-source Python module for the visualization of large-scale genomic, comparative genomic and other data with reference to a single chromosome or other biological sequence. GenomeDiagram may be used to generate publication-quality vector graphics, rastered images and in-line streamed graphics for webpages. The package integrates with datatypes from the BioPython project, and is available for Windows, Linux and Mac OS X systems. AVAILABILITY: GenomeDiagram is freely available as source code (under GNU Public License) at http://bioinf.scri.ac.uk/lp/programs.html, and requires Python 2.3 or higher, and recent versions of the ReportLab and BioPython packages. SUPPLEMENTARY INFORMATION: A user manual, example code and images are available at http://bioinf.scri.ac.uk/lp/programs.html."
     )
     self.assertEqual(
         record["AD"],
         "Plant Pathogen Programme, Scottish Crop Research Institute, Invergowrie, Dundee DD2 5DA, Scotland, UK. [email protected]"
     )
     self.assertEqual(record["FAU"], [
         "Pritchard, Leighton", "White, Jennifer A", "Birch, Paul R J",
         "Toth, Ian K"
     ])
     self.assertEqual(record["AU"],
                      ["Pritchard L", "White JA", "Birch PR", "Toth IK"])
     self.assertEqual(record["LA"], ["eng"])
     self.assertEqual(
         record["PT"],
         ["Journal Article", "Research Support, Non-U.S. Gov't"])
     self.assertEqual(record["DEP"], "20051223")
     self.assertEqual(record["PL"], "England")
     self.assertEqual(record["TA"], "Bioinformatics")
     self.assertEqual(record["JT"], "Bioinformatics (Oxford, England)")
     self.assertEqual(record["JID"], "9808944")
     self.assertEqual(record["SB"], "IM")
     self.assertEqual(record["MH"], [
         "Chromosome Mapping/*methods", "*Computer Graphics",
         "*Database Management Systems", "*Databases, Genetic",
         "Information Storage and Retrieval/methods",
         "*Programming Languages", "*Software", "*User-Computer Interface"
     ])
     self.assertEqual(record["EDAT"], "2005/12/27 09:00")
     self.assertEqual(record["MHDA"], "2006/04/19 09:00")
     self.assertEqual(record["PHST"], ["2005/12/23 [aheadofprint]"])
     self.assertEqual(
         record["AID"],
         ["btk021 [pii]", "10.1093/bioinformatics/btk021 [doi]"])
     self.assertEqual(record["PST"], "ppublish")
     self.assertEqual(
         record["SO"],
         "Bioinformatics. 2006 Mar 1;22(5):616-7. Epub 2005 Dec 23.")
     record = next(records)
     self.assertEqual(record["PMID"], "14871861")
     self.assertEqual(record["OWN"], "NLM")
     self.assertEqual(record["STAT"], "MEDLINE")
     self.assertEqual(record["DA"], "20040611")
     self.assertEqual(record["DCOM"], "20050104")
     self.assertEqual(record["LR"], "20061115")
     self.assertEqual(record["PUBM"], "Print-Electronic")
     self.assertEqual(record["IS"], "1367-4803 (Print)")
     self.assertEqual(record["VI"], "20")
     self.assertEqual(record["IP"], "9")
     self.assertEqual(record["DP"], "2004 Jun 12")
     self.assertEqual(record["TI"], "Open source clustering software.")
     self.assertEqual(record["PG"], "1453-4")
     self.assertEqual(
         record["AB"],
         "SUMMARY: We have implemented k-means clustering, hierarchical clustering and self-organizing maps in a single multipurpose open-source library of C routines, callable from other C and C++ programs. Using this library, we have created an improved version of Michael Eisen's well-known Cluster program for Windows, Mac OS X and Linux/Unix. In addition, we generated a Python and a Perl interface to the C Clustering Library, thereby combining the flexibility of a scripting language with the speed of C. AVAILABILITY: The C Clustering Library and the corresponding Python C extension module Pycluster were released under the Python License, while the Perl module Algorithm::Cluster was released under the Artistic License. The GUI code Cluster 3.0 for Windows, Macintosh and Linux/Unix, as well as the corresponding command-line program, were released under the same license as the original Cluster code. The complete source code is available at http://bonsai.ims.u-tokyo.ac.jp/mdehoon/software/cluster. Alternatively, Algorithm::Cluster can be downloaded from CPAN, while Pycluster is also available as part of the Biopython distribution."
     )
     self.assertEqual(
         record["AD"],
         "Human Genome Center, Institute of Medical Science, University of Tokyo, 4-6-1 Shirokanedai, Minato-ku, Tokyo, 108-8639 Japan. [email protected]"
     )
     self.assertEqual(
         record["FAU"],
         ["de Hoon, M J L", "Imoto, S", "Nolan, J", "Miyano, S"])
     self.assertEqual(record["AU"],
                      ["de Hoon MJ", "Imoto S", "Nolan J", "Miyano S"])
     self.assertEqual(record["LA"], ["eng"])
     self.assertEqual(record["PT"], [
         "Comparative Study", "Evaluation Studies", "Journal Article",
         "Validation Studies"
     ])
     self.assertEqual(record["DEP"], "20040210")
     self.assertEqual(record["PL"], "England")
     self.assertEqual(record["TA"], "Bioinformatics")
     self.assertEqual(record["JT"], "Bioinformatics (Oxford, England)")
     self.assertEqual(record["JID"], "9808944")
     self.assertEqual(record["SB"], "IM")
     self.assertEqual(record["MH"], [
         "*Algorithms", "*Cluster Analysis",
         "Gene Expression Profiling/*methods",
         "Pattern Recognition, Automated/methods", "*Programming Languages",
         "Sequence Alignment/*methods", "Sequence Analysis, DNA/*methods",
         "*Software"
     ])
     self.assertEqual(record["EDAT"], "2004/02/12 05:00")
     self.assertEqual(record["MHDA"], "2005/01/05 09:00")
     self.assertEqual(record["PHST"], ["2004/02/10 [aheadofprint]"])
     self.assertEqual(
         record["AID"],
         ["10.1093/bioinformatics/bth078 [doi]", "bth078 [pii]"])
     self.assertEqual(record["PST"], "ppublish")
     self.assertEqual(
         record["SO"],
         "Bioinformatics. 2004 Jun 12;20(9):1453-4. Epub 2004 Feb 10.")
     record = next(records)
     self.assertEqual(record["PMID"], "14630660")
     self.assertEqual(record["OWN"], "NLM")
     self.assertEqual(record["STAT"], "MEDLINE")
     self.assertEqual(record["DA"], "20031121")
     self.assertEqual(record["DCOM"], "20040722")
     self.assertEqual(record["LR"], "20061115")
     self.assertEqual(record["PUBM"], "Print")
     self.assertEqual(record["IS"], "1367-4803 (Print)")
     self.assertEqual(record["VI"], "19")
     self.assertEqual(record["IP"], "17")
     self.assertEqual(record["DP"], "2003 Nov 22")
     self.assertEqual(
         record["TI"],
         "PDB file parser and structure class implemented in Python.")
     self.assertEqual(record["PG"], "2308-10")
     self.assertEqual(
         record["AB"],
         "The biopython project provides a set of bioinformatics tools implemented in Python. Recently, biopython was extended with a set of modules that deal with macromolecular structure. Biopython now contains a parser for PDB files that makes the atomic information available in an easy-to-use but powerful data structure. The parser and data structure deal with features that are often left out or handled inadequately by other packages, e.g. atom and residue disorder (if point mutants are present in the crystal), anisotropic B factors, multiple models and insertion codes. In addition, the parser performs some sanity checking to detect obvious errors. AVAILABILITY: The Biopython distribution (including source code and documentation) is freely available (under the Biopython license) from http://www.biopython.org"
     )
     self.assertEqual(
         record["AD"],
         "Department of Cellular and Molecular Interactions, Vlaams Interuniversitair Instituut voor Biotechnologie and Computational Modeling Lab, Department of Computer Science, Vrije Universiteit Brussel, Pleinlaan 2, 1050 Brussels, Belgium. [email protected]"
     )
     self.assertEqual(record["FAU"],
                      ["Hamelryck, Thomas", "Manderick, Bernard"])
     self.assertEqual(record["AU"], ["Hamelryck T", "Manderick B"])
     self.assertEqual(record["LA"], ["eng"])
     self.assertEqual(record["PT"], [
         "Comparative Study", "Evaluation Studies", "Journal Article",
         "Research Support, Non-U.S. Gov't", "Validation Studies"
     ])
     self.assertEqual(record["PL"], "England")
     self.assertEqual(record["TA"], "Bioinformatics")
     self.assertEqual(record["JT"], "Bioinformatics (Oxford, England)")
     self.assertEqual(record["JID"], "9808944")
     self.assertEqual(record["RN"], ["0 (Macromolecular Substances)"])
     self.assertEqual(record["SB"], "IM")
     self.assertEqual(record["MH"], [
         "Computer Simulation", "Database Management Systems/*standards",
         "*Databases, Protein",
         "Information Storage and Retrieval/*methods/*standards",
         "Macromolecular Substances", "*Models, Molecular",
         "*Programming Languages", "Protein Conformation", "*Software"
     ])
     self.assertEqual(record["EDAT"], "2003/11/25 05:00")
     self.assertEqual(record["MHDA"], "2004/07/23 05:00")
     self.assertEqual(record["PST"], "ppublish")
     self.assertEqual(record["SO"],
                      "Bioinformatics. 2003 Nov 22;19(17):2308-10.")
     self.assertRaises(StopIteration, next, records)
     handle.close()

示例#57

0

显示文件

文件： medline_abstract.py 项目： kernyu/symb_project

from Bio import Medline

# example script to download medline version of the pubmed query
# esearch -db pubmed -query 'antimicrobial resistance' | efilter -mindate 1950 -maxdate 1990 -datetype PDAT | efetch -format medline > 50_90_medline.txt
medline=[]
with open('../data/medline/10_18_medline.txt') as medline_file:
	records = Medline.parse(medline_file)
	for record in records:
		medline.append(record)

for i in range(len(medline)):
	medline_entry = medline[0]
	print(medline_entry)
# 	outpath = 'mesh_10_18/'+ medline_entry.get('PMID')
# 	with open(outpath,'w') as file:
# 		file.write(str(medline_entry.get('MH')))
# 		file.close()

# example script to move expty file
# grep -lrIZ None | xargs -r0 mv -t nonefile/ --

示例#58

0

显示文件

文件： load_medline_online.py 项目： heyaqiong123/literature

    def handle(self, *args, **options):
        for term in args:
            print "buscando [%s]" % term

            handle = Entrez.esearch(db="pubmed", retmax=10, term=term)
            record = Entrez.read(handle)

            ids_list = record['IdList']

            for id in ids_list:
                a = Entrez.efetch(db="pubmed",
                                  id=id,
                                  rettype='medline',
                                  retmode='text')
                ff = a.readlines()
                records = Medline.parse(ff)
                r = records.next()
                try:
                    cit = Citation()
                    cit.pmid = int(r['PMID'])

                    cit.title = r['TI'] if 'TI' in r.keys() else None
                    cit.abstract = r['AB'] if 'AB' in r.keys() else None
                    cit.pagination = r['PG'] if 'PG' in r.keys() else None
                    cit.copyright_information = " ; ".join(
                        r['CI']) if 'CI' in r.keys() else None

                    # dates
                    if 'CRDT' in r.keys():
                        conv = time.strptime(r['CRDT'][0], "%Y/%m/%d %H:%M")
                        cit.date_created = datetime.datetime(*conv[:6])
                    if 'DCOM' in r.keys():
                        # 'DCOM': '19990406'
                        conv = time.strptime(r['DCOM'], "%Y%m%d")
                        cit.date_completed = datetime.datetime(*conv[:6])
                    if 'LR' in r.keys():
                        conv = time.strptime(r['LR'], "%Y%m%d")
                        cit.date_revised = datetime.datetime(*conv[:6])
                    if 'DEP' in r.keys():
                        conv = time.strptime(r['DEP'], "%Y%m%d")
                        cit.date_electronic_publication = datetime.datetime(
                            *conv[:6])

                    cit.save()

                    # relationships

                    # type
                    if 'PT' in r.keys():
                        for pub_type in r['PT']:
                            (pt, created) = PubType.objects.get_or_create(
                                pub_type=pub_type)
                            cit.pub_types.add(pt)

                    # authors
                    if 'AU' in r.keys():
                        for i, author in enumerate(r['AU']):
                            if author != 'et al.':
                                (a, created) = Author.objects.get_or_create(
                                    name=author, full_name=r['FAU'][i])
                                cit.authors.add(a)

                    # language
                    if 'LA' in r.keys():
                        for lang in r['LA']:
                            (l, created) = Language.objects.get_or_create(
                                language=lang)
                            cit.languages.add(l)

                    # affiliation
                    if 'AD' in r.keys():
                        (organization,
                         created) = Organization.objects.get_or_create(
                             name=r['AD'])
                        cit.affiliation = organization

                    # journal
                    if 'JID' in r.keys():
                        issn = r['IS'] if 'IS' in r.keys() else None
                        volume = r['VI'] if 'VI' in r.keys() else None
                        issue = r['IP'] if 'IP' in r.keys() else None

                        if not 'PL' in r:
                            r['PL'] = 'Mexico'

                        (journal, created) = Journal.objects.get_or_create(
                            jid=r['JID'],
                            issn=issn,
                            volume=volume,
                            issue=issue,
                            title=r['JT'],
                            iso_abbreviation=r['TA'],
                            country=r['PL'])
                        cit.journal = journal

                    cit.save()

                    # meshterms
                    if 'MH' in r:
                        for term in r['MH']:
                            term_subs = term.split('/')
                            if term_subs[0].startswith('*'):
                                (mh, created) = Meshterm.objects.get_or_create(
                                    term=term_subs[0][1:])
                                major = True
                            else:
                                (mh, created) = Meshterm.objects.get_or_create(
                                    term=term_subs[0])
                                major = False

                            mc = Meshcitation.objects.create(meshterm=mh,
                                                             citation=cit,
                                                             major=major)

                            if len(term_subs) > 1:
                                for subterm in term_subs[1:]:
                                    if subterm.startswith('*'):
                                        major = True
                                        subterm = subterm[1:]
                                    else:
                                        major = False

                                    (sh, created
                                     ) = Subheading.objects.get_or_create(
                                         term=subterm)
                                    sht = Subheadingterm.objects.create(
                                        subheading=sh,
                                        meshcitation=mc,
                                        major=major)

                    self.stdout.write('%s' % cit)

                except:
                    print "error trying to load %s" % r['PMID']
                    import pprint
                    import sys
                    print sys.exc_info()[0]
                    pprint.pprint(r)
                    raise

示例#59

0

显示文件

    abs_count.append(result["Count"])

    ids = result["IdList"]

    # only download abstracts if there are at least 5 available
    if len(ids) > 5:

        batches = [ids[x:x + 10] for x in range(0, len(ids), batch_size)]

        record_list = []
        for batch in tqdm(batches):
            h = Entrez.efetch(db="pubmed",
                              id=batch,
                              rettype="medline",
                              retmode="text")
            records = Medline.parse(h)
            record_list.extend(list(records))

        if len(record_list) != 0:  # if the recorsd list is not empty

            record_list_df = pd.DataFrame(record_list)  # make a data frame

            record_list_df = record_list_df[record_list_df['AB'].notna(
            )]  # keep rows without na in abstract column
            record_list_df['AB'] = record_list_df['AB'].str.lower(
            )  # make all text lower case
            record_list_df['AB'] = record_list_df['AB'].apply(
                remove_punctuation)  # remove punctuation

            text_list = [row.split(' ') for row in record_list_df['AB']
                         ]  # convert into list of single words

示例#60

0

显示文件

def download_abstracts(dataset, email):
    """Download the abstracts for a list of PubMed IDs.

    Uses the BioPython package.

    .. versionadded:: 0.0.2

    Parameters
    ----------
    dataset : :obj:`nimare.dataset.Dataset`
        A Dataset object where IDs are in the form PMID-EXPID
    email : :obj:`str`
        Email address to use to call the PubMed API

    Returns
    -------
    dataset : :obj:`nimare.dataset.Dataset`

    Warning
    -------
    This function assumes that the dataset uses identifiers in the format
    [PMID-EXPID]. Thus, the ``study_id`` column of the
    :obj:`nimare.dataset.Dataset.texts` DataFrame should correspond to PMID.
    """
    try:
        from Bio import Entrez, Medline
    except ImportError:
        raise Exception(
            "Module biopython is required for downloading abstracts from PubMed."
        )

    Entrez.email = email

    if isinstance(dataset, Dataset):
        pmids = dataset.texts["study_id"].astype(str).tolist()
        pmids = sorted(list(set(pmids)))
    elif isinstance(dataset, list):
        pmids = [str(pmid) for pmid in dataset]
    else:
        raise Exception(f"Dataset type not recognized: {type(dataset)}")

    records = []
    # PubMed only allows you to search ~1000 at a time. I chose 900 to be safe.
    chunks = [pmids[x:x + 900] for x in range(0, len(pmids), 900)]
    for i, chunk in enumerate(chunks):
        LGR.info(f"Downloading chunk {i + 1} of {len(chunks)}")
        h = Entrez.efetch(db="pubmed",
                          id=chunk,
                          rettype="medline",
                          retmode="text")
        records += list(Medline.parse(h))

    # Pull data for studies with abstracts
    data = [[study["PMID"], study["AB"]] for study in records
            if study.get("AB", None)]
    df = pd.DataFrame(columns=["study_id", "abstract"], data=data)
    if not isinstance(dataset, Dataset):
        return df

    dataset.texts = pd.merge(dataset.texts,
                             df,
                             left_on="study_id",
                             right_on="study_id",
                             how="left")
    return dataset