def get_protein_accession_map(): """ Query Entrez to get a map from its protein accession to ids """ from Bio.EUtils import HistoryClient search_term = 'mouse[orgn]' #search_term = 'MYOD[Gene name] AND mouse[orgn]' # get a handle to the results client = HistoryClient.HistoryClient() results = client.search(db='protein', term=search_term) results_size = len(results) dbids = results.dbids.ids print '# results: %d' % results_size # download them bit by bit acc_2_id = cookbook.DictOfSets() step = 10000 for start in xrange(0, results_size, step): size = min(step, results_size - start) end = start + size results.retstart = start results.retmax = end print 'Getting %d->%d' % (start, end) for id, acc in zip(dbids[start:end], results.efetch(retmode='text', rettype='acc')): acc = acc.strip().split('.')[0] print acc, id acc_2_id[acc].add( biopsy.transfac.DbRef.parse_as( id, biopsy.transfac.db.entrez_protein)) return acc_2_id
def mouse_proteins_result(): from Bio.EUtils import HistoryClient search_term = 'mouse[orgn]' #search_term = 'MYOD1[Gene name] AND mouse[orgn]' # for testing # get a handle to the results client = HistoryClient.HistoryClient() return client.search(db='protein', term=search_term)
def write_mouse_protein_ids(filename): from Bio.EUtils import HistoryClient f = open(filename, 'w') results = HistoryClient.HistoryClient().search(db='protein', term='mouse[orgn]') for id in results.dbids.ids: f.write(id) f.write('\n') f.close()
def testRelatedSequences(self): # Get all protein sequences similar to 4579714 (bacteriorhodopsin) # in FASTA format client = HistoryClient.HistoryClient(eutils=picklestore.client()) result = client.post(EUtils.DBIds("protein", "4579714")) related = result.neighbor_links("protein") related_dbids = related.linksetdbs["protein_protein"].dbids proteins = client.post(related_dbids) fasta_infile = proteins.efetch(retmode="text", rettype="fasta") fasta = fasta_infile.read() print fasta[:1000]
def testPostHistory(self): entrez = HistoryClient.HistoryClient(eutils=picklestore.client()) # Upload a couple nucleotide record identifiers dbids = Datatypes.DBIds("nucleotide", ["26285514", "16445404"]) results = entrez.post(dbids) self.assertEquals(dbids, results.dbids) # Restrict to those which also mention "dopamine" results2 = entrez.search("#%s AND dopamine" % results.query_key, db=results.db) # Should be one hit self.assertEquals(Datatypes.DBIds("nucleotide", ["16445404"]), results2.dbids) summary = results2[0].summary() self.assertEquals(summary.id, "16445404") gold_data = { "16445404": (("Caption", "NM_000794"), ("Title", "H**o sapiens dopamine receptor D1 (DRD1), mRNA"), ("Extra", "gi|16445404|ref|NM_000794.2|"), ("Gi", 16445404), ("CreateDate", "1999/03/19"), ("UpdateDate", "2002/11/05"), ("Flags", 0), ("TaxId", 9606)), "26285514": (("Caption", "AY165035"), ("Title", "Saccharomyces cerevisiae scavenger mRNA decapping enzyme (DCS1) mRNA, complete cds" ), ("Extra", "gi|26285514|gb|AY165035.1|"), ("Gi", 26285514), ("CreateDate", "2002/12/10"), ("UpdateDate", "2002/12/10"), ("Flags", 0), ("TaxId", 4932)), } for fieldname, value in gold_data[summary.id]: self.assertEquals(summary.dataitems[fieldname], value) for x in results2.summary(): for fieldname, value in gold_data[x.id]: self.assertEquals(x.dataitems[fieldname], value)
def testSeqFetch(self): entrez = HistoryClient.HistoryClient(eutils=picklestore.client()) dbids = EUtils.DBIds("nucleotide", "8886082") result = entrez.post(dbids) lines = result.efetch(retmode="text", rettype="gb").readlines() _trim_lines(lines) self.assertEquals(lines[-2], """\ 2761 ttcctagata aacac """) lines = result.efetch(retmode="text", rettype="gb", seq_start=1, seq_stop=60).readlines() _trim_lines(lines) self.assertEquals( lines[-2], """\ 1 ttcagtttat ttggacggaa gaatggtggc tcaattattg acgacttgtg gcctaaattt """) lines = result.efetch(retmode="text", rettype="gb", seq_start=3, seq_stop=40, strand=EUtils.PLUS_STRAND).readlines() _trim_lines(lines) self.assertEquals( lines[-2], """\ 1 cagtttattt ggacggaaga atggtggctc aattattg """) lines = result.efetch(retmode="text", rettype="gb", seq_start=3, seq_stop=40, strand=EUtils.MINUS_STRAND).readlines() _trim_lines(lines) self.assertEquals( lines[-2], """\ 1 caataattga gccaccattc ttccgtccaa ataaactg """)
def get_protein_accession_map_2(): import elementtree.ElementTree as ET search_term = 'mouse[orgn]' #search_term = 'MYOD[Gene name] AND mouse[orgn]' client = HistoryClient.HistoryClient() results = client.search(db='protein', term=search_term) results_size = len(results) print '# results: %d' % results_size acc_2_ids = cookbook.DictOfSets() step = 5000 for start in xrange(0, results_size, step): results.retstart = start results.retmax = min(step, results_size - start) #results.retmax = 1000 start = time.time() summary = results.summary() print 'Retrieving summary: %f secs' % (time.time() - start) for entry in summary: acc = entry.dataitems['Caption'].encode().strip().split('.')[0] acc_2_ids[acc].add( biopsy.transfac.DbRef.parse_as( entry.id.encode(), biopsy.transfac.db.entrez_protein))
def testForNewItems(self): # See if there are Halobacterium salinarum hits known_dbids = EUtils.DBIds( 'protein', [ # '461608', '133739', '121859', '121858', '114808', '26399547', '21362599', '20807968', '18310162', '10954592', '7674074', '3913880', '3913879', '3913877', '1350915', '462366', '462365', '461784', '119171', '21617825', '18202987', '14423949', '13878670', '13431821', '13124532', '12230579', '7531150', '6094165', '1352354', '1350833', '1169376', '140149', '133039', '133038', '132873', '132841', '114547', '114516', '18138454', '11385314', '11385312', '11385311', '11383975', '11383843', '11383705', '11383685', '11383631', '11383600', '11383544', '11383521', '11383449', '11383413', '11362758', '11362757', '11362756', '11362633', '11361208', '11361204', '11361203', '11361202', '11360968', '11360864', '11360863', '11360531', '11291672', '11290031', '11283208', '11281612', '11280741', '11280740', '11280737', '11280563', '11280281', '11280223', '11279456', '11277771', '11277752', '11277727', '11277613', '11277612', '11277611', '11277609', '11277608', '11277607', '11277606', '11277605', '11277604', '11277603', '11277602', '11277601', '11277597', '11277160', '11277109', '11277100', '11271301', '11270360', '11261072', '11260624', '11255573', '9972746', '7443396', '7440307', '7427602', '2129406', '2129405', '2129404', '2117914', '1363464', '1363463', '1084274', '1076149', '1076148', '1076147', '629399', '629398', '629396', '629394', '487075', '487074', '486684', '477959', '477726', '477027', '421715', '421714', '282664', '282663', '282662', '282661', '282660', '282659', '282658', '282657', '282656', '282655', '282654', '282653', '282652', '281164', '281162', '281161', '281160', '281159', '281158', '281157', '280348', '99214', '99212', '99211', '99210', '99209', '99207', '99206', '99205', '99203', '99202', '99201', '99200', '99198', '99195', '99192', '99191', '99190', '99188', '99187', '99185', '99183', '81076', '81071', '81070', '81069', '81067', '81065', '81063', '81062', '81061', '81060', '81058', '81057', '81055', '81054', '81052', '81051', '81047', '81045', '81043', '81042', '81041', '81040', '81039', '81038', '81037', '81036', '81035', '81034', '81032', '81031', '81029', '81026', '81023', '81022', '81020', '81019', '81018', '81017', '81016', '81015', '81013', '81012', '81011', '76372', '76329', '76324', '72632', '71284', '71211', '71179', '71146', '71063', '66382', '21264507', '18000395', '7531067', '132751', '117568', '14916722', '133992', '3122880', '114811', '24158915', '24158914', '24158913', '10640268', '3913878', '3023997', '544287', '19909691', '19909603', '3183098', '141254', '2492920', '809698', '43513', '18144841', '4467437', '4467436', '6226495', '3122803', '3122663', '2499389', '2494605', '1350834', '548760', '141356', '141353', '140804', '139955', '134663', '134662', '133462', '133078', '132785', '132646', '120245', '120232', '114812', '20873485', '20873480', '17942995', '17942994', '17942993', '20516540', '20151159', '20150922', '20150921', '12644298', '12644014', '120249', '10120917', '13124533', '133398', '11132078', '6225334', '1168858', '549772', '140827', '140825', '120239', '114838', '16974947', '16974946', '3219755', '347255', '8569313', '4929918', '899266', '10121032', '1621047', '14278674', '14278519', '11071687', '11071686', '11071685', '11071684', '6729723', '6729722', '6435624', '6435623', '3745775', '12655906', '12655904', '12655903', '6435626', '6435625', '11992133', '8918496', '10121037', '10121036', '10121033', '10120892', '10120851', '10120850', '2851428', '1710568', '1710498', '548772', '548769', '133031', '132995', '132985', '132879', '132784', '132750', '132728', '132708', '132639', '120251', '7327959', '7107278', '6435593', '455306', '455305', '455304', '455303', '455302', '455301', '455300', '455299', '455298', '455297', '150418', '150417', '6435594', '6172231', '6172230', '6172229', '6172228', '6172227', '6137453', '130153', '118549', '2499384', '2499383', '461612', '461611', '461610', '5822280', '285806', '4930169', '2425186', '2425185', '2425184', '2425183', '2425182', '2425181', '2425180', '2425179', '2425178', '2425177', '2425176', '2425175', '2425174', '3659953', '3659944', '4469246', '4001706', '4001704', '4388967', '4378986', '4377598', '809702', '809701', '43668', '43667', '43666', '43665', '43663', '43662', '43661', '43660', '43658', '43491', '4322492', '4104487', '4104485', '4104483', '3928158', '43508', '1633466', '2072795', '1527138', '1527137', '1353676', '3015619', '598123', '515085', '493889', '493888', '229726', '2351849', '2351848', '1154790', '1154789', '1154788', '1154787', '1154786', '1154785', '1154784', '1154783', '1154782', '1154781', '1154780', '1154779', '1154778', '1154777', '2190417', '2190416', '984742', '984741', '984740', '984739', '2760612', '2648028', '1070346', '1070345', '1070344', '2209068', '1487875', '1199752', '1199750', '509675', '509674', '285817', '285810', '285808', '216709', '43455', '43454', '43453', '43452', '43451', '43450', '807110', '1654427', '1654425', '1654423', '1654421', '1654419', '225948', '1583108', '1583107', '1094422', '448273', '226716', '226715', '350080', '226309', '225904', '225428', '1483625', '1235894', '1435134', '1435132', '1435131', '1435129', '994803', '994802', '226310', '223370', '223077', '223076', '223063', '1333716', '671101', '671100', '550341', '311841', '297410', '49046', '43656', '43655', '43654', '43653', '43641', '43640', '43559', '43557', '43552', '43551', '43550', '43548', '43546', '43545', '43544', '43543', '43542', '43541', '43540', '43539', '43537', '43535', '43534', '43533', '43531', '43530', '43526', '43525', '43524', '43523', '43522', '43521', '43520', '43519', '43518', '43517', '43511', '43510', '43505', '43504', '43503', '43501', '43499', '43498', '43496', '43495', '43493', '517390', '305353', '305352', '148816', '148814', '148812', '148794', '148793', '148792', '148768', '148767', '148766', '148764', '148763', '148759', '148757', '148753', '148749', '148747', '148745', '305350' ]) # Upload to the server client = HistoryClient.HistoryClient(eutils=picklestore.client()) old_records = client.post(known_dbids) # Now see if there's anything new new_records = client.search("Halobacterium salinarum BUTNOT #%s" % (old_records.query_key, ), db="protein") print print "There are", len( new_records), "new Halobacterium salinarum records" new_dbids = new_records.dbids print "The record identifiers are:", ", ".join(map(str, new_dbids)) print new_records.efetch(retmode="text", rettype="summary").read() # These should exist, since I commented them out above :) for x in ('461608', '133739', '121859', '121858', '114808'): assert x in new_dbids, "Cannot find expected record %r" % x
def omim_snp_search(dnsnp_id): client = HistoryClient.HistoryClient() articles = client.search(dnsnp_id, "omim") result = articles.efetch("summary") # how to parse the result?? return result
MESH_TO_LABEL = "../terminology-mappings/MeSHToMedDRA/mesh_cui_to_label.txt" MEDDRA_TO_MESH = "../terminology-mappings/MeSHToMedDRA/meshToMeddra-partial-05202014.txt" # OUTPUT DATA FILE PICKLE_FILE = "drug-hoi-test.pickle" TEMP_STAGING_PICKLE = "temp.pickle" # PUBLICATION TYPE FILTERS RCT_FILTER = "Clinical Trial [PT]" CASE_REPORT_FILTER = "Case Reports [PT]" OTHER_FILTER = "NOT Case Reports [PT] NOT Clinical Trial [PT]" ################################################################################ # THE GLOBAL QUERY CLIENT client1 = HistoryClient.HistoryClient() # TEST DRUGS DRUGS_D = {} f = open(RXNORM_TO_MESH,"r") buf = f.read() f.close() l = buf.split("\n") for elt in l[1:]: # skip header if elt.strip() == "": break # use rxcui,mesh,pt for rxnorm-to-MeSH-mapping-03032014.txt (mesh,pt,rxcui,concept_name,ohdsi,conceptClass) = [x.strip() for x in elt.split("|")]
def client(): "Returns a singleton HistoryClient we can work with" global _client if None == _client: _client = HistoryClient.HistoryClient() return _client
def setup(self): self.entrez = HistoryClient.HistoryClient(eutils=picklestore.client(1)) self.result1 = self.entrez.post(EUtils.DBIds("protein", "4579714")) self.result2 = self.entrez.post( EUtils.DBIds("nucleotide", ["18250303", "18250301", "18250299"]))
def testHistory(self): entrez = HistoryClient.HistoryClient(eutils=picklestore.client()) results1 = entrez.search("Dalke", field="au", daterange=EUtils.DateRange("1995", "1998")) self.assertEquals(len(results1), 10) sizes = [] expression = results1.metadata.expression for x in expression: if isinstance(x, Datatypes.Term): n = x.count assert n, n # cannot be 0 or None sizes.append(n) self.assertEquals(len(sizes), 3) if sizes[0] < 30: raise AssertionError(sizes) self.assertEquals(sizes[1], -1) self.assertEquals(sizes[2], -1) self.assertEquals(expression.left.term, "Dalke[Author]") self.assertEquals(expression.left.field, "Author") self.assertEquals(expression.right.left.term, "1995[EDAT]") self.assertEquals(expression.right.left.field, "EDAT") self.assertEquals(expression.right.right.term, "1998[EDAT]") self.assertEquals(expression.right.right.field, "EDAT") expected_dbids = Datatypes.DBIds("pubmed", [ "9454215", "9454196", "9454186", "9390282", "9303476", "9300720", "8763495", "8744570", "8566008", "7648552" ]) self.assertEquals(results1.dbids, expected_dbids) # this is a no-no, since EDAT isn't a searchable field self.failUnlessRaises(EUtils.EUtilsSearchError, entrez.search, "poliovirus AND 1995:1998[EDAT]", db="nucleotide") results2 = entrez.search("poliovirus AND 1995:1998[PDAT]", db="pubmed") if len(results2) < 1160: raise AssertionError(len(results2)) all_ids = results2.dbids self.assertEquals(len(all_ids), len(results2)) self.assertEquals(all_ids[:20], results2[:20].dbids) self.assertEquals(all_ids[5:20], results2[5:20].dbids) self.assertEquals(all_ids[-5:], results2[-5:].dbids) self.assertEquals(all_ids[-5:-1], results2[-5:-1].dbids) self.assertEquals(all_ids[10:-14], results2[10:-14].dbids) # This is illegal because pubmed isn't a sequence database self.failUnlessRaises(TypeError, results2.efetch, seq_start=0) # Try a different database results3 = entrez.search("poliovirus AND 1995:1998[PDAT]", db="nucleotide") # Make sure I can still access fields from the first database self.assertEquals(results1.dbids, expected_dbids) # This is illegal because it mixes databases self.failUnlessRaises( EUtils.EUtilsSearchError, entrez.search, "#%s OR #%s" % (results1.query_key, results3.query_key)) # However, this should yield the same as results3 results4 = entrez.search("poliovirus", db="nucleotide") results5 = entrez.search("#%s AND 1995:1998[PDAT]" % results4.query_key, db=results4.db) self.assertEquals(len(results3), len(results5)) results3_dbids = results3.dbids self.assertEquals(results3_dbids, results5.dbids) # Get the sequence as FASTA one way s = results3[0].efetch(retmode='text', rettype='fasta').read() # And another way t = entrez.eutils.efetch_using_dbids(results3_dbids[:1], retmode='text', rettype='fasta').read() self.assertEquals(s, t)
from Bio import EUtils from Bio.EUtils import HistoryClient client = HistoryClient.HistoryClient() records = client.search('Q8R5B6', db='protein') print records.dbids #result = client.post(EUtils.DBIds("protein", "4579714")) #related = result.neighbor_links("protein") #related_dbids = related.linksetdbs["protein_protein"].dbids #proteins = client.post(related_dbids) #len(proteins) #infile = proteins.efetch(retmode = "text", rettype = "fasta") #fasta = infile.read() #print fasta[:788]