Пример #1
0
    def get_protein_accession_map():
        """
        Query Entrez to get a map from its protein accession to ids
        """
        from Bio.EUtils import HistoryClient
        search_term = 'mouse[orgn]'
        #search_term = 'MYOD[Gene name] AND mouse[orgn]'

        # get a handle to the results
        client = HistoryClient.HistoryClient()
        results = client.search(db='protein', term=search_term)
        results_size = len(results)
        dbids = results.dbids.ids
        print '# results: %d' % results_size

        # download them bit by bit
        acc_2_id = cookbook.DictOfSets()
        step = 10000
        for start in xrange(0, results_size, step):
            size = min(step, results_size - start)
            end = start + size
            results.retstart = start
            results.retmax = end
            print 'Getting %d->%d' % (start, end)
            for id, acc in zip(dbids[start:end],
                               results.efetch(retmode='text', rettype='acc')):
                acc = acc.strip().split('.')[0]
                print acc, id
                acc_2_id[acc].add(
                    biopsy.transfac.DbRef.parse_as(
                        id, biopsy.transfac.db.entrez_protein))

        return acc_2_id
Пример #2
0
def mouse_proteins_result():
    from Bio.EUtils import HistoryClient
    search_term = 'mouse[orgn]'
    #search_term = 'MYOD1[Gene name] AND mouse[orgn]' # for testing

    # get a handle to the results
    client = HistoryClient.HistoryClient()
    return client.search(db='protein', term=search_term)
Пример #3
0
def write_mouse_protein_ids(filename):
    from Bio.EUtils import HistoryClient
    f = open(filename, 'w')
    results = HistoryClient.HistoryClient().search(db='protein',
                                                   term='mouse[orgn]')
    for id in results.dbids.ids:
        f.write(id)
        f.write('\n')
    f.close()
Пример #4
0
    def testRelatedSequences(self):
        # Get all protein sequences similar to 4579714 (bacteriorhodopsin)
        # in FASTA format
        client = HistoryClient.HistoryClient(eutils=picklestore.client())

        result = client.post(EUtils.DBIds("protein", "4579714"))
        related = result.neighbor_links("protein")
        related_dbids = related.linksetdbs["protein_protein"].dbids

        proteins = client.post(related_dbids)
        fasta_infile = proteins.efetch(retmode="text", rettype="fasta")
        fasta = fasta_infile.read()
        print fasta[:1000]
Пример #5
0
    def testPostHistory(self):
        entrez = HistoryClient.HistoryClient(eutils=picklestore.client())
        # Upload a couple nucleotide record identifiers
        dbids = Datatypes.DBIds("nucleotide", ["26285514", "16445404"])
        results = entrez.post(dbids)

        self.assertEquals(dbids, results.dbids)

        # Restrict to those which also mention "dopamine"
        results2 = entrez.search("#%s AND dopamine" % results.query_key,
                                 db=results.db)
        # Should be one hit
        self.assertEquals(Datatypes.DBIds("nucleotide", ["16445404"]),
                          results2.dbids)

        summary = results2[0].summary()
        self.assertEquals(summary.id, "16445404")
        gold_data = {
            "16445404":
            (("Caption", "NM_000794"),
             ("Title", "H**o sapiens dopamine receptor D1 (DRD1), mRNA"),
             ("Extra", "gi|16445404|ref|NM_000794.2|"), ("Gi", 16445404),
             ("CreateDate", "1999/03/19"), ("UpdateDate", "2002/11/05"),
             ("Flags", 0), ("TaxId", 9606)),
            "26285514":
            (("Caption", "AY165035"),
             ("Title",
              "Saccharomyces cerevisiae scavenger mRNA decapping enzyme (DCS1) mRNA, complete cds"
              ), ("Extra", "gi|26285514|gb|AY165035.1|"), ("Gi", 26285514),
             ("CreateDate", "2002/12/10"), ("UpdateDate", "2002/12/10"),
             ("Flags", 0), ("TaxId", 4932)),
        }
        for fieldname, value in gold_data[summary.id]:
            self.assertEquals(summary.dataitems[fieldname], value)

        for x in results2.summary():
            for fieldname, value in gold_data[x.id]:
                self.assertEquals(x.dataitems[fieldname], value)
Пример #6
0
    def testSeqFetch(self):
        entrez = HistoryClient.HistoryClient(eutils=picklestore.client())
        dbids = EUtils.DBIds("nucleotide", "8886082")
        result = entrez.post(dbids)
        lines = result.efetch(retmode="text", rettype="gb").readlines()
        _trim_lines(lines)
        self.assertEquals(lines[-2], """\
     2761 ttcctagata aacac
""")
        lines = result.efetch(retmode="text",
                              rettype="gb",
                              seq_start=1,
                              seq_stop=60).readlines()
        _trim_lines(lines)
        self.assertEquals(
            lines[-2], """\
        1 ttcagtttat ttggacggaa gaatggtggc tcaattattg acgacttgtg gcctaaattt
""")
        lines = result.efetch(retmode="text",
                              rettype="gb",
                              seq_start=3,
                              seq_stop=40,
                              strand=EUtils.PLUS_STRAND).readlines()
        _trim_lines(lines)
        self.assertEquals(
            lines[-2], """\
        1 cagtttattt ggacggaaga atggtggctc aattattg
""")
        lines = result.efetch(retmode="text",
                              rettype="gb",
                              seq_start=3,
                              seq_stop=40,
                              strand=EUtils.MINUS_STRAND).readlines()
        _trim_lines(lines)
        self.assertEquals(
            lines[-2], """\
        1 caataattga gccaccattc ttccgtccaa ataaactg
""")
Пример #7
0
    def get_protein_accession_map_2():
        import elementtree.ElementTree as ET
        search_term = 'mouse[orgn]'
        #search_term = 'MYOD[Gene name] AND mouse[orgn]'

        client = HistoryClient.HistoryClient()
        results = client.search(db='protein', term=search_term)
        results_size = len(results)
        print '# results: %d' % results_size

        acc_2_ids = cookbook.DictOfSets()
        step = 5000
        for start in xrange(0, results_size, step):
            results.retstart = start
            results.retmax = min(step, results_size - start)
            #results.retmax = 1000
            start = time.time()
            summary = results.summary()
            print 'Retrieving summary: %f secs' % (time.time() - start)
            for entry in summary:
                acc = entry.dataitems['Caption'].encode().strip().split('.')[0]
                acc_2_ids[acc].add(
                    biopsy.transfac.DbRef.parse_as(
                        entry.id.encode(), biopsy.transfac.db.entrez_protein))
Пример #8
0
    def testForNewItems(self):
        # See if there are Halobacterium salinarum hits

        known_dbids = EUtils.DBIds(
            'protein',
            [
                #            '461608', '133739', '121859', '121858', '114808',
                '26399547',
                '21362599',
                '20807968',
                '18310162',
                '10954592',
                '7674074',
                '3913880',
                '3913879',
                '3913877',
                '1350915',
                '462366',
                '462365',
                '461784',
                '119171',
                '21617825',
                '18202987',
                '14423949',
                '13878670',
                '13431821',
                '13124532',
                '12230579',
                '7531150',
                '6094165',
                '1352354',
                '1350833',
                '1169376',
                '140149',
                '133039',
                '133038',
                '132873',
                '132841',
                '114547',
                '114516',
                '18138454',
                '11385314',
                '11385312',
                '11385311',
                '11383975',
                '11383843',
                '11383705',
                '11383685',
                '11383631',
                '11383600',
                '11383544',
                '11383521',
                '11383449',
                '11383413',
                '11362758',
                '11362757',
                '11362756',
                '11362633',
                '11361208',
                '11361204',
                '11361203',
                '11361202',
                '11360968',
                '11360864',
                '11360863',
                '11360531',
                '11291672',
                '11290031',
                '11283208',
                '11281612',
                '11280741',
                '11280740',
                '11280737',
                '11280563',
                '11280281',
                '11280223',
                '11279456',
                '11277771',
                '11277752',
                '11277727',
                '11277613',
                '11277612',
                '11277611',
                '11277609',
                '11277608',
                '11277607',
                '11277606',
                '11277605',
                '11277604',
                '11277603',
                '11277602',
                '11277601',
                '11277597',
                '11277160',
                '11277109',
                '11277100',
                '11271301',
                '11270360',
                '11261072',
                '11260624',
                '11255573',
                '9972746',
                '7443396',
                '7440307',
                '7427602',
                '2129406',
                '2129405',
                '2129404',
                '2117914',
                '1363464',
                '1363463',
                '1084274',
                '1076149',
                '1076148',
                '1076147',
                '629399',
                '629398',
                '629396',
                '629394',
                '487075',
                '487074',
                '486684',
                '477959',
                '477726',
                '477027',
                '421715',
                '421714',
                '282664',
                '282663',
                '282662',
                '282661',
                '282660',
                '282659',
                '282658',
                '282657',
                '282656',
                '282655',
                '282654',
                '282653',
                '282652',
                '281164',
                '281162',
                '281161',
                '281160',
                '281159',
                '281158',
                '281157',
                '280348',
                '99214',
                '99212',
                '99211',
                '99210',
                '99209',
                '99207',
                '99206',
                '99205',
                '99203',
                '99202',
                '99201',
                '99200',
                '99198',
                '99195',
                '99192',
                '99191',
                '99190',
                '99188',
                '99187',
                '99185',
                '99183',
                '81076',
                '81071',
                '81070',
                '81069',
                '81067',
                '81065',
                '81063',
                '81062',
                '81061',
                '81060',
                '81058',
                '81057',
                '81055',
                '81054',
                '81052',
                '81051',
                '81047',
                '81045',
                '81043',
                '81042',
                '81041',
                '81040',
                '81039',
                '81038',
                '81037',
                '81036',
                '81035',
                '81034',
                '81032',
                '81031',
                '81029',
                '81026',
                '81023',
                '81022',
                '81020',
                '81019',
                '81018',
                '81017',
                '81016',
                '81015',
                '81013',
                '81012',
                '81011',
                '76372',
                '76329',
                '76324',
                '72632',
                '71284',
                '71211',
                '71179',
                '71146',
                '71063',
                '66382',
                '21264507',
                '18000395',
                '7531067',
                '132751',
                '117568',
                '14916722',
                '133992',
                '3122880',
                '114811',
                '24158915',
                '24158914',
                '24158913',
                '10640268',
                '3913878',
                '3023997',
                '544287',
                '19909691',
                '19909603',
                '3183098',
                '141254',
                '2492920',
                '809698',
                '43513',
                '18144841',
                '4467437',
                '4467436',
                '6226495',
                '3122803',
                '3122663',
                '2499389',
                '2494605',
                '1350834',
                '548760',
                '141356',
                '141353',
                '140804',
                '139955',
                '134663',
                '134662',
                '133462',
                '133078',
                '132785',
                '132646',
                '120245',
                '120232',
                '114812',
                '20873485',
                '20873480',
                '17942995',
                '17942994',
                '17942993',
                '20516540',
                '20151159',
                '20150922',
                '20150921',
                '12644298',
                '12644014',
                '120249',
                '10120917',
                '13124533',
                '133398',
                '11132078',
                '6225334',
                '1168858',
                '549772',
                '140827',
                '140825',
                '120239',
                '114838',
                '16974947',
                '16974946',
                '3219755',
                '347255',
                '8569313',
                '4929918',
                '899266',
                '10121032',
                '1621047',
                '14278674',
                '14278519',
                '11071687',
                '11071686',
                '11071685',
                '11071684',
                '6729723',
                '6729722',
                '6435624',
                '6435623',
                '3745775',
                '12655906',
                '12655904',
                '12655903',
                '6435626',
                '6435625',
                '11992133',
                '8918496',
                '10121037',
                '10121036',
                '10121033',
                '10120892',
                '10120851',
                '10120850',
                '2851428',
                '1710568',
                '1710498',
                '548772',
                '548769',
                '133031',
                '132995',
                '132985',
                '132879',
                '132784',
                '132750',
                '132728',
                '132708',
                '132639',
                '120251',
                '7327959',
                '7107278',
                '6435593',
                '455306',
                '455305',
                '455304',
                '455303',
                '455302',
                '455301',
                '455300',
                '455299',
                '455298',
                '455297',
                '150418',
                '150417',
                '6435594',
                '6172231',
                '6172230',
                '6172229',
                '6172228',
                '6172227',
                '6137453',
                '130153',
                '118549',
                '2499384',
                '2499383',
                '461612',
                '461611',
                '461610',
                '5822280',
                '285806',
                '4930169',
                '2425186',
                '2425185',
                '2425184',
                '2425183',
                '2425182',
                '2425181',
                '2425180',
                '2425179',
                '2425178',
                '2425177',
                '2425176',
                '2425175',
                '2425174',
                '3659953',
                '3659944',
                '4469246',
                '4001706',
                '4001704',
                '4388967',
                '4378986',
                '4377598',
                '809702',
                '809701',
                '43668',
                '43667',
                '43666',
                '43665',
                '43663',
                '43662',
                '43661',
                '43660',
                '43658',
                '43491',
                '4322492',
                '4104487',
                '4104485',
                '4104483',
                '3928158',
                '43508',
                '1633466',
                '2072795',
                '1527138',
                '1527137',
                '1353676',
                '3015619',
                '598123',
                '515085',
                '493889',
                '493888',
                '229726',
                '2351849',
                '2351848',
                '1154790',
                '1154789',
                '1154788',
                '1154787',
                '1154786',
                '1154785',
                '1154784',
                '1154783',
                '1154782',
                '1154781',
                '1154780',
                '1154779',
                '1154778',
                '1154777',
                '2190417',
                '2190416',
                '984742',
                '984741',
                '984740',
                '984739',
                '2760612',
                '2648028',
                '1070346',
                '1070345',
                '1070344',
                '2209068',
                '1487875',
                '1199752',
                '1199750',
                '509675',
                '509674',
                '285817',
                '285810',
                '285808',
                '216709',
                '43455',
                '43454',
                '43453',
                '43452',
                '43451',
                '43450',
                '807110',
                '1654427',
                '1654425',
                '1654423',
                '1654421',
                '1654419',
                '225948',
                '1583108',
                '1583107',
                '1094422',
                '448273',
                '226716',
                '226715',
                '350080',
                '226309',
                '225904',
                '225428',
                '1483625',
                '1235894',
                '1435134',
                '1435132',
                '1435131',
                '1435129',
                '994803',
                '994802',
                '226310',
                '223370',
                '223077',
                '223076',
                '223063',
                '1333716',
                '671101',
                '671100',
                '550341',
                '311841',
                '297410',
                '49046',
                '43656',
                '43655',
                '43654',
                '43653',
                '43641',
                '43640',
                '43559',
                '43557',
                '43552',
                '43551',
                '43550',
                '43548',
                '43546',
                '43545',
                '43544',
                '43543',
                '43542',
                '43541',
                '43540',
                '43539',
                '43537',
                '43535',
                '43534',
                '43533',
                '43531',
                '43530',
                '43526',
                '43525',
                '43524',
                '43523',
                '43522',
                '43521',
                '43520',
                '43519',
                '43518',
                '43517',
                '43511',
                '43510',
                '43505',
                '43504',
                '43503',
                '43501',
                '43499',
                '43498',
                '43496',
                '43495',
                '43493',
                '517390',
                '305353',
                '305352',
                '148816',
                '148814',
                '148812',
                '148794',
                '148793',
                '148792',
                '148768',
                '148767',
                '148766',
                '148764',
                '148763',
                '148759',
                '148757',
                '148753',
                '148749',
                '148747',
                '148745',
                '305350'
            ])

        # Upload to the server
        client = HistoryClient.HistoryClient(eutils=picklestore.client())

        old_records = client.post(known_dbids)

        # Now see if there's anything new
        new_records = client.search("Halobacterium salinarum BUTNOT #%s" %
                                    (old_records.query_key, ),
                                    db="protein")
        print
        print "There are", len(
            new_records), "new Halobacterium salinarum records"
        new_dbids = new_records.dbids
        print "The record identifiers are:", ", ".join(map(str, new_dbids))
        print new_records.efetch(retmode="text", rettype="summary").read()

        # These should exist, since I commented them out above :)
        for x in ('461608', '133739', '121859', '121858', '114808'):
            assert x in new_dbids, "Cannot find expected record %r" % x
Пример #9
0
def omim_snp_search(dnsnp_id):
	client = HistoryClient.HistoryClient()
	articles = client.search(dnsnp_id, "omim")
	result = articles.efetch("summary")
	# how to parse the result??
	return result
Пример #10
0
MESH_TO_LABEL = "../terminology-mappings/MeSHToMedDRA/mesh_cui_to_label.txt"
MEDDRA_TO_MESH = "../terminology-mappings/MeSHToMedDRA/meshToMeddra-partial-05202014.txt"

# OUTPUT DATA FILE
PICKLE_FILE = "drug-hoi-test.pickle"
TEMP_STAGING_PICKLE = "temp.pickle"

# PUBLICATION TYPE FILTERS
RCT_FILTER = "Clinical Trial [PT]"
CASE_REPORT_FILTER = "Case Reports [PT]"
OTHER_FILTER = "NOT Case Reports [PT] NOT Clinical Trial [PT]"

################################################################################

# THE GLOBAL QUERY CLIENT
client1 = HistoryClient.HistoryClient()

# TEST DRUGS
DRUGS_D = {}
f = open(RXNORM_TO_MESH,"r")
buf = f.read()
f.close()
l = buf.split("\n")
for elt in l[1:]: # skip header
    if elt.strip() == "":
        break

    # use rxcui,mesh,pt for rxnorm-to-MeSH-mapping-03032014.txt

    (mesh,pt,rxcui,concept_name,ohdsi,conceptClass) = [x.strip() for x in elt.split("|")]
Пример #11
0
def client():
    "Returns a singleton HistoryClient we can work with"
    global _client
    if None == _client:
        _client = HistoryClient.HistoryClient()
    return _client
Пример #12
0
 def setup(self):
     self.entrez = HistoryClient.HistoryClient(eutils=picklestore.client(1))
     self.result1 = self.entrez.post(EUtils.DBIds("protein", "4579714"))
     self.result2 = self.entrez.post(
         EUtils.DBIds("nucleotide", ["18250303", "18250301", "18250299"]))
Пример #13
0
    def testHistory(self):
        entrez = HistoryClient.HistoryClient(eutils=picklestore.client())

        results1 = entrez.search("Dalke",
                                 field="au",
                                 daterange=EUtils.DateRange("1995", "1998"))
        self.assertEquals(len(results1), 10)
        sizes = []
        expression = results1.metadata.expression
        for x in expression:
            if isinstance(x, Datatypes.Term):
                n = x.count
                assert n, n  # cannot be 0 or None
                sizes.append(n)
        self.assertEquals(len(sizes), 3)
        if sizes[0] < 30:
            raise AssertionError(sizes)
        self.assertEquals(sizes[1], -1)
        self.assertEquals(sizes[2], -1)

        self.assertEquals(expression.left.term, "Dalke[Author]")
        self.assertEquals(expression.left.field, "Author")

        self.assertEquals(expression.right.left.term, "1995[EDAT]")
        self.assertEquals(expression.right.left.field, "EDAT")
        self.assertEquals(expression.right.right.term, "1998[EDAT]")
        self.assertEquals(expression.right.right.field, "EDAT")

        expected_dbids = Datatypes.DBIds("pubmed", [
            "9454215", "9454196", "9454186", "9390282", "9303476", "9300720",
            "8763495", "8744570", "8566008", "7648552"
        ])
        self.assertEquals(results1.dbids, expected_dbids)

        # this is a no-no, since EDAT isn't a searchable field
        self.failUnlessRaises(EUtils.EUtilsSearchError,
                              entrez.search,
                              "poliovirus AND 1995:1998[EDAT]",
                              db="nucleotide")

        results2 = entrez.search("poliovirus AND 1995:1998[PDAT]", db="pubmed")

        if len(results2) < 1160:
            raise AssertionError(len(results2))

        all_ids = results2.dbids
        self.assertEquals(len(all_ids), len(results2))
        self.assertEquals(all_ids[:20], results2[:20].dbids)
        self.assertEquals(all_ids[5:20], results2[5:20].dbids)
        self.assertEquals(all_ids[-5:], results2[-5:].dbids)
        self.assertEquals(all_ids[-5:-1], results2[-5:-1].dbids)
        self.assertEquals(all_ids[10:-14], results2[10:-14].dbids)

        # This is illegal because pubmed isn't a sequence database
        self.failUnlessRaises(TypeError, results2.efetch, seq_start=0)

        # Try a different database
        results3 = entrez.search("poliovirus AND 1995:1998[PDAT]",
                                 db="nucleotide")

        # Make sure I can still access fields from the first database
        self.assertEquals(results1.dbids, expected_dbids)

        # This is illegal because it mixes databases
        self.failUnlessRaises(
            EUtils.EUtilsSearchError, entrez.search,
            "#%s OR #%s" % (results1.query_key, results3.query_key))

        # However, this should yield the same as results3
        results4 = entrez.search("poliovirus", db="nucleotide")
        results5 = entrez.search("#%s AND 1995:1998[PDAT]" %
                                 results4.query_key,
                                 db=results4.db)
        self.assertEquals(len(results3), len(results5))
        results3_dbids = results3.dbids
        self.assertEquals(results3_dbids, results5.dbids)

        # Get the sequence as FASTA one way
        s = results3[0].efetch(retmode='text', rettype='fasta').read()
        # And another way
        t = entrez.eutils.efetch_using_dbids(results3_dbids[:1],
                                             retmode='text',
                                             rettype='fasta').read()
        self.assertEquals(s, t)
Пример #14
0
from Bio import EUtils
from Bio.EUtils import HistoryClient

client = HistoryClient.HistoryClient()
records = client.search('Q8R5B6', db='protein')
print records.dbids

#result = client.post(EUtils.DBIds("protein", "4579714"))
#related = result.neighbor_links("protein")
#related_dbids = related.linksetdbs["protein_protein"].dbids
#proteins = client.post(related_dbids)
#len(proteins)
#infile = proteins.efetch(retmode = "text", rettype = "fasta")
#fasta = infile.read()
#print fasta[:788]