def testParsing(self): src = os.path.join(base, "search-0.xml") r = Reader() r.parse(Compat.ElementTree.ElementTree(file=open(src)), self.db) tmp = pybut.dbname() fd = open(tmp, "w") self.db.xmlwrite(fd) fd.close() pybut.fileeq(tmp, pybut.fp("ut_pubmed", "result.bip"))
def __init__(self, db): self.db = db self.reader = Reader() self._pending = None
class PubMed(object): """ A connection to the PubMed database """ schema = 'org.pybliographer/pubmed/0.1' baseURL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils' BATCH_SIZE = 500 toolName = 'pybliographer' adminEmail = '*****@*****.**' log = logging.getLogger('pyblio.external.pubmed') SRV_SEARCH = '/esearch.fcgi' SRV_FETCH = '/efetch.fcgi' def __init__(self, db): self.db = db self.reader = Reader() self._pending = None def count(self, query, db='PubMed'): assert self._pending is None, 'no more than one search at a time per connection' data = {'db': db, 'term': query} req = self._send_query(self.SRV_SEARCH, data, rettype='count') def success(data): return int(data.find('./Count').text) return req.addCallback(success) def search(self, query, maxhits=500, db='PubMed'): query = query.strip() # The result set that will contain the data self._rs = self.db.rs.new() self._rs.name = _('Imported from PubMed') # Special case for no query: this would cause an error from # the server if we do not catch it first. if not query: results = defer.Deferred() def autofire(): results.callback(0) reactor.callLater(0, autofire) return results, rs self._query = query self._pubmed_db = db self._total = None self._webenv = None self._query_key = None self._batch = batch.Batch(maxhits, self.BATCH_SIZE) return self._batch.fetch(self._runner), self._rs def cancel(self): """ Cancel a running query. The database is not reverted to its original state.""" if self._pending: self._pending.cancel() def _send_query(self, service, args, **kargs): all = {'email': self.adminEmail, 'tool': self.toolName, 'retmode': 'xml'} all.update(args) all.update(kargs) # ensure all arguments are utf8 encoded for k, v in all.items(): if isinstance(v, unicode): all[k] = v.encode('utf-8') url = self.baseURL + service + '?' + urllib.urlencode(all) self.log.debug('sending query %r' % url) # We have the charge of setting and cleaning self._pending self._pending = HTTPRetrieve(url) def done(data): self._pending = None return data return self._pending.deferred.addBoth(done).addCallback(_xml) def _runner(self, start, count): if self._total is None: # for the first iteration of the query, we send a # "Search", which prepares the query, returns the total # number of results, and gives us a key to access the # content. The following calls will use "Fetch" to # actually gather the results. data = {'db': self._pubmed_db, 'term': self._query} d = self._send_query(self.SRV_SEARCH, data, usehistory='y') def _got_summary(data): # Total number of results self._total = int(data.find('./Count').text) # Parameters necessary to fetch the content of the result set self._webenv = data.find('./WebEnv').text self._query_key = data.find('./QueryKey').text # next run, we will actually start fetching the # results, starting at 0 return 0, self._total return d.addCallback(_got_summary) else: def _received(data): previously = len(self._rs) self.reader.parse(data, self.db, self._rs) freshly_parsed = len(self._rs) - previously if freshly_parsed <= 0: self.log.warn("what happend? I increased the result set by %d" % freshly_parsed) # pretend there has been at least one parsing, so # that we ensure that the task # progresses. Otherwise we might loop forever on # an entry we cannot parse. freshly_parsed = 1 return freshly_parsed, self._total fetchdata = { 'db': self._pubmed_db, 'WebEnv': self._webenv, 'query_key': self._query_key, } d = self._send_query(self.SRV_FETCH, fetchdata, retstart=start, retmax=count) return d.addCallback(_received)