Exemplo n.º 1
0
    def testParsing(self):

        src = os.path.join(base, "search-0.xml")

        r = Reader()
        r.parse(Compat.ElementTree.ElementTree(file=open(src)), self.db)

        tmp = pybut.dbname()
        fd = open(tmp, "w")
        self.db.xmlwrite(fd)
        fd.close()

        pybut.fileeq(tmp, pybut.fp("ut_pubmed", "result.bip"))
Exemplo n.º 2
0
    def __init__(self, db):

        self.db = db
        self.reader = Reader()

        self._pending = None
Exemplo n.º 3
0
class PubMed(object):
    """ A connection to the PubMed database """

    schema = 'org.pybliographer/pubmed/0.1'

    baseURL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils'

    BATCH_SIZE = 500
    
    toolName = 'pybliographer'
    adminEmail = '*****@*****.**'

    log = logging.getLogger('pyblio.external.pubmed')

    SRV_SEARCH = '/esearch.fcgi'
    SRV_FETCH = '/efetch.fcgi'

    def __init__(self, db):

        self.db = db
        self.reader = Reader()

        self._pending = None

    def count(self, query, db='PubMed'):
        assert self._pending is None, 'no more than one search at a time per connection'

        data = {'db': db, 'term': query}
        req = self._send_query(self.SRV_SEARCH, data, rettype='count')

        def success(data):
            return int(data.find('./Count').text)
        return req.addCallback(success)
    
    def search(self, query, maxhits=500, db='PubMed'):

        query = query.strip()
        
        # The result set that will contain the data
        self._rs = self.db.rs.new()
        self._rs.name = _('Imported from PubMed')

        # Special case for no query: this would cause an error from
        # the server if we do not catch it first.
        if not query:
            results = defer.Deferred()
            def autofire():
                results.callback(0)
            reactor.callLater(0, autofire)
            return results, rs

        self._query = query
        self._pubmed_db = db
        self._total = None
        self._webenv = None
        self._query_key = None

        self._batch = batch.Batch(maxhits, self.BATCH_SIZE)
        return self._batch.fetch(self._runner), self._rs

    def cancel(self):
        """ Cancel a running query.

        The database is not reverted to its original state."""
        if self._pending:
            self._pending.cancel()

    def _send_query(self, service, args, **kargs):

        all = {'email': self.adminEmail,
               'tool': self.toolName,
               'retmode': 'xml'}
        
        all.update(args)
        all.update(kargs)

        # ensure all arguments are utf8 encoded
        for k, v in all.items():
            if isinstance(v, unicode):
                all[k] = v.encode('utf-8')
                
        url = self.baseURL + service + '?' + urllib.urlencode(all)

        self.log.debug('sending query %r' % url)

        # We have the charge of setting and cleaning self._pending
        self._pending = HTTPRetrieve(url)

        def done(data):
            self._pending = None
            return data
        
        return self._pending.deferred.addBoth(done).addCallback(_xml)

    def _runner(self, start, count):

        if self._total is None:
            # for the first iteration of the query, we send a
            # "Search", which prepares the query, returns the total
            # number of results, and gives us a key to access the
            # content. The following calls will use "Fetch" to
            # actually gather the results.
            data = {'db': self._pubmed_db, 'term': self._query}
            d = self._send_query(self.SRV_SEARCH, data, usehistory='y')

            def _got_summary(data):
                # Total number of results
                self._total = int(data.find('./Count').text)
            
                # Parameters necessary to fetch the content of the result set
                self._webenv = data.find('./WebEnv').text
                self._query_key = data.find('./QueryKey').text

                # next run, we will actually start fetching the
                # results, starting at 0
                return 0, self._total
            return d.addCallback(_got_summary)

        else:
            def _received(data):
                previously = len(self._rs)
                self.reader.parse(data, self.db, self._rs)
                freshly_parsed = len(self._rs) - previously
                if freshly_parsed <= 0:
                    self.log.warn("what happend? I increased the result set by %d" % freshly_parsed)
                    # pretend there has been at least one parsing, so
                    # that we ensure that the task
                    # progresses. Otherwise we might loop forever on
                    # an entry we cannot parse.
                    freshly_parsed = 1
                return freshly_parsed, self._total

            fetchdata = {
                'db': self._pubmed_db,
                'WebEnv': self._webenv,
                'query_key': self._query_key,
            }
            d = self._send_query(self.SRV_FETCH, fetchdata,
                                 retstart=start, retmax=count)
            return d.addCallback(_received)