def run(self): it = ( bson.BSON.encode(obj) for obj in self._query ) chunks = size_chunked_iter(it, self._chunk_size) pagenum = None for pagenum, page_chunks in enumerate( util.item_chunked_iter(chunks, self._chunks_per_page)): self._job.status['query'] = 'Processed %d records' % (pagenum * self._chunks_per_page) fn = self._fn_tpl % pagenum log.debug('fn = %s', fn) index = [] with open(fn, 'wb') as fp: pos = 0 for chunk in page_chunks: old = pos fp.write(chunk) pos = fp.tell() index.append((old, pos)) mm = self._job.map_read(fn) for b,e in index: log.debug('query put(%s,%s)', b,e) self.output.put((mm, b, e)) if pagenum is None: log.info('Nothing to process from this query. Job %s' % self._job.id) self.output.put(StopIteration) del self._job.status['query']
def _iter(self): # Seems to be required due to some weirdness in gevent queues def anno_job(): for o in self.input: yield dict(o, job_id=self._job.id) result = anno_job() result = util.item_chunked_iter(result, 100) return result
def _handle_finalize(self, header, parts): ns = {} exec header['finalize_text'] in ns func = ns[header['finalize_name']] def obj_iter(): for part in parts: for obj in util.bson_iter(part): yield obj result = [] util.send_bson(self._sink, header, zmq.SNDMORE) for result in util.item_chunked_iter(func(obj_iter()), 100): sresult = ''.join(map(bson.BSON.encode, result)) self._sink.send(sresult, zmq.SNDMORE) self._sink.send('')