Пример #1
0
    def run(self):
        # Short circuit
        if not self._finalize_text:
            for obj in self.input:
                self.output.put(obj)
            self.output.put(StopIteration)
            return

        with open(self._fn, 'wb') as fp:
            for obj in self.input:
                fp.write(bson.BSON.encode(obj))
        mm = self._job.map_read(self._fn)
        if not mm:
            self.output.put(StopIteration)
            return
        hdr = dict(
            jobtype='finalize',
            finalize_text=self._finalize_text,
            finalize_name=self._finalize_name,
            compress=self._job.options.zmr['compress'])

        def chunk_gen():
            yield dict(hdr), lambda:mm
        result_iter = self._job.router.job_manager(
            self._job, 'finalize', chunk_gen())

        for i, (header, content) in enumerate(result_iter):
            self._job.status['finalize'] = 'Processed %d chunks' % i
            r = resource.getrusage(resource.RUSAGE_SELF)
            log.debug('Retire finalize, rss %s', r.ru_maxrss)
            for part in content:
                for obj in util.bson_iter(part):
                    self.output.put(obj)
        self.output.put(StopIteration)
        del self._job.status['finalize']
Пример #2
0
 def run(self):
     # Short circuit
     if not self._reduce_text:
         pattern = os.path.join(
             self._job.jobdir, self._job.MAP_OUTPUT_TPL % '*')
         for fn in glob(pattern):
             if not os.path.exists(fn): continue
             mm = self._job.map_read(fn)
             for obj in util.bson_iter(mm):
                 self.output.put(obj)
         self.output.put(StopIteration)
         return
     # Divvy out jobs to workers
     def chunk_gen(hdr):
         for j in xrange(self._job.options.zmr['reduce_count']):
             fn = os.path.join(
                 self._job.jobdir, self._job.MAP_OUTPUT_TPL % j)
             if not os.path.exists(fn): continue
             yield dict(hdr), lambda: [ self._job.map_read(fn) ]
     if self._job.command == 'mapreduce':
         jobtype = 'reduce'
     elif self._job.command == 'xmapreduce':
         jobtype = 'xreduce'
     hdr = dict(
         jobtype=jobtype,
         reduce_text=self._reduce_text,
         reduce_name=self._reduce_name,
         compress=self._job.options.zmr['compress'])
     result_iter = self._job.router.job_manager(
         self._job, jobtype, chunk_gen(hdr))
     for i, (header, content) in enumerate(result_iter):
         self._job.status['reduce'] = 'Processed %d chunks' % i
         r = resource.getrusage(resource.RUSAGE_SELF)
         log.debug('Retire reduce, rss %s', r.ru_maxrss)
         for part in content:
             for d in util.bson_iter(part):
                 for obj in d['result']:
                     self.output.put(obj)
     self.output.put(StopIteration)
Пример #3
0
 def _handle_maplike(self, key, header, parts):
     ns = {}
     exec header['map_text'] in ns
     func = ns[header['map_name']]
     reduce_count = header['reduce_count']
     result = [ [] for x in range(reduce_count) ]
     # Iterate, grouping chunks by the reduce chunk ID
     sz_input = 0
     for part in parts:
         sz_input += len(part)
         for obj in func(util.bson_iter(part)):
             chunk_key = hash(key(obj)) % reduce_count
             result[chunk_key].append(obj)
     assert sz_input, 'There was no input!'
     # Emit reduce chunks one at a time
     util.send_bson(self._sink, header, zmq.SNDMORE)
     for result_chunk in result:
         self._sink.send(
             ''.join(
                 bson.BSON.encode(dict(_id=key, value=value))
                 for key, value in result_chunk),
             zmq.SNDMORE)
     self._sink.send('')
Пример #4
0
 def obj_iter():
     for part in parts:
         for obj in util.bson_iter(part):
             yield obj