예제 #1
0
    def _pre_commit(self, aggregator):
        for i in range(self.num_reduce):
            out_path = self.paths[i]
            if out_path:
                tmp = self._mk_tmp(out_path)

                in_path = [
                    self._mk_tmp(out_path, i) for i in range(self.num_dump + 1)
                ]
                in_path = [p for p in in_path if os.path.exists(p)]
                if len(in_path) == 1:
                    os.rename(in_path[0], tmp)
                else:
                    inputs = [
                        get_serializer(self.rddconf).load_stream(open(p))
                        for p in in_path
                    ]
                    rddconf = self.rddconf.dup(op=dpark.conf.OP_GROUPBY)
                    merger = Merger.get(rddconf,
                                        aggregator=aggregator,
                                        call_site=self.__class__.__name__)
                    merger.merge(inputs)
                    with open(tmp, 'w') as f:
                        get_serializer(self.rddconf).dump_stream(merger, f)
            else:
                self._dump_empty_bucket(i)
예제 #2
0
파일: task.py 프로젝트: douban/dpark
 def _pre_commit(self, aggregator):
     for i in range(self.num_reduce):
         tmp_paths = self.tmp_paths[i]
         if tmp_paths:
             if len(tmp_paths) == 1:
                 self.paths[i].export(tmp_paths[0])
             else:
                 inputs = [get_serializer(self.rddconf).load_stream(open(p))
                           for p in tmp_paths]
                 rddconf = self.rddconf.dup(op=dpark.conf.OP_GROUPBY)
                 merger = Merger.get(rddconf, aggregator=aggregator, api_callsite=self.__class__.__name__)
                 merger.merge(inputs)
                 final_tmp = self._get_tmp(i, True, 0)
                 with open(final_tmp, 'wb') as f:
                     get_serializer(self.rddconf).dump_stream(merger, f)
         else:
             self._dump_empty_bucket(i)
예제 #3
0
 def _pre_commit(self, aggregator):
     for i in range(self.num_reduce):
         tmp_paths = self.tmp_paths[i]
         if tmp_paths:
             if len(tmp_paths) == 1:
                 self.paths[i].export(tmp_paths[0])
             else:
                 inputs = [
                     get_serializer(self.rddconf).load_stream(open(p))
                     for p in tmp_paths
                 ]
                 rddconf = self.rddconf.dup(op=dpark.conf.OP_GROUPBY)
                 merger = Merger.get(rddconf,
                                     aggregator=aggregator,
                                     api_callsite=self.__class__.__name__)
                 merger.merge(inputs)
                 final_tmp = self._get_tmp(i, True, 0)
                 with open(final_tmp, 'wb') as f:
                     get_serializer(self.rddconf).dump_stream(merger, f)
         else:
             self._dump_empty_bucket(i)
예제 #4
0
 def _dump_bucket(self, items, path):
     serializer = get_serializer(self.rddconf)
     with open(path, 'wb') as f:
         serializer.dump_stream(sorted(items), f)
         size = f.tell()
     return size
예제 #5
0
파일: task.py 프로젝트: douban/dpark
 def _dump_bucket(self, items, path):
     serializer = get_serializer(self.rddconf)
     with open(path, 'wb') as f:
         serializer.dump_stream(sorted(items), f)
         size = f.tell()
     return size