def _pre_commit(self, aggregator): for i in range(self.num_reduce): out_path = self.paths[i] if out_path: tmp = self._mk_tmp(out_path) in_path = [ self._mk_tmp(out_path, i) for i in range(self.num_dump + 1) ] in_path = [p for p in in_path if os.path.exists(p)] if len(in_path) == 1: os.rename(in_path[0], tmp) else: inputs = [ get_serializer(self.rddconf).load_stream(open(p)) for p in in_path ] rddconf = self.rddconf.dup(op=dpark.conf.OP_GROUPBY) merger = Merger.get(rddconf, aggregator=aggregator, call_site=self.__class__.__name__) merger.merge(inputs) with open(tmp, 'w') as f: get_serializer(self.rddconf).dump_stream(merger, f) else: self._dump_empty_bucket(i)
def _pre_commit(self, aggregator): for i in range(self.num_reduce): tmp_paths = self.tmp_paths[i] if tmp_paths: if len(tmp_paths) == 1: self.paths[i].export(tmp_paths[0]) else: inputs = [get_serializer(self.rddconf).load_stream(open(p)) for p in tmp_paths] rddconf = self.rddconf.dup(op=dpark.conf.OP_GROUPBY) merger = Merger.get(rddconf, aggregator=aggregator, api_callsite=self.__class__.__name__) merger.merge(inputs) final_tmp = self._get_tmp(i, True, 0) with open(final_tmp, 'wb') as f: get_serializer(self.rddconf).dump_stream(merger, f) else: self._dump_empty_bucket(i)
def _pre_commit(self, aggregator): for i in range(self.num_reduce): tmp_paths = self.tmp_paths[i] if tmp_paths: if len(tmp_paths) == 1: self.paths[i].export(tmp_paths[0]) else: inputs = [ get_serializer(self.rddconf).load_stream(open(p)) for p in tmp_paths ] rddconf = self.rddconf.dup(op=dpark.conf.OP_GROUPBY) merger = Merger.get(rddconf, aggregator=aggregator, api_callsite=self.__class__.__name__) merger.merge(inputs) final_tmp = self._get_tmp(i, True, 0) with open(final_tmp, 'wb') as f: get_serializer(self.rddconf).dump_stream(merger, f) else: self._dump_empty_bucket(i)
def _dump_bucket(self, items, path): serializer = get_serializer(self.rddconf) with open(path, 'wb') as f: serializer.dump_stream(sorted(items), f) size = f.tell() return size