def run(self, attempId): logger.debug("shuffling %d of %s", self.partition, self.rdd) for i, bucket in self._prepare_shuffle(self.rdd): try: if marshalable(bucket): flag, d = b'm', marshal.dumps(bucket) else: flag, d = b'p', six.moves.cPickle.dumps(bucket, -1) except ValueError: flag, d = b'p', six.moves.cPickle.dumps(bucket, -1) cd = compress(d) for tried in range(1, 4): try: path = LocalFileShuffle.getOutputFile( self.shuffleId, self.partition, i, len(cd) * tried) with atomic_file(path, bufsize=1024 * 4096) as f: f.write(flag + struct.pack("I", 5 + len(cd))) f.write(cd) break except IOError as e: logger.warning("write %s failed: %s, try again (%d)", path, e, tried) else: raise return LocalFileShuffle.getServerUri()
def run_without_sorted(self, it): for i, bucket in it: try: if marshalable(bucket): flag, d = b'm', marshal.dumps(bucket) else: flag, d = b'p', six.moves.cPickle.dumps(bucket, -1) except ValueError: flag, d = b'p', six.moves.cPickle.dumps(bucket, -1) cd = compress(d) env.task_stats.bytes_shuffle_write += len(cd) for tried in range(1, 4): try: path = LocalFileShuffle.getOutputFile( self.shuffleId, self.partition, i, len(cd) * tried) with atomic_file(path, bufsize=1024 * 4096) as f: f.write(flag + struct.pack("I", 5 + len(cd))) f.write(cd) break except IOError as e: logger.warning("write %s failed: %s, try again (%d)", path, e, tried) else: raise e return LocalFileShuffle.getServerUri()
def run_task(task_data): try: gc.disable() task, ntry = cPickle.loads(decompress(task_data)) setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() if marshalable(result): flag, data = 0, marshal.dumps(result) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: workdir = env.get('WORKDIR') name = 'task_%s_%s.result' % (task.id, ntry) path = os.path.join(workdir, name) f = open(path, 'w') f.write(data) f.close() data = LocalFileShuffle.getServerUri() + '/' + name flag += 2 return mesos_pb2.TASK_FINISHED, cPickle.dumps((task.id, Success(), (flag, data), accUpdate), -1) except Exception, e: import traceback msg = traceback.format_exc() return mesos_pb2.TASK_FAILED, cPickle.dumps((task.id, OtherFailure(msg), None, None), -1)
def run_task(task, ntry): try: setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() gc.disable() result = task.run(ntry) accUpdate = Accumulator.values() if marshalable(result): flag, data = 0, marshal.dumps(result) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: workdir = env.get('WORKDIR') name = 'task_%s_%s.result' % (task.id, ntry) path = os.path.join(workdir, name) f = open(path, 'w') f.write(data) f.close() data = LocalFileShuffle.getServerUri() + '/' + name flag += 2 return mesos_pb2.TASK_FINISHED, cPickle.dumps( (task.id, Success(), (flag, data), accUpdate), -1) except Exception, e: import traceback msg = traceback.format_exc() return mesos_pb2.TASK_FAILED, cPickle.dumps( (task.id, OtherFailure(msg), None, None), -1)
def run_task(task_data): try: gc.disable() task, ntry = cPickle.loads(decompress(task_data)) setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() if marshalable(result): flag, data = 0, marshal.dumps(result) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, ntry, task.id, len(data)) f = open(path, 'w') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return mesos_pb2.TASK_FINISHED, cPickle.dumps((Success(), (flag, data), accUpdate), -1) except FetchFailed, e: return mesos_pb2.TASK_FAILED, cPickle.dumps((e, None, None), -1)
def _run(self, task_id): mem_limit = env.meminfo.mem_limit_soft t0 = time.time() logger.debug("run task with shuffle_flag %r" % (self.rddconf, )) rdd = self.rdd meminfo = env.meminfo n = self.partitioner.numPartitions get_partition = self.partitioner.getPartition merge_value = self.aggregator.mergeValue create_combiner = self.aggregator.createCombiner dumper_cls = SortMergeBucketDumper if self.rddconf.sort_merge else BucketDumper dumper = dumper_cls(self.shuffleId, self.partition, n, self.rddconf) buckets = [{} for _ in range(n)] env.meminfo.ratio = min(float(n) / (n + 1), env.meminfo.ratio) last_i = 0 for i, item in enumerate(rdd.iterator(self.split)): try: try: k, v = item except: raise DparkUserFatalError( "item of {} should be (k, v) pair, got: {}".format( rdd.scope.key, item)) bucket = buckets[get_partition(k)] r = bucket.get(k, None) if r is not None: bucket[k] = merge_value(r, v) else: bucket[k] = create_combiner(v) if dpark.conf.MULTI_SEGMENT_DUMP and meminfo.rss > mem_limit: _log = logger.info if dpark.conf.LOG_ROTATE else logger.debug _log( "dump rotate %d with %d kv: mem %d MB, sort limit %d MB, limit %d MB", env.task_stats.num_dump_rotate + 1, i - last_i, int(meminfo.rss) >> 20, mem_limit >> 20, int(meminfo.mem) >> 20) dumper.dump(buckets, False) [buckets[j].clear() for j in range(n)] env.meminfo.after_rotate() mem_limit = env.meminfo.mem_limit_soft last_i = i except ValueError as e: logger.exception('The ValueError exception: %s at %s', str(e), str(rdd.scope.api_callsite)) raise t1 = time.time() dumper.dump(buckets, True) dumper.commit(self.aggregator) del buckets env.task_stats.bytes_dump += dumper.get_size() env.task_stats.num_dump_rotate += 1 t = time.time() env.task_stats.secs_dump += t - t1 env.task_stats.secs_all = t - t0 return LocalFileShuffle.getServerUri()
class ShuffleMapTask(DAGTask): def __init__(self, stageId, rdd, dep, partition, locs): DAGTask.__init__(self, stageId) self.rdd = rdd self.shuffleId = dep.shuffleId self.aggregator = dep.aggregator self.partitioner = dep.partitioner self.partition = partition self.split = rdd.splits[partition] self.locs = locs def __repr__(self): return '<ShuffleTask(%d, %d) of %s>' % (self.shuffleId, self.partition, self.rdd) def __getstate__(self): d = dict(self.__dict__) del d['rdd'] return d, dumps(self.rdd) def __setstate__(self, state): d, rdd = state self.__dict__.update(d) self.rdd = loads(rdd) def preferredLocations(self): return self.locs def run(self, attempId): logger.debug("shuffling %d of %s", self.partition, self.rdd) for i, bucket in self.rdd._prepare_shuffle(self.split, self.partitioner, self.aggregator): try: if marshalable(bucket): flag, d = 'm', marshal.dumps(bucket) else: flag, d = 'p', cPickle.dumps(bucket, -1) except ValueError: flag, d = 'p', cPickle.dumps(bucket, -1) cd = compress(d) for tried in range(1, 4): try: path = LocalFileShuffle.getOutputFile(self.shuffleId, self.partition, i, len(cd) * tried) tpath = path + ".%s.%s" % (socket.gethostname(), os.getpid()) f = open(tpath, 'wb', 1024*4096) f.write(flag + struct.pack("I", 5 + len(cd))) f.write(cd) f.close() os.rename(tpath, path) break except IOError, e: logger.warning("write %s failed: %s, try again (%d)", path, e, tried) try: os.remove(tpath) except OSError: pass else: raise return LocalFileShuffle.getServerUri()
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try, len(data)) f = open(path, 'wb') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return TaskState.finished, cPickle.dumps( ((flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return TaskState.failed, TaskEndReason.fetch_failed, str( e), cPickle.dumps(e) except Exception as e: import traceback msg = traceback.format_exc() ename = e.__class__.__name__ fatal_exceptions = (DparkUserFatalError, ArithmeticError, ValueError, LookupError, SyntaxError, TypeError, AssertionError) prefix = "FATAL" if isinstance(e, fatal_exceptions) else "FAILED" return TaskState.failed, '{}_EXCEPTION_{}'.format( prefix, ename), msg, cPickle.dumps(e) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try, len(data)) f = open(path, 'wb') f.write(data) f.close() data = '/'.join( [LocalFileShuffle.getServerUri()] + path.split('/')[-3:] ) flag += 2 return TaskState.finished, cPickle.dumps(((flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return TaskState.failed, TaskEndReason.fetch_failed, str(e), cPickle.dumps(e) except Exception as e: import traceback msg = traceback.format_exc() ename = e.__class__.__name__ fatal_exceptions = (DparkUserFatalError, ArithmeticError, ValueError, LookupError, SyntaxError, TypeError, AssertionError) prefix = "FATAL" if isinstance(e, fatal_exceptions) else "FAILED" return TaskState.failed, '{}_EXCEPTION_{}'.format(prefix, ename), msg, cPickle.dumps(e) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try, len(data)) f = open(path, 'wb') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return 'TASK_FINISHED', cPickle.dumps( (Success(), (flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return 'TASK_FAILED', cPickle.dumps((e, None, None, None), -1) except: import traceback msg = traceback.format_exc() return 'TASK_FAILED', cPickle.dumps( (OtherFailure(msg), None, None, None), -1) finally: gc.collect() gc.enable()
def run_with_sorted(self, it): serializer = GroupByAutoBatchedSerializer( ) if self.iter_values else AutoBatchedSerializer() for i, bucket in it: for tried in range(1, 4): try: path = LocalFileShuffle.getOutputFile( self.shuffleId, self.partition, i) with atomic_file(path, bufsize=1024 * 4096) as f: items = sorted(bucket.items(), key=lambda x: x[0]) serializer.dump_stream(items, f) env.task_stats.bytes_shuffle_write += f.tell() break except IOError as e: logger.warning("write %s failed: %s, try again (%d)", path, e, tried) else: raise e return LocalFileShuffle.getServerUri()
def run_task(task_data): try: gc.disable() task, ntry = loads(decompress(task_data)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception as e: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, ntry, task.id, len(data)) f = open(path, 'w') f.write(data) f.close() data = '/'.join( [LocalFileShuffle.getServerUri()] + path.split('/')[-3:] ) flag += 2 return 'TASK_FINISHED', cPickle.dumps( (Success(), (flag, data), accUpdate), -1) except FetchFailed as e: return 'TASK_FAILED', cPickle.dumps((e, None, None), -1) except: import traceback msg = traceback.format_exc() return 'TASK_FAILED', cPickle.dumps( (OtherFailure(msg), None, None), -1) finally: close_mfs() gc.collect() gc.enable()
class ShuffleMapTask(DAGTask): def __init__(self, stageId, rdd, dep, partition, locs): DAGTask.__init__(self, stageId) self.rdd = rdd self.shuffleId = dep.shuffleId self.aggregator = dep.aggregator self.partitioner = dep.partitioner self.partition = partition self.split = rdd.splits[partition] self.locs = locs def __repr__(self): return '<ShuffleTask(%d, %d) of %s>' % (self.shuffleId, self.partition, self.rdd) def preferredLocations(self): return self.locs def run(self, attempId): logger.debug("shuffling %d of %s", self.partition, self.rdd) numOutputSplits = self.partitioner.numPartitions getPartition = self.partitioner.getPartition mergeValue = self.aggregator.mergeValue createCombiner = self.aggregator.createCombiner buckets = [{} for i in range(numOutputSplits)] for k, v in self.rdd.iterator(self.split): bucketId = getPartition(k) bucket = buckets[bucketId] r = bucket.get(k, None) if r is not None: bucket[k] = mergeValue(r, v) else: bucket[k] = createCombiner(v) for i in range(numOutputSplits): try: if marshalable(buckets[i]): flag, d = 'm', marshal.dumps(buckets[i]) else: flag, d = 'p', cPickle.dumps(buckets[i], -1) except ValueError: flag, d = 'p', cPickle.dumps(buckets[i], -1) cd = compress(d) for tried in range(1, 4): try: path = LocalFileShuffle.getOutputFile( self.shuffleId, self.partition, i, len(cd) * tried) tpath = path + ".%s.%s" % (socket.gethostname(), os.getpid()) f = open(tpath, 'wb', 1024 * 4096) f.write(flag + struct.pack("I", 5 + len(cd))) f.write(cd) f.close() os.rename(tpath, path) break except IOError, e: logger.warning("write %s failed: %s, try again (%d)", path, e, tried) try: os.remove(tpath) except OSError: pass else: raise return LocalFileShuffle.getServerUri()
if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception, e: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, ntry, task.id, len(data)) f = open(path, 'w') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return mesos_pb2.TASK_FINISHED, cPickle.dumps( (Success(), (flag, data), accUpdate), -1) except FetchFailed, e: return mesos_pb2.TASK_FAILED, cPickle.dumps((e, None, None), -1) except: import traceback msg = traceback.format_exc() return mesos_pb2.TASK_FAILED, cPickle.dumps( (OtherFailure(msg), None, None), -1) finally: gc.collect() gc.enable()
class ShuffleMapTask(DAGTask): def __init__(self, stageId, rdd, dep, partition, locs): DAGTask.__init__(self, stageId) self.rdd = rdd self.shuffleId = dep.shuffleId self.aggregator = dep.aggregator self.partitioner = dep.partitioner self.partition = partition self.split = rdd.splits[partition] self.locs = locs def __repr__(self): shuffleId = getattr(self, 'shuffleId', None) partition = getattr(self, 'partition', None) rdd = getattr(self, 'rdd', None) return '<ShuffleTask(%s, %s) of %s>' % (shuffleId, partition, rdd) def __getstate__(self): d = dict(self.__dict__) del d['rdd'] del d['split'] return d, dumps(self.rdd), dumps(self.split) def __setstate__(self, state): d, rdd, split = state self.__dict__.update(d) self.rdd = loads(rdd) self.split = loads(split) def preferredLocations(self): return self.locs def _prepare_shuffle(self, rdd): split = self.split numOutputSplits = self.partitioner.numPartitions getPartition = self.partitioner.getPartition mergeValue = self.aggregator.mergeValue createCombiner = self.aggregator.createCombiner buckets = [{} for i in range(numOutputSplits)] for k, v in rdd.iterator(split): bucketId = getPartition(k) bucket = buckets[bucketId] r = bucket.get(k, None) if r is not None: bucket[k] = mergeValue(r, v) else: bucket[k] = createCombiner(v) return enumerate(buckets) def run(self, attempId): logger.debug("shuffling %d of %s", self.partition, self.rdd) for i, bucket in self._prepare_shuffle(self.rdd): try: if marshalable(bucket): flag, d = 'm', marshal.dumps(bucket) else: flag, d = 'p', cPickle.dumps(bucket, -1) except ValueError: flag, d = 'p', cPickle.dumps(bucket, -1) cd = compress(d) for tried in range(1, 4): try: path = LocalFileShuffle.getOutputFile( self.shuffleId, self.partition, i, len(cd) * tried) with atomic_file(path, bufsize=1024 * 4096) as f: f.write(flag + struct.pack("I", 5 + len(cd))) f.write(cd) break except IOError, e: logger.warning("write %s failed: %s, try again (%d)", path, e, tried) else: raise return LocalFileShuffle.getServerUri()
if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception, e: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, ntry, task.id, len(data)) f = open(path, 'w') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return mesos_pb2.TASK_FINISHED, cPickle.dumps((Success(), (flag, data), accUpdate), -1) except FetchFailed, e: return mesos_pb2.TASK_FAILED, cPickle.dumps((e, None, None), -1) except : import traceback msg = traceback.format_exc() return mesos_pb2.TASK_FAILED, cPickle.dumps((OtherFailure(msg), None, None), -1) finally: setproctitle('dpark worker: idle') gc.collect() gc.enable() def init_env(args):
if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception, e: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, ntry, task.id, len(data)) f = open(path, "w") f.write(data) f.close() data = "/".join([LocalFileShuffle.getServerUri()] + path.split("/")[-3:]) flag += 2 return mesos_pb2.TASK_FINISHED, cPickle.dumps((Success(), (flag, data), accUpdate), -1) except FetchFailed, e: return mesos_pb2.TASK_FAILED, cPickle.dumps((e, None, None), -1) except: import traceback msg = traceback.format_exc() return mesos_pb2.TASK_FAILED, cPickle.dumps((OtherFailure(msg), None, None), -1) finally: gc.collect() gc.enable()