def onStageFinished(stage): def _(r, dep): return r._do_checkpoint() MutableDict.merge() walk_dependencies(stage.rdd, _) logger.info("stage %d finish %s", stage.id, stage.fmt_stats())
def run_task(task, aid): logger.debug('Running task %r', task) try: Accumulator.clear() result = task.run(aid) accumUpdates = Accumulator.values() MutableDict.flush() return task.id, Success(), result, accumUpdates except Exception as e: logger.error('error in task %s', task) import traceback traceback.print_exc() return task.id, OtherFailure('exception:' + str(e)), None, None
def run_task(task, aid): logger.debug("Running task %r", task) try: Accumulator.clear() result = task.run(aid) accumUpdates = Accumulator.values() MutableDict.flush() return (task.id, Success(), result, accumUpdates) except Exception, e: logger.error("error in task %s", task) import traceback traceback.print_exc() return (task.id, OtherFailure("exception:" + str(e)), None, None)
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: # shuffle_id start from 1 swd = ShuffleWorkDir(0, task.id, ttid.task_try) tmppath = swd.alloc_tmp(len(data)) with open(tmppath, 'wb') as f: f.write(data) f.close() path = swd.export(tmppath) data = '/'.join([env.server_uri] + path.split('/')[-3:]) flag += 2 return TaskState.finished, cPickle.dumps( ((flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return TaskState.failed, TaskEndReason.fetch_failed, str( e), cPickle.dumps(e) except Exception as e: import traceback msg = traceback.format_exc() ename = e.__class__.__name__ fatal_exceptions = (DparkUserFatalError, ArithmeticError, ValueError, LookupError, SyntaxError, TypeError, AssertionError) prefix = "FATAL" if isinstance(e, fatal_exceptions) else "FAILED" return TaskState.failed, '{}_EXCEPTION_{}'.format( prefix, ename), msg, cPickle.dumps(e) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: # shuffle_id start from 1 swd = ShuffleWorkDir(0, task.id, ttid.task_try) tmppath = swd.alloc_tmp(len(data)) with open(tmppath, 'wb') as f: f.write(data) f.close() path = swd.export(tmppath) data = '/'.join( [env.server_uri] + path.split('/')[-3:] ) flag += 2 return TaskState.finished, cPickle.dumps(((flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return TaskState.failed, TaskEndReason.fetch_failed, str(e), cPickle.dumps(e) except Exception as e: import traceback msg = traceback.format_exc() ename = e.__class__.__name__ fatal_exceptions = (DparkUserFatalError, ArithmeticError, ValueError, LookupError, SyntaxError, TypeError, AssertionError) prefix = "FATAL" if isinstance(e, fatal_exceptions) else "FAILED" return TaskState.failed, '{}_EXCEPTION_{}'.format(prefix, ename), msg, cPickle.dumps(e) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, ntry = cPickle.loads(decompress(task_data)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception, e: flag, data = 1, cPickle.dumps(result, -1) else:
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try, len(data)) f = open(path, 'wb') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return TaskState.finished, cPickle.dumps( ((flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return TaskState.failed, TaskEndReason.fetch_failed, str( e), cPickle.dumps(e) except Exception as e: import traceback msg = traceback.format_exc() ename = e.__class__.__name__ return TaskState.failed, 'FAILED_EXCEPTION_{}'.format( ename), msg, cPickle.dumps(e) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try, len(data)) f = open(path, 'wb') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return 'TASK_FINISHED', cPickle.dumps( (Success(), (flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return 'TASK_FAILED', cPickle.dumps((e, None, None, None), -1) except: import traceback msg = traceback.format_exc() return 'TASK_FAILED', cPickle.dumps( (OtherFailure(msg), None, None, None), -1) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, ntry = loads(decompress(task_data)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception as e: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, ntry, task.id, len(data)) f = open(path, 'w') f.write(data) f.close() data = '/'.join( [LocalFileShuffle.getServerUri()] + path.split('/')[-3:] ) flag += 2 return 'TASK_FINISHED', cPickle.dumps( (Success(), (flag, data), accUpdate), -1) except FetchFailed as e: return 'TASK_FAILED', cPickle.dumps((e, None, None), -1) except: import traceback msg = traceback.format_exc() return 'TASK_FAILED', cPickle.dumps( (OtherFailure(msg), None, None), -1) finally: close_mfs() gc.collect() gc.enable()
k = 50 d = 20 M = len(ori) V = len(ori[0]) assert M % d == 0 assert V % d == 0 m = M / d v = V / d GAMMA = 0.02 LAMBDA = 0.1 STEP=0.9 W = MutableDict(d) H = MutableDict(d) ori_b = dpark.broadcast(ori) def sgd((i, j)): Wi = W.get(i) if Wi is None: Wi = numpy.random.rand(m, k) W.put(i, Wi) Hj = H.get(j) if Hj is None: Hj = numpy.random.rand(v, k) H.put(j, Hj) ori = ori_b.value
def runJob(self, finalRdd, func, partitions, allowLocal): outputParts = list(partitions) numOutputParts = len(partitions) finalStage = self.newStage(finalRdd, None) results = [None] * numOutputParts finished = [None] * numOutputParts lastFinished = 0 numFinished = 0 waiting = set() running = set() failed = set() pendingTasks = {} lastFetchFailureTime = 0 self.updateCacheLocs() logger.debug("Final stage: %s, %d", finalStage, numOutputParts) logger.debug("Parents of final stage: %s", finalStage.parents) logger.debug("Missing parents: %s", self.getMissingParentStages(finalStage)) if allowLocal and (not finalStage.parents or not self.getMissingParentStages(finalStage) ) and numOutputParts == 1: split = finalRdd.splits[outputParts[0]] yield func(finalRdd.iterator(split)) return def submitStage(stage): logger.debug("submit stage %s", stage) if stage not in waiting and stage not in running: missing = self.getMissingParentStages(stage) if not missing: submitMissingTasks(stage) running.add(stage) else: for parent in missing: submitStage(parent) waiting.add(stage) def submitMissingTasks(stage): myPending = pendingTasks.setdefault(stage, set()) tasks = [] have_prefer = True if stage == finalStage: for i in range(numOutputParts): if not finished[i]: part = outputParts[i] if have_prefer: locs = self.getPreferredLocs(finalRdd, part) if not locs: have_prefer = False else: locs = [] tasks.append( ResultTask(finalStage.id, finalRdd, func, part, locs, i)) else: for p in range(stage.numPartitions): if not stage.outputLocs[p]: if have_prefer: locs = self.getPreferredLocs(stage.rdd, p) if not locs: have_prefer = False else: locs = [] tasks.append( ShuffleMapTask(stage.id, stage.rdd, stage.shuffleDep, p, locs)) logger.debug("add to pending %s tasks", len(tasks)) myPending |= set(t.id for t in tasks) self.submitTasks(tasks) submitStage(finalStage) while numFinished != numOutputParts: try: evt = self.completionEvents.get(False) except Queue.Empty: self.check() if self._shutdown: sys.exit(1) if failed and time.time( ) > lastFetchFailureTime + RESUBMIT_TIMEOUT: self.updateCacheLocs() for stage in failed: logger.info("Resubmitting failed stages: %s", stage) submitStage(stage) failed.clear() else: time.sleep(0.1) continue task, reason = evt.task, evt.reason stage = self.idToStage[task.stageId] if stage not in pendingTasks: # stage from other job continue logger.debug("remove from pending %s from %s", task, stage) pendingTasks[stage].remove(task.id) if isinstance(reason, Success): Accumulator.merge(evt.accumUpdates) if isinstance(task, ResultTask): finished[task.outputId] = True numFinished += 1 results[task.outputId] = evt.result while lastFinished < numOutputParts and finished[ lastFinished]: yield results[lastFinished] results[lastFinished] = None lastFinished += 1 elif isinstance(task, ShuffleMapTask): stage = self.idToStage[task.stageId] stage.addOutputLoc(task.partition, evt.result) if not pendingTasks[stage] and all(stage.outputLocs): logger.debug( "%s finished; looking for newly runnable stages", stage) MutableDict.merge() running.remove(stage) if stage.shuffleDep != None: self.mapOutputTracker.registerMapOutputs( stage.shuffleDep.shuffleId, [l[-1] for l in stage.outputLocs]) self.updateCacheLocs() newlyRunnable = set( stage for stage in waiting if not self.getMissingParentStages(stage)) waiting -= newlyRunnable running |= newlyRunnable logger.debug("newly runnable: %s, %s", waiting, newlyRunnable) for stage in newlyRunnable: submitMissingTasks(stage) elif isinstance(reason, FetchFailed): if stage in running: waiting.add(stage) mapStage = self.shuffleToMapStage[reason.shuffleId] mapStage.removeHost(reason.serverUri) failed.add(mapStage) lastFetchFailureTime = time.time() else: logger.error("task %s failed: %s %s %s", task, reason, type(reason), reason.message) raise Exception(reason.message) MutableDict.merge() assert not any(results) return
def runJob(self, finalRdd, func, partitions, allowLocal): outputParts = list(partitions) numOutputParts = len(partitions) finalStage = self.newStage(finalRdd, None) results = [None]*numOutputParts finished = [None]*numOutputParts lastFinished = 0 numFinished = 0 waiting = set() running = set() failed = set() pendingTasks = {} lastFetchFailureTime = 0 self.updateCacheLocs() logger.debug("Final stage: %s, %d", finalStage, numOutputParts) logger.debug("Parents of final stage: %s", finalStage.parents) logger.debug("Missing parents: %s", self.getMissingParentStages(finalStage)) if allowLocal and (not finalStage.parents or not self.getMissingParentStages(finalStage)) and numOutputParts == 1: split = finalRdd.splits[outputParts[0]] yield func(finalRdd.iterator(split)) return def submitStage(stage): logger.debug("submit stage %s", stage) if stage not in waiting and stage not in running: missing = self.getMissingParentStages(stage) if not missing: submitMissingTasks(stage) running.add(stage) else: for parent in missing: submitStage(parent) waiting.add(stage) def submitMissingTasks(stage): myPending = pendingTasks.setdefault(stage, set()) tasks = [] have_prefer = True if stage == finalStage: for i in range(numOutputParts): if not finished[i]: part = outputParts[i] if have_prefer: locs = self.getPreferredLocs(finalRdd, part) if not locs: have_prefer = False else: locs = [] tasks.append(ResultTask(finalStage.id, finalRdd, func, part, locs, i)) else: for p in range(stage.numPartitions): if not stage.outputLocs[p]: if have_prefer: locs = self.getPreferredLocs(stage.rdd, p) if not locs: have_prefer = False else: locs = [] tasks.append(ShuffleMapTask(stage.id, stage.rdd, stage.shuffleDep, p, locs)) logger.debug("add to pending %s tasks", len(tasks)) myPending |= set(t.id for t in tasks) self.submitTasks(tasks) submitStage(finalStage) while numFinished != numOutputParts: try: evt = self.completionEvents.get(False) except Queue.Empty: self.check() if self._shutdown: sys.exit(1) if failed and time.time() > lastFetchFailureTime + RESUBMIT_TIMEOUT: self.updateCacheLocs() for stage in failed: logger.info("Resubmitting failed stages: %s", stage) submitStage(stage) failed.clear() else: time.sleep(0.1) continue task, reason = evt.task, evt.reason stage = self.idToStage[task.stageId] if stage not in pendingTasks: # stage from other job continue logger.debug("remove from pending %s from %s", task, stage) pendingTasks[stage].remove(task.id) if isinstance(reason, Success): Accumulator.merge(evt.accumUpdates) if isinstance(task, ResultTask): finished[task.outputId] = True numFinished += 1 results[task.outputId] = evt.result while lastFinished < numOutputParts and finished[lastFinished]: yield results[lastFinished] results[lastFinished] = None lastFinished += 1 elif isinstance(task, ShuffleMapTask): stage = self.idToStage[task.stageId] stage.addOutputLoc(task.partition, evt.result) if not pendingTasks[stage] and all(stage.outputLocs): logger.debug("%s finished; looking for newly runnable stages", stage) MutableDict.merge() running.remove(stage) if stage.shuffleDep != None: self.mapOutputTracker.registerMapOutputs( stage.shuffleDep.shuffleId, [l[-1] for l in stage.outputLocs]) self.updateCacheLocs() newlyRunnable = set(stage for stage in waiting if not self.getMissingParentStages(stage)) waiting -= newlyRunnable running |= newlyRunnable logger.debug("newly runnable: %s, %s", waiting, newlyRunnable) for stage in newlyRunnable: submitMissingTasks(stage) elif isinstance(reason, FetchFailed): if stage in running: waiting.add(stage) mapStage = self.shuffleToMapStage[reason.shuffleId] mapStage.removeHost(reason.serverUri) failed.add(mapStage) lastFetchFailureTime = time.time() else: logger.error("task %s failed: %s %s %s", task, reason, type(reason), reason.message) raise Exception(reason.message) MutableDict.merge() assert not any(results) return
def onStageFinished(stage): def _(r, dep): return r._do_checkpoint() MutableDict.merge() walk_dependencies(stage.rdd, _)
k = 50 d = 20 M = len(ori) V = len(ori[0]) assert M % d == 0 assert V % d == 0 m = M / d v = V / d GAMMA = 0.02 LAMBDA = 0.1 STEP = 0.9 W = MutableDict(d) H = MutableDict(d) ori_b = dpark.broadcast(ori) def sgd((i, j)): Wi = W.get(i) if Wi is None: Wi = numpy.random.rand(m, k) W.put(i, Wi) Hj = H.get(j) if Hj is None: Hj = numpy.random.rand(v, k) H.put(j, Hj)
k = 50 d = 20 M = len(ori) V = len(ori[0]) assert M % d == 0 assert V % d == 0 m = M / d v = V / d GAMMA = 0.02 LAMBDA = 0.1 STEP = 0.9 W = MutableDict(d) H = MutableDict(d) ori_b = dpark.broadcast(ori) def sgd(i_j): (i, j) = i_j Wi = W.get(i) if Wi is None: Wi = numpy.random.rand(m, k) W.put(i, Wi) Hj = H.get(j) if Hj is None: Hj = numpy.random.rand(v, k)