def run_task(task, aid): try: setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() result = task.run(aid) accUpdate = Accumulator.values() try: flag, data = 0, marshal.dumps(result) except ValueError: flag, data = 1, cPickle.dumps(result) if len(data) > TASK_RESULT_LIMIT: workdir = env.get('WORKDIR') path = os.path.join(workdir, str(task.id) + '.result') with open(path, 'w') as f: f.write(data) data = path flag += 2 setproctitle('dpark worker: idle') return mesos_pb2.TASK_FINISHED, cPickle.dumps( (task.id, Success(), (flag, data), accUpdate), -1) except Exception, e: import traceback msg = traceback.format_exc() setproctitle('dpark worker: idle') return mesos_pb2.TASK_FAILED, cPickle.dumps( (task.id, OtherFailure(msg), None, None), -1)
def run_task(task_data): try: gc.disable() task, ntry = cPickle.loads(decompress(task_data)) setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() if marshalable(result): flag, data = 0, marshal.dumps(result) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, ntry, task.id, len(data)) f = open(path, 'w') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return mesos_pb2.TASK_FINISHED, cPickle.dumps((Success(), (flag, data), accUpdate), -1) except FetchFailed, e: return mesos_pb2.TASK_FAILED, cPickle.dumps((e, None, None), -1)
def run_task(task, ntry): try: setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() gc.disable() result = task.run(ntry) accUpdate = Accumulator.values() if marshalable(result): flag, data = 0, marshal.dumps(result) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: workdir = env.get('WORKDIR') name = 'task_%s_%s.result' % (task.id, ntry) path = os.path.join(workdir, name) f = open(path, 'w') f.write(data) f.close() data = LocalFileShuffle.getServerUri() + '/' + name flag += 2 return mesos_pb2.TASK_FINISHED, cPickle.dumps( (task.id, Success(), (flag, data), accUpdate), -1) except Exception, e: import traceback msg = traceback.format_exc() return mesos_pb2.TASK_FAILED, cPickle.dumps( (task.id, OtherFailure(msg), None, None), -1)
def run_task(task, aid): try: setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() result = task.run(aid) accUpdate = Accumulator.values() try: flag, data = 0, marshal.dumps(result) except ValueError: flag, data = 1, cPickle.dumps(result) if len(data) > TASK_RESULT_LIMIT and env.dfs: workdir = env.get('WORKDIR') path = os.path.join(workdir, str(task.id)+'.result') with open(path, 'w') as f: f.write(data) data = path flag += 2 setproctitle('dpark worker: idle') return mesos_pb2.TASK_FINISHED, cPickle.dumps((task.id, Success(), (flag, data), accUpdate), -1) except Exception, e: import traceback msg = traceback.format_exc() setproctitle('dpark worker: idle') return mesos_pb2.TASK_FAILED, cPickle.dumps((task.id, OtherFailure(msg), None, None), -1)
def run_task(task_data): try: gc.disable() task, ntry = cPickle.loads(decompress(task_data)) setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() if marshalable(result): flag, data = 0, marshal.dumps(result) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: workdir = env.get('WORKDIR') name = 'task_%s_%s.result' % (task.id, ntry) path = os.path.join(workdir, name) f = open(path, 'w') f.write(data) f.close() data = LocalFileShuffle.getServerUri() + '/' + name flag += 2 return mesos_pb2.TASK_FINISHED, cPickle.dumps((task.id, Success(), (flag, data), accUpdate), -1) except Exception, e: import traceback msg = traceback.format_exc() return mesos_pb2.TASK_FAILED, cPickle.dumps((task.id, OtherFailure(msg), None, None), -1)
def run_task(task, aid): logger.debug("Running task %r", task) try: Accumulator.clear() result = task.run(aid) accumUpdates = Accumulator.values() return (task.id, Success(), result, accumUpdates) except Exception, e: logger.error("error in task %s", task) import traceback traceback.print_exc() return (task.id, OtherFailure("exception:" + str(e)), None, None)
def run_task(task, aid): logger.debug('Running task %r', task) try: Accumulator.clear() result = task.run(aid) accumUpdates = Accumulator.values() MutableDict.flush() return task.id, Success(), result, accumUpdates except Exception as e: logger.error('error in task %s', task) import traceback traceback.print_exc() return task.id, OtherFailure('exception:' + str(e)), None, None
def run_task(task, aid): logger.debug('Running task %r', task) try: Accumulator.clear() result = task.run(aid) accumUpdates = Accumulator.values() MutableDict.flush() return (task.id, Success(), result, accumUpdates) except Exception as e: logger.error('error in task %s', task) import traceback traceback.print_exc() return (task.id, OtherFailure('exception:' + str(e)), None, None)
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: # shuffle_id start from 1 swd = ShuffleWorkDir(0, task.id, ttid.task_try) tmppath = swd.alloc_tmp(len(data)) with open(tmppath, 'wb') as f: f.write(data) f.close() path = swd.export(tmppath) data = '/'.join([env.server_uri] + path.split('/')[-3:]) flag += 2 return TaskState.finished, cPickle.dumps( ((flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return TaskState.failed, TaskEndReason.fetch_failed, str( e), cPickle.dumps(e) except Exception as e: import traceback msg = traceback.format_exc() ename = e.__class__.__name__ fatal_exceptions = (DparkUserFatalError, ArithmeticError, ValueError, LookupError, SyntaxError, TypeError, AssertionError) prefix = "FATAL" if isinstance(e, fatal_exceptions) else "FAILED" return TaskState.failed, '{}_EXCEPTION_{}'.format( prefix, ename), msg, cPickle.dumps(e) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: # shuffle_id start from 1 swd = ShuffleWorkDir(0, task.id, ttid.task_try) tmppath = swd.alloc_tmp(len(data)) with open(tmppath, 'wb') as f: f.write(data) f.close() path = swd.export(tmppath) data = '/'.join( [env.server_uri] + path.split('/')[-3:] ) flag += 2 return TaskState.finished, cPickle.dumps(((flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return TaskState.failed, TaskEndReason.fetch_failed, str(e), cPickle.dumps(e) except Exception as e: import traceback msg = traceback.format_exc() ename = e.__class__.__name__ fatal_exceptions = (DparkUserFatalError, ArithmeticError, ValueError, LookupError, SyntaxError, TypeError, AssertionError) prefix = "FATAL" if isinstance(e, fatal_exceptions) else "FAILED" return TaskState.failed, '{}_EXCEPTION_{}'.format(prefix, ename), msg, cPickle.dumps(e) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, ntry = cPickle.loads(decompress(task_data)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception, e: flag, data = 1, cPickle.dumps(result, -1) else:
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try, len(data)) f = open(path, 'wb') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return TaskState.finished, cPickle.dumps( ((flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return TaskState.failed, TaskEndReason.fetch_failed, str( e), cPickle.dumps(e) except Exception as e: import traceback msg = traceback.format_exc() ename = e.__class__.__name__ return TaskState.failed, 'FAILED_EXCEPTION_{}'.format( ename), msg, cPickle.dumps(e) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, ntry = cPickle.loads(decompress(task_data)) setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception, e: flag, data = 1, cPickle.dumps(result, -1) else:
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try, len(data)) f = open(path, 'wb') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return 'TASK_FINISHED', cPickle.dumps( (Success(), (flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return 'TASK_FAILED', cPickle.dumps((e, None, None, None), -1) except: import traceback msg = traceback.format_exc() return 'TASK_FAILED', cPickle.dumps( (OtherFailure(msg), None, None, None), -1) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, ntry = loads(decompress(task_data)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception as e: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, ntry, task.id, len(data)) f = open(path, 'w') f.write(data) f.close() data = '/'.join( [LocalFileShuffle.getServerUri()] + path.split('/')[-3:] ) flag += 2 return 'TASK_FINISHED', cPickle.dumps( (Success(), (flag, data), accUpdate), -1) except FetchFailed as e: return 'TASK_FAILED', cPickle.dumps((e, None, None), -1) except: import traceback msg = traceback.format_exc() return 'TASK_FAILED', cPickle.dumps( (OtherFailure(msg), None, None), -1) finally: close_mfs() gc.collect() gc.enable()
def runJob(self, finalRdd, func, partitions, allowLocal): run_id = self.runJobTimes self.runJobTimes += 1 outputParts = list(partitions) numOutputParts = len(partitions) finalStage = self.newStage(finalRdd, None) try: from dpark.web.ui.views.rddopgraph import StageInfo stage_info = StageInfo() stage_info.create_stage_info(finalStage) def create_stage_info_recur(cur_stage, is_final=False): if not cur_stage or cur_stage.id in self.idToRunJob: return for par_stage in cur_stage.parents: create_stage_info_recur(par_stage) if cur_stage.id not in self.idToRunJob: self.idToRunJob[cur_stage.id] = StageInfo.idToStageInfo[ cur_stage.id] self.idToRunJob[cur_stage.id].is_final = is_final create_stage_info_recur(finalStage, is_final=True) except ImportError: pass results = [None] * numOutputParts finished = [None] * numOutputParts last_finished = 0 num_finished = 0 waiting = set() running = set() failed = set() pendingTasks = {} # stage -> set([task_id..]) lastFetchFailureTime = 0 self.updateCacheLocs() logger.debug('Final stage: %s, %d', finalStage, numOutputParts) logger.debug('Parents of final stage: %s', finalStage.parents) logger.debug('Missing parents: %s', self.getMissingParentStages(finalStage)) def onStageFinished(stage): def _(r, dep): return r._do_checkpoint() MutableDict.merge() walk_dependencies(stage.rdd, _) logger.info("stage %d finish %s", stage.id, stage.fmt_stats()) if (allowLocal and (not finalStage.parents or not self.getMissingParentStages(finalStage)) and numOutputParts == 1): split = finalRdd.splits[outputParts[0]] yield func(finalRdd.iterator(split)) onStageFinished(finalStage) return def submitStage(stage): stage.submit_time = time.time() logger.debug('submit stage %s', stage) if stage not in waiting and stage not in running: missing = self.getMissingParentStages(stage) if not missing: submitMissingTasks(stage) running.add(stage) else: for parent in missing: submitStage(parent) waiting.add(stage) def submitMissingTasks(stage): myPending = pendingTasks.setdefault(stage, set()) tasks = [] have_prefer = True if stage == finalStage: for i in range(numOutputParts): if not finished[i]: part = outputParts[i] if have_prefer: locs = self.getPreferredLocs(finalRdd, part) if not locs: have_prefer = False else: locs = [] tasks.append( ResultTask(finalStage.id, finalStage.try_id, part, finalRdd, func, locs, i)) else: for part in range(stage.numPartitions): if not stage.outputLocs[part]: if have_prefer: locs = self.getPreferredLocs(stage.rdd, part) if not locs: have_prefer = False else: locs = [] tasks.append( ShuffleMapTask(stage.id, stage.try_id, part, stage.rdd, stage.shuffleDep, locs)) logger.debug('add to pending %s tasks', len(tasks)) myPending |= set(t.id for t in tasks) self.submitTasks(tasks) submitStage(finalStage) while num_finished != numOutputParts: try: evt = self.completionEvents.get(False) except queue.Empty: if (failed and time.time() > lastFetchFailureTime + RESUBMIT_TIMEOUT): self.updateCacheLocs() for stage in failed: logger.info('Resubmitting failed stages: %s', stage) submitStage(stage) failed.clear() else: time.sleep(0.1) continue if evt is None: # aborted for taskset in self.active_tasksets.values(): self.tasksetFinished(taskset) raise RuntimeError('TaskSet aborted!') task, reason = evt.task, evt.reason stage = self.idToStage[task.stage_id] if stage not in pendingTasks: # stage from other taskset continue logger.debug('remove from pending %s from %s', task, stage) pendingTasks[stage].remove(task.id) if isinstance(reason, Success): Accumulator.merge(evt.accumUpdates) stage.task_stats[task.partition].append(evt.stats) if isinstance(task, ResultTask): finished[task.outputId] = True num_finished += 1 results[task.outputId] = evt.result while last_finished < numOutputParts and finished[ last_finished]: yield results[last_finished] results[last_finished] = None last_finished += 1 stage.finish() elif isinstance(task, ShuffleMapTask): stage = self.idToStage[task.stage_id] stage.addOutputLoc(task.partition, evt.result) if all(stage.outputLocs): stage.finish() logger.debug( '%s finished; looking for newly runnable stages', stage) if pendingTasks[stage]: logger.warn( 'dirty stage %d with %d tasks' '(select at most 10 tasks:%s) not clean', stage.id, len(pendingTasks[stage]), str(list(pendingTasks[stage])[:10])) del pendingTasks[stage] onStageFinished(stage) running.remove(stage) if stage.shuffleDep is not None: self.mapOutputTracker.registerMapOutputs( stage.shuffleDep.shuffleId, [l[-1] for l in stage.outputLocs]) self.updateCacheLocs() newlyRunnable = set( stage for stage in waiting if not self.getMissingParentStages(stage)) waiting -= newlyRunnable running |= newlyRunnable logger.debug('newly runnable: %s, %s', waiting, newlyRunnable) for stage in newlyRunnable: submitMissingTasks(stage) elif isinstance(reason, FetchFailed): if stage in running: waiting.add(stage) running.remove(stage) mapStage = self.shuffleToMapStage[reason.shuffleId] mapStage.removeHost(reason.serverUri) failed.add(mapStage) lastFetchFailureTime = time.time() else: logger.error('task %s failed: %s %s %s', task, reason, type(reason), reason.message) raise Exception(reason.message) onStageFinished(finalStage) if not self.is_dstream: try: self.last_jobstats = self.get_stats(run_id) if self.loghub_dir: names = ['sched', self.id, "job", run_id] name = "_".join(map(str, names)) + ".json" path = os.path.join(self.loghub_dir, name) logger.info("writing profile to %s", path) with open(path, 'w') as f: json.dump(self.last_jobstats, f, indent=4) except Exception as e: logger.error("Fail to dump job stats: %s.", e) assert all(finished) return
def runJob(self, finalRdd, func, partitions, allowLocal): outputParts = list(partitions) numOutputParts = len(partitions) finalStage = self.newStage(finalRdd, None) results = [None]*numOutputParts finished = [None]*numOutputParts lastFinished = 0 numFinished = 0 waiting = set() running = set() failed = set() pendingTasks = {} lastFetchFailureTime = 0 self.updateCacheLocs() logger.debug("Final stage: %s, %d", finalStage, numOutputParts) logger.debug("Parents of final stage: %s", finalStage.parents) logger.debug("Missing parents: %s", self.getMissingParentStages(finalStage)) if allowLocal and (not finalStage.parents or not self.getMissingParentStages(finalStage)) and numOutputParts == 1: split = finalRdd.splits[outputParts[0]] yield func(finalRdd.iterator(split)) return def submitStage(stage): logger.debug("submit stage %s", stage) if stage not in waiting and stage not in running: missing = self.getMissingParentStages(stage) if not missing: submitMissingTasks(stage) running.add(stage) else: for parent in missing: submitStage(parent) waiting.add(stage) def submitMissingTasks(stage): myPending = pendingTasks.setdefault(stage, set()) tasks = [] have_prefer = True if stage == finalStage: for i in range(numOutputParts): if not finished[i]: part = outputParts[i] if have_prefer: locs = self.getPreferredLocs(finalRdd, part) if not locs: have_prefer = False else: locs = [] tasks.append(ResultTask(finalStage.id, finalRdd, func, part, locs, i)) else: for p in range(stage.numPartitions): if not stage.outputLocs[p]: if have_prefer: locs = self.getPreferredLocs(stage.rdd, p) if not locs: have_prefer = False else: locs = [] tasks.append(ShuffleMapTask(stage.id, stage.rdd, stage.shuffleDep, p, locs)) logger.debug("add to pending %s tasks", len(tasks)) myPending |= set(t.id for t in tasks) self.submitTasks(tasks) submitStage(finalStage) while numFinished != numOutputParts: try: evt = self.completionEvents.get(False) except Queue.Empty: self.check() if self._shutdown: sys.exit(1) if failed and time.time() > lastFetchFailureTime + RESUBMIT_TIMEOUT: self.updateCacheLocs() for stage in failed: logger.info("Resubmitting failed stages: %s", stage) submitStage(stage) failed.clear() else: time.sleep(0.1) continue task, reason = evt.task, evt.reason stage = self.idToStage[task.stageId] if stage not in pendingTasks: # stage from other job continue logger.debug("remove from pedding %s from %s", task, stage) pendingTasks[stage].remove(task.id) if isinstance(reason, Success): Accumulator.merge(evt.accumUpdates) if isinstance(task, ResultTask): finished[task.outputId] = True numFinished += 1 results[task.outputId] = evt.result while lastFinished < numOutputParts and finished[lastFinished]: yield results[lastFinished] results[lastFinished] = None lastFinished += 1 elif isinstance(task, ShuffleMapTask): stage = self.idToStage[task.stageId] stage.addOutputLoc(task.partition, evt.result) if not pendingTasks[stage]: logger.debug("%s finished; looking for newly runnable stages", stage) running.remove(stage) if stage.shuffleDep != None: self.mapOutputTracker.registerMapOutputs( stage.shuffleDep.shuffleId, [l[-1] for l in stage.outputLocs]) self.updateCacheLocs() newlyRunnable = set(stage for stage in waiting if not self.getMissingParentStages(stage)) waiting -= newlyRunnable running |= newlyRunnable logger.debug("newly runnable: %s, %s", waiting, newlyRunnable) for stage in newlyRunnable: submitMissingTasks(stage) elif isinstance(reason, FetchFailed): if stage in running: running.remove(stage) waiting.add(stage) mapStage = self.shuffleToMapStage[reason.shuffleId] mapStage.removeHost(reason.serverUri) failed.add(mapStage) lastFetchFailureTime = time.time() else: logger.error("task %s failed: %s %s %s", task, reason, type(reason), reason.message) raise Exception(reason.message) assert not any(results) return
def runJob(self, finalRdd, func, partitions, allowLocal): outputParts = list(partitions) numOutputParts = len(partitions) finalStage = self.newStage(finalRdd, None) results = [None] * numOutputParts finished = [None] * numOutputParts lastFinished = 0 numFinished = 0 waiting = set() running = set() failed = set() pendingTasks = {} lastFetchFailureTime = 0 self.updateCacheLocs() logger.debug("Final stage: %s, %d", finalStage, numOutputParts) logger.debug("Parents of final stage: %s", finalStage.parents) logger.debug("Missing parents: %s", self.getMissingParentStages(finalStage)) if allowLocal and (not finalStage.parents or not self.getMissingParentStages(finalStage) ) and numOutputParts == 1: split = finalRdd.splits[outputParts[0]] yield func(finalRdd.iterator(split)) return def submitStage(stage): logger.debug("submit stage %s", stage) if stage not in waiting and stage not in running: missing = self.getMissingParentStages(stage) if not missing: submitMissingTasks(stage) running.add(stage) else: for parent in missing: submitStage(parent) waiting.add(stage) def submitMissingTasks(stage): myPending = pendingTasks.setdefault(stage, set()) tasks = [] have_prefer = True if stage == finalStage: for i in range(numOutputParts): if not finished[i]: part = outputParts[i] if have_prefer: locs = self.getPreferredLocs(finalRdd, part) if not locs: have_prefer = False else: locs = [] tasks.append( ResultTask(finalStage.id, finalRdd, func, part, locs, i)) else: for p in range(stage.numPartitions): if not stage.outputLocs[p]: if have_prefer: locs = self.getPreferredLocs(stage.rdd, p) if not locs: have_prefer = False else: locs = [] tasks.append( ShuffleMapTask(stage.id, stage.rdd, stage.shuffleDep, p, locs)) logger.debug("add to pending %s tasks", len(tasks)) myPending |= set(t.id for t in tasks) self.submitTasks(tasks) submitStage(finalStage) while numFinished != numOutputParts: try: evt = self.completionEvents.get(False) except Queue.Empty: self.check() if self._shutdown: sys.exit(1) if failed and time.time( ) > lastFetchFailureTime + RESUBMIT_TIMEOUT: self.updateCacheLocs() for stage in failed: logger.info("Resubmitting failed stages: %s", stage) submitStage(stage) failed.clear() else: time.sleep(0.1) continue task, reason = evt.task, evt.reason stage = self.idToStage[task.stageId] if stage not in pendingTasks: # stage from other job continue logger.debug("remove from pending %s from %s", task, stage) pendingTasks[stage].remove(task.id) if isinstance(reason, Success): Accumulator.merge(evt.accumUpdates) if isinstance(task, ResultTask): finished[task.outputId] = True numFinished += 1 results[task.outputId] = evt.result while lastFinished < numOutputParts and finished[ lastFinished]: yield results[lastFinished] results[lastFinished] = None lastFinished += 1 elif isinstance(task, ShuffleMapTask): stage = self.idToStage[task.stageId] stage.addOutputLoc(task.partition, evt.result) if not pendingTasks[stage] and all(stage.outputLocs): logger.debug( "%s finished; looking for newly runnable stages", stage) MutableDict.merge() running.remove(stage) if stage.shuffleDep != None: self.mapOutputTracker.registerMapOutputs( stage.shuffleDep.shuffleId, [l[-1] for l in stage.outputLocs]) self.updateCacheLocs() newlyRunnable = set( stage for stage in waiting if not self.getMissingParentStages(stage)) waiting -= newlyRunnable running |= newlyRunnable logger.debug("newly runnable: %s, %s", waiting, newlyRunnable) for stage in newlyRunnable: submitMissingTasks(stage) elif isinstance(reason, FetchFailed): if stage in running: waiting.add(stage) mapStage = self.shuffleToMapStage[reason.shuffleId] mapStage.removeHost(reason.serverUri) failed.add(mapStage) lastFetchFailureTime = time.time() else: logger.error("task %s failed: %s %s %s", task, reason, type(reason), reason.message) raise Exception(reason.message) MutableDict.merge() assert not any(results) return
def accumulator(self, init=0, param=None): return Accumulator(init, param)
def runJob(self, finalRdd, func, partitions, allowLocal): outputParts = list(partitions) numOutputParts = len(partitions) finalStage = self.newStage(finalRdd, None) try: from dpark.web.ui.views.rddopgraph import StageInfo stage_info = StageInfo() stage_info.create_stage_info(finalStage) def create_stage_info_recur(cur_stage, is_final=False): if not cur_stage or cur_stage.id in self.idToRunJob: return for par_stage in cur_stage.parents: create_stage_info_recur(par_stage) if cur_stage.id not in self.idToRunJob: self.idToRunJob[cur_stage.id] = StageInfo.idToStageInfo[cur_stage.id] self.idToRunJob[cur_stage.id].is_final = is_final create_stage_info_recur(finalStage, is_final=True) except ImportError: pass results = [None] * numOutputParts finished = [None] * numOutputParts lastFinished = 0 numFinished = 0 waiting = set() running = set() failed = set() pendingTasks = {} lastFetchFailureTime = 0 self.updateCacheLocs() logger.debug('Final stage: %s, %d', finalStage, numOutputParts) logger.debug('Parents of final stage: %s', finalStage.parents) logger.debug( 'Missing parents: %s', self.getMissingParentStages(finalStage)) def onStageFinished(stage): def _(r, dep): return r._do_checkpoint() MutableDict.merge() walk_dependencies(stage.rdd, _) if (allowLocal and ( not finalStage.parents or not self.getMissingParentStages(finalStage) ) and numOutputParts == 1): split = finalRdd.splits[outputParts[0]] yield func(finalRdd.iterator(split)) onStageFinished(finalStage) return def submitStage(stage): logger.debug('submit stage %s', stage) if stage not in waiting and stage not in running: missing = self.getMissingParentStages(stage) if not missing: submitMissingTasks(stage) running.add(stage) else: for parent in missing: submitStage(parent) waiting.add(stage) def submitMissingTasks(stage): myPending = pendingTasks.setdefault(stage, set()) tasks = [] have_prefer = True if stage == finalStage: for i in range(numOutputParts): if not finished[i]: part = outputParts[i] if have_prefer: locs = self.getPreferredLocs(finalRdd, part) if not locs: have_prefer = False else: locs = [] tasks.append(ResultTask(finalStage.id, finalRdd, func, part, locs, i)) else: for p in range(stage.numPartitions): if not stage.outputLocs[p]: if have_prefer: locs = self.getPreferredLocs(stage.rdd, p) if not locs: have_prefer = False else: locs = [] tasks.append(ShuffleMapTask(stage.id, stage.rdd, stage.shuffleDep, p, locs)) logger.debug('add to pending %s tasks', len(tasks)) myPending |= set(t.id for t in tasks) self.submitTasks(tasks) submitStage(finalStage) while numFinished != numOutputParts: try: evt = self.completionEvents.get(False) except Queue.Empty: self.check() if self._shutdown: sys.exit(1) if (failed and time.time() > lastFetchFailureTime + RESUBMIT_TIMEOUT): self.updateCacheLocs() for stage in failed: logger.info('Resubmitting failed stages: %s', stage) submitStage(stage) failed.clear() else: time.sleep(0.1) continue task, reason = evt.task, evt.reason stage = self.idToStage[task.stageId] if stage not in pendingTasks: # stage from other job continue logger.debug('remove from pending %s from %s', task, stage) pendingTasks[stage].remove(task.id) if isinstance(reason, Success): Accumulator.merge(evt.accumUpdates) if isinstance(task, ResultTask): finished[task.outputId] = True numFinished += 1 results[task.outputId] = evt.result while lastFinished < numOutputParts and finished[ lastFinished]: yield results[lastFinished] results[lastFinished] = None lastFinished += 1 elif isinstance(task, ShuffleMapTask): stage = self.idToStage[task.stageId] stage.addOutputLoc(task.partition, evt.result) if not pendingTasks[stage] and all(stage.outputLocs): logger.debug( '%s finished; looking for newly runnable stages', stage ) onStageFinished(stage) running.remove(stage) if stage.shuffleDep is not None: self.mapOutputTracker.registerMapOutputs( stage.shuffleDep.shuffleId, [l[-1] for l in stage.outputLocs]) self.updateCacheLocs() newlyRunnable = set( stage for stage in waiting if not self.getMissingParentStages(stage) ) waiting -= newlyRunnable running |= newlyRunnable logger.debug( 'newly runnable: %s, %s', waiting, newlyRunnable) for stage in newlyRunnable: submitMissingTasks(stage) elif isinstance(reason, FetchFailed): if stage in running: waiting.add(stage) mapStage = self.shuffleToMapStage[reason.shuffleId] mapStage.removeHost(reason.serverUri) failed.add(mapStage) lastFetchFailureTime = time.time() else: logger.error( 'task %s failed: %s %s %s', task, reason, type(reason), reason.message) raise Exception(reason.message) onStageFinished(finalStage) assert not any(results) return