def onStageFinished(stage): def _(r, dep): return r._do_checkpoint() MutableDict.merge() walk_dependencies(stage.rdd, _) logger.info("stage %d finish %s", stage.id, stage.fmt_stats())
H.flush() rdd = dpark.makeRDD(range(d)) rdd = rdd.cartesian(rdd).cache() def calc_err((i, j)): Wi = W.get(i) Hj = H.get(j) ori = ori_b.value Rij = Wi.dot(Hj.T) Oij = ori[i*m:(i+1)*m, j*v:(j+1)*v] return ((Rij - Oij) ** 2).sum() J = range(d) while True: for i in xrange(d): dpark.makeRDD(zip(range(d), J), d).foreach(sgd) J = J[1:] + [J[0]] W.merge() H.merge() GAMMA *= STEP shuffle(J) err = rdd.map(calc_err).reduce(lambda x,y:x+y) rmse = numpy.sqrt(err/(M*V)) print rmse if rmse < 0.01: break
def runJob(self, finalRdd, func, partitions, allowLocal): outputParts = list(partitions) numOutputParts = len(partitions) finalStage = self.newStage(finalRdd, None) results = [None] * numOutputParts finished = [None] * numOutputParts lastFinished = 0 numFinished = 0 waiting = set() running = set() failed = set() pendingTasks = {} lastFetchFailureTime = 0 self.updateCacheLocs() logger.debug("Final stage: %s, %d", finalStage, numOutputParts) logger.debug("Parents of final stage: %s", finalStage.parents) logger.debug("Missing parents: %s", self.getMissingParentStages(finalStage)) if allowLocal and (not finalStage.parents or not self.getMissingParentStages(finalStage) ) and numOutputParts == 1: split = finalRdd.splits[outputParts[0]] yield func(finalRdd.iterator(split)) return def submitStage(stage): logger.debug("submit stage %s", stage) if stage not in waiting and stage not in running: missing = self.getMissingParentStages(stage) if not missing: submitMissingTasks(stage) running.add(stage) else: for parent in missing: submitStage(parent) waiting.add(stage) def submitMissingTasks(stage): myPending = pendingTasks.setdefault(stage, set()) tasks = [] have_prefer = True if stage == finalStage: for i in range(numOutputParts): if not finished[i]: part = outputParts[i] if have_prefer: locs = self.getPreferredLocs(finalRdd, part) if not locs: have_prefer = False else: locs = [] tasks.append( ResultTask(finalStage.id, finalRdd, func, part, locs, i)) else: for p in range(stage.numPartitions): if not stage.outputLocs[p]: if have_prefer: locs = self.getPreferredLocs(stage.rdd, p) if not locs: have_prefer = False else: locs = [] tasks.append( ShuffleMapTask(stage.id, stage.rdd, stage.shuffleDep, p, locs)) logger.debug("add to pending %s tasks", len(tasks)) myPending |= set(t.id for t in tasks) self.submitTasks(tasks) submitStage(finalStage) while numFinished != numOutputParts: try: evt = self.completionEvents.get(False) except Queue.Empty: self.check() if self._shutdown: sys.exit(1) if failed and time.time( ) > lastFetchFailureTime + RESUBMIT_TIMEOUT: self.updateCacheLocs() for stage in failed: logger.info("Resubmitting failed stages: %s", stage) submitStage(stage) failed.clear() else: time.sleep(0.1) continue task, reason = evt.task, evt.reason stage = self.idToStage[task.stageId] if stage not in pendingTasks: # stage from other job continue logger.debug("remove from pending %s from %s", task, stage) pendingTasks[stage].remove(task.id) if isinstance(reason, Success): Accumulator.merge(evt.accumUpdates) if isinstance(task, ResultTask): finished[task.outputId] = True numFinished += 1 results[task.outputId] = evt.result while lastFinished < numOutputParts and finished[ lastFinished]: yield results[lastFinished] results[lastFinished] = None lastFinished += 1 elif isinstance(task, ShuffleMapTask): stage = self.idToStage[task.stageId] stage.addOutputLoc(task.partition, evt.result) if not pendingTasks[stage] and all(stage.outputLocs): logger.debug( "%s finished; looking for newly runnable stages", stage) MutableDict.merge() running.remove(stage) if stage.shuffleDep != None: self.mapOutputTracker.registerMapOutputs( stage.shuffleDep.shuffleId, [l[-1] for l in stage.outputLocs]) self.updateCacheLocs() newlyRunnable = set( stage for stage in waiting if not self.getMissingParentStages(stage)) waiting -= newlyRunnable running |= newlyRunnable logger.debug("newly runnable: %s, %s", waiting, newlyRunnable) for stage in newlyRunnable: submitMissingTasks(stage) elif isinstance(reason, FetchFailed): if stage in running: waiting.add(stage) mapStage = self.shuffleToMapStage[reason.shuffleId] mapStage.removeHost(reason.serverUri) failed.add(mapStage) lastFetchFailureTime = time.time() else: logger.error("task %s failed: %s %s %s", task, reason, type(reason), reason.message) raise Exception(reason.message) MutableDict.merge() assert not any(results) return
def onStageFinished(stage): def _(r, dep): return r._do_checkpoint() MutableDict.merge() walk_dependencies(stage.rdd, _)
def runJob(self, finalRdd, func, partitions, allowLocal): outputParts = list(partitions) numOutputParts = len(partitions) finalStage = self.newStage(finalRdd, None) results = [None]*numOutputParts finished = [None]*numOutputParts lastFinished = 0 numFinished = 0 waiting = set() running = set() failed = set() pendingTasks = {} lastFetchFailureTime = 0 self.updateCacheLocs() logger.debug("Final stage: %s, %d", finalStage, numOutputParts) logger.debug("Parents of final stage: %s", finalStage.parents) logger.debug("Missing parents: %s", self.getMissingParentStages(finalStage)) if allowLocal and (not finalStage.parents or not self.getMissingParentStages(finalStage)) and numOutputParts == 1: split = finalRdd.splits[outputParts[0]] yield func(finalRdd.iterator(split)) return def submitStage(stage): logger.debug("submit stage %s", stage) if stage not in waiting and stage not in running: missing = self.getMissingParentStages(stage) if not missing: submitMissingTasks(stage) running.add(stage) else: for parent in missing: submitStage(parent) waiting.add(stage) def submitMissingTasks(stage): myPending = pendingTasks.setdefault(stage, set()) tasks = [] have_prefer = True if stage == finalStage: for i in range(numOutputParts): if not finished[i]: part = outputParts[i] if have_prefer: locs = self.getPreferredLocs(finalRdd, part) if not locs: have_prefer = False else: locs = [] tasks.append(ResultTask(finalStage.id, finalRdd, func, part, locs, i)) else: for p in range(stage.numPartitions): if not stage.outputLocs[p]: if have_prefer: locs = self.getPreferredLocs(stage.rdd, p) if not locs: have_prefer = False else: locs = [] tasks.append(ShuffleMapTask(stage.id, stage.rdd, stage.shuffleDep, p, locs)) logger.debug("add to pending %s tasks", len(tasks)) myPending |= set(t.id for t in tasks) self.submitTasks(tasks) submitStage(finalStage) while numFinished != numOutputParts: try: evt = self.completionEvents.get(False) except Queue.Empty: self.check() if self._shutdown: sys.exit(1) if failed and time.time() > lastFetchFailureTime + RESUBMIT_TIMEOUT: self.updateCacheLocs() for stage in failed: logger.info("Resubmitting failed stages: %s", stage) submitStage(stage) failed.clear() else: time.sleep(0.1) continue task, reason = evt.task, evt.reason stage = self.idToStage[task.stageId] if stage not in pendingTasks: # stage from other job continue logger.debug("remove from pending %s from %s", task, stage) pendingTasks[stage].remove(task.id) if isinstance(reason, Success): Accumulator.merge(evt.accumUpdates) if isinstance(task, ResultTask): finished[task.outputId] = True numFinished += 1 results[task.outputId] = evt.result while lastFinished < numOutputParts and finished[lastFinished]: yield results[lastFinished] results[lastFinished] = None lastFinished += 1 elif isinstance(task, ShuffleMapTask): stage = self.idToStage[task.stageId] stage.addOutputLoc(task.partition, evt.result) if not pendingTasks[stage] and all(stage.outputLocs): logger.debug("%s finished; looking for newly runnable stages", stage) MutableDict.merge() running.remove(stage) if stage.shuffleDep != None: self.mapOutputTracker.registerMapOutputs( stage.shuffleDep.shuffleId, [l[-1] for l in stage.outputLocs]) self.updateCacheLocs() newlyRunnable = set(stage for stage in waiting if not self.getMissingParentStages(stage)) waiting -= newlyRunnable running |= newlyRunnable logger.debug("newly runnable: %s, %s", waiting, newlyRunnable) for stage in newlyRunnable: submitMissingTasks(stage) elif isinstance(reason, FetchFailed): if stage in running: waiting.add(stage) mapStage = self.shuffleToMapStage[reason.shuffleId] mapStage.removeHost(reason.serverUri) failed.add(mapStage) lastFetchFailureTime = time.time() else: logger.error("task %s failed: %s %s %s", task, reason, type(reason), reason.message) raise Exception(reason.message) MutableDict.merge() assert not any(results) return
def onStageFinished(stage): def _(r, dep): return r._do_checkpoint() MutableDict.merge() walk_dependencies(stage.rdd, _)
rdd = dpark.makeRDD(range(d)) rdd = rdd.cartesian(rdd).cache() def calc_err((i, j)): Wi = W.get(i) Hj = H.get(j) ori = ori_b.value Rij = Wi.dot(Hj.T) Oij = ori[i * m:(i + 1) * m, j * v:(j + 1) * v] return ((Rij - Oij)**2).sum() J = range(d) while True: for i in xrange(d): dpark.makeRDD(zip(range(d), J), d).foreach(sgd) J = J[1:] + [J[0]] W.merge() H.merge() GAMMA *= STEP shuffle(J) err = rdd.map(calc_err).reduce(lambda x, y: x + y) rmse = numpy.sqrt(err / (M * V)) print rmse if rmse < 0.01: break