示例#1
0
文件: schedule.py 项目: posens/dpark
        def onStageFinished(stage):
            def _(r, dep):
                return r._do_checkpoint()

            MutableDict.merge()
            walk_dependencies(stage.rdd, _)
            logger.info("stage %d finish %s", stage.id, stage.fmt_stats())
示例#2
0
    H.flush()

rdd = dpark.makeRDD(range(d))
rdd = rdd.cartesian(rdd).cache()

def calc_err((i, j)):
    Wi = W.get(i)
    Hj = H.get(j)

    ori = ori_b.value
    Rij = Wi.dot(Hj.T)
    Oij = ori[i*m:(i+1)*m, j*v:(j+1)*v]
    return ((Rij - Oij) ** 2).sum()

J = range(d)
while True:
    for i in xrange(d):
        dpark.makeRDD(zip(range(d), J), d).foreach(sgd)
        J = J[1:] + [J[0]]
        W.merge()
        H.merge()

    GAMMA *= STEP
    shuffle(J)
    err = rdd.map(calc_err).reduce(lambda x,y:x+y)
    rmse = numpy.sqrt(err/(M*V))
    print rmse
    if rmse < 0.01:
        break

示例#3
0
    def runJob(self, finalRdd, func, partitions, allowLocal):
        outputParts = list(partitions)
        numOutputParts = len(partitions)
        finalStage = self.newStage(finalRdd, None)
        results = [None] * numOutputParts
        finished = [None] * numOutputParts
        lastFinished = 0
        numFinished = 0

        waiting = set()
        running = set()
        failed = set()
        pendingTasks = {}
        lastFetchFailureTime = 0

        self.updateCacheLocs()

        logger.debug("Final stage: %s, %d", finalStage, numOutputParts)
        logger.debug("Parents of final stage: %s", finalStage.parents)
        logger.debug("Missing parents: %s",
                     self.getMissingParentStages(finalStage))

        if allowLocal and (not finalStage.parents
                           or not self.getMissingParentStages(finalStage)
                           ) and numOutputParts == 1:
            split = finalRdd.splits[outputParts[0]]
            yield func(finalRdd.iterator(split))
            return

        def submitStage(stage):
            logger.debug("submit stage %s", stage)
            if stage not in waiting and stage not in running:
                missing = self.getMissingParentStages(stage)
                if not missing:
                    submitMissingTasks(stage)
                    running.add(stage)
                else:
                    for parent in missing:
                        submitStage(parent)
                    waiting.add(stage)

        def submitMissingTasks(stage):
            myPending = pendingTasks.setdefault(stage, set())
            tasks = []
            have_prefer = True
            if stage == finalStage:
                for i in range(numOutputParts):
                    if not finished[i]:
                        part = outputParts[i]
                        if have_prefer:
                            locs = self.getPreferredLocs(finalRdd, part)
                            if not locs:
                                have_prefer = False
                        else:
                            locs = []
                        tasks.append(
                            ResultTask(finalStage.id, finalRdd, func, part,
                                       locs, i))
            else:
                for p in range(stage.numPartitions):
                    if not stage.outputLocs[p]:
                        if have_prefer:
                            locs = self.getPreferredLocs(stage.rdd, p)
                            if not locs:
                                have_prefer = False
                        else:
                            locs = []
                        tasks.append(
                            ShuffleMapTask(stage.id, stage.rdd,
                                           stage.shuffleDep, p, locs))
            logger.debug("add to pending %s tasks", len(tasks))
            myPending |= set(t.id for t in tasks)
            self.submitTasks(tasks)

        submitStage(finalStage)

        while numFinished != numOutputParts:
            try:
                evt = self.completionEvents.get(False)
            except Queue.Empty:
                self.check()
                if self._shutdown:
                    sys.exit(1)

                if failed and time.time(
                ) > lastFetchFailureTime + RESUBMIT_TIMEOUT:
                    self.updateCacheLocs()
                    for stage in failed:
                        logger.info("Resubmitting failed stages: %s", stage)
                        submitStage(stage)
                    failed.clear()
                else:
                    time.sleep(0.1)
                continue

            task, reason = evt.task, evt.reason
            stage = self.idToStage[task.stageId]
            if stage not in pendingTasks:  # stage from other job
                continue
            logger.debug("remove from pending %s from %s", task, stage)
            pendingTasks[stage].remove(task.id)
            if isinstance(reason, Success):
                Accumulator.merge(evt.accumUpdates)
                if isinstance(task, ResultTask):
                    finished[task.outputId] = True
                    numFinished += 1
                    results[task.outputId] = evt.result
                    while lastFinished < numOutputParts and finished[
                            lastFinished]:
                        yield results[lastFinished]
                        results[lastFinished] = None
                        lastFinished += 1

                elif isinstance(task, ShuffleMapTask):
                    stage = self.idToStage[task.stageId]
                    stage.addOutputLoc(task.partition, evt.result)
                    if not pendingTasks[stage] and all(stage.outputLocs):
                        logger.debug(
                            "%s finished; looking for newly runnable stages",
                            stage)
                        MutableDict.merge()
                        running.remove(stage)
                        if stage.shuffleDep != None:
                            self.mapOutputTracker.registerMapOutputs(
                                stage.shuffleDep.shuffleId,
                                [l[-1] for l in stage.outputLocs])
                        self.updateCacheLocs()
                        newlyRunnable = set(
                            stage for stage in waiting
                            if not self.getMissingParentStages(stage))
                        waiting -= newlyRunnable
                        running |= newlyRunnable
                        logger.debug("newly runnable: %s, %s", waiting,
                                     newlyRunnable)
                        for stage in newlyRunnable:
                            submitMissingTasks(stage)
            elif isinstance(reason, FetchFailed):
                if stage in running:
                    waiting.add(stage)
                mapStage = self.shuffleToMapStage[reason.shuffleId]
                mapStage.removeHost(reason.serverUri)
                failed.add(mapStage)
                lastFetchFailureTime = time.time()
            else:
                logger.error("task %s failed: %s %s %s", task, reason,
                             type(reason), reason.message)
                raise Exception(reason.message)

        MutableDict.merge()
        assert not any(results)
        return
示例#4
0
        def onStageFinished(stage):
            def _(r, dep):
                return r._do_checkpoint()

            MutableDict.merge()
            walk_dependencies(stage.rdd, _)
示例#5
0
文件: schedule.py 项目: SKSC/dpark
    def runJob(self, finalRdd, func, partitions, allowLocal):
        outputParts = list(partitions)
        numOutputParts = len(partitions)
        finalStage = self.newStage(finalRdd, None)
        results = [None]*numOutputParts
        finished = [None]*numOutputParts
        lastFinished = 0
        numFinished = 0

        waiting = set()
        running = set()
        failed = set()
        pendingTasks = {}
        lastFetchFailureTime = 0

        self.updateCacheLocs()

        logger.debug("Final stage: %s, %d", finalStage, numOutputParts)
        logger.debug("Parents of final stage: %s", finalStage.parents)
        logger.debug("Missing parents: %s", self.getMissingParentStages(finalStage))

        if allowLocal and (not finalStage.parents or not self.getMissingParentStages(finalStage)) and numOutputParts == 1:
            split = finalRdd.splits[outputParts[0]]
            yield func(finalRdd.iterator(split))
            return

        def submitStage(stage):
            logger.debug("submit stage %s", stage)
            if stage not in waiting and stage not in running:
                missing = self.getMissingParentStages(stage)
                if not missing:
                    submitMissingTasks(stage)
                    running.add(stage)
                else:
                    for parent in missing:
                        submitStage(parent)
                    waiting.add(stage)

        def submitMissingTasks(stage):
            myPending = pendingTasks.setdefault(stage, set())
            tasks = []
            have_prefer = True
            if stage == finalStage:
                for i in range(numOutputParts):
                    if not finished[i]:
                        part = outputParts[i]
                        if have_prefer:
                            locs = self.getPreferredLocs(finalRdd, part)
                            if not locs:
                                have_prefer = False
                        else:
                            locs = []
                        tasks.append(ResultTask(finalStage.id, finalRdd,
                            func, part, locs, i))
            else:
                for p in range(stage.numPartitions):
                    if not stage.outputLocs[p]:
                        if have_prefer:
                            locs = self.getPreferredLocs(stage.rdd, p)
                            if not locs:
                                have_prefer = False
                        else:
                            locs = []
                        tasks.append(ShuffleMapTask(stage.id, stage.rdd,
                            stage.shuffleDep, p, locs))
            logger.debug("add to pending %s tasks", len(tasks))
            myPending |= set(t.id for t in tasks)
            self.submitTasks(tasks)

        submitStage(finalStage)

        while numFinished != numOutputParts:
            try:
                evt = self.completionEvents.get(False)
            except Queue.Empty:
                self.check()
                if self._shutdown:
                    sys.exit(1)

                if failed and time.time() > lastFetchFailureTime + RESUBMIT_TIMEOUT:
                    self.updateCacheLocs()
                    for stage in failed:
                        logger.info("Resubmitting failed stages: %s", stage)
                        submitStage(stage)
                    failed.clear()
                else:
                    time.sleep(0.1)
                continue

            task, reason = evt.task, evt.reason
            stage = self.idToStage[task.stageId]
            if stage not in pendingTasks: # stage from other job
                continue
            logger.debug("remove from pending %s from %s", task, stage)
            pendingTasks[stage].remove(task.id)
            if isinstance(reason, Success):
                Accumulator.merge(evt.accumUpdates)
                if isinstance(task, ResultTask):
                    finished[task.outputId] = True
                    numFinished += 1
                    results[task.outputId] = evt.result
                    while lastFinished < numOutputParts and finished[lastFinished]:
                        yield results[lastFinished]
                        results[lastFinished] = None
                        lastFinished += 1

                elif isinstance(task, ShuffleMapTask):
                    stage = self.idToStage[task.stageId]
                    stage.addOutputLoc(task.partition, evt.result)
                    if not pendingTasks[stage] and all(stage.outputLocs):
                        logger.debug("%s finished; looking for newly runnable stages", stage)
                        MutableDict.merge()
                        running.remove(stage)
                        if stage.shuffleDep != None:
                            self.mapOutputTracker.registerMapOutputs(
                                    stage.shuffleDep.shuffleId,
                                    [l[-1] for l in stage.outputLocs])
                        self.updateCacheLocs()
                        newlyRunnable = set(stage for stage in waiting if not self.getMissingParentStages(stage))
                        waiting -= newlyRunnable
                        running |= newlyRunnable
                        logger.debug("newly runnable: %s, %s", waiting, newlyRunnable)
                        for stage in newlyRunnable:
                            submitMissingTasks(stage)
            elif isinstance(reason, FetchFailed):
                if stage in running:
                    waiting.add(stage)
                mapStage = self.shuffleToMapStage[reason.shuffleId]
                mapStage.removeHost(reason.serverUri)
                failed.add(mapStage)
                lastFetchFailureTime = time.time()
            else:
                logger.error("task %s failed: %s %s %s", task, reason, type(reason), reason.message)
                raise Exception(reason.message)

        MutableDict.merge()
        assert not any(results)
        return
示例#6
0
        def onStageFinished(stage):
            def _(r, dep):
                return r._do_checkpoint()

            MutableDict.merge()
            walk_dependencies(stage.rdd, _)
示例#7
0
rdd = dpark.makeRDD(range(d))
rdd = rdd.cartesian(rdd).cache()


def calc_err((i, j)):
    Wi = W.get(i)
    Hj = H.get(j)

    ori = ori_b.value
    Rij = Wi.dot(Hj.T)
    Oij = ori[i * m:(i + 1) * m, j * v:(j + 1) * v]
    return ((Rij - Oij)**2).sum()


J = range(d)
while True:
    for i in xrange(d):
        dpark.makeRDD(zip(range(d), J), d).foreach(sgd)
        J = J[1:] + [J[0]]
        W.merge()
        H.merge()

    GAMMA *= STEP
    shuffle(J)
    err = rdd.map(calc_err).reduce(lambda x, y: x + y)
    rmse = numpy.sqrt(err / (M * V))
    print rmse
    if rmse < 0.01:
        break