Пример #1
0
def run_task(task, aid):
    try:
        setproctitle('dpark worker %s: run task %s' % (Script, task))
        Accumulator.clear()
        result = task.run(aid)
        accUpdate = Accumulator.values()
        try:
            flag, data = 0, marshal.dumps(result)
        except ValueError:
            flag, data = 1, cPickle.dumps(result)

        if len(data) > TASK_RESULT_LIMIT:
            workdir = env.get('WORKDIR')
            path = os.path.join(workdir, str(task.id) + '.result')
            with open(path, 'w') as f:
                f.write(data)
            data = path
            flag += 2

        setproctitle('dpark worker: idle')
        return mesos_pb2.TASK_FINISHED, cPickle.dumps(
            (task.id, Success(), (flag, data), accUpdate), -1)
    except Exception, e:
        import traceback
        msg = traceback.format_exc()
        setproctitle('dpark worker: idle')
        return mesos_pb2.TASK_FAILED, cPickle.dumps(
            (task.id, OtherFailure(msg), None, None), -1)
Пример #2
0
def run_task(task_data):
    try:
        gc.disable()
        task, ntry = cPickle.loads(decompress(task_data))
        setproctitle('dpark worker %s: run task %s' % (Script, task))

        Accumulator.clear()
        result = task.run(ntry)
        accUpdate = Accumulator.values()

        if marshalable(result):
            flag, data = 0, marshal.dumps(result)
        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            path = LocalFileShuffle.getOutputFile(0, ntry, task.id, len(data))
            f = open(path, 'w')
            f.write(data)
            f.close()
            data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:])
            flag += 2

        return mesos_pb2.TASK_FINISHED, cPickle.dumps((Success(), (flag, data), accUpdate), -1)
    except FetchFailed, e:
        return mesos_pb2.TASK_FAILED, cPickle.dumps((e, None, None), -1)
Пример #3
0
def run_task(task, ntry):
    try:
        setproctitle('dpark worker %s: run task %s' % (Script, task))
        Accumulator.clear()
        gc.disable()
        result = task.run(ntry)
        accUpdate = Accumulator.values()

        if marshalable(result):
            flag, data = 0, marshal.dumps(result)
        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            workdir = env.get('WORKDIR')
            name = 'task_%s_%s.result' % (task.id, ntry)
            path = os.path.join(workdir, name)
            f = open(path, 'w')
            f.write(data)
            f.close()
            data = LocalFileShuffle.getServerUri() + '/' + name
            flag += 2

        return mesos_pb2.TASK_FINISHED, cPickle.dumps(
            (task.id, Success(), (flag, data), accUpdate), -1)
    except Exception, e:
        import traceback
        msg = traceback.format_exc()
        return mesos_pb2.TASK_FAILED, cPickle.dumps(
            (task.id, OtherFailure(msg), None, None), -1)
Пример #4
0
def run_task(task, aid):
    try:
        setproctitle('dpark worker %s: run task %s' % (Script, task))
        Accumulator.clear()
        result = task.run(aid)
        accUpdate = Accumulator.values()
        try:
            flag, data = 0, marshal.dumps(result)
        except ValueError:
            flag, data = 1, cPickle.dumps(result)

        if len(data) > TASK_RESULT_LIMIT and env.dfs:
            workdir = env.get('WORKDIR')
            path = os.path.join(workdir, str(task.id)+'.result')
            with open(path, 'w') as f:
                f.write(data)
            data = path
            flag += 2

        setproctitle('dpark worker: idle')
        return mesos_pb2.TASK_FINISHED, cPickle.dumps((task.id, Success(), (flag, data), accUpdate), -1)
    except Exception, e:
        import traceback
        msg = traceback.format_exc()
        setproctitle('dpark worker: idle')
        return mesos_pb2.TASK_FAILED, cPickle.dumps((task.id, OtherFailure(msg), None, None), -1)
Пример #5
0
def run_task(task_data):
    try:
        gc.disable()
        task, ntry = cPickle.loads(decompress(task_data))
        setproctitle('dpark worker %s: run task %s' % (Script, task))
        
        Accumulator.clear()
        result = task.run(ntry)
        accUpdate = Accumulator.values()

        if marshalable(result):
            flag, data = 0, marshal.dumps(result)
        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            workdir = env.get('WORKDIR')
            name = 'task_%s_%s.result' % (task.id, ntry)
            path = os.path.join(workdir, name) 
            f = open(path, 'w')
            f.write(data)
            f.close()
            data = LocalFileShuffle.getServerUri() + '/' + name
            flag += 2

        return mesos_pb2.TASK_FINISHED, cPickle.dumps((task.id, Success(), (flag, data), accUpdate), -1)
    except Exception, e:
        import traceback
        msg = traceback.format_exc()
        return mesos_pb2.TASK_FAILED, cPickle.dumps((task.id, OtherFailure(msg), None, None), -1)
Пример #6
0
def run_task(task, aid):
    logger.debug("Running task %r", task)
    try:
        Accumulator.clear()
        result = task.run(aid)
        accumUpdates = Accumulator.values()
        return (task.id, Success(), result, accumUpdates)
    except Exception, e:
        logger.error("error in task %s", task)
        import traceback
        traceback.print_exc()
        return (task.id, OtherFailure("exception:" + str(e)), None, None)
Пример #7
0
def run_task(task, aid):
    logger.debug("Running task %r", task)
    try:
        Accumulator.clear()
        result = task.run(aid)
        accumUpdates = Accumulator.values()
        return (task.id, Success(), result, accumUpdates)
    except Exception, e:
        logger.error("error in task %s", task)
        import traceback
        traceback.print_exc()
        return (task.id, OtherFailure("exception:" + str(e)), None, None)
Пример #8
0
def run_task(task, aid):
    logger.debug('Running task %r', task)
    try:
        Accumulator.clear()
        result = task.run(aid)
        accumUpdates = Accumulator.values()
        MutableDict.flush()
        return task.id, Success(), result, accumUpdates
    except Exception as e:
        logger.error('error in task %s', task)
        import traceback
        traceback.print_exc()
        return task.id, OtherFailure('exception:' + str(e)), None, None
Пример #9
0
def run_task(task, aid):
    logger.debug('Running task %r', task)
    try:
        Accumulator.clear()
        result = task.run(aid)
        accumUpdates = Accumulator.values()
        MutableDict.flush()
        return (task.id, Success(), result, accumUpdates)
    except Exception as e:
        logger.error('error in task %s', task)
        import traceback
        traceback.print_exc()
        return (task.id, OtherFailure('exception:' + str(e)), None, None)
Пример #10
0
def run_task(task_data):
    try:
        gc.disable()
        task, task_try_id = loads(decompress(task_data))
        ttid = TTID(task_try_id)
        Accumulator.clear()
        result = task.run(ttid.ttid)
        env.task_stats.bytes_max_rss = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss * 1024
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            # shuffle_id start from 1
            swd = ShuffleWorkDir(0, task.id, ttid.task_try)
            tmppath = swd.alloc_tmp(len(data))
            with open(tmppath, 'wb') as f:
                f.write(data)
                f.close()
            path = swd.export(tmppath)
            data = '/'.join([env.server_uri] + path.split('/')[-3:])
            flag += 2

        return TaskState.finished, cPickle.dumps(
            ((flag, data), accUpdate, env.task_stats), -1)
    except FetchFailed as e:
        return TaskState.failed, TaskEndReason.fetch_failed, str(
            e), cPickle.dumps(e)
    except Exception as e:
        import traceback
        msg = traceback.format_exc()
        ename = e.__class__.__name__
        fatal_exceptions = (DparkUserFatalError, ArithmeticError, ValueError,
                            LookupError, SyntaxError, TypeError,
                            AssertionError)
        prefix = "FATAL" if isinstance(e, fatal_exceptions) else "FAILED"
        return TaskState.failed, '{}_EXCEPTION_{}'.format(
            prefix, ename), msg, cPickle.dumps(e)
    finally:
        gc.collect()
        gc.enable()
Пример #11
0
def run_task(task_data):
    try:
        gc.disable()
        task, task_try_id = loads(decompress(task_data))
        ttid = TTID(task_try_id)
        Accumulator.clear()
        result = task.run(ttid.ttid)
        env.task_stats.bytes_max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            # shuffle_id start from 1
            swd = ShuffleWorkDir(0, task.id, ttid.task_try)
            tmppath = swd.alloc_tmp(len(data))
            with open(tmppath, 'wb') as f:
                f.write(data)
                f.close()
            path = swd.export(tmppath)
            data = '/'.join(
                [env.server_uri] + path.split('/')[-3:]
            )
            flag += 2

        return TaskState.finished, cPickle.dumps(((flag, data), accUpdate, env.task_stats), -1)
    except FetchFailed as e:
        return TaskState.failed, TaskEndReason.fetch_failed, str(e), cPickle.dumps(e)
    except Exception as e:
        import traceback
        msg = traceback.format_exc()
        ename = e.__class__.__name__
        fatal_exceptions = (DparkUserFatalError, ArithmeticError,
                            ValueError, LookupError, SyntaxError,
                            TypeError, AssertionError)
        prefix = "FATAL" if isinstance(e, fatal_exceptions) else "FAILED"
        return TaskState.failed, '{}_EXCEPTION_{}'.format(prefix, ename), msg, cPickle.dumps(e)
    finally:
        gc.collect()
        gc.enable()
Пример #12
0
def run_task(task_data):
    try:
        gc.disable()
        task, ntry = cPickle.loads(decompress(task_data))
        Accumulator.clear()
        result = task.run(ntry)
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception, e:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
Пример #13
0
def run_task(task_data):
    try:
        gc.disable()
        task, ntry = cPickle.loads(decompress(task_data))
        Accumulator.clear()
        result = task.run(ntry)
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception, e:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
Пример #14
0
def run_task(task_data):
    try:
        gc.disable()
        task, task_try_id = loads(decompress(task_data))
        ttid = TTID(task_try_id)
        Accumulator.clear()
        result = task.run(ttid.ttid)
        env.task_stats.bytes_max_rss = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss * 1024
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try,
                                                  len(data))
            f = open(path, 'wb')
            f.write(data)
            f.close()
            data = '/'.join([LocalFileShuffle.getServerUri()] +
                            path.split('/')[-3:])
            flag += 2

        return TaskState.finished, cPickle.dumps(
            ((flag, data), accUpdate, env.task_stats), -1)
    except FetchFailed as e:
        return TaskState.failed, TaskEndReason.fetch_failed, str(
            e), cPickle.dumps(e)
    except Exception as e:
        import traceback
        msg = traceback.format_exc()
        ename = e.__class__.__name__
        return TaskState.failed, 'FAILED_EXCEPTION_{}'.format(
            ename), msg, cPickle.dumps(e)
    finally:
        gc.collect()
        gc.enable()
Пример #15
0
def run_task(task_data):
    try:
        gc.disable()
        task, ntry = cPickle.loads(decompress(task_data))
        setproctitle('dpark worker %s: run task %s' % (Script, task))

        Accumulator.clear()
        result = task.run(ntry)
        accUpdate = Accumulator.values()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception, e:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
Пример #16
0
def run_task(task_data):
    try:
        gc.disable()
        task, ntry = cPickle.loads(decompress(task_data))
        setproctitle('dpark worker %s: run task %s' % (Script, task))

        Accumulator.clear()
        result = task.run(ntry)
        accUpdate = Accumulator.values()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception, e:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
Пример #17
0
def run_task(task_data):
    try:
        gc.disable()
        task, task_try_id = loads(decompress(task_data))
        ttid = TTID(task_try_id)
        Accumulator.clear()
        result = task.run(ttid.ttid)
        env.task_stats.bytes_max_rss = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss * 1024
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try,
                                                  len(data))
            f = open(path, 'wb')
            f.write(data)
            f.close()
            data = '/'.join([LocalFileShuffle.getServerUri()] +
                            path.split('/')[-3:])
            flag += 2

        return 'TASK_FINISHED', cPickle.dumps(
            (Success(), (flag, data), accUpdate, env.task_stats), -1)
    except FetchFailed as e:
        return 'TASK_FAILED', cPickle.dumps((e, None, None, None), -1)
    except:
        import traceback
        msg = traceback.format_exc()
        return 'TASK_FAILED', cPickle.dumps(
            (OtherFailure(msg), None, None, None), -1)
    finally:
        gc.collect()
        gc.enable()
Пример #18
0
def run_task(task_data):
    try:
        gc.disable()
        task, ntry = loads(decompress(task_data))
        Accumulator.clear()
        result = task.run(ntry)
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception as e:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            path = LocalFileShuffle.getOutputFile(0, ntry, task.id, len(data))
            f = open(path, 'w')
            f.write(data)
            f.close()
            data = '/'.join(
                [LocalFileShuffle.getServerUri()] + path.split('/')[-3:]
            )
            flag += 2

        return 'TASK_FINISHED', cPickle.dumps(
            (Success(), (flag, data), accUpdate), -1)
    except FetchFailed as e:
        return 'TASK_FAILED', cPickle.dumps((e, None, None), -1)
    except:
        import traceback
        msg = traceback.format_exc()
        return 'TASK_FAILED', cPickle.dumps(
            (OtherFailure(msg), None, None), -1)
    finally:
        close_mfs()
        gc.collect()
        gc.enable()
Пример #19
0
    def runJob(self, finalRdd, func, partitions, allowLocal):
        run_id = self.runJobTimes
        self.runJobTimes += 1
        outputParts = list(partitions)
        numOutputParts = len(partitions)
        finalStage = self.newStage(finalRdd, None)
        try:
            from dpark.web.ui.views.rddopgraph import StageInfo
            stage_info = StageInfo()
            stage_info.create_stage_info(finalStage)

            def create_stage_info_recur(cur_stage, is_final=False):
                if not cur_stage or cur_stage.id in self.idToRunJob:
                    return
                for par_stage in cur_stage.parents:
                    create_stage_info_recur(par_stage)
                if cur_stage.id not in self.idToRunJob:
                    self.idToRunJob[cur_stage.id] = StageInfo.idToStageInfo[
                        cur_stage.id]
                    self.idToRunJob[cur_stage.id].is_final = is_final

            create_stage_info_recur(finalStage, is_final=True)
        except ImportError:
            pass
        results = [None] * numOutputParts
        finished = [None] * numOutputParts
        last_finished = 0
        num_finished = 0

        waiting = set()
        running = set()
        failed = set()
        pendingTasks = {}  # stage -> set([task_id..])
        lastFetchFailureTime = 0

        self.updateCacheLocs()

        logger.debug('Final stage: %s, %d', finalStage, numOutputParts)
        logger.debug('Parents of final stage: %s', finalStage.parents)
        logger.debug('Missing parents: %s',
                     self.getMissingParentStages(finalStage))

        def onStageFinished(stage):
            def _(r, dep):
                return r._do_checkpoint()

            MutableDict.merge()
            walk_dependencies(stage.rdd, _)
            logger.info("stage %d finish %s", stage.id, stage.fmt_stats())

        if (allowLocal and (not finalStage.parents
                            or not self.getMissingParentStages(finalStage))
                and numOutputParts == 1):
            split = finalRdd.splits[outputParts[0]]
            yield func(finalRdd.iterator(split))
            onStageFinished(finalStage)
            return

        def submitStage(stage):
            stage.submit_time = time.time()
            logger.debug('submit stage %s', stage)
            if stage not in waiting and stage not in running:
                missing = self.getMissingParentStages(stage)
                if not missing:
                    submitMissingTasks(stage)
                    running.add(stage)
                else:
                    for parent in missing:
                        submitStage(parent)
                    waiting.add(stage)

        def submitMissingTasks(stage):
            myPending = pendingTasks.setdefault(stage, set())
            tasks = []
            have_prefer = True
            if stage == finalStage:
                for i in range(numOutputParts):
                    if not finished[i]:
                        part = outputParts[i]
                        if have_prefer:
                            locs = self.getPreferredLocs(finalRdd, part)
                            if not locs:
                                have_prefer = False
                        else:
                            locs = []
                        tasks.append(
                            ResultTask(finalStage.id, finalStage.try_id, part,
                                       finalRdd, func, locs, i))
            else:
                for part in range(stage.numPartitions):
                    if not stage.outputLocs[part]:
                        if have_prefer:
                            locs = self.getPreferredLocs(stage.rdd, part)
                            if not locs:
                                have_prefer = False
                        else:
                            locs = []
                        tasks.append(
                            ShuffleMapTask(stage.id, stage.try_id, part,
                                           stage.rdd, stage.shuffleDep, locs))
            logger.debug('add to pending %s tasks', len(tasks))
            myPending |= set(t.id for t in tasks)
            self.submitTasks(tasks)

        submitStage(finalStage)

        while num_finished != numOutputParts:
            try:
                evt = self.completionEvents.get(False)
            except queue.Empty:
                if (failed and
                        time.time() > lastFetchFailureTime + RESUBMIT_TIMEOUT):
                    self.updateCacheLocs()
                    for stage in failed:
                        logger.info('Resubmitting failed stages: %s', stage)
                        submitStage(stage)
                    failed.clear()
                else:
                    time.sleep(0.1)
                continue

            if evt is None:  # aborted
                for taskset in self.active_tasksets.values():
                    self.tasksetFinished(taskset)

                raise RuntimeError('TaskSet aborted!')

            task, reason = evt.task, evt.reason
            stage = self.idToStage[task.stage_id]
            if stage not in pendingTasks:  # stage from other taskset
                continue
            logger.debug('remove from pending %s from %s', task, stage)
            pendingTasks[stage].remove(task.id)
            if isinstance(reason, Success):
                Accumulator.merge(evt.accumUpdates)
                stage.task_stats[task.partition].append(evt.stats)
                if isinstance(task, ResultTask):
                    finished[task.outputId] = True
                    num_finished += 1
                    results[task.outputId] = evt.result

                    while last_finished < numOutputParts and finished[
                            last_finished]:
                        yield results[last_finished]
                        results[last_finished] = None
                        last_finished += 1

                    stage.finish()

                elif isinstance(task, ShuffleMapTask):
                    stage = self.idToStage[task.stage_id]
                    stage.addOutputLoc(task.partition, evt.result)
                    if all(stage.outputLocs):
                        stage.finish()
                        logger.debug(
                            '%s finished; looking for newly runnable stages',
                            stage)
                        if pendingTasks[stage]:
                            logger.warn(
                                'dirty stage %d with %d tasks'
                                '(select at most 10 tasks:%s) not clean',
                                stage.id, len(pendingTasks[stage]),
                                str(list(pendingTasks[stage])[:10]))
                            del pendingTasks[stage]
                        onStageFinished(stage)
                        running.remove(stage)
                        if stage.shuffleDep is not None:
                            self.mapOutputTracker.registerMapOutputs(
                                stage.shuffleDep.shuffleId,
                                [l[-1] for l in stage.outputLocs])
                        self.updateCacheLocs()
                        newlyRunnable = set(
                            stage for stage in waiting
                            if not self.getMissingParentStages(stage))
                        waiting -= newlyRunnable
                        running |= newlyRunnable
                        logger.debug('newly runnable: %s, %s', waiting,
                                     newlyRunnable)
                        for stage in newlyRunnable:
                            submitMissingTasks(stage)
            elif isinstance(reason, FetchFailed):
                if stage in running:
                    waiting.add(stage)
                    running.remove(stage)
                mapStage = self.shuffleToMapStage[reason.shuffleId]
                mapStage.removeHost(reason.serverUri)
                failed.add(mapStage)
                lastFetchFailureTime = time.time()
            else:
                logger.error('task %s failed: %s %s %s', task, reason,
                             type(reason), reason.message)
                raise Exception(reason.message)

        onStageFinished(finalStage)

        if not self.is_dstream:
            try:
                self.last_jobstats = self.get_stats(run_id)
                if self.loghub_dir:
                    names = ['sched', self.id, "job", run_id]
                    name = "_".join(map(str, names)) + ".json"
                    path = os.path.join(self.loghub_dir, name)
                    logger.info("writing profile to %s", path)
                    with open(path, 'w') as f:
                        json.dump(self.last_jobstats, f, indent=4)
            except Exception as e:
                logger.error("Fail to dump job stats: %s.", e)

        assert all(finished)
        return
Пример #20
0
    def runJob(self, finalRdd, func, partitions, allowLocal):
        outputParts = list(partitions)
        numOutputParts = len(partitions)
        finalStage = self.newStage(finalRdd, None)
        results = [None]*numOutputParts
        finished = [None]*numOutputParts
        lastFinished = 0
        numFinished = 0

        waiting = set()
        running = set()
        failed = set()
        pendingTasks = {}
        lastFetchFailureTime = 0

        self.updateCacheLocs()

        logger.debug("Final stage: %s, %d", finalStage, numOutputParts)
        logger.debug("Parents of final stage: %s", finalStage.parents)
        logger.debug("Missing parents: %s", self.getMissingParentStages(finalStage))

        if allowLocal and (not finalStage.parents or not self.getMissingParentStages(finalStage)) and numOutputParts == 1:
            split = finalRdd.splits[outputParts[0]]
            yield func(finalRdd.iterator(split))
            return

        def submitStage(stage):
            logger.debug("submit stage %s", stage)
            if stage not in waiting and stage not in running:
                missing = self.getMissingParentStages(stage)
                if not missing:
                    submitMissingTasks(stage)
                    running.add(stage)
                else:
                    for parent in missing:
                        submitStage(parent)
                    waiting.add(stage)

        def submitMissingTasks(stage):
            myPending = pendingTasks.setdefault(stage, set())
            tasks = []
            have_prefer = True
            if stage == finalStage:
                for i in range(numOutputParts):
                    if not finished[i]:
                        part = outputParts[i]
                        if have_prefer:
                            locs = self.getPreferredLocs(finalRdd, part)
                            if not locs:
                                have_prefer = False
                        else:
                            locs = []
                        tasks.append(ResultTask(finalStage.id, finalRdd,
                            func, part, locs, i))
            else:
                for p in range(stage.numPartitions):
                    if not stage.outputLocs[p]:
                        if have_prefer:
                            locs = self.getPreferredLocs(stage.rdd, p)
                            if not locs:
                                have_prefer = False
                        else:
                            locs = []
                        tasks.append(ShuffleMapTask(stage.id, stage.rdd,
                            stage.shuffleDep, p, locs))
            logger.debug("add to pending %s tasks", len(tasks))
            myPending |= set(t.id for t in tasks)
            self.submitTasks(tasks)

        submitStage(finalStage)

        while numFinished != numOutputParts:
            try:
                evt = self.completionEvents.get(False)
            except Queue.Empty:
                self.check()
                if self._shutdown:
                    sys.exit(1)

                if failed and time.time() > lastFetchFailureTime + RESUBMIT_TIMEOUT:
                    self.updateCacheLocs()
                    for stage in failed:
                        logger.info("Resubmitting failed stages: %s", stage)
                        submitStage(stage)
                    failed.clear()
                else:
                    time.sleep(0.1)
                continue
               
            task, reason = evt.task, evt.reason
            stage = self.idToStage[task.stageId]
            if stage not in pendingTasks: # stage from other job
                continue
            logger.debug("remove from pedding %s from %s", task, stage)
            pendingTasks[stage].remove(task.id)
            if isinstance(reason, Success):
                Accumulator.merge(evt.accumUpdates)
                if isinstance(task, ResultTask):
                    finished[task.outputId] = True
                    numFinished += 1
                    results[task.outputId] = evt.result
                    while lastFinished < numOutputParts and finished[lastFinished]:
                        yield results[lastFinished]
                        results[lastFinished] = None
                        lastFinished += 1

                elif isinstance(task, ShuffleMapTask):
                    stage = self.idToStage[task.stageId]
                    stage.addOutputLoc(task.partition, evt.result)
                    if not pendingTasks[stage]:
                        logger.debug("%s finished; looking for newly runnable stages", stage)
                        running.remove(stage)
                        if stage.shuffleDep != None:
                            self.mapOutputTracker.registerMapOutputs(
                                    stage.shuffleDep.shuffleId,
                                    [l[-1] for l in stage.outputLocs])
                        self.updateCacheLocs()
                        newlyRunnable = set(stage for stage in waiting if not self.getMissingParentStages(stage))
                        waiting -= newlyRunnable
                        running |= newlyRunnable
                        logger.debug("newly runnable: %s, %s", waiting, newlyRunnable)
                        for stage in newlyRunnable:
                            submitMissingTasks(stage)
            elif isinstance(reason, FetchFailed):
                if stage in running:
                    running.remove(stage)
                    waiting.add(stage)
                mapStage = self.shuffleToMapStage[reason.shuffleId]
                mapStage.removeHost(reason.serverUri)
                failed.add(mapStage)
                lastFetchFailureTime = time.time()
            else:
                logger.error("task %s failed: %s %s %s", task, reason, type(reason), reason.message)
                raise Exception(reason.message)

        assert not any(results)
        return
Пример #21
0
    def runJob(self, finalRdd, func, partitions, allowLocal):
        outputParts = list(partitions)
        numOutputParts = len(partitions)
        finalStage = self.newStage(finalRdd, None)
        results = [None] * numOutputParts
        finished = [None] * numOutputParts
        lastFinished = 0
        numFinished = 0

        waiting = set()
        running = set()
        failed = set()
        pendingTasks = {}
        lastFetchFailureTime = 0

        self.updateCacheLocs()

        logger.debug("Final stage: %s, %d", finalStage, numOutputParts)
        logger.debug("Parents of final stage: %s", finalStage.parents)
        logger.debug("Missing parents: %s",
                     self.getMissingParentStages(finalStage))

        if allowLocal and (not finalStage.parents
                           or not self.getMissingParentStages(finalStage)
                           ) and numOutputParts == 1:
            split = finalRdd.splits[outputParts[0]]
            yield func(finalRdd.iterator(split))
            return

        def submitStage(stage):
            logger.debug("submit stage %s", stage)
            if stage not in waiting and stage not in running:
                missing = self.getMissingParentStages(stage)
                if not missing:
                    submitMissingTasks(stage)
                    running.add(stage)
                else:
                    for parent in missing:
                        submitStage(parent)
                    waiting.add(stage)

        def submitMissingTasks(stage):
            myPending = pendingTasks.setdefault(stage, set())
            tasks = []
            have_prefer = True
            if stage == finalStage:
                for i in range(numOutputParts):
                    if not finished[i]:
                        part = outputParts[i]
                        if have_prefer:
                            locs = self.getPreferredLocs(finalRdd, part)
                            if not locs:
                                have_prefer = False
                        else:
                            locs = []
                        tasks.append(
                            ResultTask(finalStage.id, finalRdd, func, part,
                                       locs, i))
            else:
                for p in range(stage.numPartitions):
                    if not stage.outputLocs[p]:
                        if have_prefer:
                            locs = self.getPreferredLocs(stage.rdd, p)
                            if not locs:
                                have_prefer = False
                        else:
                            locs = []
                        tasks.append(
                            ShuffleMapTask(stage.id, stage.rdd,
                                           stage.shuffleDep, p, locs))
            logger.debug("add to pending %s tasks", len(tasks))
            myPending |= set(t.id for t in tasks)
            self.submitTasks(tasks)

        submitStage(finalStage)

        while numFinished != numOutputParts:
            try:
                evt = self.completionEvents.get(False)
            except Queue.Empty:
                self.check()
                if self._shutdown:
                    sys.exit(1)

                if failed and time.time(
                ) > lastFetchFailureTime + RESUBMIT_TIMEOUT:
                    self.updateCacheLocs()
                    for stage in failed:
                        logger.info("Resubmitting failed stages: %s", stage)
                        submitStage(stage)
                    failed.clear()
                else:
                    time.sleep(0.1)
                continue

            task, reason = evt.task, evt.reason
            stage = self.idToStage[task.stageId]
            if stage not in pendingTasks:  # stage from other job
                continue
            logger.debug("remove from pending %s from %s", task, stage)
            pendingTasks[stage].remove(task.id)
            if isinstance(reason, Success):
                Accumulator.merge(evt.accumUpdates)
                if isinstance(task, ResultTask):
                    finished[task.outputId] = True
                    numFinished += 1
                    results[task.outputId] = evt.result
                    while lastFinished < numOutputParts and finished[
                            lastFinished]:
                        yield results[lastFinished]
                        results[lastFinished] = None
                        lastFinished += 1

                elif isinstance(task, ShuffleMapTask):
                    stage = self.idToStage[task.stageId]
                    stage.addOutputLoc(task.partition, evt.result)
                    if not pendingTasks[stage] and all(stage.outputLocs):
                        logger.debug(
                            "%s finished; looking for newly runnable stages",
                            stage)
                        MutableDict.merge()
                        running.remove(stage)
                        if stage.shuffleDep != None:
                            self.mapOutputTracker.registerMapOutputs(
                                stage.shuffleDep.shuffleId,
                                [l[-1] for l in stage.outputLocs])
                        self.updateCacheLocs()
                        newlyRunnable = set(
                            stage for stage in waiting
                            if not self.getMissingParentStages(stage))
                        waiting -= newlyRunnable
                        running |= newlyRunnable
                        logger.debug("newly runnable: %s, %s", waiting,
                                     newlyRunnable)
                        for stage in newlyRunnable:
                            submitMissingTasks(stage)
            elif isinstance(reason, FetchFailed):
                if stage in running:
                    waiting.add(stage)
                mapStage = self.shuffleToMapStage[reason.shuffleId]
                mapStage.removeHost(reason.serverUri)
                failed.add(mapStage)
                lastFetchFailureTime = time.time()
            else:
                logger.error("task %s failed: %s %s %s", task, reason,
                             type(reason), reason.message)
                raise Exception(reason.message)

        MutableDict.merge()
        assert not any(results)
        return
Пример #22
0
 def accumulator(self, init=0, param=None):
     return Accumulator(init, param)
Пример #23
0
    def runJob(self, finalRdd, func, partitions, allowLocal):
        outputParts = list(partitions)
        numOutputParts = len(partitions)
        finalStage = self.newStage(finalRdd, None)
        try:
            from dpark.web.ui.views.rddopgraph import StageInfo
            stage_info = StageInfo()
            stage_info.create_stage_info(finalStage)

            def create_stage_info_recur(cur_stage, is_final=False):
                if not cur_stage or cur_stage.id in self.idToRunJob:
                    return
                for par_stage in cur_stage.parents:
                    create_stage_info_recur(par_stage)
                if cur_stage.id not in self.idToRunJob:
                    self.idToRunJob[cur_stage.id] = StageInfo.idToStageInfo[cur_stage.id]
                    self.idToRunJob[cur_stage.id].is_final = is_final

            create_stage_info_recur(finalStage, is_final=True)
        except ImportError:
            pass
        results = [None] * numOutputParts
        finished = [None] * numOutputParts
        lastFinished = 0
        numFinished = 0

        waiting = set()
        running = set()
        failed = set()
        pendingTasks = {}
        lastFetchFailureTime = 0

        self.updateCacheLocs()

        logger.debug('Final stage: %s, %d', finalStage, numOutputParts)
        logger.debug('Parents of final stage: %s', finalStage.parents)
        logger.debug(
            'Missing parents: %s',
            self.getMissingParentStages(finalStage))

        def onStageFinished(stage):
            def _(r, dep):
                return r._do_checkpoint()

            MutableDict.merge()
            walk_dependencies(stage.rdd, _)

        if (allowLocal and
                (
                    not finalStage.parents or
                    not self.getMissingParentStages(finalStage)
                ) and numOutputParts == 1):
            split = finalRdd.splits[outputParts[0]]
            yield func(finalRdd.iterator(split))
            onStageFinished(finalStage)
            return

        def submitStage(stage):
            logger.debug('submit stage %s', stage)
            if stage not in waiting and stage not in running:
                missing = self.getMissingParentStages(stage)
                if not missing:
                    submitMissingTasks(stage)
                    running.add(stage)
                else:
                    for parent in missing:
                        submitStage(parent)
                    waiting.add(stage)

        def submitMissingTasks(stage):
            myPending = pendingTasks.setdefault(stage, set())
            tasks = []
            have_prefer = True
            if stage == finalStage:
                for i in range(numOutputParts):
                    if not finished[i]:
                        part = outputParts[i]
                        if have_prefer:
                            locs = self.getPreferredLocs(finalRdd, part)
                            if not locs:
                                have_prefer = False
                        else:
                            locs = []
                        tasks.append(ResultTask(finalStage.id, finalRdd,
                                                func, part, locs, i))
            else:
                for p in range(stage.numPartitions):
                    if not stage.outputLocs[p]:
                        if have_prefer:
                            locs = self.getPreferredLocs(stage.rdd, p)
                            if not locs:
                                have_prefer = False
                        else:
                            locs = []
                        tasks.append(ShuffleMapTask(stage.id, stage.rdd,
                                                    stage.shuffleDep, p, locs))
            logger.debug('add to pending %s tasks', len(tasks))
            myPending |= set(t.id for t in tasks)
            self.submitTasks(tasks)

        submitStage(finalStage)

        while numFinished != numOutputParts:
            try:
                evt = self.completionEvents.get(False)
            except Queue.Empty:
                self.check()
                if self._shutdown:
                    sys.exit(1)

                if (failed and
                        time.time() > lastFetchFailureTime + RESUBMIT_TIMEOUT):
                    self.updateCacheLocs()
                    for stage in failed:
                        logger.info('Resubmitting failed stages: %s', stage)
                        submitStage(stage)
                    failed.clear()
                else:
                    time.sleep(0.1)
                continue

            task, reason = evt.task, evt.reason
            stage = self.idToStage[task.stageId]
            if stage not in pendingTasks:  # stage from other job
                continue
            logger.debug('remove from pending %s from %s', task, stage)
            pendingTasks[stage].remove(task.id)
            if isinstance(reason, Success):
                Accumulator.merge(evt.accumUpdates)
                if isinstance(task, ResultTask):
                    finished[task.outputId] = True
                    numFinished += 1
                    results[task.outputId] = evt.result
                    while lastFinished < numOutputParts and finished[
                            lastFinished]:
                        yield results[lastFinished]
                        results[lastFinished] = None
                        lastFinished += 1

                elif isinstance(task, ShuffleMapTask):
                    stage = self.idToStage[task.stageId]
                    stage.addOutputLoc(task.partition, evt.result)
                    if not pendingTasks[stage] and all(stage.outputLocs):
                        logger.debug(
                            '%s finished; looking for newly runnable stages',
                            stage
                        )
                        onStageFinished(stage)
                        running.remove(stage)
                        if stage.shuffleDep is not None:
                            self.mapOutputTracker.registerMapOutputs(
                                stage.shuffleDep.shuffleId,
                                [l[-1] for l in stage.outputLocs])
                        self.updateCacheLocs()
                        newlyRunnable = set(
                            stage for stage in waiting
                            if not self.getMissingParentStages(stage)
                        )
                        waiting -= newlyRunnable
                        running |= newlyRunnable
                        logger.debug(
                            'newly runnable: %s, %s', waiting, newlyRunnable)
                        for stage in newlyRunnable:
                            submitMissingTasks(stage)
            elif isinstance(reason, FetchFailed):
                if stage in running:
                    waiting.add(stage)
                mapStage = self.shuffleToMapStage[reason.shuffleId]
                mapStage.removeHost(reason.serverUri)
                failed.add(mapStage)
                lastFetchFailureTime = time.time()
            else:
                logger.error(
                    'task %s failed: %s %s %s',
                    task,
                    reason,
                    type(reason),
                    reason.message)
                raise Exception(reason.message)

        onStageFinished(finalStage)
        assert not any(results)
        return