Exemplo n.º 1
0
Arquivo: dag.py Projeto: cxz/ploomber
    def _build(self, force, show_progress):
        # always render before building (the function might immediately
        # return if the user turned render status caching on)
        # Do not show progress - should only be displayed when .render is
        # called directly
        self.render(force=force, show_progress=False)

        if self._exec_status == DAGStatus.ErroredRender:
            raise DAGBuildError('Cannot build dag that failed to render, '
                                'fix rendering errors then build again. '
                                'To see the full traceback again, run '
                                'dag.render(force=True)')
        else:
            self._logger.info('Building DAG %s', self)

            tb = {}

            try:
                # within_dag flags when we execute a task in isolation
                # vs as part of a dag execution
                # FIXME: not passing force flag
                task_reports = self._executor(dag=self,
                                              show_progress=show_progress)

            # executors raise this error to signal that there was an error
            # building the dag, this allows us to run the on_failure hook,
            # but any other errors should not be caught (e.g.
            # a user might turn that setting off in the executor to start
            # a debugging session at the line of failure)
            except DAGBuildError as e:
                tb['build'] = traceback.format_exc()
                self._exec_status = DAGStatus.Errored
                build_exception = e
            except DAGBuildEarlyStop:
                # early stop and empty on_failure, nothing left to do
                if self.on_failure is None:
                    return
            else:
                # no error when building dag
                build_exception = None

            if build_exception is None:
                empty = [
                    TaskReport.empty_with_name(t.name) for t in self.values()
                    if t.exec_status == TaskStatus.Skipped
                ]

                build_report = BuildReport(task_reports + empty)
                self._logger.info(' DAG report:\n{}'.format(build_report))

                # try on_finish hook
                try:
                    self._run_on_finish(build_report)
                except Exception as e:
                    tb['on_finish'] = traceback.format_exc()
                    # on_finish error, log exception and set status
                    msg = ('Exception when running on_finish '
                           'for DAG "{}": {}'.format(self.name, e))
                    self._logger.exception(msg)
                    self._exec_status = DAGStatus.Errored

                    if isinstance(e, DAGBuildEarlyStop):
                        # early stop, nothing left to co
                        return
                    else:
                        # otherwise raise exception
                        raise DAGBuildError(msg) from e
                else:
                    # DAG success and on_finish did not raise exception
                    self._exec_status = DAGStatus.Executed
                    return build_report

            else:
                # DAG raised error, run on_failure hook
                try:
                    self._run_on_failure(tb)
                except Exception as e:
                    # error in hook, log exception
                    msg = ('Exception when running on_failure '
                           'for DAG "{}": {}'.format(self.name, e))
                    self._logger.exception(msg)

                    # do not raise exception if early stop
                    if isinstance(e, DAGBuildEarlyStop):
                        return
                    else:
                        raise DAGBuildError(msg) from e

                # on_failure hook executed, raise original exception
                raise build_exception
Exemplo n.º 2
0
    def __call__(self, dag, show_progress):
        super().__call__(dag)

        exceptions_all = BuildExceptionsCollector()
        warnings_all = BuildWarningsCollector()
        task_reports = []

        task_kwargs = {'catch_exceptions': self._catch_exceptions}

        scheduled = [
            dag[t] for t in dag if dag[t].exec_status != TaskStatus.Skipped
        ]

        if show_progress:
            scheduled = tqdm(scheduled, total=len(scheduled))

        for t in scheduled:
            if t.exec_status == TaskStatus.Aborted:
                continue

            if show_progress:
                scheduled.set_description('Building task "{}"'.format(t.name))

            if self._build_in_subprocess:
                fn = LazyFunction(
                    build_in_subprocess, {
                        'task': t,
                        'build_kwargs': task_kwargs,
                        'reports_all': task_reports
                    }, t)
            else:
                fn = LazyFunction(
                    build_in_current_process, {
                        'task': t,
                        'build_kwargs': task_kwargs,
                        'reports_all': task_reports
                    }, t)

            if self._catch_warnings:
                fn = LazyFunction(fn=catch_warnings,
                                  kwargs={
                                      'fn': fn,
                                      'warnings_all': warnings_all
                                  },
                                  task=t)
            else:
                # NOTE: this isn't doing anything
                fn = LazyFunction(fn=pass_exceptions,
                                  kwargs={'fn': fn},
                                  task=t)

            if self._catch_exceptions:
                fn = LazyFunction(fn=catch_exceptions,
                                  kwargs={
                                      'fn': fn,
                                      'exceptions_all': exceptions_all
                                  },
                                  task=t)

            fn()

        # end of for loop

        if warnings_all and self._catch_warnings:
            # NOTE: maybe raise one by one to keep the warning type
            warnings.warn(str(warnings_all))

        if exceptions_all and self._catch_exceptions:
            early_stop = any(
                [isinstance(m.obj, DAGBuildEarlyStop) for m in exceptions_all])
            if early_stop:
                raise DAGBuildEarlyStop('Ealy stopping DAG execution, '
                                        'at least one of the tasks that '
                                        'failed raised a DAGBuildEarlyStop '
                                        'exception:\n{}'.format(
                                            str(exceptions_all)))
            else:
                raise DAGBuildError(str(exceptions_all))

        return task_reports
Exemplo n.º 3
0
    def __call__(self, dag, show_progress):
        super().__call__(dag)

        # TODO: Have to test this with other Tasks, especially the ones that
        # use clients - have to make sure they are serialized correctly
        done = []
        started = []
        set_all = set(dag)
        future_mapping = {}

        # there might be up-to-date tasks, add them to done
        # FIXME: this only happens when the dag is already build and then
        # then try to build again (in the same session), if the session
        # is restarted even up-to-date tasks will be WaitingExecution again
        # this is a bit confusing, so maybe change WaitingExecution
        # to WaitingBuild?
        for name in dag:
            if dag[name].exec_status in {
                    TaskStatus.Executed, TaskStatus.Skipped
            }:
                done.append(dag[name])

        def callback(future):
            """Keep track of finished tasks
            """
            task = future_mapping[future]
            self._logger.debug('Added %s to the list of finished tasks...',
                               task.name)
            try:
                result = future.result()
            except BrokenProcessPool:
                # ignore the error here but flag the task,
                # so next_task is able to stop the iteration,
                # when we call result after breaking the loop,
                # this will show up
                task.exec_status = TaskStatus.BrokenProcessPool
            else:
                if isinstance(result, Message):
                    task.exec_status = TaskStatus.Errored
                # sucessfully run task._build
                else:
                    # ignore report here, we just the metadata to update it
                    _, meta = result
                    task.product.metadata.update_locally(meta)
                    task.exec_status = TaskStatus.Executed

            done.append(task)

        def next_task():
            """
            Return the next Task to execute, returns None if no Tasks are
            available for execution (cause their dependencies are not done yet)
            and raises a StopIteration exception if there are no more tasks to
            run, which means the DAG is done
            """
            for task in dag.values():
                if task.exec_status in {TaskStatus.Aborted}:
                    done.append(task)
                elif task.exec_status == TaskStatus.BrokenProcessPool:
                    raise StopIteration

            # iterate over tasks to find which is ready for execution
            for task in dag.values():
                # ignore tasks that are already started, I should probably add
                # an executing status but that cannot exist in the task itself,
                # maybe in the manaer?
                if (task.exec_status == TaskStatus.WaitingExecution
                        and task not in started):
                    return task
                # there might be some up-to-date tasks, add them

            set_done = set([t.name for t in done])

            if not self._i % 50000:
                self._logger.debug('Finished tasks so far: %s', set_done)
                self._logger.debug('Remaining tasks: %s', set_all - set_done)
                self._logger.info('Finished %i out of %i tasks', len(set_done),
                                  len(set_all))

            if set_done == set_all:
                self._logger.debug('All tasks done')

                raise StopIteration

            self._i += 1

        task_kwargs = {'catch_exceptions': True}

        with ProcessPoolExecutor(max_workers=self.processes) as pool:
            while True:
                try:
                    task = next_task()
                except StopIteration:
                    break
                else:
                    if task is not None:
                        future = pool.submit(TaskBuildWrapper(task),
                                             **task_kwargs)
                        # the callback function uses the future mapping
                        # so add it before registering the callback, otherwise
                        # it might break and hang the whole process
                        future_mapping[future] = task
                        future.add_done_callback(callback)
                        started.append(task)
                        logging.info('Added %s to the pool...', task.name)

        results = [
            # results are the output of Task._build: (report, metadata)
            # OR a Message
            get_future_result(f, future_mapping)
            for f in future_mapping.keys()
        ]

        exps = [r for r in results if isinstance(r, Message)]

        if exps:
            raise DAGBuildError(str(BuildExceptionsCollector(exps)))

        # if we reach this, it means no tasks failed. only return reports
        return [r[0] for r in results]