def sync_task_run_attempts_retries(self, ti_status): databand_run = get_databand_run() for dag_run in ti_status.active_runs: for ti in dag_run.get_task_instances(): task_run = databand_run.get_task_run_by_af_id( ti.task_id ) # type: TaskRun # looking for retry tasks af_task_try_number = get_af_task_try_number(ti) if task_run and task_run.attempt_number != af_task_try_number: self.log.info( "Found a new attempt for task %60s (%s -> %s) in Airflow. Submitting to Databand.", ti.task_id, task_run.attempt_number, af_task_try_number, ) # update in memory object with new attempt number task_run.update_task_run_attempt(af_task_try_number) # sync the tracker with the new task_run_attempt databand_run.tracker.tracking_store.add_task_runs( run=databand_run, task_runs=[task_run] ) report_airflow_task_instance( ti.dag_id, ti.execution_date, [task_run] )
def create_dynamic_task(func_call): # type: (FuncCall) -> Task task_cls, call_args, call_kwargs = ( func_call.task_cls, func_call.call_args, func_call.call_kwargs.copy(), ) from dbnd import pipeline, PipelineTask from dbnd._core.decorator.dbnd_decorator import _default_output parent_task = current_task_run().task dbnd_run = get_databand_run() if task_cls._conf__decorator_spec is not None: # orig_call_args, orig_call_kwargs = call_args, call_kwargs call_args, call_kwargs = args_to_kwargs( task_cls._conf__decorator_spec.args, call_args, call_kwargs) # Map all kwargs to the "original" target of that objects # for example: for DataFrame we'll try to find a relevant target that were used to read it # get all possible value's targets call_kwargs_as_targets = dbnd_run.target_origin.get_for_map(call_kwargs) for p_name, value_origin in call_kwargs_as_targets.items(): root_target = value_origin.origin_target path = root_target.path if hasattr(root_target, "path") else None call_kwargs[p_name] = InlineTarget( root_target=root_target, obj=call_kwargs[p_name], value_type=value_origin.value_type, source=value_origin.origin_target.source, path=path, ) call_kwargs.setdefault("task_is_dynamic", True) call_kwargs.setdefault("task_in_memory_outputs", parent_task.settings.dynamic_task.in_memory_outputs) # in case of pipeline - we'd like to run it as regular task # if False and issubclass(task_cls, PipelineTask): # # TODO: do we want to support this behavior # task_cls = task(task_cls._conf__decorator_spec.item).task_cls if issubclass(task_cls, PipelineTask): # if it's pipeline - create new databand run # create override _task_default_result to be object instead of target task_cls = pipeline(task_cls._conf__decorator_spec.item, _task_default_result=_default_output).task_cls # instantiate inline pipeline t = task_cls(*call_args, **call_kwargs) return t else: # instantiate inline task t = task_cls(*call_args, **call_kwargs) # update upstream/downstream relations - needed for correct tracking # we can have the task as upstream , as it was executed already if not parent_task.task_dag.has_upstream(t): parent_task.set_upstream(t) return t
def dbnd_execute_airflow_operator(airflow_operator, context): """ Airflow Operator execute function """ dbnd_task_id = getattr(airflow_operator, "dbnd_task_id", None) if not dbnd_task_id: return airflow_operator.execute(context) # operator is wrapped/created by databand if isinstance(airflow_operator, DbndOperator): return airflow_operator.execute(context) from dbnd._core.current import get_databand_run # this is the Airflow native Operator # we will want to call it with Databand wrapper # we are at the airflow operator that is part of databand dag dbnd_task_run = get_databand_run().get_task_run_by_id(dbnd_task_id) if isinstance(dbnd_task_run.task, AirflowOperatorAsDbndTask): # we need to update it with latest, as we have "templated" and copy airflow operator object dbnd_task_run.task.airflow_op = airflow_operator return dbnd_task_run.runner.execute(context) else: logging.info( "Found airflow operator with dbnd_task_id that can not be run by dbnd: %s", airflow_operator, ) return airflow_operator.execute(context)
def _task_submit(self): spark_ctrl = self._get_spark_ctrl() if self._use_spark_context_inplace(): logger.info("Reusing existing spark session in inline task " "due to spark_local.enable_spark_context_inplace") return self._task_run() dr = get_databand_run() if not dr.driver_dump.exists(): raise DatabandConfigError( "Please configure your cloud to always_save_pipeline=True, we need to pickle pipeline first" ) driver_dump = self.current_task_run.run.driver_task.driver_dump self._application_args = [ "execute", "--dbnd-run", spark_ctrl.sync(driver_dump), "task", "--task-id", self.task_id, ] if spark_ctrl.config.disable_tracking_api: self._application_args[1:1] = ["--disable-tracking-api"] return spark_ctrl.run_pyspark( pyspark_script=databand_lib_path("_core", "cli", "main.py"))
def read_metrics(metrics_task_id): # type: ( str) -> Dict source_task_attempt_folder = ( get_databand_run().get_task_run(metrics_task_id).attempt_folder ) metrics = read_task_metrics(source_task_attempt_folder) return metrics
def _update_databand_task_run_states(self, run): """ Sync states between DBND and Airflow we need to sync state into Tracker, if we use "remote" executors (parallel/k8s) we need to copy state into current process (scheduler) """ # this is the only state we want to propogate into Databand # all other state changes are managed by databand itself by it's own state machine databand_run = get_databand_run() task_runs = [] # sync all states # These tasks need special treatment because Airflow doesn't manage sub-pipelines # for this, we need to process failures in child tasks first # and decide if the parent sub-pipeline has failed upstream_failed_tasks: typing.List[TaskInstance] = [] for ti in run.get_task_instances(): task_run = databand_run.get_task_run_by_af_id(ti.task_id) # type: TaskRun if not task_run: continue # UPSTREAM FAILED tasks are not going to "run" , so no code will update their state if ( ti.state == State.UPSTREAM_FAILED and task_run.task_run_state != TaskRunState.UPSTREAM_FAILED ): upstream_failed_tasks.append(ti) # update only in memory state if ( ti.state == State.SUCCESS and task_run.task_run_state != TaskRunState.SUCCESS ): task_run.set_task_run_state(TaskRunState.SUCCESS, track=False) if ( ti.state == State.FAILED and task_run.task_run_state != TaskRunState.FAILED ): task_run.set_task_run_state(TaskRunState.FAILED, track=False) # process them at the last step, when we have knowledge about the child tasks for ti in upstream_failed_tasks: task_run: TaskRun = databand_run.get_task_run_by_af_id(ti.task_id) state = databand_run.get_upstream_failed_task_run_state(task_run) logger.info("Setting %s to %s", task_run.task.task_id, state) task_run.set_task_run_state(state, track=False) task_runs.append(task_run) # optimization to write all updates in batch if task_runs: databand_run.tracker.set_task_run_states(task_runs)
def _collect_errors(self, ti_status, session=None): err = "" if ti_status.failed: dr = get_databand_run() upstream_failed = [] failed = [] for fail_info in ti_status.failed: airflow_task_id = fail_info[1] task_run = dr.get_task_run(airflow_task_id) task_name = task_run.task.task_name if task_run.task_run_state == State.UPSTREAM_FAILED: # we don't want to show upstream failed in the list upstream_failed.append(task_name) else: failed.append(task_name) if upstream_failed: err += ( "Task that didn't run because " "of failed dependency:\n\t{}\n".format("\n\t".join(upstream_failed)) ) if failed: err += "Failed tasks are:\n\t{}".format("\n\t".join(failed)) if ti_status.deadlocked: err += ( "---------------------------------------------------\n" "DagRunJob is deadlocked." ) deadlocked_depends_on_past = any( t.are_dependencies_met( dep_context=DepContext(ignore_depends_on_past=False), session=session, verbose=self.verbose, ) != t.are_dependencies_met( dep_context=DepContext(ignore_depends_on_past=True), session=session, verbose=self.verbose, ) for t in ti_status.deadlocked ) if deadlocked_depends_on_past: err += ( "Some of the deadlocked tasks were unable to run because " 'of "depends_on_past" relationships. Try running the ' "backfill with the option " '"ignore_first_depends_on_past=True" or passing "-I" at ' "the command line." ) err += " These tasks have succeeded:\n{}\n".format(ti_status.succeeded) err += " These tasks are running:\n{}\n".format(ti_status.running) err += " These tasks have failed:\n{}\n".format(ti_status.failed) err += " These tasks are skipped:\n{}\n".format(ti_status.skipped) err += " These tasks are deadlocked:\n{}\n".format(ti_status.deadlocked) return err
def create_and_run_dynamic_task_safe(func_call, parent_task_run): # type: (FuncCall,TaskRun ) -> Any try: task = create_dynamic_task(func_call) # type: Task except MissingParameterError: # We can't handle MissingParameterError, function invocation will always fail raise except Exception: _handle_dynamic_error("task-create", func_call) return func_call.invoke() try: from dbnd._core.decorator.func_task_call import TaskCallState, CALL_FAILURE_OBJ task._dbnd_call_state = TaskCallState(should_store_result=True) # this is the real run of the decorated function from dbnd import PipelineTask dbnd_run = get_databand_run() if isinstance(task, PipelineTask): # if it's pipeline - create new databand run run = dbnd_run.context.dbnd_run_task(task) task_run = run.get_task_run(task.task_id) else: task_run = dbnd_run.run_executor.run_dynamic_task( task, task_engine=current_task_run().task_engine) if task._dbnd_call_state.result_saved: return task._dbnd_call_state.result t = task_run.task # if we are inside run, we want to have real values, not deferred! if t.task_definition.single_result_output: return t.__class__.result.load_from_target(t.result) # we have func without result, just fallback to None return t except Exception: if task and task._dbnd_call_state: if task._dbnd_call_state.finished: # if function was invoked and finished - than we failed in dbnd post-exec # just return invoke_result to user _handle_dynamic_error("task-post-execute", func_call) return task._dbnd_call_state.result if task._dbnd_call_state.started: # if started but not finished -> it was user code exception -> re-raise raise # not started - our exception on pre-exec, run user code _handle_dynamic_error("task-pre-execute", func_call) return func_call.invoke() finally: # we'd better clean _invoke_result to avoid memory leaks task._dbnd_call_state = None
def run_dynamic_task_safe(task, func_call): # type: (Task, FuncCall) -> Union[Any] try: from dbnd._core.decorator.func_task_call import TaskCallState, CALL_FAILURE_OBJ task._dbnd_call_state = TaskCallState(should_store_result=True) # this is the real run of the decorated function from dbnd import PipelineTask dbnd_run = get_databand_run() if isinstance(task, PipelineTask): # if it's pipeline - create new databand run run = dbnd_run.context.dbnd_run_task(task) task_run = run.get_task_run(task.task_id) else: task_run = dbnd_run.run_dynamic_task( task, task_engine=current_task_run().task_engine ) if task._dbnd_call_state.result_saved: return task._dbnd_call_state.result t = task_run.task # if we are inside run, we want to have real values, not deferred! if t.task_definition.single_result_output: return t.__class__.result.load_from_target(t.result) # we have func without result, just fallback to None return t except Exception: if task and task._dbnd_call_state: if task._dbnd_call_state.finished: # if function was invoked and finished - than we failed in dbnd post-exec # just return invoke_result to user logger.warning("Error during dbnd post-exec, ignoring", exc_info=True) return task._dbnd_call_state.result if task._dbnd_call_state.started: # if started but not finished -> it was user code exception -> re-raise raise # not started - our exception on pre-exec, run user code logger.warning("Error during dbnd task-pre-execute, ignoring", exc_info=True) return func_call.invoke() finally: # we'd better clean _invoke_result to avoid memory leaks task._dbnd_call_state = None
def _update_databand_task_run_states(self, run): # we are going to update UPSTREAM_FAILED only # this is the only state we want to propogate into Databand # all other state changes are managed by databand itself by it's own state machine dr = get_databand_run() task_runs = [] for ti in run.get_task_instances(): if ti.state != State.UPSTREAM_FAILED: continue task_run = dr.get_task_run_by_af_id(ti.task_id) # type: TaskRun if not task_run: continue if task_run.task_run_state != State.UPSTREAM_FAILED: task_run.set_task_run_state(TaskRunState.UPSTREAM_FAILED, track=False) task_runs.append(task_run) if task_runs: dr.tracker.set_task_run_states(task_runs)
def _update_databand_task_run_states(self, run): """ Sync states between DBND and Airflow we need to sync state into Tracker, if we use "remote" executors (parallel/k8s) we need to copy state into current process (scheduler) """ # this is the only state we want to propogate into Databand # all other state changes are managed by databand itself by it's own state machine dr = get_databand_run() task_runs = [] # sync all states for ti in run.get_task_instances(): task_run = dr.get_task_run_by_af_id(ti.task_id) # type: TaskRun if not task_run: continue # UPSTREAM FAILED tasks are not going to "run" , so no code will update their state if ( ti.state == State.UPSTREAM_FAILED and task_run.task_run_state != TaskRunState.UPSTREAM_FAILED ): task_run.set_task_run_state(TaskRunState.UPSTREAM_FAILED, track=False) task_runs.append(task_run) # update only in memory state if ( ti.state == State.SUCCESS and task_run.task_run_state != TaskRunState.SUCCESS ): task_run.set_task_run_state(TaskRunState.SUCCESS, track=False) if ( ti.state == State.FAILED and task_run.task_run_state != TaskRunState.FAILED ): task_run.set_task_run_state(TaskRunState.FAILED, track=False) # optimization to write all updates in batch if task_runs: dr.tracker.set_task_run_states(task_runs)
def task_run(self): # type: ()-> TaskRun run = get_databand_run() return run.get_task_run(self.task.task_id)
def current_task_run(self): # type: ()->TaskRun return get_databand_run().get_task_run(self.task_id)
def tracking_context(self, call_args, call_kwargs): user_code_called = False # whether we got to executing of user code user_code_finished = False # whether we passed executing of user code func_call = None try: # 1. check that we don't have too many calls if self._call_count_limit_exceeded(): yield _do_nothing_decorator return # 2. Start or reuse existing "main tracking task" that is root for tracked tasks if not try_get_current_task(): """ try to get existing task, and if not exists - try to get/create inplace_task_run """ from dbnd._core.tracking.script_tracking_manager import ( try_get_inplace_tracking_task_run, ) inplace_tacking_task = try_get_inplace_tracking_task_run() if not inplace_tacking_task: # we didn't manage to start inplace tracking task run, we will not be able to track yield _do_nothing_decorator return tracking_task_definition = self.get_tracking_task_definition() callable_spec = tracking_task_definition.task_decorator.get_callable_spec( ) func_call = TrackedFuncCallWithResult( callable=self.callable, call_args=tuple( call_args), # prevent original call_args modification call_kwargs=dict( call_kwargs), # prevent original kwargs modification ) # replace any position argument with kwarg if it possible args, kwargs = args_to_kwargs(callable_spec.args, func_call.call_args, func_call.call_kwargs) # instantiate inline task task = TrackingTask.for_func(tracking_task_definition, args, kwargs) # update upstream/downstream relations - needed for correct tracking # we can have the task as upstream , as it was executed already parent_task = current_task_run().task if not parent_task.task_dag.has_upstream(task): parent_task.set_upstream(task) # checking if any of the inputs are the outputs of previous task. # we can add that task as upstream. dbnd_run = get_databand_run() call_kwargs_as_targets = dbnd_run.target_origin.get_for_map(kwargs) for value_origin in call_kwargs_as_targets.values(): up_task = value_origin.origin_target.task task.set_upstream(up_task) # creating task_run as a task we found mid-run task_run = dbnd_run.create_task_run_at_execution_time( task, task_engine=current_task_run().task_engine) should_capture_log = ( TrackingConfig.from_databand_context().capture_tracking_log) with task_run.runner.task_run_execution_context( handle_sigterm=True, capture_log=should_capture_log): task_run.set_task_run_state(state=TaskRunState.RUNNING) _log_inputs(task_run) # if we reached this line, then all tracking initialization is # finished successfully, and we're going to execute user code user_code_called = True try: # tracking_context is context manager - user code will run on yield yield func_call.set_result # if we reached this line, this means that user code finished # successfully without any exceptions user_code_finished = True # We catch BaseException since we want to catch KeyboardInterrupts as well except BaseException as ex: task_run.finished_time = utcnow() error = TaskRunError.build_from_ex(ex, task_run) task_run.set_task_run_state(TaskRunState.FAILED, error=error) raise else: task_run.finished_time = utcnow() # func_call.result should contain result, log it _log_result(task_run, func_call.result) task_run.set_task_run_state(TaskRunState.SUCCESS) except BaseException: if user_code_called and not user_code_finished: # if we started to call the user code and not got to user_code_finished # line - it means there was user code exception - so just re-raise it raise # else it's either we didn't reached calling user code, or already passed it # then it's some dbnd tracking error - just log it if func_call: _handle_tracking_error("tracking-init", func_call) else: log_exception_to_server() # if we didn't reached user_code_called=True line - there was an error during # dbnd tracking initialization, so nothing is done - user function wasn't called yet if not user_code_called: # tracking_context is context manager - user code will run on yield yield _do_nothing_decorator return
def _run_task_from_another_task_execution( self, parent_task, call_args, call_kwargs ): # type: (TaskDecorator, Task, *Any, **Any) -> TaskRun # task is running from another task task_cls = self.get_task_cls() from dbnd import PipelineTask, pipeline from dbnd._core.task_build.dbnd_decorator import _default_output dbnd_run = get_databand_run() # orig_call_args, orig_call_kwargs = call_args, call_kwargs call_args, call_kwargs = args_to_kwargs( self.get_callable_spec().args, call_args, call_kwargs ) # Map all kwargs to the "original" target of that objects # for example: for DataFrame we'll try to find a relevant target that were used to read it # get all possible value's targets call_kwargs_as_targets = dbnd_run.target_origin.get_for_map(call_kwargs) for p_name, value_origin in call_kwargs_as_targets.items(): root_target = value_origin.origin_target path = root_target.path if hasattr(root_target, "path") else None original_object = call_kwargs[p_name] call_kwargs[p_name] = InlineTarget( root_target=root_target, obj=original_object, value_type=value_origin.value_type, source=value_origin.origin_target.source, path=path, ) call_kwargs.setdefault("task_is_dynamic", True) call_kwargs.setdefault( "task_in_memory_outputs", parent_task.settings.run.task_run_at_execution_time_in_memory_outputs, ) if issubclass(task_cls, PipelineTask): # if it's pipeline - create new databand run # create override _task_default_result to be object instead of target task_cls = pipeline( self.class_or_func, _task_default_result=_default_output ).task_cls # instantiate inline pipeline task = task_cls(*call_args, **call_kwargs) # if it's pipeline - create new databand run run = dbnd_run.context.dbnd_run_task(task) task_run = run.get_task_run(task.task_id) else: # instantiate inline task (dbnd object) task = task_cls(*call_args, **call_kwargs) # update upstream/downstream relations - needed for correct tracking # we can have the task as upstream , as it was executed already if not parent_task.task_dag.has_upstream(task): parent_task.set_upstream(task) from dbnd._core.task_build.task_cls__call_state import TaskCallState task._dbnd_call_state = TaskCallState(should_store_result=True) try: task_run = dbnd_run.run_executor.run_task_at_execution_time( task, task_engine=current_task_run().task_engine ) # this will work only for _DecoratedTask if task._dbnd_call_state.result_saved: return task._dbnd_call_state.result finally: # we'd better clean _invoke_result to avoid memory leaks task._dbnd_call_state = None # if we are inside run, we want to have real values, not deferred! if task.task_definition.single_result_output: return task.__class__.result.load_from_target(task.result) # we have func without result, just fallback to None return task
def tracking_context(self, call_args, call_kwargs): user_code_called = False # whether we got to executing of user code user_code_finished = False # whether we passed executing of user code func_call = None try: func_call = FuncCallWithResult( task_cls=self.get_tracking_task_cls(), call_user_code=self.func, call_args=tuple( call_args), # prevent original call_args modification call_kwargs=dict( call_kwargs), # prevent original kwargs modification ) # 1. check that we don't have too many calls # 2. Start or reuse existing "inplace_task" that is root for tracked tasks if not self._call_count_limit_exceeded( ) and _get_or_create_inplace_task(): cls = func_call.task_cls # replace any position argument with kwarg if it possible args, kwargs = args_to_kwargs( cls._conf__decorator_spec.args, func_call.call_args, func_call.call_kwargs, ) # instantiate inline task task = cls._create_task(args, kwargs) # update upstream/downstream relations - needed for correct tracking # we can have the task as upstream , as it was executed already parent_task = current_task_run().task if not parent_task.task_dag.has_upstream(task): parent_task.set_upstream(task) # checking if any of the inputs are the outputs of previous task. # we can add that task as upstream. dbnd_run = get_databand_run() call_kwargs_as_targets = dbnd_run.target_origin.get_for_map( kwargs) for value_origin in call_kwargs_as_targets.values(): up_task = value_origin.origin_target.task task.set_upstream(up_task) # creating task_run as a task we found mid-run task_run = dbnd_run.create_dynamic_task_run( task, task_engine=current_task_run().task_engine) with task_run.runner.task_run_execution_context( handle_sigterm=True): task_run.set_task_run_state(state=TaskRunState.RUNNING) _log_inputs(task_run) # if we reached this line, then all tracking initialization is # finished successfully, and we're going to execute user code user_code_called = True try: # tracking_context is context manager - user code will run on yield yield func_call.set_result # if we reached this line, this means that user code finished # successfully without any exceptions user_code_finished = True except Exception as ex: task_run.finished_time = utcnow() error = TaskRunError.build_from_ex(ex, task_run) task_run.set_task_run_state(TaskRunState.FAILED, error=error) raise else: task_run.finished_time = utcnow() # func_call.result should contain result, log it _log_result(task_run, func_call.result) task_run.set_task_run_state(TaskRunState.SUCCESS) except Exception: if user_code_called and not user_code_finished: # if we started to call the user code and not got to user_code_finished # line - it means there was user code exception - so just re-raise it raise # else it's either we didn't reached calling user code, or already passed it # then it's some dbnd tracking error - just log it if func_call: _handle_dynamic_error("tracking-init", func_call) # if we didn't reached user_code_called=True line - there was an error during # dbnd tracking initialization, so nothing is done - user function wasn't called yet if not user_code_called: # tracking_context is context manager - user code will run on yield yield _passthrough_decorator
def _create_dynamic_task_run(func_call): task = create_dynamic_task(func_call) dbnd_run = get_databand_run() task_run = dbnd_run.create_dynamic_task_run( task, task_engine=current_task_run().task_engine) return task_run
def _dbnd_operator_to_taskrun(operator): # type: (DbndOperator)-> TaskRun from dbnd._core.current import get_databand_run return get_databand_run().get_task_run_by_id(operator.dbnd_task_id)