def from_env(cls, current_run): # type: (DatabandRun) -> RootRunInfo parent_run = try_get_databand_run() if parent_run: # take from parent root_run_info = parent_run.root_run_info # update parent run info if required root_task_run = try_get_current_task_run() if root_task_run: root_task_run_uid = root_task_run.task_run_uid root_run_info = attr.evolve( root_run_info, root_task_run_uid=root_task_run_uid) return root_run_info # take from env root_run_uid = os.environ.get(DBND_ROOT_RUN_UID) root_run_url = os.environ.get(DBND_ROOT_RUN_TRACKER_URL) root_task_run_uid = os.environ.get(DBND_PARENT_TASK_RUN_UID) if not root_run_uid: # current run is the main run root_run_uid = current_run.run_uid root_run_url = current_run.tracker.run_url return cls( root_run_uid=root_run_uid, root_run_url=root_run_url, root_task_run_uid=root_task_run_uid, )
def _update_node_name(self, pod_name, pod_data): if self.processed_pods.get(pod_name): self.log.debug( "Pod %s has already been logged to metrics - skipping") return node_name = pod_data.spec.node_name if not node_name: return # Some events are missing the node name, but it will get there for sure try: task_id = pod_data.metadata.labels.get("task_id") if not task_id: return dr = try_get_databand_run() if not dr: return task_run = dr.get_task_run(task_id) if not task_run: return self.metrics_logger.log_pod_information(task_run.task, pod_name, node_name) except Exception as ex: logger.info("Failed to gather node name for %s", pod_name) finally: self.processed_pods[pod_name] = True
def stop(): msg("stopping!") task = try_get_current_task() msg("Current tasks looks like: %s" % (task)) run = try_get_databand_run() if run: run.kill() return
def dbnd_operator__kill(dbnd_operator): from dbnd._core.current import try_get_databand_run run = try_get_databand_run() if not run: return task_run = run.get_task_run_by_id(dbnd_operator.dbnd_task_id) return task_run.task.on_kill()
def _log_parameter_value(self, runtime_value, value, task): if try_get_databand_run() and task.current_task_run: task.current_task_run.tracker.log_parameter_data( parameter=self, target=value, value=runtime_value, operation_type=DbndTargetOperationType.read, operation_status=DbndTargetOperationStatus.OK, )
def context_to_airflow_vars(context, in_env_var_format=False): # original_context_to_airflow_vars is created during function override in patch_models() params = airflow.utils.operator_helpers.original_context_to_airflow_vars( context=context, in_env_var_format=in_env_var_format ) if in_env_var_format: dbnd_run = try_get_databand_run() if dbnd_run: params.update(dbnd_run.get_context_spawn_env()) return params
def get_task_run_from_pod_data(pod_data): labels = pod_data.metadata.labels if "task_id" not in labels: return None task_id = labels["task_id"] dr = try_get_databand_run() if not dr: return None return dr.get_task_run_by_af_id(task_id)
def _resolve_cache_file_name(file_path): run = try_get_databand_run() if not run: raise Exception( "No databand run found to when creating cache file") dbnd_local_root = run.get_current_dbnd_local_root() cache_dir = get_or_create_folder_in_dir("cache", dbnd_local_root.path) file_name = os.path.basename( file_path) + DbndLocalFileMetadataRegistry.ext return os.path.join(cache_dir, file_name)
def fake_task_inside_dag(): log_metric("Testing", "Metric") run = try_get_databand_run() assert run is not None, "Task should run in databand run, check airflow tracking!" root_task = run.root_task # Validate regular subdag properties assert run.job_name == "%s.%s.fake_task_inside_dag" % (PARENT_DAG, CHILD_DAG) assert root_task.task_name == "fake_task_inside_dag__execute" return "Regular test"
def context_to_airflow_vars(context, in_env_var_format=False): # original_context_to_airflow_vars is created during function override in patch_models() params = airflow.utils.operator_helpers._original_context_to_airflow_vars( context=context, in_env_var_format=in_env_var_format) if in_env_var_format: dbnd_run = try_get_databand_run() if dbnd_run: params.update(dbnd_run.get_context_spawn_env()) try_number = str(context['task_instance'].try_number) params.update({"AIRFLOW_CTX_TRY_NUMBER": try_number}) return params
def fake_task_inside_dag(): log_metric("Testing", "Metric") run = try_get_databand_run() assert run is not None, "Task should run in databand run, check airflow tracking!" root_task = run.root_task # Validate regular subdag properties assert run.job_name == "%s.%s" % (PARENT_DAG, CHILD_DAG) # this test got problematic cause airflow_inplace task named as the script that ran it assert root_task.task_name return "Regular test"
def dbnd_operator__execute(dbnd_operator, context): from dbnd._core.current import try_get_databand_run from dbnd._core.run.databand_run import DatabandRun from targets import target run = try_get_databand_run() if not run: # we are not inside dbnd run, probably we are running from native airflow # let's try to load it: try: executor_config = dbnd_operator.executor_config logger.info("context: %s", context) logger.info("task.executor_config: %s", dbnd_operator.executor_config) logger.info("ti.executor_config: %s", context["ti"].executor_config) driver_dump = executor_config["DatabandExecutor"].get( "dbnd_driver_dump") print( "Running dbnd task %s from %s" % (dbnd_operator.dbnd_task_id, driver_dump), file=sys.__stderr__, ) if executor_config["DatabandExecutor"].get( "remove_airflow_std_redirect", False): sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ dbnd_bootstrap() dbnd_airflow_bootstrap() run = DatabandRun.load_run(dump_file=target(driver_dump), disable_tracking_api=False) except Exception as e: print( "Failed to load dbnd task in native airflow execution! Exception: %s" % (e, ), file=sys.__stderr__, ) dump_trace() raise with run.run_context() as dr: task_run = run.get_task_run_by_id(dbnd_operator.dbnd_task_id) ret_value = task_run.runner.execute(airflow_context=context) else: task_run = run.get_task_run_by_id(dbnd_operator.dbnd_task_id) ret_value = task_run.runner.execute(airflow_context=context) return ret_value
def start(self, root_task_name, job_name=None): if self._run or self._active or try_get_databand_run(): return airflow_context = try_get_airflow_context() set_tracking_config_overide(use_dbnd_log=True, airflow_context=airflow_context) # 1. create proper DatabandContext so we can create other objects dc = self._enter_cm(new_dbnd_context()) # type: DatabandContext if airflow_context: root_task_or_task_name = AirflowOperatorRuntimeTask.build_from_airflow_context( airflow_context ) source = UpdateSource.airflow_tracking job_name = "{}.{}".format(airflow_context.dag_id, airflow_context.task_id) else: root_task_or_task_name = _build_inline_root_task(root_task_name) source = UpdateSource.dbnd # create databand run # this will create databand run with driver and root tasks. # create databand run # we will want to preserve self._run = self._enter_cm( new_databand_run( context=dc, task_or_task_name=root_task_or_task_name, job_name=job_name, existing_run=False, source=source, af_context=airflow_context, ) ) # type: DatabandRun if not self._atexit_registered: _set_process_exit_handler(self.stop) self._atexit_registered = True sys.excepthook = self.stop_on_exception self._active = True # now we send data to DB self._run._init_without_run() self._start_taskrun(self._run.driver_task_run) self._start_taskrun(self._run.root_task_run) self._task_run = self._run.root_task_run return self._task_run
def start(self, root_task_name=None, airflow_context=None): if self._run or self._active or try_get_databand_run(): return # we probably should use only airlfow context via parameter. # also, there are mocks that cover only get_dbnd_project_config().airflow_context airflow_context = airflow_context or get_dbnd_project_config().airflow_context() set_tracking_config_overide(use_dbnd_log=True, airflow_context=airflow_context) dc = self._enter_cm( new_dbnd_context(name="inplace_tracking") ) # type: DatabandContext if airflow_context: root_task, job_name, source = build_run_time_airflow_task(airflow_context) else: root_task = _build_inline_root_task(root_task_name) job_name = root_task.task_name source = UpdateSource.dbnd self._run = run = self._enter_cm( new_databand_run( context=dc, job_name=job_name, existing_run=False, source=source, af_context=airflow_context, ) ) # type: DatabandRun self._run.root_task = root_task if not self._atexit_registered: _set_process_exit_handler(self.stop) self._atexit_registered = True sys.excepthook = self.stop_on_exception self._active = True # now we send data to DB root_task_run = run._build_and_add_task_run(root_task) root_task_run.is_root = True # No need to track the state because we track in init_run run.root_task_run.set_task_run_state(TaskRunState.RUNNING, track=False) run.tracker.init_run() self._enter_cm(run.root_task_run.runner.task_run_execution_context()) self._task_run = run.root_task_run return self._task_run
def get_local_tempfile(*path): run = try_get_databand_run() if run: tempdir = run.get_current_dbnd_local_root().partition("tmp").path else: tempdir = tempfile.gettempdir() path = os.path.join(tempdir, "databand-tmp-%09d" % random.randrange(0, 1e10), *path) base_dir = os.path.dirname(path) try: if not os.path.exists(base_dir): os.makedirs(base_dir) except Exception as ex: logger.info("Failed to create temp dir %s: %s", base_dir, ex) return path
def run_next(self, next_job): """ The run_next command will check the task_queue for any un-run jobs. It will then create a unique job-id, launch that job in the cluster, and store relevant info in the current_jobs map so we can track the job's status """ key, command, kube_executor_config = next_job dag_id, task_id, execution_date, try_number = key self.log.debug( "Kube POD to submit: image=%s with %s", self.kube_config.kube_image, str(next_job), ) dr = try_get_databand_run() task_run = dr.get_task_run_by_af_id(task_id) pod_command = [str(c) for c in command] task_engine = task_run.task_engine # type: KubernetesEngineConfig pod = task_engine.build_pod( task_run=task_run, cmds=pod_command, labels={ "airflow-worker": self.worker_uuid, "dag_id": make_safe_label_value(dag_id), "task_id": make_safe_label_value(task_run.task_af_id), "execution_date": self._datetime_to_label_safe_datestring( execution_date ), "try_number": str(try_number), }, try_number=try_number, include_system_secrets=True, ) pod_ctrl = self.kube_dbnd.get_pod_ctrl_for_pod(pod) self.submitted_pods[pod.name] = SubmittedPodState( pod_name=pod.name, task_run=task_run, scheduler_key=key, submitted_at=utcnow(), ) pod_ctrl.run_pod(pod=pod, task_run=task_run, detach_run=True) self.metrics_logger.log_pod_submitted(task_run.task, pod_name=pod.name)
def run_next_kube_job(self, key, command): dag_id, task_id, execution_date, try_number = key self.log.debug( "Kube POD to submit: image=%s with %s [%s]", self.kube_config.kube_image, str(key), str(command), ) databand_run = try_get_databand_run() task_run = databand_run.get_task_run_by_af_id(task_id) pod_command = [str(c) for c in command] task_engine = task_run.task_engine # type: KubernetesEngineConfig pod: "k8s.V1Pod" = task_engine.build_pod( task_run=task_run, cmds=pod_command, labels={ "airflow-worker": self._version_independent_worker_id(), "dag_id": make_safe_label_value(dag_id), "task_id": make_safe_label_value(task_run.task_af_id), "execution_date": self._datetime_to_label_safe_datestring(execution_date), "try_number": str(try_number), }, try_number=try_number, include_system_secrets=True, ) pod_ctrl = self.kube_dbnd.get_pod_ctrl(pod.metadata.name, pod.metadata.namespace, config=task_engine) self.submitted_pods[pod.metadata.name] = SubmittedPodState( pod_name=pod.metadata.name, task_run=task_run, scheduler_key=key, submitted_at=utcnow(), ) pod_ctrl.run_pod(pod=pod, task_run=task_run, detach_run=True) self.metrics_logger.log_pod_submitted(task_run.task, pod_name=pod.metadata.name)
def build_file_logger(name, fmt=None): """ Create a logger which write only to a file. the file will be located under the run dict. """ file_logger = logging.getLogger("{}_{}".format(__name__, name)) file_logger.propagate = False run = try_get_databand_run() if run: log_file = run.run_local_root.partition("{}.logs".format(name)) logger.info("Api-clients {name} logs writing into {path}".format( name=name, path=log_file)) handler = create_file_handler(str(log_file), fmt=fmt) file_logger.addHandler(handler) file_logger.setLevel(logging.INFO) return file_logger
def __getattribute__(self, name): def _get(n): return super(_BaseTask, self).__getattribute__(n) value = _get(name) try: _task_auto_read = _get("_task_auto_read") except Exception: return value # already cached if _task_auto_read is None or name in _task_auto_read: return value parameter = _get("_params").get_param(name) # we are not parameter # or there is nothing to "deferefence" # TODO: rebase : value is None if not parameter: return value runtime_value = parameter.calc_runtime_value(value, task=self) if parameter.is_output(): # if it's outpus, we should not "cache" it # otherwise we will try to save it on autosave ( as it was changed) return runtime_value elif isinstance(value, Target): if try_get_databand_run(): task_run = self.current_task_run if task_run: task_run.tracker.log_target( parameter=parameter, target=value, value=runtime_value, operation_type=DbndTargetOperationType.read, operation_status=DbndTargetOperationStatus.OK, ) # for the cache, so next time we don't need to calculate it setattr(self, name, runtime_value) _task_auto_read.add(name) return runtime_value
def start(self, root_task_name, job_name=None): if self._run: return if self._started or self._disabled: # started or failed return try: if try_get_databand_run(): return self._started = True # 1. create proper DatabandContext so we can create other objects set_tracking_config_overide(use_dbnd_log=True) # create databand context dc = self._enter_cm(new_dbnd_context()) # type: DatabandContext root_task = _build_inline_root_task(root_task_name) # create databand run self._run = self._enter_cm( new_databand_run( context=dc, task_or_task_name=root_task, existing_run=False, job_name=job_name, )) # type: DatabandRun self._run._init_without_run() if not self._atexit_registered: atexit.register(self.stop) sys.excepthook = self.stop_on_exception self._start_taskrun(self._run.driver_task_run) self._start_taskrun(self._run.root_task_run) self._task_run = self._run.root_task_run return self._task_run except Exception: _handle_inline_error("inline-start") self._disabled = True return finally: self._started = True
def print_driver_events(): try: dbnd_run = try_get_databand_run() engine_config = dbnd_run.run_executor.remote_engine kube_client = engine_config.get_kube_client() from socket import gethostname driver_pod_name = gethostname() logger.info("Driver pod name is %s" % (driver_pod_name, )) field_selector = "involvedObject.name=%s" % driver_pod_name logger.info("Field selector is %s" % (field_selector, )) driver_events = kube_client.list_namespaced_event( namespace=engine_config.namespace, field_selector=field_selector) logger.info("Found %s driver events" % (len(driver_events.items), )) for event in driver_events.items: message = create_log_message_from_event(event) logger.info(message) except Exception as e: logger.info("Could not retrieve driver events! Exception: %s", e)
def start(self, root_task_name=None, airflow_context=None): if self._run or self._active or try_get_databand_run(): return airflow_context = airflow_context or try_get_airflow_context() set_tracking_config_overide(use_dbnd_log=True, airflow_context=airflow_context) dc = self._enter_cm(new_dbnd_context()) # type: DatabandContext if airflow_context: root_task, job_name, source = build_run_time_airflow_task( airflow_context) else: root_task = _build_inline_root_task(root_task_name) job_name = None source = UpdateSource.dbnd self._run = self._enter_cm( new_databand_run( context=dc, task_or_task_name=root_task, job_name=job_name, existing_run=False, source=source, af_context=airflow_context, send_heartbeat=False, )) # type: DatabandRun if not self._atexit_registered: _set_process_exit_handler(self.stop) self._atexit_registered = True sys.excepthook = self.stop_on_exception self._active = True # now we send data to DB self._run._init_without_run() self._start_taskrun(self._run.driver_task_run) self._start_taskrun(self._run.root_task_run) self._task_run = self._run.root_task_run return self._task_run
def dbnd_operator__get_task_retry_delay(dbnd_operator): """ This method overrides the task retry delay found in airflow. We must override the actual task retry delay from airflow to ensure that we can control the retry delay per task, for example when we send pods to retry, we may want a different delay rather than another engine """ from dbnd._core.current import try_get_databand_run run = try_get_databand_run() if not run: return task_run = run.get_task_run_by_id(dbnd_operator.dbnd_task_id) if task_run.task_engine.task_definition.task_family == "kubernetes": # If we are running in K8s - use pod retry delay instead of task retry delay return task_run.task_engine.pod_retry_delay else: return task_run.task.task_retry_delay
def fake_task_inside_dag(): log_metric("Testing", "Metric") run = try_get_databand_run() assert run is not None, "Task should run in databand run, check airflow tracking!" root_task = run.root_task # Validate regular subdag properties assert run.job_name == PARENT_DAG assert root_task.task_name == "DAG__runtime" # Validate relationships ## sub dag child_task = list(root_task.task_dag.upstream)[0] assert "fake_task_inside_dag" in child_task.task_name assert child_task.dag_id == FULL_DAG_NAME ## function task func_task = list(child_task.task_dag.upstream)[0] assert fake_task_inside_dag.__name__ in func_task.task_name return "Regular test"
def start(self): self.log.info("Starting Kubernetes executor... PID: %s", os.getpid()) dbnd_run = try_get_databand_run() if dbnd_run: if AIRFLOW_VERSION_2: self.worker_uuid = str(dbnd_run.run_uid) else: self.worker_uuid = (KubeWorkerIdentifier. get_or_create_current_kube_worker_uuid()) else: self.worker_uuid = str(uuid.uuid4()) self.log.debug("Start with worker_uuid: %s", self.worker_uuid) # always need to reset resource version since we don't know # when we last started, note for behavior below # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs # /CoreV1Api.md#list_namespaced_pod # KubeResourceVersion.reset_resource_version() self.task_queue = self._manager.Queue() self.result_queue = self._manager.Queue() self.kube_client = self.kube_dbnd.kube_client self.kube_scheduler = DbndKubernetesScheduler( self.kube_config, self.task_queue, self.result_queue, self.kube_client, self.worker_uuid, kube_dbnd=self.kube_dbnd, ) if self.kube_dbnd.engine_config.debug: self.log.setLevel(logging.DEBUG) self.kube_scheduler.log.setLevel(logging.DEBUG) if AIRFLOW_VERSION_1: self._inject_secrets() self.clear_not_launched_queued_tasks() self._flush_result_queue()
def get_local_tempfile(*path): run = try_get_databand_run() if run: dbnd_local_root = run.get_current_dbnd_local_root() if dbnd_local_root.exists(): # on remote engine temp defined at driver can be un-awailable # simple workaround to use tmp folder on the machine tempdir = dbnd_local_root.partition("tmp").path else: # fallback to simple temp dir tempdir = tempfile.gettempdir() else: tempdir = tempfile.gettempdir() path = os.path.join(tempdir, "databand-tmp-%09d" % random.randrange(0, 1e10), *path) base_dir = os.path.dirname(path) try: if not os.path.exists(base_dir): os.makedirs(base_dir) except Exception as ex: logger.info("Failed to create temp dir %s: %s", base_dir, ex) return path
def stop(self, at_exit=True, update_run_state=True): if update_run_state: databand_run = try_get_databand_run() if databand_run: root_tr = databand_run.task.current_task_run root_tr.finished_time = utcnow() for tr in databand_run.task_runs: if tr.task_run_state == TaskRunState.FAILED: root_tr.set_task_run_state( TaskRunState.UPSTREAM_FAILED) databand_run.set_run_state(RunState.FAILED) break else: root_tr.set_task_run_state(TaskRunState.SUCCESS) databand_run.set_run_state(RunState.SUCCESS) logger.info(databand_run.describe.run_banner_for_finished()) self._close_all_context_managers() if at_exit and is_airflow_enabled(): from airflow.settings import dispose_orm dispose_orm()
def _store_value_origin_target(self, value, target): dbnd_run = try_get_databand_run() if not dbnd_run: return dbnd_run.target_origin.add(target, value, self.value_type)
def start(self, root_task_name=None, project_name=None, airflow_context=None): if self._run or self._active or try_get_databand_run(): return # we probably should use only airlfow context via parameter. # also, there are mocks that cover only get_dbnd_project_config().airflow_context airflow_context = airflow_context or get_dbnd_project_config( ).airflow_context() if airflow_context: _set_dbnd_config_from_airflow_connections() _set_tracking_config_overide(airflow_context=airflow_context) dc = self._enter_cm( new_dbnd_context(name="inplace_tracking")) # type: DatabandContext if not root_task_name: # extract the name of the script we are running (in Airflow scenario it will be just "airflow") root_task_name = sys.argv[0].split(os.path.sep)[-1] if airflow_context: root_task, job_name, source, run_uid = build_run_time_airflow_task( airflow_context, root_task_name) try_number = airflow_context.try_number else: root_task = _build_inline_root_task(root_task_name) job_name = root_task_name source = UpdateSource.generic_tracking run_uid = None try_number = 1 tracking_source = ( None # TODO_CORE build tracking_source -> typeof TrackingSourceSchema ) self._run = run = self._enter_cm( new_databand_run( context=dc, job_name=job_name, run_uid=run_uid, existing_run=run_uid is not None, source=source, af_context=airflow_context, tracking_source=tracking_source, project_name=project_name, )) # type: DatabandRun self._run.root_task = root_task self.update_run_from_airflow_context(airflow_context) if not self._atexit_registered: _set_process_exit_handler(self.stop) self._atexit_registered = True sys.excepthook = self.stop_on_exception self._active = True # now we send data to DB root_task_run = run._build_and_add_task_run( root_task, task_af_id=root_task.task_name, try_number=try_number) root_task_run.is_root = True run.tracker.init_run() run.root_task_run.set_task_run_state(TaskRunState.RUNNING) should_capture_log = TrackingConfig.from_databand_context( ).capture_tracking_log self._enter_cm( run.root_task_run.runner.task_run_execution_context( capture_log=should_capture_log, handle_sigterm=False)) self._task_run = run.root_task_run return self._task_run
def _create_pod_id(dag_id, task_id): task_run = try_get_databand_run().get_task_run(task_id) return task_run.job_id__dns1123