def _calculate_task_defaults(self, defaults): # type: (...)-> Dict[str, Any] base_defaults = dict() for base_schema in self.base_task_definitions: base_defaults = combine_mappings(base_defaults, base_schema.defaults) return combine_mappings(base_defaults, defaults)
def _calculate_task_class_values(self, classdict): # reflect inherited attributes params, base_defaults = self._discover_base_attributes() # let update params with new class attributes self._update_params_from_attributes(classdict, params) # this is the place we add parameters from function definition if self.task_class._conf__decorator_spec is not None: func_params_builder = FuncParamsBuilder( base_params=params, decorator_spec=self.task_class._conf__decorator_spec) func_classdict = func_params_builder.get_task_parameters() self._update_params_from_attributes(func_classdict, params) defaults = combine_mappings(base_defaults, classdict.get("defaults", None)) # add parameters config params = { name: param.evolve_with_owner(task_cls=self.task_class, name=name) for name, param in six.iteritems(params) } params = _ordered_params(params) return params, defaults
def _calculate_task_class_values(self, classdict): # reflect inherited attributes params, base_defaults = self._discover_base_attributes() defaults = combine_mappings(base_defaults, classdict.get("defaults", None)) self._update_and_evolve_owner(classdict, params) return params, defaults
def _discover_base_attributes(self): # type: ()-> (Dict[str,ParameterDefinition], Dict[str, Any]) params = dict() defaults = dict() # we process only "direct" inheritance # params will contain definition of param, even it's was overrided by the parent task for c in reversed(self.task_class.__bases__): # type: TaskMetaclass if not hasattr(c, "task_definition"): logger.debug( "you should inherit from Task objects only: %s -> %s ", self.task_class, c, ) continue base_schema = c.task_definition # type: TaskDefinition defaults = combine_mappings(defaults, base_schema.defaults) params = combine_mappings(params, c.task_definition.task_params) return params, defaults
def _pd_to(self, value, *args, **kwargs): kwargs = combine_mappings( { "key": "data", "format": _get_supported_hd5_storage_format(value), "mode": "w", "complib": "zlib", }, kwargs, ) value.to_hdf(*args, **kwargs)
def _calculate_task_class_values(self, classdict, external_parameters): # type: (Optional[Dict], Optional[Parameters]) -> Dict[str, ParameterDefinition] # reflect inherited attributes params = dict() # params will contain definition of param, even it's was overrided by the parent task for base_schema in self.base_task_definitions: params = combine_mappings(params, base_schema.task_param_defs) # let update params with new class attributes self._update_params_from_attributes(classdict, params) # this is the place we add parameters from function definition if self.task_decorator is not None: func_params_builder = DecoratedCallableParamBuilder( base_params=params, task_decorator=self.task_decorator) func_params_builder.build_func_params() params_dict = dict(func_params_builder.decorator_kwargs_params) params_dict.update(func_params_builder.func_spec_params) params_dict.update(func_params_builder.result_params) self._update_params_from_attributes(params_dict, params) if external_parameters: params.update({ param.name: param for param in external_parameters.get_params() }) updated_params = {} for name, param in six.iteritems(params): # add parameters config param_with_owner = param.evolve_with_owner(task_definition=self, name=name) # updated the owner in the external parameters param_value = external_parameters and external_parameters.get_param_value( name) if param_value: param_value.parameter = param_with_owner updated_params[name] = param_with_owner params = _ordered_params(updated_params) return params
def run( ctx, is_help, task, module, _sets, _sets_config, _sets_root, _overrides, verbose, describe, env, parallel, conf_file, task_version, project_name, name, description, run_driver, alternative_task_name, scheduled_job_name, scheduled_date, interactive, submit_driver, submit_tasks, disable_web_tracker, ): """ Run a task or a DAG To see tasks use `dbnd show-tasks` (tab completion is available). """ from dbnd._core.context.databand_context import new_dbnd_context, DatabandContext from dbnd._core.utils.structures import combine_mappings from dbnd import config task_name = task # --verbose, --describe, --env, --parallel, --conf-file and --project-name # we filter out false flags since otherwise they will always override the config with their falseness main_switches = dict( databand=filter_dict_remove_false_values( dict( verbose=verbose > 0, describe=describe, env=env, conf_file=conf_file, project_name=project_name, ) ), run=filter_dict_remove_false_values( dict( name=name, parallel=parallel, description=description, is_archived=describe, ) ), ) if submit_driver is not None: main_switches["run"]["submit_driver"] = bool(submit_driver) if submit_tasks is not None: main_switches["run"]["submit_tasks"] = bool(submit_tasks) if disable_web_tracker: main_switches.setdefault("core", {})["tracker_api"] = "disabled" if task_version is not None: main_switches["task"] = {"task_version": task_version} cmd_line_config = parse_and_build_config_store( source="cli", config_values=main_switches ) _sets = list(_sets) _sets_config = list(_sets_config) _sets_root = list(_sets_root) root_task_config = {} for _set in _sets_root: root_task_config = combine_mappings(left=root_task_config, right=_set) # remove all "first level" config values, assume that they are for the main task # add them to _sets_root for _set in _sets: for k, v in list(_set.items()): # so json-like values won't be included if "." not in k and isinstance(v, six.string_types): root_task_config[k] = v del _set[k] # --set, --set-config if _sets: cmd_line_config.update(_parse_cli(_sets, source="--set")) if _sets_config: cmd_line_config.update(_parse_cli(_sets_config, source="--set-config")) if _overrides: cmd_line_config.update( _parse_cli(_overrides, source="--set-override", override=True) ) if interactive: cmd_line_config.update( _parse_cli([{"run.interactive": True}], source="--interactive") ) if verbose > 1: cmd_line_config.update( _parse_cli([{"task_build.verbose": True}], source="-v -v") ) if cmd_line_config: config.set_values(cmd_line_config, source="cmdline") if verbose: logger.info("CLI config: \n%s", pformat_config_store_as_table(cmd_line_config)) # double checking on bootstrap, as we can run from all kind of locations # usually we should be bootstraped already as we run from cli. dbnd_bootstrap() if not config.getboolean("log", "disabled"): configure_basic_logging(None) scheduled_run_info = None if scheduled_job_name: scheduled_run_info = ScheduledRunInfo( scheduled_job_name=scheduled_job_name, scheduled_date=scheduled_date ) with new_dbnd_context( name="run", module=module ) as context: # type: DatabandContext task_registry = get_task_registry() tasks = task_registry.list_dbnd_task_classes() completer.refresh(tasks) # modules are loaded, we can load the task task_cls = None if task_name: task_cls = task_registry.get_task_cls(task_name) if alternative_task_name: task_cls = build_dynamic_task( original_cls=task_cls, new_cls_name=alternative_task_name ) task_name = alternative_task_name # --set-root # now we can get it config, as it's not main task, we can load config after the configuration is loaded if task_cls is not None: if root_task_config: # adding root task to configuration config.set_values( {task_cls.task_definition.task_config_section: root_task_config}, source="--set-root", ) if is_help or not task_name: print_help(ctx, task_cls) return return context.dbnd_run_task( task_or_task_name=task_name, run_uid=run_driver, scheduled_run_info=scheduled_run_info, )
def cmd_run( ctx, is_help, task, module, _sets, _sets_config, _sets_root, _overrides, _extend, verbose, print_task_band, describe, env, parallel, conf_file, task_version, project, name, description, run_driver, override_run_uid, alternative_task_name, job_name, scheduled_job_name, scheduled_date, interactive, submit_driver, submit_tasks, disable_web_tracker, open_web_tab, docker_build_tag, ): """ Run a task or a DAG To see all available tasks use `dbnd show-tasks` (tab completion is available). `dbnd show-configs` will print all available configs. """ from dbnd import config from dbnd._core.context.databand_context import DatabandContext, new_dbnd_context from dbnd._core.utils.structures import combine_mappings task_registry = get_task_registry() # we need to do it before we are looking for the task cls load_user_modules(dbnd_config=config, modules=module) task_name = task # --verbose, --describe, --env, --parallel, --conf-file and --project # we filter out false flags since otherwise they will always override the config with their falseness main_switches = dict( databand=dict( verbose=verbose > 0, print_task_band=print_task_band, describe=describe, env=env, conf_file=conf_file, project=project, ), run=dict( name=name, parallel=parallel, interactive=interactive, description=description, is_archived=describe, open_web_tracker_in_browser=open_web_tab, submit_driver=_nullable_flag(submit_driver), submit_tasks=_nullable_flag(submit_tasks), ), kubernetes=dict(docker_build_tag=docker_build_tag), task=dict(task_version=task_version), task_build=dict(verbose=True if verbose > 1 else None), core=dict(tracker_api="disabled" if disable_web_tracker else None), ) main_switches = cleanup_empty_switches(main_switches) _sets = list(_sets) _sets_config = list(_sets_config) _sets_root = list(_sets_root) root_task_config = {} for _set in _sets_root: root_task_config = combine_mappings(left=root_task_config, right=_set) # remove all "first level" config values, assume that they are for the main task # add them to _sets_root for _set in _sets: for k, v in list(_set.items()): # so json-like values won't be included if "." not in k and isinstance(v, six.string_types): root_task_config[k] = v del _set[k] cmd_line_config = parse_and_build_config_store(source="cli", config_values=main_switches) # --set, --set-config if _sets: cmd_line_config.update(_parse_cli(_sets, source="--set")) if _sets_config: cmd_line_config.update(_parse_cli(_sets_config, source="--set-config")) if _extend: cmd_line_config.update( _parse_cli(_extend, source="--extend-config", extend=True)) if _overrides: cmd_line_config.update( _parse_cli( _overrides, source="--set-override", priority=ConfigValuePriority.OVERRIDE, )) # --set-root if root_task_config: task_cls = task_registry.get_task_cls(task_name) task_section = task_cls.task_definition.task_config_section # adding root task to configuration cmd_line_config.update( parse_and_build_config_store( config_values={task_section: root_task_config}, source="--set-root")) # UPDATE CURRENT CONFIG with CLI values if cmd_line_config: if verbose: logger.info("CLI config: \n%s", pformat_config_store_as_table(cmd_line_config)) config.set_values(cmd_line_config, source="cmdline") # double checking on bootstrap, as we can run from all kind of locations # usually we should be bootstraped already as we run from cli. dbnd_bootstrap() # initialize basic logging (until we get to the context logging if not config.getboolean("log", "disabled"): configure_basic_logging(None) scheduled_run_info = None if scheduled_job_name: scheduled_run_info = ScheduledRunInfo( scheduled_job_name=scheduled_job_name, scheduled_date=scheduled_date) # update completer if config.getboolean("databand", "completer"): tasks = task_registry.list_dbnd_task_classes() completer.refresh(tasks) # bootstrap and modules are loaded, we can load the task task_cls = None if task_name: task_cls = task_registry.get_task_cls(task_name) if not task_name: print_help(ctx, None) return if is_help: print_help(ctx, task_cls) return with tracking_mode_context(tracking=False), new_dbnd_context( name="run") as context: # type: DatabandContext if context.settings.system.describe: # we want to print describe without triggering real run logger.info("Building main task '%s'", task_name) root_task = get_task_registry().build_dbnd_task(task_name) root_task.ctrl.describe_dag.describe_dag() # currently there is bug with the click version we have when using python 2 # so we don't use the click.echo function # https://github.com/pallets/click/issues/564 print("Task %s has been described!" % task_name) return root_task return context.dbnd_run_task( task_or_task_name=task_name, force_task_name=alternative_task_name, job_name=job_name or alternative_task_name or task_name, run_uid=run_driver or override_run_uid, existing_run=run_driver is not None, scheduled_run_info=scheduled_run_info, project=project, )
def build_pod( self, task_run, cmds, args=None, labels=None, try_number=None, include_system_secrets=False, ): # type: (TaskRun, List[str], Optional[List[str]], Optional[Dict[str,str]], Optional[int]) ->Pod pod_name = self.get_pod_name(task_run=task_run, try_number=try_number) image = self.full_image labels = combine_mappings(labels, self.labels) labels["dbnd_run_uid"] = clean_job_name_dns1123( str(task_run.run.run_uid)) labels["dbnd_task_run_uid"] = clean_job_name_dns1123( str(task_run.task_run_uid)) labels[ "dbnd"] = "task_run" # for easier pod deletion (kubectl delete pod -l dbnd=task_run -n <my_namespace>) annotations = self.annotations.copy() if self.gcp_service_account_keys: annotations[ "iam.cloud.google.com/service-account"] = self.gcp_service_account_keys annotations["dbnd_tracker"] = task_run.task_tracker_url from dbnd_docker.kubernetes.dbnd_extended_resources import DbndExtendedResources resources = DbndExtendedResources( requests=self.requests, limits=self.limits, request_memory=self.request_memory, request_cpu=self.request_cpu, limit_memory=self.limit_memory, limit_cpu=self.limit_cpu, ) env_vars = { ENV_DBND_POD_NAME: pod_name, ENV_DBND_POD_NAMESPACE: self.namespace, ENV_DBND_USER: task_run.task_run_env.user, ENV_DBND__ENV_IMAGE: image, ENV_DBND_ENV: task_run.run.env.task_name, ENV_DBND__ENV_MACHINE: "%s at %s" % (pod_name, self.namespace), } if self.auto_remove: env_vars[ENV_DBND_AUTO_REMOVE_POD] = "True" env_vars[self._params.get_param_env_key("in_cluster")] = "True" env_vars["AIRFLOW__KUBERNETES__IN_CLUSTER"] = "True" env_vars[ "DBND__RUN_INFO__SOURCE_VERSION"] = task_run.run.context.task_run_env.user_code_version # we want that all next runs will be able to use the image that we have in our configuration env_vars.update( self._params.to_env_map("container_repository", "container_tag")) env_vars.update(self.env_vars) env_vars.update(task_run.run.get_context_spawn_env()) secrets = self.get_secrets( include_system_secrets=include_system_secrets) from airflow.contrib.kubernetes.pod import Pod if self.trap_exit_file_flag: args = [ textwrap.dedent(""" trap "touch {trap_file}" EXIT {command} """.format( trap_file=self.trap_exit_file_flag, command=subprocess.list2cmdline(cmds), )) ] # we update cmd now cmds = ["/bin/bash", "-c"] if not self.container_tag: raise DatabandConfigError( "Your container tag is None, please check your configuration", help_msg="Container tag should be assigned", ) pod = Pod( namespace=self.namespace, name=pod_name, envs=env_vars, image=image, cmds=cmds, args=args, labels=labels, image_pull_policy=self.image_pull_policy, image_pull_secrets=self.image_pull_secrets, secrets=secrets, service_account_name=self.service_account_name, volumes=self.volumes, volume_mounts=self.volume_mounts, annotations=annotations, node_selectors=self.node_selectors, affinity=self.affinity, tolerations=self.tolerations, security_context=self.security_context, configmaps=self.configmaps, hostnetwork=self.hostnetwork, resources=resources, ) if self.pod_yaml: pod.pod_yaml = target(self.pod_yaml).read() return pod
def _pd_to(self, data, file_or_path, *args, **kwargs): kwargs = combine_mappings({"format": "fixed"}, kwargs) with pd.HDFStore(file_or_path, "w") as store: kwargs.pop("mode", None) store.put("features", data.features, data_columns=True, **kwargs) store.put("targets", data.targets, data_columns=True, **kwargs)
def build_pod( self, task_run: TaskRun, cmds: List[str], args: Optional[List[str]] = None, labels: Optional[Dict[str, str]] = None, try_number: Optional[int] = None, include_system_secrets: bool = False, ) -> k8s.V1Pod: if not self.container_tag: raise DatabandConfigError( "Your container tag is None, please check your configuration", help_msg="Container tag should be assigned", ) pod_name = self.get_pod_name(task_run=task_run, try_number=try_number) image = self.full_image labels = combine_mappings(labels, self.labels) labels["pod_name"] = pod_name labels["dbnd_run_uid"] = task_run.run.run_uid labels["dbnd_task_run_uid"] = task_run.task_run_uid labels["dbnd_task_run_attempt_uid"] = task_run.task_run_attempt_uid labels[ "dbnd_task_family"] = task_run.task.task_definition.full_task_family_short labels["dbnd_task_name"] = task_run.task.task_name labels["dbnd_task_af_id"] = task_run.task_af_id # for easier pod deletion (kubectl delete pod -l dbnd=task_run -n <my_namespace>) if task_run.task.task_is_system: labels["dbnd"] = "dbnd_system_task_run" else: labels["dbnd"] = "task_run" # we need to be sure that the values meet the dns label names RFC # https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#dns-label-names labels = { label_name: clean_label_name_dns1123(str(label_value)) for label_name, label_value in six.iteritems(labels) } if is_verbose(): logger.info("Build pod with kubernetes labels {}".format(labels)) annotations = self.annotations.copy() if self.gcp_service_account_keys: annotations[ "iam.cloud.google.com/service-account"] = self.gcp_service_account_keys annotations["dbnd_tracker"] = task_run.task_tracker_url from dbnd_docker.kubernetes.vendorized_airflow.dbnd_extended_resources import ( DbndExtendedResources, ) resources = DbndExtendedResources( requests=self.requests, limits=self.limits, request_memory=self.request_memory, request_cpu=self.request_cpu, limit_memory=self.limit_memory, limit_cpu=self.limit_cpu, ) env_vars = { ENV_DBND_POD_NAME: pod_name, ENV_DBND_POD_NAMESPACE: self.namespace, ENV_DBND_USER: task_run.task_run_env.user, ENV_DBND__ENV_IMAGE: image, ENV_DBND_ENV: task_run.run.env.task_name, ENV_DBND__ENV_MACHINE: "%s at %s" % (pod_name, self.namespace), } if AIRFLOW_VERSION_2: env_vars[ "AIRFLOW__CORE__TASK_RUNNER"] = "dbnd_airflow.compat.dbnd_task_runner.DbndStandardTaskRunner" if self.auto_remove: env_vars[ENV_DBND_AUTO_REMOVE_POD] = "True" env_vars[self._params.get_param_env_key(self, "in_cluster")] = "True" env_vars["AIRFLOW__KUBERNETES__IN_CLUSTER"] = "True" env_vars[ "DBND__RUN_INFO__SOURCE_VERSION"] = task_run.run.context.task_run_env.user_code_version env_vars["AIRFLOW__KUBERNETES__DAGS_IN_IMAGE"] = "True" if not get_dbnd_project_config().is_tracking_mode(): env_vars[ENV_DBND__TRACKING] = "False" # we want that all next runs will be able to use the image that we have in our configuration env_vars.update( self._params.to_env_map(self, "container_repository", "container_tag")) env_vars.update(self.env_vars) env_vars.update(task_run.run.get_context_spawn_env()) secrets = self.get_secrets( include_system_secrets=include_system_secrets) if self.trap_exit_file_flag: args = [ textwrap.dedent(""" trap "touch {trap_file}" EXIT {command} """.format( trap_file=self.trap_exit_file_flag, command=subprocess.list2cmdline(cmds), )) ] # we update cmd now cmds = ["/bin/bash", "-c"] if self.debug_with_command: logger.warning( "%s replacing pod %s command with '%s', original command=`%s`", task_run, pod_name, self.debug_with_command, subprocess.list2cmdline(cmds), ) cmds = shlex.split(self.debug_with_command) base_pod = self._build_base_pod() pod = self._to_real_pod( cmds=cmds, args=args, namespace=self.namespace, name=pod_name, envs=env_vars, image=image, labels=labels, secrets=secrets, resources=resources, annotations=annotations, ) final_pod = reconcile_pods(base_pod, pod) return final_pod