def json_default(obj, safe=False): from dbnd._core.parameter.parameter_definition import ParameterDefinition if isinstance(obj, ParameterDefinition): return str(obj) if isinstance(obj, _FrozenOrderedDict): return obj.get_wrapped() from targets import Target if isinstance(obj, Target): return str(obj) if isinstance(obj, datetime.datetime): return obj.strftime("%Y-%m-%dT%H:%M:%SZ") elif isinstance(obj, datetime.date): return obj.strftime("%Y-%m-%d") if is_instance_by_class_name(obj, "int32") or is_instance_by_class_name( obj, "int64"): return str(obj) if isinstance(obj, UUID): return str(obj) if safe: return str(obj) raise TypeError(repr(obj) + " is not JSON serializable")
def wrap_operator_with_tracking_info(tracking_info, operator): # type: (Dict[str, str], Any) -> Optional[ContextManager] """ Wrap the operator with relevant tracking method, if found such method. """ for class_name, tracking_wrapper in _EXECUTE_TRACKING.items(): if is_instance_by_class_name(operator, class_name): return tracking_wrapper(operator, tracking_info)
def add_tracking_to_submit_task(tracking_info, operator): """ Wrap the operator with relevant tracking method, if found such method. """ for class_name, tracking_wrapper in _EXECUTE_TRACKING.items(): if is_instance_by_class_name(operator, class_name): tracking_wrapper(operator, tracking_info) break
def _track_task(task): if should_not_track(task): return if is_instance_by_class_name(task, "SubDagOperator"): # we do not track the execute of a SubDag, only its tasks track_dag(task.subdag) else: track_operator(task)
def build_run_time_airflow_task(af_context): # type: (AirflowTaskContext) -> (AirflowOperatorRuntimeTask, str, UpdateSource) if af_context.context: # we are in the execute entry point and therefore that task name is <task>__execute task_family = "%s__execute" % af_context.task_id airflow_operator = af_context.context["task_instance"].task tracked_function = None if is_instance_by_class_name(airflow_operator, "PythonOperator"): tracked_function = airflow_operator.python_callable # find the template fields of the operators user_params = get_flatten_operator_params(airflow_operator) if tracked_function: user_params["function_name"] = tracked_function.__name__ # create params definitions for the operator's fields params_def = { name: parameter[type(value)].build_parameter("inline") for name, value in user_params.items() } task_class = build_dynamic_task_class( AirflowOperatorRuntimeTask, task_family, params_def, ) if tracked_function: import inspect task_class.task_definition.task_source_code = inspect.getsource( tracked_function) task_class.task_definition.task_module_code = inspect.getsource( inspect.getmodule(tracked_function)) else: # if this is an inline run-time task, we name it after the script which ran it task_family = sys.argv[0].split(os.path.sep)[-1] task_class = build_dynamic_task_class(AirflowOperatorRuntimeTask, task_family) module_code = get_user_module_code() task_class.task_definition.task_module_code = module_code task_class.task_definition.task_source_code = module_code user_params = {} root_task = task_class(dag_id=af_context.dag_id, execution_date=af_context.execution_date, task_version="%s:%s" % (af_context.task_id, af_context.execution_date), **user_params) job_name = "{}.{}".format(af_context.dag_id, af_context.task_id) source = UpdateSource.airflow_tracking return root_task, job_name, source
def _track_task(task): from airflow.operators.subdag_operator import SubDagOperator if should_not_track(task): return track_operator(task) if is_instance_by_class_name(task, SubDagOperator.__name__): # we also track the subdag's inner tasks track_dag(task.subdag)
def t(obj): if convert_types and isinstance(obj, convert_types): return convert_f(obj) if isinstance(obj, Mapping): # noinspection PyArgumentList converted = ((k, t(v)) for k, v in six.iteritems(obj)) if filter_none: converted = ((k, v) for k, v in converted if v is not None) new_obj = obj.__class__(converted) if filter_empty and not new_obj: return None return new_obj if isinstance(obj, six.string_types): return convert_f(obj) if "pyspark" in str(type(obj)): return convert_f(obj) try: target_no_traverse = hasattr(obj, "target_no_traverse") and getattr( obj, "target_no_traverse", None) except ValueError: # SPARK OBJECTS do not support hasattr target_no_traverse = True if target_no_traverse is bool and target_no_traverse: return convert_f(obj) list_obj_constructor = None if isinstance(obj, (list, tuple, set)): list_obj_constructor = obj.__class__ elif is_instance_by_class_name(obj, "DataFrame"): pass else: try: iter(obj) # noqa: F841 list_obj_constructor = list except TypeError: pass # we can parse and reconstruct list object if list_obj_constructor: converted = (t(r) for r in obj) if filter_none: converted = filter(lambda x: x is not None, converted) new_obj = list_obj_constructor(converted) if filter_empty and not new_obj: return None return new_obj # so it's simple obj, let apply function return convert_f(obj)
def _track_task(task): from airflow.operators.subdag_operator import SubDagOperator if should_not_track(task): return if is_instance_by_class_name(task, SubDagOperator.__name__): # we do not track the execute of a SubDag, only its tasks track_dag(task.subdag) else: track_operator(task)
def add_tracking_to_submit_task(tracking_info, operator): for class_name, tracking_wrapper in _EXECUTE_TRACKING.items(): if is_instance_by_class_name(operator, class_name): tracking_wrapper(operator, tracking_info) break
def should_flatten(operator, attr_name): flatten_config = get_settings().tracking.flatten_operator_fields for op_name in flatten_config: if is_instance_by_class_name(operator, op_name): return attr_name in flatten_config[op_name] return False
def get_tracking_wrapper(task): for class_name, tracking_wrapper in _TRACKING.items(): if is_instance_by_class_name(task, class_name): return tracking_wrapper raise KeyError
def test_is_instance_by_class_name(obj, name, expected): assert is_instance_by_class_name(obj, name) == expected
def traverse( struct, convert_f=f_identity, filter_none=False, filter_empty=False, convert_types=None, traverse_path=None, ): """ Maps all Tasks in a structured data object to their .output(). :param convert_types non basic types to apply convert_f :param filter_none remove None from structures """ obj = struct if traverse_path is None: traverse_path = [] # first invocation fo the function t = functools.partial( traverse, convert_f=convert_f, filter_none=filter_none, filter_empty=filter_empty, convert_types=convert_types, ) if convert_types and isinstance(obj, convert_types): return convert_f(obj) if isinstance(obj, Mapping): # noinspection PyArgumentList converted = ( (k, t(v, traverse_path=traverse_path + [k])) for k, v in six.iteritems(obj) ) if filter_none: converted = ((k, v) for k, v in converted if v is not None) new_obj = obj.__class__(converted) if filter_empty and not new_obj: return None return new_obj if isinstance(obj, six.string_types): return convert_f(obj) if "pyspark" in str(type(obj)): return convert_f(obj) try: target_no_traverse = hasattr(obj, "target_no_traverse") and getattr( obj, "target_no_traverse", None ) except ValueError: # SPARK OBJECTS do not support hasattr target_no_traverse = True if target_no_traverse is bool and target_no_traverse: return convert_f(obj) list_obj_constructor = None if isinstance(obj, (list, tuple, set)): list_obj_constructor = obj.__class__ elif is_instance_by_class_name(obj, "DataFrame"): pass else: try: iter(obj) # noqa: F841 list_obj_constructor = list except TypeError: pass # we can parse and reconstruct list object if list_obj_constructor: converted = (t(r, traverse_path=traverse_path + [i]) for i, r in enumerate(obj)) if filter_none: converted = filter(lambda x: x is not None, converted) new_obj = list_obj_constructor(converted) if filter_empty and not new_obj: return None return new_obj # so it's simple obj, let apply function return convert_f(obj)
def build_run_time_airflow_task(af_context, root_task_name): # type: (AirflowTaskContext, Optional[str]) -> Tuple[TrackingTask, str, UpdateSource, UUID] if af_context.context: # we are in the execute entry point and therefore that task name is <task>__execute task_family = af_context.task_id airflow_operator = af_context.context["task_instance"].task # find the template fields of the operators user_params = get_flatten_operator_params(airflow_operator) source_code = NO_SOURCE_CODE if is_instance_by_class_name(airflow_operator, "PythonOperator"): tracked_function = airflow_operator.python_callable user_params["function_name"] = tracked_function.__name__ source_code = TaskSourceCode.from_callable(tracked_function) else: # if this is an inline run-time task, we name it after the script which ran it # If we ever get here, the root_task_name will be just "airflow" since this is what we pass task_family = get_task_family_for_inline_script( af_context.task_id, root_task_name) source_code = TaskSourceCode.from_callstack() user_params = {} user_params.update( dag_id=af_context.dag_id, execution_date=af_context.execution_date, task_version="%s:%s" % (af_context.task_id, af_context.execution_date), ) # just a placeholder name task_passport = TaskPassport.from_module(task_family) task_definition_uid = get_task_def_uid( af_context.dag_id, task_family, "{}{}".format( source_md5(source_code.task_source_code), source_md5(source_code.task_module_code), ), ) root_task = TrackingTask.for_user_params( task_definition_uid=task_definition_uid, task_name=task_family, task_passport=task_passport, source_code=source_code, user_params=user_params, ) # type: TrackingTask root_task.ctrl.task_repr.task_functional_call = "" root_task.ctrl.task_repr.task_command_line = generate_airflow_cmd( dag_id=af_context.dag_id, task_id=af_context.task_id, execution_date=af_context.execution_date, is_root_task=False, ) root_run_uid = get_job_run_uid( airflow_instance_uid=af_context.airflow_instance_uid, dag_id=af_context.dag_id, execution_date=af_context.execution_date, ) root_task.ctrl.force_task_run_uid = get_task_run_uid( run_uid=root_run_uid, dag_id=af_context.dag_id, task_id=task_family) job_name = af_context.dag_id source = UpdateSource.airflow_tracking return root_task, job_name, source, root_run_uid