Exemplo n.º 1
0
def json_default(obj, safe=False):
    from dbnd._core.parameter.parameter_definition import ParameterDefinition

    if isinstance(obj, ParameterDefinition):
        return str(obj)

    if isinstance(obj, _FrozenOrderedDict):
        return obj.get_wrapped()

    from targets import Target

    if isinstance(obj, Target):
        return str(obj)

    if isinstance(obj, datetime.datetime):
        return obj.strftime("%Y-%m-%dT%H:%M:%SZ")
    elif isinstance(obj, datetime.date):
        return obj.strftime("%Y-%m-%d")

    if is_instance_by_class_name(obj, "int32") or is_instance_by_class_name(
            obj, "int64"):
        return str(obj)

    if isinstance(obj, UUID):
        return str(obj)
    if safe:
        return str(obj)

    raise TypeError(repr(obj) + " is not JSON serializable")
Exemplo n.º 2
0
def wrap_operator_with_tracking_info(tracking_info, operator):
    # type: (Dict[str, str], Any) -> Optional[ContextManager]
    """
    Wrap the operator with relevant tracking method, if found such method.
    """
    for class_name, tracking_wrapper in _EXECUTE_TRACKING.items():
        if is_instance_by_class_name(operator, class_name):
            return tracking_wrapper(operator, tracking_info)
Exemplo n.º 3
0
def add_tracking_to_submit_task(tracking_info, operator):
    """
    Wrap the operator with relevant tracking method, if found such method.
    """
    for class_name, tracking_wrapper in _EXECUTE_TRACKING.items():
        if is_instance_by_class_name(operator, class_name):
            tracking_wrapper(operator, tracking_info)
            break
Exemplo n.º 4
0
def _track_task(task):
    if should_not_track(task):
        return

    if is_instance_by_class_name(task, "SubDagOperator"):
        # we do not track the execute of a SubDag, only its tasks
        track_dag(task.subdag)
    else:
        track_operator(task)
Exemplo n.º 5
0
def build_run_time_airflow_task(af_context):
    # type: (AirflowTaskContext) -> (AirflowOperatorRuntimeTask, str, UpdateSource)
    if af_context.context:
        # we are in the execute entry point and therefore that task name is <task>__execute
        task_family = "%s__execute" % af_context.task_id

        airflow_operator = af_context.context["task_instance"].task

        tracked_function = None
        if is_instance_by_class_name(airflow_operator, "PythonOperator"):
            tracked_function = airflow_operator.python_callable

        # find the template fields of the operators
        user_params = get_flatten_operator_params(airflow_operator)
        if tracked_function:
            user_params["function_name"] = tracked_function.__name__

        # create params definitions for the operator's fields
        params_def = {
            name: parameter[type(value)].build_parameter("inline")
            for name, value in user_params.items()
        }

        task_class = build_dynamic_task_class(
            AirflowOperatorRuntimeTask,
            task_family,
            params_def,
        )

        if tracked_function:
            import inspect

            task_class.task_definition.task_source_code = inspect.getsource(
                tracked_function)
            task_class.task_definition.task_module_code = inspect.getsource(
                inspect.getmodule(tracked_function))

    else:
        # if this is an inline run-time task, we name it after the script which ran it
        task_family = sys.argv[0].split(os.path.sep)[-1]
        task_class = build_dynamic_task_class(AirflowOperatorRuntimeTask,
                                              task_family)
        module_code = get_user_module_code()
        task_class.task_definition.task_module_code = module_code
        task_class.task_definition.task_source_code = module_code
        user_params = {}

    root_task = task_class(dag_id=af_context.dag_id,
                           execution_date=af_context.execution_date,
                           task_version="%s:%s" %
                           (af_context.task_id, af_context.execution_date),
                           **user_params)

    job_name = "{}.{}".format(af_context.dag_id, af_context.task_id)
    source = UpdateSource.airflow_tracking
    return root_task, job_name, source
Exemplo n.º 6
0
def _track_task(task):
    from airflow.operators.subdag_operator import SubDagOperator

    if should_not_track(task):
        return

    track_operator(task)
    if is_instance_by_class_name(task, SubDagOperator.__name__):
        # we also track the subdag's inner tasks
        track_dag(task.subdag)
Exemplo n.º 7
0
    def t(obj):
        if convert_types and isinstance(obj, convert_types):
            return convert_f(obj)

        if isinstance(obj, Mapping):
            # noinspection PyArgumentList
            converted = ((k, t(v)) for k, v in six.iteritems(obj))
            if filter_none:
                converted = ((k, v) for k, v in converted if v is not None)
            new_obj = obj.__class__(converted)
            if filter_empty and not new_obj:
                return None
            return new_obj

        if isinstance(obj, six.string_types):
            return convert_f(obj)

        if "pyspark" in str(type(obj)):
            return convert_f(obj)

        try:
            target_no_traverse = hasattr(obj,
                                         "target_no_traverse") and getattr(
                                             obj, "target_no_traverse", None)
        except ValueError:
            # SPARK OBJECTS do not support hasattr
            target_no_traverse = True
        if target_no_traverse is bool and target_no_traverse:
            return convert_f(obj)

        list_obj_constructor = None
        if isinstance(obj, (list, tuple, set)):
            list_obj_constructor = obj.__class__
        elif is_instance_by_class_name(obj, "DataFrame"):
            pass
        else:
            try:
                iter(obj)  # noqa: F841
                list_obj_constructor = list
            except TypeError:
                pass

        # we can parse and reconstruct list object
        if list_obj_constructor:
            converted = (t(r) for r in obj)
            if filter_none:
                converted = filter(lambda x: x is not None, converted)
            new_obj = list_obj_constructor(converted)
            if filter_empty and not new_obj:
                return None
            return new_obj

        # so it's simple obj, let apply function
        return convert_f(obj)
Exemplo n.º 8
0
def _track_task(task):
    from airflow.operators.subdag_operator import SubDagOperator

    if should_not_track(task):
        return

    if is_instance_by_class_name(task, SubDagOperator.__name__):
        # we do not track the execute of a SubDag, only its tasks
        track_dag(task.subdag)
    else:
        track_operator(task)
Exemplo n.º 9
0
def add_tracking_to_submit_task(tracking_info, operator):
    for class_name, tracking_wrapper in _EXECUTE_TRACKING.items():
        if is_instance_by_class_name(operator, class_name):
            tracking_wrapper(operator, tracking_info)
            break
Exemplo n.º 10
0
def should_flatten(operator, attr_name):
    flatten_config = get_settings().tracking.flatten_operator_fields
    for op_name in flatten_config:
        if is_instance_by_class_name(operator, op_name):
            return attr_name in flatten_config[op_name]
    return False
Exemplo n.º 11
0
def get_tracking_wrapper(task):
    for class_name, tracking_wrapper in _TRACKING.items():
        if is_instance_by_class_name(task, class_name):
            return tracking_wrapper
    raise KeyError
Exemplo n.º 12
0
def test_is_instance_by_class_name(obj, name, expected):
    assert is_instance_by_class_name(obj, name) == expected
Exemplo n.º 13
0
def traverse(
    struct,
    convert_f=f_identity,
    filter_none=False,
    filter_empty=False,
    convert_types=None,
    traverse_path=None,
):
    """
    Maps all Tasks in a structured data object to their .output().
    :param convert_types non basic types to apply convert_f
    :param filter_none  remove None from structures
    """
    obj = struct

    if traverse_path is None:
        traverse_path = []  # first invocation fo the function

    t = functools.partial(
        traverse,
        convert_f=convert_f,
        filter_none=filter_none,
        filter_empty=filter_empty,
        convert_types=convert_types,
    )

    if convert_types and isinstance(obj, convert_types):
        return convert_f(obj)

    if isinstance(obj, Mapping):
        # noinspection PyArgumentList
        converted = (
            (k, t(v, traverse_path=traverse_path + [k])) for k, v in six.iteritems(obj)
        )
        if filter_none:
            converted = ((k, v) for k, v in converted if v is not None)
        new_obj = obj.__class__(converted)
        if filter_empty and not new_obj:
            return None
        return new_obj

    if isinstance(obj, six.string_types):
        return convert_f(obj)

    if "pyspark" in str(type(obj)):
        return convert_f(obj)

    try:
        target_no_traverse = hasattr(obj, "target_no_traverse") and getattr(
            obj, "target_no_traverse", None
        )
    except ValueError:
        # SPARK OBJECTS do not support hasattr
        target_no_traverse = True
    if target_no_traverse is bool and target_no_traverse:
        return convert_f(obj)

    list_obj_constructor = None
    if isinstance(obj, (list, tuple, set)):
        list_obj_constructor = obj.__class__
    elif is_instance_by_class_name(obj, "DataFrame"):
        pass
    else:
        try:
            iter(obj)  # noqa: F841
            list_obj_constructor = list
        except TypeError:
            pass

    # we can parse and reconstruct list object
    if list_obj_constructor:
        converted = (t(r, traverse_path=traverse_path + [i]) for i, r in enumerate(obj))
        if filter_none:
            converted = filter(lambda x: x is not None, converted)
        new_obj = list_obj_constructor(converted)
        if filter_empty and not new_obj:
            return None
        return new_obj

    # so it's simple obj, let apply function
    return convert_f(obj)
Exemplo n.º 14
0
def build_run_time_airflow_task(af_context, root_task_name):
    # type: (AirflowTaskContext, Optional[str]) -> Tuple[TrackingTask, str, UpdateSource, UUID]
    if af_context.context:
        # we are in the execute entry point and therefore that task name is <task>__execute
        task_family = af_context.task_id

        airflow_operator = af_context.context["task_instance"].task

        # find the template fields of the operators
        user_params = get_flatten_operator_params(airflow_operator)

        source_code = NO_SOURCE_CODE
        if is_instance_by_class_name(airflow_operator, "PythonOperator"):
            tracked_function = airflow_operator.python_callable
            user_params["function_name"] = tracked_function.__name__
            source_code = TaskSourceCode.from_callable(tracked_function)
    else:
        # if this is an inline run-time task, we name it after the script which ran it
        # If we ever get here, the root_task_name will be just "airflow" since this is what we pass
        task_family = get_task_family_for_inline_script(
            af_context.task_id, root_task_name)
        source_code = TaskSourceCode.from_callstack()
        user_params = {}

    user_params.update(
        dag_id=af_context.dag_id,
        execution_date=af_context.execution_date,
        task_version="%s:%s" % (af_context.task_id, af_context.execution_date),
    )

    # just a placeholder name
    task_passport = TaskPassport.from_module(task_family)
    task_definition_uid = get_task_def_uid(
        af_context.dag_id,
        task_family,
        "{}{}".format(
            source_md5(source_code.task_source_code),
            source_md5(source_code.task_module_code),
        ),
    )
    root_task = TrackingTask.for_user_params(
        task_definition_uid=task_definition_uid,
        task_name=task_family,
        task_passport=task_passport,
        source_code=source_code,
        user_params=user_params,
    )  # type: TrackingTask

    root_task.ctrl.task_repr.task_functional_call = ""
    root_task.ctrl.task_repr.task_command_line = generate_airflow_cmd(
        dag_id=af_context.dag_id,
        task_id=af_context.task_id,
        execution_date=af_context.execution_date,
        is_root_task=False,
    )

    root_run_uid = get_job_run_uid(
        airflow_instance_uid=af_context.airflow_instance_uid,
        dag_id=af_context.dag_id,
        execution_date=af_context.execution_date,
    )
    root_task.ctrl.force_task_run_uid = get_task_run_uid(
        run_uid=root_run_uid, dag_id=af_context.dag_id, task_id=task_family)

    job_name = af_context.dag_id
    source = UpdateSource.airflow_tracking
    return root_task, job_name, source, root_run_uid