Exemplo n.º 1
0
def set_dbnd_config_from_airflow_connections():
    from dbnd._core.configuration.dbnd_config import config

    all_config_layers_names = set(
        [layer.name for layer in config.config_layer.get_all_layers()]
    )

    if AIRFLOW_DBND_CONNECTION_SOURCE not in all_config_layers_names:
        json_config = get_dbnd_json_config_from_airflow_connections()
        if not json_config:
            return False

        from dbnd._core.configuration.config_value import ConfigValuePriority

        config.set_values(
            config_values=json_config,
            priority=ConfigValuePriority.NORMAL,
            source=AIRFLOW_DBND_CONNECTION_SOURCE,
        )
        logger.debug(
            "Databand config was set using {0} connection.".format(
                DATABAND_AIRFLOW_CONN_ID
            )
        )
    return True
Exemplo n.º 2
0
    def start(
        self,
        root_task_name,
        in_memory=True,
        run_uid=None,
        airflow_context=False,
        job_name=None,
    ):
        if try_get_databand_context():
            return

        if not airflow_context and not self._atexit_registered:
            atexit.register(self.stop)
            if is_airflow_enabled():
                from airflow.settings import dispose_orm

                atexit.unregister(dispose_orm)
        c = {
            "run": {
                "skip_completed": False
            },  # we don't want to "check" as script is task_version="now"
            "task": {
                "task_in_memory_outputs": in_memory
            },  # do not save any outputs
        }
        config.set_values(config_values=c, override=True, source="dbnd_start")
        context_kwargs = {"name": "airflow"} if airflow_context else {}
        # create databand context
        dc = self._enter_cm(
            new_dbnd_context(**context_kwargs))  # type: DatabandContext

        root_task = _build_inline_root_task(root_task_name,
                                            airflow_context=airflow_context)
        # create databand run
        dr = self._enter_cm(
            new_databand_run(
                context=dc,
                task_or_task_name=root_task,
                run_uid=run_uid,
                existing_run=False,
                job_name=job_name,
            ))  # type: DatabandRun

        if run_uid:
            root_task_run_uid = get_task_run_uid(run_uid, root_task_name)
        else:
            root_task_run_uid = None
        dr._init_without_run(root_task_run_uid=root_task_run_uid)

        self._start_taskrun(dr.driver_task_run)
        self._start_taskrun(dr.root_task_run)
        return dr
Exemplo n.º 3
0
def _set_tracking_config_overide(airflow_context=None):
    # Ceate proper DatabandContext so we can create other objects
    # There should be no Orchestrations tasks.
    # However, let's disable any orchestrations side effects
    config_for_tracking = {
        "run": {
            "skip_completed": False,
            "skip_completed_on_run": False,
            "validate_task_inputs": False,
            "validate_task_outputs": False,
        },  # we don't want to "check" as script is task_version="now"
        "task": {
            "task_in_memory_outputs": True
        },  # do not save any outputs
        "core": {
            "tracker_raise_on_error": False
        },  # do not fail on tracker errors
    }
    if airflow_context:
        import pytz

        task_target_date = pendulum.parse(airflow_context.execution_date,
                                          tz=pytz.UTC).date()
        use_dbnd_log = override_airflow_log_system_for_tracking()
        if use_dbnd_log is not None:
            config_for_tracking["log"] = {"disabled": not use_dbnd_log}

        config_for_tracking["task"]["task_target_date"] = task_target_date

    return config.set_values(
        config_values=config_for_tracking,
        priority=ConfigValuePriority.OVERRIDE,
        source="dbnd_tracking_config",
    )
Exemplo n.º 4
0
def set_tracking_config_overide(airflow_context=None, use_dbnd_log=None):
    # 1. create proper DatabandContext so we can create other objects
    track_with_cache = config.getboolean("run", "tracking_with_cache")
    config_for_tracking = {
        "run": {
            "skip_completed": track_with_cache,
            "skip_completed_on_run": track_with_cache,
            "validate_task_inputs": track_with_cache,
            "validate_task_outputs": track_with_cache,
        },  # we don't want to "check" as script is task_version="now"
        "task": {
            "task_in_memory_outputs": not track_with_cache
        },  # do not save any outputs
        "core": {
            "tracker_raise_on_error": False
        },  # do not fail on tracker errors
    }
    if airflow_context:
        import pytz

        task_target_date = pendulum.parse(airflow_context.execution_date,
                                          tz=pytz.UTC).date()
        use_dbnd_log = override_airflow_log_system_for_tracking()
        config_for_tracking["task"]["task_target_date"] = task_target_date

    if use_dbnd_log is not None:
        config_for_tracking["log"] = {"disabled": not use_dbnd_log}
    return config.set_values(config_values=config_for_tracking,
                             override=True,
                             source="dbnd_tracking_config")
Exemplo n.º 5
0
def dbnd_tracking_start(job_name=None,
                        run_name=None,
                        project_name=None,
                        conf=None):
    """
    This function is used for tracking Python scripts only and should be added at the beginning of the script.

    When the script execution ends, dbnd_tracking_stop will be called automatically, there is no need to add it manually.

    Args:
        job_name: Name of the pipeline
        run_name: Name of the run
        project_name: Name of the project
        conf: Configuration dict with values for Databand configurations
    """
    if not conf:
        conf = {}

    if run_name:
        conf.setdefault("run", {}).setdefault("name", run_name)

    # do not apply our logger to a python script by default
    conf.setdefault("log", {}).setdefault("disabled", True)

    if conf["log"]["disabled"]:
        _configure_tracking_logging(conf)

    # We use print here and not log because the dbnd logger might be set to Warning (by default), and we want to
    # inform the user that we started, without alerting him with a Warning or Error message.
    # This should be a logger info message when tracking and orchestration split.
    print(
        "Databand Tracking Started {version}".format(version=dbnd.__version__))

    if conf:
        config.set_values(
            config_values=conf,
            priority=ConfigValuePriority.OVERRIDE,
            source="dbnd_tracking_start",
        )

    if job_name is None:
        job_name = try_get_script_name()

    return tracking_start_base(job_name=job_name, project_name=project_name)
Exemplo n.º 6
0
def set_tracking_config_overide(use_dbnd_log=None):
    # 1. create proper DatabandContext so we can create other objects
    track_with_cache = config.getboolean("run", "tracking_with_cache")
    config_for_airflow = {
        "run": {
            "skip_completed": track_with_cache,
            "skip_completed_on_run": track_with_cache,
            "validate_task_inputs": track_with_cache,
            "validate_task_outputs": track_with_cache,
        },  # we don't want to "check" as script is task_version="now"
        "task": {
            "task_in_memory_outputs": not track_with_cache
        },  # do not save any outputs
        "core": {
            "tracker_raise_on_error": False
        },  # do not fail on tracker errors
    }
    if use_dbnd_log is not None:
        config_for_airflow["log"] = {"disabled": not use_dbnd_log}
    config.set_values(config_values=config_for_airflow,
                      override=True,
                      source="dbnd_tracking_config")

    return