Exemplo n.º 1
0
    def send_lineage(
        operator: "BaseOperator",
        inlets: Optional[List] = None,  # unused
        outlets: Optional[List] = None,  # unused
        context: Dict = None,
    ) -> None:
        config = get_lineage_config()

        try:
            # This is necessary to avoid issues with circular imports.
            from airflow.lineage import prepare_lineage

            from datahub_provider.hooks.datahub import AIRFLOW_1

            # Detect Airflow 1.10.x inlet/outlet configurations in Airflow 2.x, and
            # convert to the newer version. This code path will only be triggered
            # when 2.x receives a 1.10.x inlet/outlet config.
            needs_repeat_preparation = False
            if (not AIRFLOW_1 and isinstance(operator._inlets, list)
                    and len(operator._inlets) == 1
                    and isinstance(operator._inlets[0], dict)):
                from airflow.lineage import AUTO

                operator._inlets = [
                    # See https://airflow.apache.org/docs/apache-airflow/1.10.15/lineage.html.
                    *operator._inlets[0].get(
                        "datasets", []),  # assumes these are attr-annotated
                    *operator._inlets[0].get("task_ids", []),
                    *([AUTO]
                      if operator._inlets[0].get("auto", False) else []),
                ]
                needs_repeat_preparation = True
            if (not AIRFLOW_1 and isinstance(operator._outlets, list)
                    and len(operator._outlets) == 1
                    and isinstance(operator._outlets[0], dict)):
                operator._outlets = [*operator._outlets[0].get("datasets", [])]
                needs_repeat_preparation = True
            if needs_repeat_preparation:
                # Rerun the lineage preparation routine, now that the old format has been translated to the new one.
                prepare_lineage(lambda self, ctx: None)(operator, context)

            context = context or {}  # ensure not None to satisfy mypy
            send_lineage_to_datahub(config, operator, operator.inlets,
                                    operator.outlets, context)
        except Exception as e:
            if config.graceful_exceptions:
                operator.log.error(e)
                operator.log.info(
                    "Supressing error because graceful_exceptions is set")
            else:
                raise
Exemplo n.º 2
0
    def test_lineage_is_sent_to_backend(self, mock_get_backend):
        class TestBackend(LineageBackend):
            def send_lineage(self,
                             operator,
                             inlets=None,
                             outlets=None,
                             context=None):
                assert len(inlets) == 1
                assert len(outlets) == 1

        func = mock.Mock()
        func.__name__ = 'foo'

        mock_get_backend.return_value = TestBackend()

        dag = DAG(dag_id='test_lineage_is_sent_to_backend',
                  start_date=DEFAULT_DATE)

        with dag:
            op1 = DummyOperator(task_id='task1')

        file1 = File("/tmp/some_file")

        op1.inlets.append(file1)
        op1.outlets.append(file1)

        ctx1 = {
            "ti": TI(task=op1, execution_date=DEFAULT_DATE),
            "execution_date": DEFAULT_DATE
        }

        prep = prepare_lineage(func)
        prep(op1, ctx1)
        post = apply_lineage(func)
        post(op1, ctx1)
Exemplo n.º 3
0
def test_lineage_backend(mock_emit, inlets, outlets):
    DEFAULT_DATE = days_ago(2)

    with mock.patch.dict(
            os.environ,
        {
            "AIRFLOW__LINEAGE__BACKEND":
            "datahub_provider.lineage.datahub.DatahubLineageBackend",
            "AIRFLOW__LINEAGE__DATAHUB_CONN_ID":
            datahub_rest_connection_config.conn_id,
            "AIRFLOW__LINEAGE__DATAHUB_KWARGS":
            json.dumps({"graceful_exceptions": False}),
        },
    ), mock.patch("airflow.models.BaseOperator.xcom_pull",
                  autospec=True), mock.patch(
                      "airflow.models.BaseOperator.xcom_push",
                      autospec=True), patch_airflow_connection(
                          datahub_rest_connection_config):
        func = mock.Mock()
        func.__name__ = "foo"

        dag = DAG(dag_id="test_lineage_is_sent_to_backend",
                  start_date=DEFAULT_DATE)

        with dag:
            op1 = DummyOperator(
                task_id="task1",
                inlets=inlets,
                outlets=outlets,
            )

        ti = TI(task=op1, execution_date=DEFAULT_DATE)
        ctx1 = {
            "dag": dag,
            "task": op1,
            "ti": ti,
            "task_instance": ti,
            "execution_date": DEFAULT_DATE,
            "ts": "2021-04-08T00:54:25.771575+00:00",
        }

        prep = prepare_lineage(func)
        prep(op1, ctx1)
        post = apply_lineage(func)
        post(op1, ctx1)

        # Verify that the inlets and outlets are registered and recognized by Airflow correctly,
        # or that our lineage backend forces it to.
        assert len(op1.inlets) == 1
        assert len(op1.outlets) == 1
        assert all(map(lambda let: isinstance(let, Dataset), op1.inlets))
        assert all(map(lambda let: isinstance(let, Dataset), op1.outlets))

        # Check that the right things were emitted.
        mock_emit.assert_called_once()
        assert len(mock_emit.call_args[0][0]) == 4
        assert all(mce.validate() for mce in mock_emit.call_args[0][0])
Exemplo n.º 4
0
def test_lineage_backend(mock_emit, mock_xcom_push, inlets, outlets):
    DEFAULT_DATE = days_ago(2)

    with mock.patch.dict(
        os.environ,
        {
            "AIRFLOW__LINEAGE__BACKEND": "datahub.integrations.airflow.DatahubAirflowLineageBackend",
            "AIRFLOW__LINEAGE__DATAHUB_CONN_ID": datahub_rest_connection_config.conn_id,
        },
    ), patch_airflow_connection(datahub_rest_connection_config):
        func = mock.Mock()
        func.__name__ = "foo"

        dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE)

        with dag:
            op1 = DummyOperator(
                task_id="task1",
                inlets=inlets,
                outlets=outlets,
            )

        ti = TI(task=op1, execution_date=DEFAULT_DATE)
        ctx1 = {
            "dag": dag,
            "task": op1,
            "ti": ti,
            "task_instance": ti,
            "execution_date": DEFAULT_DATE,
            "ts": "2021-04-08T00:54:25.771575+00:00",
        }

        prep = prepare_lineage(func)
        prep(op1, ctx1)
        post = apply_lineage(func)
        post(op1, ctx1)

        # Verify that the inlets and outlets are registered and recognized by Airflow correctly,
        # or that our lineage backend forces it to.
        assert len(op1.inlets) == 1
        assert len(op1.outlets) == 1
        assert all(map(lambda let: isinstance(let, Dataset), op1.inlets))
        assert all(map(lambda let: isinstance(let, Dataset), op1.outlets))

        # Verify xcom push calls are correct.
        # Two calls, one for inlets and the other for outlets.
        assert mock_xcom_push.call_count == 2

        # Check that the right things were emitted.
        mock_emit.assert_called_once()
        assert len(mock_emit.call_args[0][0]) == 4
        assert all(mce.validate() for mce in mock_emit.call_args[0][0])
Exemplo n.º 5
0
def test_lineage_backend(mock_emit):
    # Airflow 2.x does not have lineage backend support merged back in yet.
    # As such, we must protect these imports.
    from airflow.lineage import apply_lineage, prepare_lineage

    from datahub.integrations.airflow.entities import Dataset

    DEFAULT_DATE = days_ago(2)

    with mock.patch.dict(
            os.environ,
        {
            "AIRFLOW__LINEAGE__BACKEND":
            "datahub.integrations.airflow.DatahubAirflowLineageBackend",
            "AIRFLOW__LINEAGE__DATAHUB_CONN_ID":
            datahub_rest_connection_config.conn_id,
        },
    ), patch_airflow_connection(datahub_rest_connection_config):
        func = mock.Mock()
        func.__name__ = "foo"

        dag = DAG(dag_id="test_lineage_is_sent_to_backend",
                  start_date=DEFAULT_DATE)

        with dag:
            op1 = DummyOperator(task_id="task1")

        upstream = Dataset("snowflake", "mydb.schema.tableConsumed")
        downstream = Dataset("snowflake", "mydb.schema.tableProduced")

        op1.inlets.append(upstream)
        op1.outlets.append(downstream)

        ti = TI(task=op1, execution_date=DEFAULT_DATE)
        ctx1 = {
            "dag": dag,
            "task": op1,
            "ti": ti,
            "task_instance": ti,
            "execution_date": DEFAULT_DATE,
            "ts": "2021-04-08T00:54:25.771575+00:00",
        }

        prep = prepare_lineage(func)
        prep(op1, ctx1)
        post = apply_lineage(func)
        post(op1, ctx1)

        mock_emit.assert_called_once()
        assert len(mock_emit.call_args[0][0]) == 4
        assert all(mce.validate() for mce in mock_emit.call_args[0][0])
    def test_lineage(self, _get_backend):
        backend = mock.Mock()
        send_mock = mock.Mock()
        backend.send_lineage = send_mock

        _get_backend.return_value = backend

        dag = DAG(
            dag_id='test_prepare_lineage',
            start_date=DEFAULT_DATE
        )

        f1 = File("/tmp/does_not_exist_1")
        f2 = File("/tmp/does_not_exist_2")
        f3 = File("/tmp/does_not_exist_3")

        with dag:
            op1 = DummyOperator(task_id='leave1',
                                inlets={"datasets": [f1, ]},
                                outlets={"datasets": [f2, ]})
            op2 = DummyOperator(task_id='leave2')
            op3 = DummyOperator(task_id='upstream_level_1',
                                inlets={"auto": True},
                                outlets={"datasets": [f3, ]})
            op4 = DummyOperator(task_id='upstream_level_2')
            op5 = DummyOperator(task_id='upstream_level_3',
                                inlets={"task_ids": ["leave1", "upstream_level_1"]})

            op1.set_downstream(op3)
            op2.set_downstream(op3)
            op3.set_downstream(op4)
            op4.set_downstream(op5)

        ctx1 = {"ti": TI(task=op1, execution_date=DEFAULT_DATE)}
        ctx2 = {"ti": TI(task=op2, execution_date=DEFAULT_DATE)}
        ctx3 = {"ti": TI(task=op3, execution_date=DEFAULT_DATE)}
        ctx5 = {"ti": TI(task=op5, execution_date=DEFAULT_DATE)}

        func = mock.Mock()
        func.__name__ = 'foo'

        # prepare with manual inlets and outlets
        prep = prepare_lineage(func)
        prep(op1, ctx1)

        self.assertEqual(len(op1.inlets), 1)
        self.assertEqual(op1.inlets[0], f1)

        self.assertEqual(len(op1.outlets), 1)
        self.assertEqual(op1.outlets[0], f2)

        # post process with no backend
        post = apply_lineage(func)
        post(op1, ctx1)
        self.assertEqual(send_mock.call_count, 1)
        send_mock.reset_mock()

        prep(op2, ctx2)
        self.assertEqual(len(op2.inlets), 0)
        post(op2, ctx2)
        self.assertEqual(send_mock.call_count, 1)
        send_mock.reset_mock()

        prep(op3, ctx3)
        self.assertEqual(len(op3.inlets), 1)
        self.assertEqual(op3.inlets[0].qualified_name, f2.qualified_name)
        post(op3, ctx3)
        self.assertEqual(send_mock.call_count, 1)
        send_mock.reset_mock()

        # skip 4

        prep(op5, ctx5)
        self.assertEqual(len(op5.inlets), 2)
        post(op5, ctx5)
        self.assertEqual(send_mock.call_count, 1)
        send_mock.reset_mock()
    def test_lineage_auto_branching(self, _get_backend):
        # Tests the ability for the auto feature to skip non state affecting operators
        # DAG diagram:
        #  1--->2---->4
        #       ▼     ▲
        #       3-----+
        backend = mock.Mock()
        send_mock = mock.Mock()
        backend.send_lineage = send_mock

        _get_backend.return_value = backend

        dag = DAG(
            dag_id='test_prepare_lineage_auto_branching',
            start_date=DEFAULT_DATE
        )

        f1 = File("/tmp/does_not_exist_1")

        with dag:
            op1 = DummyOperator(task_id='leave1')
            op2 = DummyOperator(task_id='branch_1', outlets={"datasets": [f1, ]})
            op3 = DummyOperator(task_id='branch_2')
            op4 = DummyOperator(task_id='upstream_level_2', inlets={"auto": True})

            op1.set_downstream(op2)
            op2.set_downstream(op3)
            op2.set_downstream(op4)
            op3.set_downstream(op4)

        ctx1 = {"ti": TI(task=op1, execution_date=DEFAULT_DATE)}
        ctx2 = {"ti": TI(task=op2, execution_date=DEFAULT_DATE)}
        ctx3 = {"ti": TI(task=op3, execution_date=DEFAULT_DATE)}
        ctx4 = {"ti": TI(task=op4, execution_date=DEFAULT_DATE)}

        func = mock.Mock()
        func.__name__ = 'foo'

        # prepare with manual inlets and outlets
        prep = prepare_lineage(func)
        prep(op1, ctx1)

        self.assertEqual(len(op1.inlets), 0)

        # post process with no backend
        post = apply_lineage(func)
        post(op1, ctx1)
        send_mock.reset_mock()

        prep(op2, ctx2)
        self.assertEqual(len(op2.inlets), 0)
        post(op2, ctx2)
        self.assertEqual(send_mock.call_count, 1)
        send_mock.reset_mock()

        prep(op3, ctx3)
        self.assertEqual(len(op3.inlets), 0)
        post(op3, ctx3)
        self.assertEqual(send_mock.call_count, 1)
        send_mock.reset_mock()

        prep(op4, ctx4)
        self.assertEqual(len(op4.inlets), 1)
        self.assertEqual(op4.inlets[0].name, f1.name)
        post(op4, ctx4)
        self.assertEqual(send_mock.call_count, 1)
        send_mock.reset_mock()
    def test_lineage_complicated_dag(self, _get_backend):
        # Tests the ability for the auto feature to skip non state affecting operators, while still
        # retrieving data from multiple outlet sources. Notice how if outlets are not specified,
        # that the auto feature continues to traverse down the dag until not input sources are found.

        # DAG diagram:
        # 1-----------+
        #             |
        #             ▼
        #             4 ----------+
        #             ▲           ▼
        #             |           5+-------->6
        # 2-----------+           ▲
        #                         |
        #                         |
        #                         |
        # 3-----------------------+

        backend = mock.Mock()
        send_mock = mock.Mock()
        backend.send_lineage = send_mock

        _get_backend.return_value = backend

        dag = DAG(
            dag_id='test_prepare_lineage_auto_complicated_dag',
            start_date=DEFAULT_DATE
        )

        f1 = File("/tmp/does_not_exist_1")
        f2 = File("/tmp/does_not_exist_2")
        f3 = File("/tmp/does_not_exist_3")

        with dag:
            op1 = DummyOperator(task_id='leave1',
                                outlets={"datasets": [f1, ]},
                                inlets={"auto": True})
            op2 = DummyOperator(task_id='leave2',
                                outlets={"datasets": [f2, ]})
            op3 = DummyOperator(task_id='leave3',
                                outlets={"datasets": [f3, ]})
            op4 = DummyOperator(task_id='upstream_level_1')
            op5 = DummyOperator(task_id='upstream_level_2', inlets={"auto": True})
            op6 = DummyOperator(task_id='upstream_level_3', inlets={"auto": True})

            op1.set_downstream(op4)
            op2.set_downstream(op4)
            op3.set_downstream(op5)
            op4.set_downstream(op5)
            op5.set_downstream(op6)

        ctx1 = {"ti": TI(task=op1, execution_date=DEFAULT_DATE)}
        ctx2 = {"ti": TI(task=op2, execution_date=DEFAULT_DATE)}
        ctx3 = {"ti": TI(task=op3, execution_date=DEFAULT_DATE)}
        ctx4 = {"ti": TI(task=op4, execution_date=DEFAULT_DATE)}
        ctx5 = {"ti": TI(task=op5, execution_date=DEFAULT_DATE)}
        ctx6 = {"ti": TI(task=op6, execution_date=DEFAULT_DATE)}

        func = mock.Mock()
        func.__name__ = 'foo'

        # prepare with manual inlets and outlets
        prep = prepare_lineage(func)
        prep(op1, ctx1)

        self.assertEqual(len(op1.outlets), 1)
        self.assertEqual(op1.outlets[0], f1)
        self.assertEqual(len(op1.inlets), 0)

        # post process with no backend
        post = apply_lineage(func)
        post(op1, ctx1)

        prep(op2, ctx2)
        self.assertEqual(len(op2.outlets), 1)
        post(op2, ctx2)

        prep(op3, ctx3)
        self.assertEqual(len(op3.outlets), 1)
        post(op3, ctx3)

        prep(op4, ctx4)
        self.assertEqual(len(op4.inlets), 0)
        post(op4, ctx4)

        prep(op5, ctx5)
        self.assertEqual(len(op5.inlets), 3)
        self.assertEqual({file.qualified_name for file in op5.inlets}, {'file:///tmp/does_not_exist_1',
                                                                        'file:///tmp/does_not_exist_2',
                                                                        'file:///tmp/does_not_exist_3'})
        post(op5, ctx5)

        prep(op6, ctx6)
        self.assertEqual(len(op6.inlets), 3)
        self.assertEqual({file.qualified_name for file in op6.inlets}, {'file:///tmp/does_not_exist_1',
                                                                        'file:///tmp/does_not_exist_2',
                                                                        'file:///tmp/does_not_exist_3'})
        post(op6, ctx6)
Exemplo n.º 9
0
    def send_lineage(
        operator: "BaseOperator",
        inlets: Optional[List] = None,
        outlets: Optional[List] = None,
        context: Dict = None,
    ) -> None:
        # This is necessary to avoid issues with circular imports.
        from airflow.lineage import prepare_lineage
        from airflow.serialization.serialized_objects import (
            SerializedBaseOperator,
            SerializedDAG,
        )

        from datahub.integrations.airflow.hooks import AIRFLOW_1

        # Detect Airflow 1.10.x inlet/outlet configurations in Airflow 2.x, and
        # convert to the newer version. This code path will only be triggered
        # when 2.x receives a 1.10.x inlet/outlet config.
        needs_repeat_preparation = False
        if (
            not AIRFLOW_1
            and isinstance(operator._inlets, list)
            and len(operator._inlets) == 1
            and isinstance(operator._inlets[0], dict)
        ):
            from airflow.lineage import AUTO

            operator._inlets = [
                # See https://airflow.apache.org/docs/apache-airflow/1.10.15/lineage.html.
                *operator._inlets[0].get(
                    "datasets", []
                ),  # assumes these are attr-annotated
                *operator._inlets[0].get("task_ids", []),
                *([AUTO] if operator._inlets[0].get("auto", False) else []),
            ]
            needs_repeat_preparation = True
        if (
            not AIRFLOW_1
            and isinstance(operator._outlets, list)
            and len(operator._outlets) == 1
            and isinstance(operator._outlets[0], dict)
        ):
            operator._outlets = [*operator._outlets[0].get("datasets", [])]
            needs_repeat_preparation = True
        if needs_repeat_preparation:
            # Rerun the lineage preparation routine, now that the old format has been translated to the new one.
            prepare_lineage(lambda self, ctx: None)(operator, context)

        context = context or {}  # ensure not None to satisfy mypy

        dag: "DAG" = context["dag"]
        task = context["task"]

        # TODO: capture context
        # context dag_run
        # task_instance: "TaskInstance" = context["task_instance"]
        # TODO: capture raw sql from db operators

        flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id)
        job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id)

        base_url = conf.get("webserver", "base_url")
        flow_url = f"{base_url}/tree?dag_id={dag.dag_id}"
        job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}"
        # operator.log.info(f"{flow_url=}")
        # operator.log.info(f"{job_url=}")
        # operator.log.info(f"{dag.get_serialized_fields()=}")
        # operator.log.info(f"{task.get_serialized_fields()=}")
        # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}")

        flow_property_bag: Dict[str, str] = {
            key: repr(value)
            for (key, value) in SerializedDAG.serialize_dag(dag).items()
        }
        for key in dag.get_serialized_fields():
            if key not in flow_property_bag:
                flow_property_bag[key] = repr(getattr(dag, key))
        job_property_bag: Dict[str, str] = {
            key: repr(value)
            for (key, value) in SerializedBaseOperator.serialize_operator(task).items()
        }
        for key in task.get_serialized_fields():
            if key not in job_property_bag:
                job_property_bag[key] = repr(getattr(task, key))
        # operator.log.info(f"{flow_property_bag=}")
        # operator.log.info(f"{job_property_bag=}")

        timestamp = int(dateutil.parser.parse(context["ts"]).timestamp() * 1000)
        ownership = models.OwnershipClass(
            owners=[
                models.OwnerClass(
                    owner=builder.make_user_urn(dag.owner),
                    type=models.OwnershipTypeClass.DEVELOPER,
                    source=models.OwnershipSourceClass(
                        type=models.OwnershipSourceTypeClass.SERVICE,
                        url=dag.filepath,
                    ),
                )
            ],
            lastModified=models.AuditStampClass(
                time=timestamp, actor=builder.make_user_urn("airflow")
            ),
        )
        # operator.log.info(f"{ownership=}")

        tags = models.GlobalTagsClass(
            tags=[
                models.TagAssociationClass(tag=f"airflow_{tag}")
                for tag in (dag.tags or [])
            ]
        )
        # operator.log.info(f"{tags=}")

        flow_mce = models.MetadataChangeEventClass(
            proposedSnapshot=models.DataFlowSnapshotClass(
                urn=flow_urn,
                aspects=[
                    models.DataFlowInfoClass(
                        name=dag.dag_id,
                        description=f"{dag.description}\n\n{dag.doc_md or ''}",
                        customProperties=flow_property_bag,
                        externalUrl=flow_url,
                    ),
                    ownership,
                    tags,
                ],
            )
        )

        job_mce = models.MetadataChangeEventClass(
            proposedSnapshot=models.DataJobSnapshotClass(
                urn=job_urn,
                aspects=[
                    models.DataJobInfoClass(
                        name=task.task_id,
                        type=models.AzkabanJobTypeClass.COMMAND,
                        description=None,
                        customProperties=job_property_bag,
                        externalUrl=job_url,
                    ),
                    models.DataJobInputOutputClass(
                        inputDatasets=_entities_to_urn_list(inlets or []),
                        outputDatasets=_entities_to_urn_list(outlets or []),
                    ),
                    ownership,
                    tags,
                ],
            )
        )

        lineage_mces = [
            builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet)
            for outlet in _entities_to_urn_list(outlets or [])
        ]

        force_upstream_materialization = [
            models.MetadataChangeEventClass(
                proposedSnapshot=models.DatasetSnapshotClass(
                    urn=inlet,
                    aspects=[
                        models.StatusClass(removed=False),
                    ],
                )
            )
            for inlet in _entities_to_urn_list(inlets or [])
        ]

        hook = make_emitter_hook()

        mces = [
            flow_mce,
            job_mce,
            *lineage_mces,
            *force_upstream_materialization,
        ]
        operator.log.info(
            "DataHub lineage backend - emitting metadata:\n"
            + "\n".join(json.dumps(mce.to_obj()) for mce in mces)
        )
        hook.emit_mces(mces)
Exemplo n.º 10
0
def test_lineage_backend(mock_emit, inlets, outlets):
    DEFAULT_DATE = days_ago(2)

    # Using autospec on xcom_pull and xcom_push methods fails on Python 3.6.
    with mock.patch.dict(
            os.environ,
        {
            "AIRFLOW__LINEAGE__BACKEND":
            "datahub_provider.lineage.datahub.DatahubLineageBackend",
            "AIRFLOW__LINEAGE__DATAHUB_CONN_ID":
            datahub_rest_connection_config.conn_id,
            "AIRFLOW__LINEAGE__DATAHUB_KWARGS":
            json.dumps({"graceful_exceptions": False}),
        },
    ), mock.patch("airflow.models.BaseOperator.xcom_pull"), mock.patch(
            "airflow.models.BaseOperator.xcom_push"), patch_airflow_connection(
                datahub_rest_connection_config):
        func = mock.Mock()
        func.__name__ = "foo"

        dag = DAG(dag_id="test_lineage_is_sent_to_backend",
                  start_date=DEFAULT_DATE)

        with dag:
            op1 = DummyOperator(
                task_id="task1_upstream",
                inlets=inlets,
                outlets=outlets,
            )
            op2 = DummyOperator(
                task_id="task2",
                inlets=inlets,
                outlets=outlets,
            )
            op1 >> op2

        # Airflow <= 2.1 requires the execution_date parameter. Newer Airflow
        # versions do not require it, but will attempt to find the associated
        # run_id in the database if execution_date is provided. As such, we
        # must fake the run_id parameter for newer Airflow versions.
        if any(
                airflow.version.version.startswith(prefix)
                for prefix in ["1", "2.0", "2.1"]):
            ti = TaskInstance(task=op2, execution_date=DEFAULT_DATE)
        else:
            ti = TaskInstance(task=op2, run_id=f"test_airflow-{DEFAULT_DATE}")
        ctx1 = {
            "dag": dag,
            "task": op2,
            "ti": ti,
            "task_instance": ti,
            "execution_date": DEFAULT_DATE,
            "ts": "2021-04-08T00:54:25.771575+00:00",
        }

        prep = prepare_lineage(func)
        prep(op2, ctx1)
        post = apply_lineage(func)
        post(op2, ctx1)

        # Verify that the inlets and outlets are registered and recognized by Airflow correctly,
        # or that our lineage backend forces it to.
        assert len(op2.inlets) == 1
        assert len(op2.outlets) == 1
        assert all(map(lambda let: isinstance(let, Dataset), op2.inlets))
        assert all(map(lambda let: isinstance(let, Dataset), op2.outlets))

        # Check that the right things were emitted.
        mock_emit.assert_called_once()
        assert len(mock_emit.call_args[0][0]) == 4
        assert all(mce.validate() for mce in mock_emit.call_args[0][0])
Exemplo n.º 11
0
    def test_lineage(self, _get_backend):
        backend = mock.Mock()
        send_mock = mock.Mock()
        backend.send_lineage = send_mock

        _get_backend.return_value = backend

        dag = DAG(
            dag_id='test_prepare_lineage',
            start_date=DEFAULT_DATE
        )

        f1 = File("/tmp/does_not_exist_1")
        f2 = File("/tmp/does_not_exist_2")
        f3 = File("/tmp/does_not_exist_3")

        with dag:
            op1 = DummyOperator(task_id='leave1',
                                inlets={"datasets": [f1, ]},
                                outlets={"datasets": [f2, ]})
            op2 = DummyOperator(task_id='leave2')
            op3 = DummyOperator(task_id='upstream_level_1',
                                inlets={"auto": True},
                                outlets={"datasets": [f3, ]})
            op4 = DummyOperator(task_id='upstream_level_2')
            op5 = DummyOperator(task_id='upstream_level_3',
                                inlets={"task_ids": ["leave1", "upstream_level_1"]})

            op1.set_downstream(op3)
            op2.set_downstream(op3)
            op3.set_downstream(op4)
            op4.set_downstream(op5)

        ctx1 = {"ti": TI(task=op1, execution_date=DEFAULT_DATE)}
        ctx2 = {"ti": TI(task=op2, execution_date=DEFAULT_DATE)}
        ctx3 = {"ti": TI(task=op3, execution_date=DEFAULT_DATE)}
        ctx5 = {"ti": TI(task=op5, execution_date=DEFAULT_DATE)}

        func = mock.Mock()
        func.__name__ = 'foo'

        # prepare with manual inlets and outlets
        prep = prepare_lineage(func)
        prep(op1, ctx1)

        self.assertEqual(len(op1.inlets), 1)
        self.assertEqual(op1.inlets[0], f1)

        self.assertEqual(len(op1.outlets), 1)
        self.assertEqual(op1.outlets[0], f2)

        # post process with no backend
        post = apply_lineage(func)
        post(op1, ctx1)
        self.assertEqual(send_mock.call_count, 1)
        send_mock.reset_mock()

        prep(op2, ctx2)
        self.assertEqual(len(op2.inlets), 0)
        post(op2, ctx2)
        self.assertEqual(send_mock.call_count, 1)
        send_mock.reset_mock()

        prep(op3, ctx3)
        self.assertEqual(len(op3.inlets), 1)
        self.assertEqual(op3.inlets[0].qualified_name, f2.qualified_name)
        post(op3, ctx3)
        self.assertEqual(send_mock.call_count, 1)
        send_mock.reset_mock()

        # skip 4

        prep(op5, ctx5)
        self.assertEqual(len(op5.inlets), 2)
        post(op5, ctx5)
        self.assertEqual(send_mock.call_count, 1)
        send_mock.reset_mock()
Exemplo n.º 12
0
def test_lineage_backend_capture_executions(mock_emit, inlets, outlets):
    DEFAULT_DATE = datetime.datetime(2020, 5, 17)
    mock_emitter = Mock()
    mock_emit.return_value = mock_emitter
    # Using autospec on xcom_pull and xcom_push methods fails on Python 3.6.
    with mock.patch.dict(
            os.environ,
        {
            "AIRFLOW__LINEAGE__BACKEND":
            "datahub_provider.lineage.datahub.DatahubLineageBackend",
            "AIRFLOW__LINEAGE__DATAHUB_CONN_ID":
            datahub_rest_connection_config.conn_id,
            "AIRFLOW__LINEAGE__DATAHUB_KWARGS":
            json.dumps({
                "graceful_exceptions": False,
                "capture_executions": True
            }),
        },
    ), mock.patch("airflow.models.BaseOperator.xcom_pull"), mock.patch(
            "airflow.models.BaseOperator.xcom_push"), patch_airflow_connection(
                datahub_rest_connection_config):
        func = mock.Mock()
        func.__name__ = "foo"

        dag = DAG(dag_id="test_lineage_is_sent_to_backend",
                  start_date=DEFAULT_DATE)

        with dag:
            op1 = DummyOperator(
                task_id="task1_upstream",
                inlets=inlets,
                outlets=outlets,
            )
            op2 = DummyOperator(
                task_id="task2",
                inlets=inlets,
                outlets=outlets,
            )
            op1 >> op2

        # Airflow < 2.2 requires the execution_date parameter. Newer Airflow
        # versions do not require it, but will attempt to find the associated
        # run_id in the database if execution_date is provided. As such, we
        # must fake the run_id parameter for newer Airflow versions.
        if AIRFLOW_VERSION < packaging.version.parse("2.2.0"):
            ti = TaskInstance(task=op2, execution_date=DEFAULT_DATE)
            # Ignoring type here because DagRun state is just a sring at Airflow 1
            dag_run = DagRun(
                state="success",
                run_id=f"scheduled_{DEFAULT_DATE}")  # type: ignore
            ti.dag_run = dag_run
            ti.start_date = datetime.datetime.utcnow()
            ti.execution_date = DEFAULT_DATE

        else:
            from airflow.utils.state import DagRunState

            ti = TaskInstance(task=op2, run_id=f"test_airflow-{DEFAULT_DATE}")
            dag_run = DagRun(state=DagRunState.SUCCESS,
                             run_id=f"scheduled_{DEFAULT_DATE}")
            ti.dag_run = dag_run
            ti.start_date = datetime.datetime.utcnow()
            ti.execution_date = DEFAULT_DATE

        ctx1 = {
            "dag": dag,
            "task": op2,
            "ti": ti,
            "dag_run": dag_run,
            "task_instance": ti,
            "execution_date": DEFAULT_DATE,
            "ts": "2021-04-08T00:54:25.771575+00:00",
        }

        prep = prepare_lineage(func)
        prep(op2, ctx1)
        post = apply_lineage(func)
        post(op2, ctx1)

        # Verify that the inlets and outlets are registered and recognized by Airflow correctly,
        # or that our lineage backend forces it to.
        assert len(op2.inlets) == 1
        assert len(op2.outlets) == 1
        assert all(map(lambda let: isinstance(let, Dataset), op2.inlets))
        assert all(map(lambda let: isinstance(let, Dataset), op2.outlets))

        # Check that the right things were emitted.
        assert mock_emitter.emit.call_count == 17
        # Running further checks based on python version because args only exists in python 3.7+
        if sys.version_info[:3] > (3, 7):
            assert mock_emitter.method_calls[0].args[
                0].aspectName == "dataFlowInfo"
            assert (
                mock_emitter.method_calls[0].args[0].entityUrn ==
                "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)"
            )

            assert mock_emitter.method_calls[1].args[
                0].aspectName == "ownership"
            assert (
                mock_emitter.method_calls[1].args[0].entityUrn ==
                "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)"
            )

            assert mock_emitter.method_calls[2].args[
                0].aspectName == "globalTags"
            assert (
                mock_emitter.method_calls[2].args[0].entityUrn ==
                "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)"
            )

            assert mock_emitter.method_calls[3].args[
                0].aspectName == "dataJobInfo"
            assert (
                mock_emitter.method_calls[3].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )

            assert (mock_emitter.method_calls[4].args[0].aspectName ==
                    "dataJobInputOutput")
            assert (
                mock_emitter.method_calls[4].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )
            assert (
                mock_emitter.method_calls[4].args[0].aspect.inputDatajobs[0] ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task1_upstream)"
            )
            assert (
                mock_emitter.method_calls[4].args[0].aspect.inputDatasets[0] ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)"
            )
            assert (
                mock_emitter.method_calls[4].args[0].aspect.outputDatasets[0]
                ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)"
            )

            assert mock_emitter.method_calls[5].args[0].aspectName == "status"
            assert (
                mock_emitter.method_calls[5].args[0].entityUrn ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)"
            )

            assert mock_emitter.method_calls[6].args[0].aspectName == "status"
            assert (
                mock_emitter.method_calls[6].args[0].entityUrn ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)"
            )

            assert mock_emitter.method_calls[7].args[
                0].aspectName == "ownership"
            assert (
                mock_emitter.method_calls[7].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )

            assert mock_emitter.method_calls[8].args[
                0].aspectName == "globalTags"
            assert (
                mock_emitter.method_calls[8].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )

            assert (mock_emitter.method_calls[9].args[0].aspectName ==
                    "dataProcessInstanceProperties")
            assert (
                mock_emitter.method_calls[9].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")

            assert (mock_emitter.method_calls[10].args[0].aspectName ==
                    "dataProcessInstanceRelationships")
            assert (
                mock_emitter.method_calls[10].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")
            assert (mock_emitter.method_calls[11].args[0].aspectName ==
                    "dataProcessInstanceInput")
            assert (
                mock_emitter.method_calls[11].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")
            assert (mock_emitter.method_calls[12].args[0].aspectName ==
                    "dataProcessInstanceOutput")
            assert (
                mock_emitter.method_calls[12].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")
            assert mock_emitter.method_calls[13].args[0].aspectName == "status"
            assert (
                mock_emitter.method_calls[13].args[0].entityUrn ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)"
            )
            assert mock_emitter.method_calls[14].args[0].aspectName == "status"
            assert (
                mock_emitter.method_calls[14].args[0].entityUrn ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)"
            )
            assert (mock_emitter.method_calls[15].args[0].aspectName ==
                    "dataProcessInstanceRunEvent")
            assert (
                mock_emitter.method_calls[15].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")
            assert (mock_emitter.method_calls[16].args[0].aspectName ==
                    "dataProcessInstanceRunEvent")
            assert (
                mock_emitter.method_calls[16].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")