def test_kafka_sink_write(self, mock_k_callback, mock_producer, mock_context): mock_producer_instance = mock_producer.return_value mock_k_callback_instance = mock_k_callback.return_value callback = MagicMock(spec=WriteCallback) kafka_sink = DatahubKafkaSink.create( {"connection": { "bootstrap": "foobar:9092" }}, mock_context) mce = builder.make_lineage_mce( [ builder.make_dataset_urn("bigquery", "upstream1"), builder.make_dataset_urn("bigquery", "upstream2"), ], builder.make_dataset_urn("bigquery", "downstream1"), ) re = RecordEnvelope(record=mce, metadata={}) kafka_sink.write_record_async(re, callback) mock_producer_instance.poll.assert_called_once( ) # producer should call poll() first self.validate_kafka_callback( mock_k_callback, re, callback) # validate kafka callback was constructed appropriately # validate that confluent_kafka.Producer.produce was called with the right arguments mock_producer_instance.produce.assert_called_once() args, kwargs = mock_producer_instance.produce.call_args assert kwargs["value"] == mce assert kwargs["key"] # produce call should include a Kafka key created_callback = kwargs["on_delivery"] assert created_callback == mock_k_callback_instance.kafka_callback
def test_can_add_aspect(): dataset_mce: MetadataChangeEventClass = builder.make_lineage_mce( [ builder.make_dataset_urn("bigquery", "upstream1"), builder.make_dataset_urn("bigquery", "upstream2"), ], builder.make_dataset_urn("bigquery", "downstream"), ) assert isinstance(dataset_mce.proposedSnapshot, DatasetSnapshotClass) assert builder.can_add_aspect(dataset_mce, DatasetPropertiesClass) assert builder.can_add_aspect(dataset_mce, OwnershipClass) assert not builder.can_add_aspect(dataset_mce, DataFlowInfoClass)
def test_datahub_lineage_operator(mock_emit): with patch_airflow_connection(datahub_rest_connection_config) as config: task = DatahubEmitterOperator( task_id="emit_lineage", datahub_conn_id=config.conn_id, mces=[ builder.make_lineage_mce( [ builder.make_dataset_urn("snowflake", "mydb.schema.tableA"), builder.make_dataset_urn("snowflake", "mydb.schema.tableB"), ], builder.make_dataset_urn("snowflake", "mydb.schema.tableC"), ) ], ) task.execute(None) mock_emit.assert_called()
def test_datahub_lineage_operator(mock_hook): task = DatahubEmitterOperator( task_id="emit_lineage", datahub_rest_conn_id=datahub_rest_connection_config.conn_id, mces=[ builder.make_lineage_mce( [ builder.make_dataset_urn("snowflake", "mydb.schema.tableA"), builder.make_dataset_urn("snowflake", "mydb.schema.tableB"), ], builder.make_dataset_urn("snowflake", "mydb.schema.tableC"), ) ], ) task.execute(None) mock_hook.assert_called() mock_hook.return_value.emit_mces.assert_called_once()
def send_lineage( operator: "BaseOperator", inlets: Optional[List] = None, outlets: Optional[List] = None, context: Dict = None, ) -> None: context = context or {} # ensure not None to satisfy mypy dag: "DAG" = context["dag"] task = context["task"] # task_instance: "TaskInstance" = context["task_instance"] # TODO: verify if task and operator are the same? # TODO: use dag serialization to just save the whole thing. # TODO: save context.get("conf") # TODO: save DAG tags # TODO: save context.get("dag_run") # TODO: save all the data from task_instance # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) timestamp = int( dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=dag.owner, type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow")), ) flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md}", ), ownership, ], )) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, # TODO: add datajob description ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), ), ownership, ], )) lineage_mces = [ builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet) for outlet in _entities_to_urn_list(outlets or []) ] hook = make_emitter_hook() mces = [ flow_mce, job_mce, *lineage_mces, ] operator.log.info("DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces)) hook.emit_mces(mces)
dag_id=DAG_ID, default_args=default_args, description= "An example DAG demonstrating lineage emission within an Airflow DAG.", schedule_interval=None, start_date=days_ago(1), catchup=False, dagrun_timeout=timedelta(minutes=5), tags=["datahub demo"], ) as dag: emit_lineage_task = DatahubEmitterOperator( task_id="emit_lineage", datahub_conn_id="datahub_rest", mces=[ builder.make_lineage_mce(upstream_urns=[ builder.make_dataset_urn("glue", "mydb.tableA"), builder.make_dataset_urn("glue", "mydb.tableB"), ], downstream_urn=builder.make_dataset_urn( "glue", "mydb.tableC", )) ], ) get_airflow_cfg_operator = PythonOperator( task_id="get_airflow_cfg_task", python_callable=print_airflow_cfg) get_print_env_vars_operator = PythonOperator( task_id="get_print_env_vars_task", python_callable=print_env_vars) chain(emit_lineage_task, get_airflow_cfg_operator, get_print_env_vars_operator)
try: from airflow.operators.dummy import DummyOperator except ModuleNotFoundError: from airflow.operators.dummy_operator import DummyOperator import datahub.emitter.mce_builder as builder from datahub_provider import get_provider_info from datahub_provider.entities import Dataset from datahub_provider.hooks.datahub import DatahubKafkaHook, DatahubRestHook from datahub_provider.operators.datahub import DatahubEmitterOperator lineage_mce = builder.make_lineage_mce( [ builder.make_dataset_urn("bigquery", "upstream1"), builder.make_dataset_urn("bigquery", "upstream2"), ], builder.make_dataset_urn("bigquery", "downstream1"), ) datahub_rest_connection_config = Connection( conn_id="datahub_rest_test", conn_type="datahub_rest", host="http://test_host:8080/", extra=None, ) datahub_kafka_connection_config = Connection( conn_id="datahub_kafka_test", conn_type="datahub_kafka", host="test_broker:9092", extra=json.dumps({
def send_lineage( operator: "BaseOperator", inlets: Optional[List] = None, outlets: Optional[List] = None, context: Dict = None, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.lineage import prepare_lineage from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) from datahub.integrations.airflow.hooks import AIRFLOW_1 # Detect Airflow 1.10.x inlet/outlet configurations in Airflow 2.x, and # convert to the newer version. This code path will only be triggered # when 2.x receives a 1.10.x inlet/outlet config. needs_repeat_preparation = False if ( not AIRFLOW_1 and isinstance(operator._inlets, list) and len(operator._inlets) == 1 and isinstance(operator._inlets[0], dict) ): from airflow.lineage import AUTO operator._inlets = [ # See https://airflow.apache.org/docs/apache-airflow/1.10.15/lineage.html. *operator._inlets[0].get( "datasets", [] ), # assumes these are attr-annotated *operator._inlets[0].get("task_ids", []), *([AUTO] if operator._inlets[0].get("auto", False) else []), ] needs_repeat_preparation = True if ( not AIRFLOW_1 and isinstance(operator._outlets, list) and len(operator._outlets) == 1 and isinstance(operator._outlets[0], dict) ): operator._outlets = [*operator._outlets[0].get("datasets", [])] needs_repeat_preparation = True if needs_repeat_preparation: # Rerun the lineage preparation routine, now that the old format has been translated to the new one. prepare_lineage(lambda self, ctx: None)(operator, context) context = context or {} # ensure not None to satisfy mypy dag: "DAG" = context["dag"] task = context["task"] # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") timestamp = int(dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow") ), ) # operator.log.info(f"{ownership=}") tags = models.GlobalTagsClass( tags=[ models.TagAssociationClass(tag=f"airflow_{tag}") for tag in (dag.tags or []) ] ) # operator.log.info(f"{tags=}") flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), ownership, tags, ], ) ) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), ), ownership, tags, ], ) ) lineage_mces = [ builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet) for outlet in _entities_to_urn_list(outlets or []) ] force_upstream_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=inlet, aspects=[ models.StatusClass(removed=False), ], ) ) for inlet in _entities_to_urn_list(inlets or []) ] hook = make_emitter_hook() mces = [ flow_mce, job_mce, *lineage_mces, *force_upstream_materialization, ] operator.log.info( "DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces) ) hook.emit_mces(mces)
), some_other_table AS ( SELECT id, some_column FROM `mydb.schema.tableB` ) SELECT * FROM some_table LEFT JOIN some_other_table ON some_table.unique_id=some_other_table.id""" transformation_task = SnowflakeOperator( task_id="snowflake_transformation", dag=dag, snowflake_conn_id="snowflake_default", sql=sql, ) emit_lineage_task = DatahubEmitterOperator( task_id="emit_lineage", datahub_conn_id="datahub_rest_default", mces=[ builder.make_lineage_mce( upstream_urns=[ builder.make_dataset_urn("snowflake", "mydb.schema.tableA"), builder.make_dataset_urn("snowflake", "mydb.schema.tableB"), ], downstream_urn=builder.make_dataset_urn( "snowflake", "mydb.schema.tableC" ), ) ], ) transformation_task >> emit_lineage_task
), some_other_table AS ( SELECT id, some_column FROM `mydb.schema.tableB` ) SELECT * FROM some_table LEFT JOIN some_other_table ON some_table.unique_id=some_other_table.id""" transformation_task = SnowflakeOperator( task_id="snowflake_transformation", dag=dag, snowflake_conn_id="snowflake_default", sql=sql, ) emit_lineage_task = DatahubEmitterOperator( task_id="emit_lineage", datahub_rest_conn_id="datahub_rest_default", mces=[ builder.make_lineage_mce( [ builder.make_dataset_urn("snowflake", "mydb.schema.tableA"), builder.make_dataset_urn("snowflake", "mydb.schema.tableB"), ], builder.make_dataset_urn("snowflake", "mydb.schema.tableC"), ) ], ) transformation_task >> emit_lineage_task
def send_lineage( operator: "BaseOperator", inlets: Optional[List] = None, outlets: Optional[List] = None, context: Dict = None, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) context = context or {} # ensure not None to satisfy mypy dag: "DAG" = context["dag"] task = context["task"] # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") timestamp = int(dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow") ), ) # operator.log.info(f"{ownership=}") tags = models.GlobalTagsClass( tags=[ models.TagAssociationClass(tag=f"airflow_{tag}") for tag in (dag.tags or []) ] ) # operator.log.info(f"{tags=}") flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), ownership, tags, ], ) ) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), ), ownership, tags, ], ) ) lineage_mces = [ builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet) for outlet in _entities_to_urn_list(outlets or []) ] force_upstream_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=inlet, aspects=[ models.StatusClass(removed=False), ], ) ) for inlet in _entities_to_urn_list(inlets or []) ] hook = make_emitter_hook() mces = [ flow_mce, job_mce, *lineage_mces, *force_upstream_materialization, ] operator.log.info( "DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces) ) hook.emit_mces(mces)