def generate_dataflow( cluster: str, dag: "DAG", capture_owner: bool = True, capture_tags: bool = True, ) -> DataFlow: """ Generates a Dataflow object from an Airflow DAG :param cluster: str - name of the cluster :param dag: DAG - :param capture_tags: :param capture_owner: :return: DataFlow - Data generated dataflow """ from airflow.serialization.serialized_objects import SerializedDAG id = dag.dag_id orchestrator = "airflow" description = f"{dag.description}\n\n{dag.doc_md or ''}" data_flow = DataFlow(cluster=cluster, id=id, orchestrator=orchestrator, description=description) flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) allowed_flow_keys = [ "_access_control", "_concurrency", "_default_view", "catchup", "fileloc", "is_paused_upon_creation", "start_date", "tags", "timezone", ] flow_property_bag = { k: v for (k, v) in flow_property_bag.items() if k in allowed_flow_keys } data_flow.properties = flow_property_bag base_url = conf.get("webserver", "base_url") data_flow.url = f"{base_url}/tree?dag_id={dag.dag_id}" if capture_owner and dag.owner: data_flow.owners.add(dag.owner) if capture_tags and dag.tags: data_flow.tags.update(dag.tags) return data_flow
def test_task_group_serialization(self): """ Test TaskGroup serialization/deserialization. """ from airflow.operators.dummy_operator import DummyOperator from airflow.utils.task_group import TaskGroup execution_date = datetime(2020, 1, 1) with DAG("test_task_group_serialization", start_date=execution_date) as dag: task1 = DummyOperator(task_id="task1") with TaskGroup("group234") as group234: _ = DummyOperator(task_id="task2") with TaskGroup("group34") as group34: _ = DummyOperator(task_id="task3") _ = DummyOperator(task_id="task4") task5 = DummyOperator(task_id="task5") task1 >> group234 group34 >> task5 dag_dict = SerializedDAG.to_dict(dag) SerializedDAG.validate_schema(dag_dict) json_dag = SerializedDAG.from_json(SerializedDAG.to_json(dag)) self.validate_deserialized_dag(json_dag, dag) serialized_dag = SerializedDAG.deserialize_dag( SerializedDAG.serialize_dag(dag)) assert serialized_dag.task_group.children assert serialized_dag.task_group.children.keys( ) == dag.task_group.children.keys() def check_task_group(node): try: children = node.children.values() except AttributeError: # Round-trip serialization and check the result expected_serialized = SerializedBaseOperator.serialize_operator( dag.get_task(node.task_id)) expected_deserialized = SerializedBaseOperator.deserialize_operator( expected_serialized) expected_dict = SerializedBaseOperator.serialize_operator( expected_deserialized) assert node assert SerializedBaseOperator.serialize_operator( node) == expected_dict return for child in children: check_task_group(child) check_task_group(serialized_dag.task_group)
def test_edge_info_serialization(self): """ Tests edge_info serialization/deserialization. """ from airflow.operators.dummy import DummyOperator from airflow.utils.edgemodifier import Label with DAG("test_edge_info_serialization", start_date=datetime(2020, 1, 1)) as dag: task1 = DummyOperator(task_id="task1") task2 = DummyOperator(task_id="task2") task1 >> Label("test label") >> task2 # pylint: disable=W0106 dag_dict = SerializedDAG.to_dict(dag) SerializedDAG.validate_schema(dag_dict) json_dag = SerializedDAG.from_json(SerializedDAG.to_json(dag)) self.validate_deserialized_dag(json_dag, dag) serialized_dag = SerializedDAG.deserialize_dag(SerializedDAG.serialize_dag(dag)) assert serialized_dag.edge_info == dag.edge_info
def send_lineage_to_datahub( config: DatahubBasicLineageConfig, operator: "BaseOperator", inlets: List[_Entity], outlets: List[_Entity], context: Dict, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) dag: "DAG" = context["dag"] task: "BaseOperator" = context["task"] # resolve URNs for upstream nodes in subdags upstream of the current task. upstream_subdag_task_urns: List[str] = [] for upstream_task_id in task.upstream_task_ids: upstream_task = dag.task_dict[upstream_task_id] # if upstream task is not a subdag, then skip it if upstream_task.subdag is None: continue # else, link the leaf tasks of the upstream subdag as upstream tasks upstream_subdag = upstream_task.subdag upstream_subdag_flow_urn = builder.make_data_flow_urn( "airflow", upstream_subdag.dag_id, config.cluster) for upstream_subdag_task_id in upstream_subdag.task_dict: upstream_subdag_task = upstream_subdag.task_dict[ upstream_subdag_task_id] upstream_subdag_task_urn = builder.make_data_job_urn_with_flow( upstream_subdag_flow_urn, upstream_subdag_task_id) # if subdag task is a leaf task, then link it as an upstream task if len(upstream_subdag_task._downstream_task_ids) == 0: upstream_subdag_task_urns.append(upstream_subdag_task_urn) # resolve URNs for upstream nodes that trigger the subdag containing the current task. # (if it is in a subdag at all) upstream_subdag_triggers: List[str] = [] # subdags are always named with 'parent.child' style or Airflow won't run them # add connection from subdag trigger(s) if subdag task has no upstreams if (dag.is_subdag and dag.parent_dag is not None and len(task._upstream_task_ids) == 0): # filter through the parent dag's tasks and find the subdag trigger(s) subdags = [ x for x in dag.parent_dag.task_dict.values() if x.subdag is not None ] matched_subdags = [ x for x in subdags if getattr(getattr(x, "subdag"), "dag_id") == dag.dag_id ] # id of the task containing the subdag subdag_task_id = matched_subdags[0].task_id parent_dag_urn = builder.make_data_flow_urn("airflow", dag.parent_dag.dag_id, config.cluster) # iterate through the parent dag's tasks and find the ones that trigger the subdag for upstream_task_id in dag.parent_dag.task_dict: upstream_task = dag.parent_dag.task_dict[upstream_task_id] upstream_task_urn = builder.make_data_job_urn_with_flow( parent_dag_urn, upstream_task_id) # if the task triggers the subdag, link it to this node in the subdag if subdag_task_id in upstream_task._downstream_task_ids: upstream_subdag_triggers.append(upstream_task_urn) # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id, config.cluster) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") allowed_task_keys = [ "_downstream_task_ids", "_inlets", "_outlets", "_task_type", "_task_module", "depends_on_past", "email", "label", "execution_timeout", "end_date", "start_date", "sla", "sql", "task_id", "trigger_rule", "wait_for_downstream", ] job_property_bag = { k: v for (k, v) in job_property_bag.items() if k in allowed_task_keys } allowed_flow_keys = [ "_access_control", "_concurrency", "_default_view", "catchup", "fileloc", "is_paused_upon_creation", "start_date", "tags", "timezone", ] flow_property_bag = { k: v for (k, v) in flow_property_bag.items() if k in allowed_flow_keys } if config.capture_ownership_info: ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=0, actor=builder.make_user_urn("airflow")), ) # operator.log.info(f"{ownership=}") ownership_aspect = [ownership] else: ownership_aspect = [] if config.capture_tags_info: tags = models.GlobalTagsClass(tags=[ models.TagAssociationClass(tag=builder.make_tag_urn(tag)) for tag in (dag.tags or []) ]) # operator.log.info(f"{tags=}") tags_aspect = [tags] else: tags_aspect = [] flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), *ownership_aspect, *tags_aspect, ], )) # exclude subdag operator tasks since these are not emitted, resulting in empty metadata upstream_tasks = ([ builder.make_data_job_urn_with_flow(flow_urn, task_id) for task_id in task.upstream_task_ids if dag.task_dict[task_id].subdag is None ] + upstream_subdag_task_urns + upstream_subdag_triggers) job_doc = ((operator.doc or operator.doc_md or operator.doc_json or operator.doc_yaml or operator.doc_rst) if not AIRFLOW_1 else None) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=job_doc, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), inputDatajobs=upstream_tasks, ), *ownership_aspect, *tags_aspect, ], )) force_entity_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=iolet, aspects=[ models.StatusClass(removed=False), ], )) for iolet in _entities_to_urn_list((inlets or []) + (outlets or [])) ] hook = config.make_emitter_hook() mces = [ flow_mce, job_mce, *force_entity_materialization, ] operator.log.info("DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces)) hook.emit_mces(mces)
def send_lineage( operator: "BaseOperator", inlets: Optional[List] = None, outlets: Optional[List] = None, context: Dict = None, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.lineage import prepare_lineage from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) from datahub.integrations.airflow.hooks import AIRFLOW_1 # Detect Airflow 1.10.x inlet/outlet configurations in Airflow 2.x, and # convert to the newer version. This code path will only be triggered # when 2.x receives a 1.10.x inlet/outlet config. needs_repeat_preparation = False if ( not AIRFLOW_1 and isinstance(operator._inlets, list) and len(operator._inlets) == 1 and isinstance(operator._inlets[0], dict) ): from airflow.lineage import AUTO operator._inlets = [ # See https://airflow.apache.org/docs/apache-airflow/1.10.15/lineage.html. *operator._inlets[0].get( "datasets", [] ), # assumes these are attr-annotated *operator._inlets[0].get("task_ids", []), *([AUTO] if operator._inlets[0].get("auto", False) else []), ] needs_repeat_preparation = True if ( not AIRFLOW_1 and isinstance(operator._outlets, list) and len(operator._outlets) == 1 and isinstance(operator._outlets[0], dict) ): operator._outlets = [*operator._outlets[0].get("datasets", [])] needs_repeat_preparation = True if needs_repeat_preparation: # Rerun the lineage preparation routine, now that the old format has been translated to the new one. prepare_lineage(lambda self, ctx: None)(operator, context) context = context or {} # ensure not None to satisfy mypy dag: "DAG" = context["dag"] task = context["task"] # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") timestamp = int(dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow") ), ) # operator.log.info(f"{ownership=}") tags = models.GlobalTagsClass( tags=[ models.TagAssociationClass(tag=f"airflow_{tag}") for tag in (dag.tags or []) ] ) # operator.log.info(f"{tags=}") flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), ownership, tags, ], ) ) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), ), ownership, tags, ], ) ) lineage_mces = [ builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet) for outlet in _entities_to_urn_list(outlets or []) ] force_upstream_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=inlet, aspects=[ models.StatusClass(removed=False), ], ) ) for inlet in _entities_to_urn_list(inlets or []) ] hook = make_emitter_hook() mces = [ flow_mce, job_mce, *lineage_mces, *force_upstream_materialization, ] operator.log.info( "DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces) ) hook.emit_mces(mces)
def send_lineage_to_datahub( config: DatahubBasicLineageConfig, operator: "BaseOperator", inlets: List[_Entity], outlets: List[_Entity], context: Dict, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) dag: "DAG" = context["dag"] task: "BaseOperator" = context["task"] # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id, config.cluster) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") allowed_task_keys = [ "_downstream_task_ids", "_inlets", "_outlets", "_task_type", "_task_module", "depends_on_past", "email", "label", "execution_timeout", "end_date", "start_date", "sla", "sql", "task_id", "trigger_rule", "wait_for_downstream", ] job_property_bag = { k: v for (k, v) in job_property_bag.items() if k in allowed_task_keys } allowed_flow_keys = [ "_access_control", "_concurrency", "_default_view", "catchup", "fileloc", "is_paused_upon_creation", "start_date", "tags", "timezone", ] flow_property_bag = { k: v for (k, v) in flow_property_bag.items() if k in allowed_flow_keys } if config.capture_ownership_info: timestamp = int( dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow")), ) # operator.log.info(f"{ownership=}") ownership_aspect = [ownership] else: ownership_aspect = [] if config.capture_tags_info: tags = models.GlobalTagsClass(tags=[ models.TagAssociationClass(tag=builder.make_tag_urn(tag)) for tag in (dag.tags or []) ]) # operator.log.info(f"{tags=}") tags_aspect = [tags] else: tags_aspect = [] flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), *ownership_aspect, *tags_aspect, ], )) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), inputDatajobs=[ builder.make_data_job_urn_with_flow(flow_urn, task_id) for task_id in task.upstream_task_ids ], ), *ownership_aspect, *tags_aspect, ], )) force_entity_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=iolet, aspects=[ models.StatusClass(removed=False), ], )) for iolet in _entities_to_urn_list((inlets or []) + (outlets or [])) ] hook = config.make_emitter_hook() mces = [ flow_mce, job_mce, *force_entity_materialization, ] operator.log.info("DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces)) hook.emit_mces(mces)
def send_lineage( operator: "BaseOperator", inlets: Optional[List] = None, outlets: Optional[List] = None, context: Dict = None, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) context = context or {} # ensure not None to satisfy mypy dag: "DAG" = context["dag"] task = context["task"] # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") timestamp = int(dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow") ), ) # operator.log.info(f"{ownership=}") tags = models.GlobalTagsClass( tags=[ models.TagAssociationClass(tag=f"airflow_{tag}") for tag in (dag.tags or []) ] ) # operator.log.info(f"{tags=}") flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), ownership, tags, ], ) ) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), ), ownership, tags, ], ) ) lineage_mces = [ builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet) for outlet in _entities_to_urn_list(outlets or []) ] force_upstream_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=inlet, aspects=[ models.StatusClass(removed=False), ], ) ) for inlet in _entities_to_urn_list(inlets or []) ] hook = make_emitter_hook() mces = [ flow_mce, job_mce, *lineage_mces, *force_upstream_materialization, ] operator.log.info( "DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces) ) hook.emit_mces(mces)