def make_lineage_mce( upstream_urns: List[str], downstream_urn: str, actor: str = make_user_urn("datahub"), lineage_type: str = DatasetLineageTypeClass.TRANSFORMED, ) -> MetadataChangeEventClass: sys_time = get_sys_time() mce = MetadataChangeEventClass( proposedSnapshot=DatasetSnapshotClass( urn=downstream_urn, aspects=[ UpstreamLineageClass( upstreams=[ UpstreamClass( auditStamp=AuditStampClass( time=sys_time, actor=actor, ), dataset=upstream_urn, type=lineage_type, ) for upstream_urn in upstream_urns ] ) ], ) ) return mce
def get_lineage_if_enabled( self, mce: MetadataChangeEventClass ) -> Optional[MetadataChangeProposalWrapper]: if self.source_config.emit_s3_lineage: # extract dataset properties aspect dataset_properties: Optional[ DatasetPropertiesClass] = mce_builder.get_aspect_if_available( mce, DatasetPropertiesClass) if dataset_properties and "Location" in dataset_properties.customProperties: location = dataset_properties.customProperties["Location"] if location.startswith("s3://"): s3_dataset_urn = make_s3_urn(location, self.source_config.env) if self.source_config.glue_s3_lineage_direction == "upstream": upstream_lineage = UpstreamLineageClass(upstreams=[ UpstreamClass( dataset=s3_dataset_urn, type=DatasetLineageTypeClass.COPY, ) ]) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=mce.proposedSnapshot.urn, changeType=ChangeTypeClass.UPSERT, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp else: # Need to mint the s3 dataset with upstream lineage from it to glue upstream_lineage = UpstreamLineageClass(upstreams=[ UpstreamClass( dataset=mce.proposedSnapshot.urn, type=DatasetLineageTypeClass.COPY, ) ]) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=s3_dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def get_lineage_mcp( self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]: if self.lineage_metadata is None: logger.debug("No lineage metadata so skipping getting mcp") return None dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key( dataset_urn) if dataset_key is None: logger.debug( f"No dataset_key for {dataset_urn} so skipping getting mcp") return None project_id, dataset_name, tablename = dataset_key.name.split(".") bq_table = BigQueryTableRef(project_id, dataset_name, tablename) if str(bq_table) in self.lineage_metadata: upstream_list: List[UpstreamClass] = [] # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend # even if the lineage is same but the order is different. for upstream_table in sorted( self.get_upstream_tables(str(bq_table), tables_seen=[])): upstream_table_class = UpstreamClass( mce_builder.make_dataset_urn_with_platform_instance( self.platform, "{project}.{database}.{table}".format( project=upstream_table.project, database=upstream_table.dataset, table=upstream_table.table, ), self.config.platform_instance, self.config.env, ), DatasetLineageTypeClass.TRANSFORMED, ) if self.config.upstream_lineage_in_report: current_lineage_map: Set = self.report.upstream_lineage.get( str(bq_table), set()) current_lineage_map.add(str(upstream_table)) self.report.upstream_lineage[str( bq_table)] = current_lineage_map upstream_list.append(upstream_table_class) if upstream_list: upstream_lineage = UpstreamLineageClass( upstreams=upstream_list) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def make_lineage_mce( upstream_urns: List[str], downstream_urn: str, lineage_type: str = DatasetLineageTypeClass.TRANSFORMED, ) -> MetadataChangeEventClass: mce = MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass( urn=downstream_urn, aspects=[ UpstreamLineageClass(upstreams=[ UpstreamClass( dataset=upstream_urn, type=lineage_type, ) for upstream_urn in upstream_urns ]) ], )) return mce
def create_lineage_aspect_mce( directive: Directive) -> MetadataChangeEventClass: return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass( urn=dataset_name_to_urn(directive.table), aspects=[ UpstreamLineageClass(upstreams=[ UpstreamClass( dataset=dataset_name_to_urn(upstream), type=DatasetLineageTypeClass.TRANSFORMED, auditStamp=AuditStampClass( time=int(time.time() * 1000), actor="urn:li:corpuser:datahub", ), ) for upstream in directive.depends_on ]) ], ))
def get_lineage_mcp( self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]: if self.lineage_metadata is None: return None dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key( dataset_urn) if dataset_key is None: return None project_id, dataset_name, tablename = dataset_key.name.split(".") bq_table = BigQueryTableRef(project_id, dataset_name, tablename) if str(bq_table) in self.lineage_metadata: upstream_list: List[UpstreamClass] = [] # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend # even if the lineage is same but the order is different. for ref_table in sorted(self.lineage_metadata[str(bq_table)]): upstream_table = BigQueryTableRef.from_string_name(ref_table) upstream_table_class = UpstreamClass( mce_builder.make_dataset_urn( self.platform, "{project}.{database}.{table}".format( project=upstream_table.project, database=upstream_table.dataset, table=upstream_table.table, ), self.config.env, ), DatasetLineageTypeClass.TRANSFORMED, ) upstream_list.append(upstream_table_class) if upstream_list: upstream_lineage = UpstreamLineageClass( upstreams=upstream_list) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def get_lineage_mcp( self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]: dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: return None dataset_params = dataset_key.name.split(".") db_name = dataset_params[0] schemaname = dataset_params[1] tablename = dataset_params[2] if db_name in self.catalog_metadata: if schemaname in self.catalog_metadata[db_name]: external_db_params = self.catalog_metadata[db_name][schemaname] upstream_lineage = UpstreamLineageClass(upstreams=[ UpstreamClass( mce_builder.make_dataset_urn( self.eskind_to_platform[ external_db_params["eskind"]], "{database}.{table}".format( database=external_db_params[ "external_database"], table=tablename, ), self.config.env, ), DatasetLineageTypeClass.COPY, ) ]) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None