def make_lineage_mce( upstream_urns: List[str], downstream_urn: str, actor: str = make_user_urn("datahub"), lineage_type: str = DatasetLineageTypeClass.TRANSFORMED, ) -> MetadataChangeEventClass: sys_time = get_sys_time() mce = MetadataChangeEventClass( proposedSnapshot=DatasetSnapshotClass( urn=downstream_urn, aspects=[ UpstreamLineageClass( upstreams=[ UpstreamClass( auditStamp=AuditStampClass( time=sys_time, actor=actor, ), dataset=upstream_urn, type=lineage_type, ) for upstream_urn in upstream_urns ] ) ], ) ) return mce
def get_lineage_if_enabled( self, mce: MetadataChangeEventClass ) -> Optional[MetadataChangeProposalWrapper]: if self.source_config.emit_s3_lineage: # extract dataset properties aspect dataset_properties: Optional[ DatasetPropertiesClass] = mce_builder.get_aspect_if_available( mce, DatasetPropertiesClass) if dataset_properties and "Location" in dataset_properties.customProperties: location = dataset_properties.customProperties["Location"] if location.startswith("s3://"): s3_dataset_urn = make_s3_urn(location, self.source_config.env) if self.source_config.glue_s3_lineage_direction == "upstream": upstream_lineage = UpstreamLineageClass(upstreams=[ UpstreamClass( dataset=s3_dataset_urn, type=DatasetLineageTypeClass.COPY, ) ]) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=mce.proposedSnapshot.urn, changeType=ChangeTypeClass.UPSERT, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp else: # Need to mint the s3 dataset with upstream lineage from it to glue upstream_lineage = UpstreamLineageClass(upstreams=[ UpstreamClass( dataset=mce.proposedSnapshot.urn, type=DatasetLineageTypeClass.COPY, ) ]) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=s3_dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def get_lineage_mcp( self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]: if self.lineage_metadata is None: logger.debug("No lineage metadata so skipping getting mcp") return None dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key( dataset_urn) if dataset_key is None: logger.debug( f"No dataset_key for {dataset_urn} so skipping getting mcp") return None project_id, dataset_name, tablename = dataset_key.name.split(".") bq_table = BigQueryTableRef(project_id, dataset_name, tablename) if str(bq_table) in self.lineage_metadata: upstream_list: List[UpstreamClass] = [] # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend # even if the lineage is same but the order is different. for upstream_table in sorted( self.get_upstream_tables(str(bq_table), tables_seen=[])): upstream_table_class = UpstreamClass( mce_builder.make_dataset_urn_with_platform_instance( self.platform, "{project}.{database}.{table}".format( project=upstream_table.project, database=upstream_table.dataset, table=upstream_table.table, ), self.config.platform_instance, self.config.env, ), DatasetLineageTypeClass.TRANSFORMED, ) if self.config.upstream_lineage_in_report: current_lineage_map: Set = self.report.upstream_lineage.get( str(bq_table), set()) current_lineage_map.add(str(upstream_table)) self.report.upstream_lineage[str( bq_table)] = current_lineage_map upstream_list.append(upstream_table_class) if upstream_list: upstream_lineage = UpstreamLineageClass( upstreams=upstream_list) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def make_lineage_mce( upstream_urns: List[str], downstream_urn: str, lineage_type: str = DatasetLineageTypeClass.TRANSFORMED, ) -> MetadataChangeEventClass: mce = MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass( urn=downstream_urn, aspects=[ UpstreamLineageClass(upstreams=[ UpstreamClass( dataset=upstream_urn, type=lineage_type, ) for upstream_urn in upstream_urns ]) ], )) return mce
def create_lineage_aspect_mce( directive: Directive) -> MetadataChangeEventClass: return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass( urn=dataset_name_to_urn(directive.table), aspects=[ UpstreamLineageClass(upstreams=[ UpstreamClass( dataset=dataset_name_to_urn(upstream), type=DatasetLineageTypeClass.TRANSFORMED, auditStamp=AuditStampClass( time=int(time.time() * 1000), actor="urn:li:corpuser:datahub", ), ) for upstream in directive.depends_on ]) ], ))
def get_lineage_mcp( self, dataset_urn: str ) -> Tuple[Optional[MetadataChangeProposalWrapper], Optional[DatasetPropertiesClass]]: dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: return None, None if not self._lineage_map: self._populate_lineage() assert self._lineage_map is not None upstream_lineage: List[UpstreamClass] = [] custom_properties: Dict[str, str] = {} if dataset_key.name in self._lineage_map: item = self._lineage_map[dataset_key.name] for upstream in item.upstreams: upstream_table = UpstreamClass( dataset=builder.make_dataset_urn_with_platform_instance( upstream.platform.value, upstream.path, self.config.platform_instance, self.config.env, ), type=item.dataset_lineage_type, ) upstream_lineage.append(upstream_table) properties = None if custom_properties: properties = DatasetPropertiesClass( customProperties=custom_properties) if not upstream_lineage: return None, properties mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=UpstreamLineage(upstreams=upstream_lineage), ) return mcp, properties
def get_lineage_mcp( self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]: if self.lineage_metadata is None: return None dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key( dataset_urn) if dataset_key is None: return None project_id, dataset_name, tablename = dataset_key.name.split(".") bq_table = BigQueryTableRef(project_id, dataset_name, tablename) if str(bq_table) in self.lineage_metadata: upstream_list: List[UpstreamClass] = [] # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend # even if the lineage is same but the order is different. for ref_table in sorted(self.lineage_metadata[str(bq_table)]): upstream_table = BigQueryTableRef.from_string_name(ref_table) upstream_table_class = UpstreamClass( mce_builder.make_dataset_urn( self.platform, "{project}.{database}.{table}".format( project=upstream_table.project, database=upstream_table.dataset, table=upstream_table.table, ), self.config.env, ), DatasetLineageTypeClass.TRANSFORMED, ) upstream_list.append(upstream_table_class) if upstream_list: upstream_lineage = UpstreamLineageClass( upstreams=upstream_list) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def get_lineage_mcp( self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]: dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: return None dataset_params = dataset_key.name.split(".") db_name = dataset_params[0] schemaname = dataset_params[1] tablename = dataset_params[2] if db_name in self.catalog_metadata: if schemaname in self.catalog_metadata[db_name]: external_db_params = self.catalog_metadata[db_name][schemaname] upstream_lineage = UpstreamLineageClass(upstreams=[ UpstreamClass( mce_builder.make_dataset_urn( self.eskind_to_platform[ external_db_params["eskind"]], "{database}.{table}".format( database=external_db_params[ "external_database"], table=tablename, ), self.config.env, ), DatasetLineageTypeClass.COPY, ) ]) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def get_lineage_mcp( self, dataset_urn: str ) -> Tuple[Optional[MetadataChangeProposalWrapper], Optional[DatasetPropertiesClass]]: dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: return None, None if self._lineage_map is None: logger.debug("Populating lineage") self._populate_lineage() assert self._lineage_map is not None upstream_lineage: List[UpstreamClass] = [] custom_properties: Dict[str, str] = {} if dataset_key.name in self._lineage_map: item = self._lineage_map[dataset_key.name] if (self.config.capture_lineage_query_parser_failures and item.query_parser_failed_sqls): custom_properties[ "lineage_sql_parser_failed_queries"] = ",".join( item.query_parser_failed_sqls) for upstream in item.upstreams: upstream_table = UpstreamClass( dataset=builder.make_dataset_urn_with_platform_instance( upstream.platform.value, upstream.path, platform_instance=self.config.platform_instance_map. get(upstream.platform.value) if self.config.platform_instance_map else None, env=self.config.env, ), type=item.dataset_lineage_type, ) upstream_lineage.append(upstream_table) dataset_params = dataset_key.name.split(".") db_name = dataset_params[0] schemaname = dataset_params[1] tablename = dataset_params[2] if db_name in self.catalog_metadata: if schemaname in self.catalog_metadata[db_name]: external_db_params = self.catalog_metadata[db_name][schemaname] upstream_platform = self.eskind_to_platform[ external_db_params["eskind"]] catalog_upstream = UpstreamClass( mce_builder.make_dataset_urn_with_platform_instance( upstream_platform, "{database}.{table}".format( database=external_db_params["external_database"], table=tablename, ), platform_instance=self.config.platform_instance_map. get(upstream_platform) if self.config.platform_instance_map else None, env=self.config.env, ), DatasetLineageTypeClass.COPY, ) upstream_lineage.append(catalog_upstream) properties = None if custom_properties: properties = DatasetPropertiesClass( customProperties=custom_properties) if upstream_lineage: self.report.upstream_lineage[dataset_urn] = [ u.dataset for u in upstream_lineage ] else: return None, properties mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=UpstreamLineage(upstreams=upstream_lineage), ) return mcp, properties
def _process_table( self, dataset_name: str, inspector: Inspector, schema: str, table: str, sql_config: SQLAlchemyConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: columns = self._get_columns(dataset_name, inspector, schema, table) dataset_urn = make_dataset_urn_with_platform_instance( self.platform, dataset_name, self.config.platform_instance, self.config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[StatusClass(removed=False)], ) if self.is_stateful_ingestion_configured(): cur_checkpoint = self.get_current_checkpoint( self.get_default_ingestion_job_id()) if cur_checkpoint is not None: checkpoint_state = cast(BaseSQLAlchemyCheckpointState, cur_checkpoint.state) checkpoint_state.add_table_urn(dataset_urn) description, properties, location_urn = self.get_table_properties( inspector, schema, table) dataset_properties = DatasetPropertiesClass( name=table, description=description, customProperties=properties, ) dataset_snapshot.aspects.append(dataset_properties) if location_urn: external_upstream_table = UpstreamClass( dataset=location_urn, type=DatasetLineageTypeClass.COPY, ) lineage_mcpw = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_snapshot.urn, aspectName="upstreamLineage", aspect=UpstreamLineage(upstreams=[external_upstream_table]), ) lineage_wu = MetadataWorkUnit( id= f"{self.platform}-{lineage_mcpw.entityUrn}-{lineage_mcpw.aspectName}", mcp=lineage_mcpw, ) yield lineage_wu pk_constraints: dict = inspector.get_pk_constraint(table, schema) foreign_keys = self._get_foreign_keys(dataset_urn, inspector, schema, table) schema_fields = self.get_schema_fields(dataset_name, columns, pk_constraints) schema_metadata = get_schema_metadata( self.report, dataset_name, self.platform, columns, pk_constraints, foreign_keys, schema_fields, ) dataset_snapshot.aspects.append(schema_metadata) db_name = self.get_db_name(inspector) yield from self.add_table_to_schema_container(dataset_urn, db_name, schema) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu dpi_aspect = self.get_dataplatform_instance_aspect( dataset_urn=dataset_urn) if dpi_aspect: yield dpi_aspect subtypes_aspect = MetadataWorkUnit( id=f"{dataset_name}-subtypes", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["table"]), ), ) yield subtypes_aspect yield from self._get_domain_wu( dataset_name=dataset_name, entity_urn=dataset_urn, entity_type="dataset", sql_config=sql_config, )