def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstream(): from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.sql.bigquery import BigQueryConfig, BigQuerySource from datahub.ingestion.source.usage.bigquery_usage import BigQueryTableRef a: BigQueryTableRef = BigQueryTableRef( project="test-project", dataset="test-dataset", table="a" ) b: BigQueryTableRef = BigQueryTableRef( project="test-project", dataset="_temp-dataset", table="b" ) c: BigQueryTableRef = BigQueryTableRef( project="test-project", dataset="test-dataset", table="c" ) d: BigQueryTableRef = BigQueryTableRef( project="test-project", dataset="_test-dataset", table="d" ) e: BigQueryTableRef = BigQueryTableRef( project="test-project", dataset="test-dataset", table="e" ) config = BigQueryConfig.parse_obj( { "project_id": "test-project", } ) source = BigQuerySource(config=config, ctx=PipelineContext(run_id="test")) source.lineage_metadata = { str(a): set([str(b)]), str(b): set([str(c), str(d)]), str(d): set([str(e)]), } upstreams = source.get_upstream_tables(str(a), []) assert list(upstreams).sort() == [c, e].sort()
def test_simple_upstream_table_generation(): a: BigQueryTableRef = BigQueryTableRef(project="test-project", dataset="test-dataset", table="a") b: BigQueryTableRef = BigQueryTableRef(project="test-project", dataset="test-dataset", table="b") config = BigQueryConfig.parse_obj({ "project_id": "test-project", }) source = BigQuerySource(config=config, ctx=PipelineContext(run_id="test")) source.lineage_metadata = {str(a): set([str(b)])} upstreams = source.get_upstream_tables(str(a), []) assert list(upstreams) == [b]
def normalise_dataset_name(self, dataset_name: str) -> str: (project_id, schema, table) = dataset_name.split(".") trimmed_table_name = (BigQueryTableRef.from_spec_obj({ "projectId": project_id, "datasetId": schema, "tableId": table }).remove_extras().table) return f"{project_id}.{schema}.{trimmed_table_name}"
def get_lineage_mcp( self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]: if self.lineage_metadata is None: return None dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key( dataset_urn) if dataset_key is None: return None project_id, dataset_name, tablename = dataset_key.name.split(".") bq_table = BigQueryTableRef(project_id, dataset_name, tablename) if str(bq_table) in self.lineage_metadata: upstream_list: List[UpstreamClass] = [] # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend # even if the lineage is same but the order is different. for ref_table in sorted(self.lineage_metadata[str(bq_table)]): upstream_table = BigQueryTableRef.from_string_name(ref_table) upstream_table_class = UpstreamClass( mce_builder.make_dataset_urn( self.platform, "{project}.{database}.{table}".format( project=upstream_table.project, database=upstream_table.dataset, table=upstream_table.table, ), self.config.env, ), DatasetLineageTypeClass.TRANSFORMED, ) upstream_list.append(upstream_table_class) if upstream_list: upstream_lineage = UpstreamLineageClass( upstreams=upstream_list) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def test_simple_upstream_table_generation(): from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.sql.bigquery import BigQueryConfig, BigQuerySource from datahub.ingestion.source.usage.bigquery_usage import BigQueryTableRef a: BigQueryTableRef = BigQueryTableRef( project="test-project", dataset="test-dataset", table="a" ) b: BigQueryTableRef = BigQueryTableRef( project="test-project", dataset="test-dataset", table="b" ) config = BigQueryConfig.parse_obj( { "project_id": "test-project", } ) source = BigQuerySource(config=config, ctx=PipelineContext(run_id="test")) source.lineage_metadata = {str(a): set([str(b)])} upstreams = source.get_upstream_tables(str(a), []) assert list(upstreams) == [b]
def get_lineage_mcp( self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]: if self.lineage_metadata is None: logger.debug("No lineage metadata so skipping getting mcp") return None dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key( dataset_urn) if dataset_key is None: logger.debug( f"No dataset_key for {dataset_urn} so skipping getting mcp") return None project_id, dataset_name, tablename = dataset_key.name.split(".") bq_table = BigQueryTableRef(project_id, dataset_name, tablename) if str(bq_table) in self.lineage_metadata: upstream_list: List[UpstreamClass] = [] # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend # even if the lineage is same but the order is different. for upstream_table in sorted( self.get_upstream_tables(str(bq_table), tables_seen=[])): upstream_table_class = UpstreamClass( mce_builder.make_dataset_urn_with_platform_instance( self.platform, "{project}.{database}.{table}".format( project=upstream_table.project, database=upstream_table.dataset, table=upstream_table.table, ), self.config.platform_instance, self.config.env, ), DatasetLineageTypeClass.TRANSFORMED, ) if self.config.upstream_lineage_in_report: current_lineage_map: Set = self.report.upstream_lineage.get( str(bq_table), set()) current_lineage_map.add(str(upstream_table)) self.report.upstream_lineage[str( bq_table)] = current_lineage_map upstream_list.append(upstream_table_class) if upstream_list: upstream_lineage = UpstreamLineageClass( upstreams=upstream_list) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def get_identifier( self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any, ) -> str: assert inspector project_id = self._get_project_id(inspector) table_name = BigQueryTableRef.from_spec_obj({ "projectId": project_id, "datasetId": schema, "tableId": entity }).table return f"{project_id}.{schema}.{table_name}"
def get_upstream_tables( self, bq_table: str, tables_seen: List[str] = []) -> Set[BigQueryTableRef]: upstreams: Set[BigQueryTableRef] = set() assert self.lineage_metadata for ref_table in self.lineage_metadata[str(bq_table)]: upstream_table = BigQueryTableRef.from_string_name(ref_table) if upstream_table.is_temporary_table(): # making sure we don't process a table twice and not get into a recurisve loop if ref_table in tables_seen: logger.debug( f"Skipping table {ref_table} because it was seen already" ) continue tables_seen.append(ref_table) if ref_table in self.lineage_metadata: upstreams = upstreams.union( self.get_upstream_tables(ref_table, tables_seen=tables_seen)) else: upstreams.add(upstream_table) return upstreams
def test_remove_extras(test_input, expected): table_ref = BigQueryTableRef("test_project", "test_dataset", test_input) assert table_ref.remove_extras().table == expected