def get_lineage_mcp( self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]: if self.lineage_metadata is None: return None dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key( dataset_urn) if dataset_key is None: return None project_id, dataset_name, tablename = dataset_key.name.split(".") bq_table = BigQueryTableRef(project_id, dataset_name, tablename) if str(bq_table) in self.lineage_metadata: upstream_list: List[UpstreamClass] = [] # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend # even if the lineage is same but the order is different. for ref_table in sorted(self.lineage_metadata[str(bq_table)]): upstream_table = BigQueryTableRef.from_string_name(ref_table) upstream_table_class = UpstreamClass( mce_builder.make_dataset_urn( self.platform, "{project}.{database}.{table}".format( project=upstream_table.project, database=upstream_table.dataset, table=upstream_table.table, ), self.config.env, ), DatasetLineageTypeClass.TRANSFORMED, ) upstream_list.append(upstream_table_class) if upstream_list: upstream_lineage = UpstreamLineageClass( upstreams=upstream_list) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def get_upstream_tables( self, bq_table: str, tables_seen: List[str] = []) -> Set[BigQueryTableRef]: upstreams: Set[BigQueryTableRef] = set() assert self.lineage_metadata for ref_table in self.lineage_metadata[str(bq_table)]: upstream_table = BigQueryTableRef.from_string_name(ref_table) if upstream_table.is_temporary_table(): # making sure we don't process a table twice and not get into a recurisve loop if ref_table in tables_seen: logger.debug( f"Skipping table {ref_table} because it was seen already" ) continue tables_seen.append(ref_table) if ref_table in self.lineage_metadata: upstreams = upstreams.union( self.get_upstream_tables(ref_table, tables_seen=tables_seen)) else: upstreams.add(upstream_table) return upstreams