Пример #1
0
def make_lineage_mce(
    upstream_urns: List[str],
    downstream_urn: str,
    actor: str = make_user_urn("datahub"),
    lineage_type: str = DatasetLineageTypeClass.TRANSFORMED,
) -> MetadataChangeEventClass:
    sys_time = get_sys_time()

    mce = MetadataChangeEventClass(
        proposedSnapshot=DatasetSnapshotClass(
            urn=downstream_urn,
            aspects=[
                UpstreamLineageClass(
                    upstreams=[
                        UpstreamClass(
                            auditStamp=AuditStampClass(
                                time=sys_time,
                                actor=actor,
                            ),
                            dataset=upstream_urn,
                            type=lineage_type,
                        )
                        for upstream_urn in upstream_urns
                    ]
                )
            ],
        )
    )
    return mce
Пример #2
0
 def get_lineage_if_enabled(
     self, mce: MetadataChangeEventClass
 ) -> Optional[MetadataChangeProposalWrapper]:
     if self.source_config.emit_s3_lineage:
         # extract dataset properties aspect
         dataset_properties: Optional[
             DatasetPropertiesClass] = mce_builder.get_aspect_if_available(
                 mce, DatasetPropertiesClass)
         if dataset_properties and "Location" in dataset_properties.customProperties:
             location = dataset_properties.customProperties["Location"]
             if location.startswith("s3://"):
                 s3_dataset_urn = make_s3_urn(location,
                                              self.source_config.env)
                 if self.source_config.glue_s3_lineage_direction == "upstream":
                     upstream_lineage = UpstreamLineageClass(upstreams=[
                         UpstreamClass(
                             dataset=s3_dataset_urn,
                             type=DatasetLineageTypeClass.COPY,
                         )
                     ])
                     mcp = MetadataChangeProposalWrapper(
                         entityType="dataset",
                         entityUrn=mce.proposedSnapshot.urn,
                         changeType=ChangeTypeClass.UPSERT,
                         aspectName="upstreamLineage",
                         aspect=upstream_lineage,
                     )
                     return mcp
                 else:
                     # Need to mint the s3 dataset with upstream lineage from it to glue
                     upstream_lineage = UpstreamLineageClass(upstreams=[
                         UpstreamClass(
                             dataset=mce.proposedSnapshot.urn,
                             type=DatasetLineageTypeClass.COPY,
                         )
                     ])
                     mcp = MetadataChangeProposalWrapper(
                         entityType="dataset",
                         entityUrn=s3_dataset_urn,
                         changeType=ChangeTypeClass.UPSERT,
                         aspectName="upstreamLineage",
                         aspect=upstream_lineage,
                     )
                     return mcp
     return None
Пример #3
0
    def get_lineage_mcp(
            self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]:
        if self.lineage_metadata is None:
            logger.debug("No lineage metadata so skipping getting mcp")
            return None
        dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key(
            dataset_urn)
        if dataset_key is None:
            logger.debug(
                f"No dataset_key for {dataset_urn} so skipping getting mcp")
            return None
        project_id, dataset_name, tablename = dataset_key.name.split(".")
        bq_table = BigQueryTableRef(project_id, dataset_name, tablename)
        if str(bq_table) in self.lineage_metadata:
            upstream_list: List[UpstreamClass] = []
            # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend
            # even if the lineage is same but the order is different.
            for upstream_table in sorted(
                    self.get_upstream_tables(str(bq_table), tables_seen=[])):
                upstream_table_class = UpstreamClass(
                    mce_builder.make_dataset_urn_with_platform_instance(
                        self.platform,
                        "{project}.{database}.{table}".format(
                            project=upstream_table.project,
                            database=upstream_table.dataset,
                            table=upstream_table.table,
                        ),
                        self.config.platform_instance,
                        self.config.env,
                    ),
                    DatasetLineageTypeClass.TRANSFORMED,
                )
                if self.config.upstream_lineage_in_report:
                    current_lineage_map: Set = self.report.upstream_lineage.get(
                        str(bq_table), set())
                    current_lineage_map.add(str(upstream_table))
                    self.report.upstream_lineage[str(
                        bq_table)] = current_lineage_map
                upstream_list.append(upstream_table_class)

            if upstream_list:
                upstream_lineage = UpstreamLineageClass(
                    upstreams=upstream_list)
                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="upstreamLineage",
                    aspect=upstream_lineage,
                )
                return mcp
        return None
Пример #4
0
def make_lineage_mce(
    upstream_urns: List[str],
    downstream_urn: str,
    lineage_type: str = DatasetLineageTypeClass.TRANSFORMED,
) -> MetadataChangeEventClass:
    mce = MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass(
        urn=downstream_urn,
        aspects=[
            UpstreamLineageClass(upstreams=[
                UpstreamClass(
                    dataset=upstream_urn,
                    type=lineage_type,
                ) for upstream_urn in upstream_urns
            ])
        ],
    ))
    return mce
Пример #5
0
def create_lineage_aspect_mce(
        directive: Directive) -> MetadataChangeEventClass:
    return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass(
        urn=dataset_name_to_urn(directive.table),
        aspects=[
            UpstreamLineageClass(upstreams=[
                UpstreamClass(
                    dataset=dataset_name_to_urn(upstream),
                    type=DatasetLineageTypeClass.TRANSFORMED,
                    auditStamp=AuditStampClass(
                        time=int(time.time() * 1000),
                        actor="urn:li:corpuser:datahub",
                    ),
                ) for upstream in directive.depends_on
            ])
        ],
    ))
Пример #6
0
    def get_lineage_mcp(
        self, dataset_urn: str
    ) -> Tuple[Optional[MetadataChangeProposalWrapper],
               Optional[DatasetPropertiesClass]]:
        dataset_key = mce_builder.dataset_urn_to_key(dataset_urn)
        if dataset_key is None:
            return None, None

        if not self._lineage_map:
            self._populate_lineage()
        assert self._lineage_map is not None

        upstream_lineage: List[UpstreamClass] = []
        custom_properties: Dict[str, str] = {}

        if dataset_key.name in self._lineage_map:
            item = self._lineage_map[dataset_key.name]
            for upstream in item.upstreams:
                upstream_table = UpstreamClass(
                    dataset=builder.make_dataset_urn_with_platform_instance(
                        upstream.platform.value,
                        upstream.path,
                        self.config.platform_instance,
                        self.config.env,
                    ),
                    type=item.dataset_lineage_type,
                )
                upstream_lineage.append(upstream_table)

        properties = None
        if custom_properties:
            properties = DatasetPropertiesClass(
                customProperties=custom_properties)

        if not upstream_lineage:
            return None, properties

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=dataset_urn,
            aspectName="upstreamLineage",
            aspect=UpstreamLineage(upstreams=upstream_lineage),
        )

        return mcp, properties
Пример #7
0
    def get_lineage_mcp(
            self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]:
        if self.lineage_metadata is None:
            return None
        dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key(
            dataset_urn)
        if dataset_key is None:
            return None
        project_id, dataset_name, tablename = dataset_key.name.split(".")
        bq_table = BigQueryTableRef(project_id, dataset_name, tablename)
        if str(bq_table) in self.lineage_metadata:
            upstream_list: List[UpstreamClass] = []
            # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend
            # even if the lineage is same but the order is different.
            for ref_table in sorted(self.lineage_metadata[str(bq_table)]):
                upstream_table = BigQueryTableRef.from_string_name(ref_table)
                upstream_table_class = UpstreamClass(
                    mce_builder.make_dataset_urn(
                        self.platform,
                        "{project}.{database}.{table}".format(
                            project=upstream_table.project,
                            database=upstream_table.dataset,
                            table=upstream_table.table,
                        ),
                        self.config.env,
                    ),
                    DatasetLineageTypeClass.TRANSFORMED,
                )
                upstream_list.append(upstream_table_class)

            if upstream_list:
                upstream_lineage = UpstreamLineageClass(
                    upstreams=upstream_list)
                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="upstreamLineage",
                    aspect=upstream_lineage,
                )
                return mcp
        return None
Пример #8
0
    def get_lineage_mcp(
            self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]:
        dataset_key = mce_builder.dataset_urn_to_key(dataset_urn)
        if dataset_key is None:
            return None

        dataset_params = dataset_key.name.split(".")
        db_name = dataset_params[0]
        schemaname = dataset_params[1]
        tablename = dataset_params[2]
        if db_name in self.catalog_metadata:
            if schemaname in self.catalog_metadata[db_name]:
                external_db_params = self.catalog_metadata[db_name][schemaname]
                upstream_lineage = UpstreamLineageClass(upstreams=[
                    UpstreamClass(
                        mce_builder.make_dataset_urn(
                            self.eskind_to_platform[
                                external_db_params["eskind"]],
                            "{database}.{table}".format(
                                database=external_db_params[
                                    "external_database"],
                                table=tablename,
                            ),
                            self.config.env,
                        ),
                        DatasetLineageTypeClass.COPY,
                    )
                ])
                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="upstreamLineage",
                    aspect=upstream_lineage,
                )
                return mcp
        return None
Пример #9
0
    def get_lineage_mcp(
        self, dataset_urn: str
    ) -> Tuple[Optional[MetadataChangeProposalWrapper],
               Optional[DatasetPropertiesClass]]:
        dataset_key = mce_builder.dataset_urn_to_key(dataset_urn)
        if dataset_key is None:
            return None, None

        if self._lineage_map is None:
            logger.debug("Populating lineage")
            self._populate_lineage()
        assert self._lineage_map is not None

        upstream_lineage: List[UpstreamClass] = []
        custom_properties: Dict[str, str] = {}

        if dataset_key.name in self._lineage_map:
            item = self._lineage_map[dataset_key.name]
            if (self.config.capture_lineage_query_parser_failures
                    and item.query_parser_failed_sqls):
                custom_properties[
                    "lineage_sql_parser_failed_queries"] = ",".join(
                        item.query_parser_failed_sqls)
            for upstream in item.upstreams:
                upstream_table = UpstreamClass(
                    dataset=builder.make_dataset_urn_with_platform_instance(
                        upstream.platform.value,
                        upstream.path,
                        platform_instance=self.config.platform_instance_map.
                        get(upstream.platform.value)
                        if self.config.platform_instance_map else None,
                        env=self.config.env,
                    ),
                    type=item.dataset_lineage_type,
                )
                upstream_lineage.append(upstream_table)

        dataset_params = dataset_key.name.split(".")
        db_name = dataset_params[0]
        schemaname = dataset_params[1]
        tablename = dataset_params[2]
        if db_name in self.catalog_metadata:
            if schemaname in self.catalog_metadata[db_name]:
                external_db_params = self.catalog_metadata[db_name][schemaname]
                upstream_platform = self.eskind_to_platform[
                    external_db_params["eskind"]]
                catalog_upstream = UpstreamClass(
                    mce_builder.make_dataset_urn_with_platform_instance(
                        upstream_platform,
                        "{database}.{table}".format(
                            database=external_db_params["external_database"],
                            table=tablename,
                        ),
                        platform_instance=self.config.platform_instance_map.
                        get(upstream_platform)
                        if self.config.platform_instance_map else None,
                        env=self.config.env,
                    ),
                    DatasetLineageTypeClass.COPY,
                )
                upstream_lineage.append(catalog_upstream)

        properties = None
        if custom_properties:
            properties = DatasetPropertiesClass(
                customProperties=custom_properties)

        if upstream_lineage:
            self.report.upstream_lineage[dataset_urn] = [
                u.dataset for u in upstream_lineage
            ]
        else:
            return None, properties

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=dataset_urn,
            aspectName="upstreamLineage",
            aspect=UpstreamLineage(upstreams=upstream_lineage),
        )

        return mcp, properties
Пример #10
0
    def _process_table(
        self,
        dataset_name: str,
        inspector: Inspector,
        schema: str,
        table: str,
        sql_config: SQLAlchemyConfig,
    ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
        columns = self._get_columns(dataset_name, inspector, schema, table)
        dataset_urn = make_dataset_urn_with_platform_instance(
            self.platform,
            dataset_name,
            self.config.platform_instance,
            self.config.env,
        )
        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[StatusClass(removed=False)],
        )
        if self.is_stateful_ingestion_configured():
            cur_checkpoint = self.get_current_checkpoint(
                self.get_default_ingestion_job_id())
            if cur_checkpoint is not None:
                checkpoint_state = cast(BaseSQLAlchemyCheckpointState,
                                        cur_checkpoint.state)
                checkpoint_state.add_table_urn(dataset_urn)

        description, properties, location_urn = self.get_table_properties(
            inspector, schema, table)
        dataset_properties = DatasetPropertiesClass(
            name=table,
            description=description,
            customProperties=properties,
        )
        dataset_snapshot.aspects.append(dataset_properties)

        if location_urn:
            external_upstream_table = UpstreamClass(
                dataset=location_urn,
                type=DatasetLineageTypeClass.COPY,
            )
            lineage_mcpw = MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_snapshot.urn,
                aspectName="upstreamLineage",
                aspect=UpstreamLineage(upstreams=[external_upstream_table]),
            )
            lineage_wu = MetadataWorkUnit(
                id=
                f"{self.platform}-{lineage_mcpw.entityUrn}-{lineage_mcpw.aspectName}",
                mcp=lineage_mcpw,
            )
            yield lineage_wu

        pk_constraints: dict = inspector.get_pk_constraint(table, schema)
        foreign_keys = self._get_foreign_keys(dataset_urn, inspector, schema,
                                              table)
        schema_fields = self.get_schema_fields(dataset_name, columns,
                                               pk_constraints)
        schema_metadata = get_schema_metadata(
            self.report,
            dataset_name,
            self.platform,
            columns,
            pk_constraints,
            foreign_keys,
            schema_fields,
        )
        dataset_snapshot.aspects.append(schema_metadata)
        db_name = self.get_db_name(inspector)
        yield from self.add_table_to_schema_container(dataset_urn, db_name,
                                                      schema)
        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = SqlWorkUnit(id=dataset_name, mce=mce)
        self.report.report_workunit(wu)
        yield wu
        dpi_aspect = self.get_dataplatform_instance_aspect(
            dataset_urn=dataset_urn)
        if dpi_aspect:
            yield dpi_aspect
        subtypes_aspect = MetadataWorkUnit(
            id=f"{dataset_name}-subtypes",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="subTypes",
                aspect=SubTypesClass(typeNames=["table"]),
            ),
        )
        yield subtypes_aspect

        yield from self._get_domain_wu(
            dataset_name=dataset_name,
            entity_urn=dataset_urn,
            entity_type="dataset",
            sql_config=sql_config,
        )