示例#1
0
文件: enrich.py 项目: zmyer/datahub
def create_editable_schema_info_aspect_mce(
    directive: Directive,
) -> MetadataChangeEventClass:
    return MetadataChangeEventClass(
        proposedSnapshot=DatasetSnapshotClass(
            urn=dataset_name_to_urn(directive.table),
            aspects=[
                EditableSchemaMetadataClass(
                    created=AuditStampClass(
                        time=int(time.time() * 1000),
                        actor="urn:li:corpuser:datahub",
                    ),
                    lastModified=AuditStampClass(
                        time=int(time.time() * 1000),
                        actor="urn:li:corpuser:datahub",
                    ),
                    editableSchemaFieldInfo=[],
                )
            ],
        )
    )
示例#2
0
def make_glossary_terms_aspect_from_urn_list(
        term_urns: List[str]) -> GlossaryTerms:
    for term_urn in term_urns:
        assert term_urn.startswith("urn:li:glossaryTerm:")
    glossary_terms = GlossaryTerms(
        [GlossaryTermAssociationClass(term_urn) for term_urn in term_urns],
        AuditStampClass(
            time=int(time.time() * 1000),
            actor="urn:li:corpuser:datahub",
        ),
    )
    return glossary_terms
示例#3
0
def create_metadata_work_unit(timestamp):
    dataset_snapshot = DatasetSnapshot(
        urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)",
        aspects=[],
    )

    dataset_snapshot.aspects.append(Status(removed=False))

    dataset_snapshot.aspects.append(
        OwnershipClass(
            owners=[
                OwnerClass(
                    owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER
                )
            ],
            lastModified=AuditStampClass(
                time=timestamp, actor="urn:li:corpuser:datahub"
            ),
        )
    )

    dataset_snapshot.aspects.append(
        DatasetPropertiesClass(
            description="Grilled Food",
        )
    )

    fields = [
        SchemaField(
            fieldPath="Size",
            nativeDataType="int",
            type=SchemaFieldDataType(type=NumberTypeClass()),
            description="Maximum attendees permitted",
            nullable=True,
            recursive=False,
        )
    ]

    schema_metadata = SchemaMetadata(
        schemaName="datalake_grilled.Barbeque",
        version=0,
        fields=fields,
        platform="urn:li:dataPlatform:glue",
        created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
        lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
    )
    dataset_snapshot.aspects.append(schema_metadata)

    mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
    return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
示例#4
0
    def _make_dashboard_and_chart_mces(
        self, looker_dashboard: LookerDashboard
    ) -> List[MetadataChangeEvent]:
        actor = self.source_config.actor
        sys_time = get_sys_time()

        chart_mces = [
            self._make_chart_mce(element)
            for element in looker_dashboard.dashboard_elements
        ]

        dashboard_urn = f"urn:li:dashboard:({self.source_config.platform_name},{looker_dashboard.get_urn_dashboard_id()})"
        dashboard_snapshot = DashboardSnapshot(
            urn=dashboard_urn,
            aspects=[],
        )

        last_modified = ChangeAuditStamps(
            created=AuditStamp(time=sys_time, actor=actor),
            lastModified=AuditStamp(time=sys_time, actor=actor),
        )

        dashboard_info = DashboardInfoClass(
            description=looker_dashboard.description
            if looker_dashboard.description is not None
            else "",
            title=looker_dashboard.title,
            charts=[mce.proposedSnapshot.urn for mce in chart_mces],
            lastModified=last_modified,
            dashboardUrl=looker_dashboard.url(self.source_config.base_url),
        )

        dashboard_snapshot.aspects.append(dashboard_info)
        owners = [OwnerClass(owner=actor, type=OwnershipTypeClass.DATAOWNER)]
        dashboard_snapshot.aspects.append(
            OwnershipClass(
                owners=owners,
                lastModified=AuditStampClass(
                    time=sys_time, actor=self.source_config.actor
                ),
            )
        )
        dashboard_snapshot.aspects.append(Status(removed=looker_dashboard.is_deleted))

        dashboard_mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)

        return chart_mces + [dashboard_mce]
示例#5
0
def create_lineage_aspect_mce(
        directive: Directive) -> MetadataChangeEventClass:
    return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass(
        urn=dataset_name_to_urn(directive.table),
        aspects=[
            UpstreamLineageClass(upstreams=[
                UpstreamClass(
                    dataset=dataset_name_to_urn(upstream),
                    type=DatasetLineageTypeClass.TRANSFORMED,
                    auditStamp=AuditStampClass(
                        time=int(time.time() * 1000),
                        actor="urn:li:corpuser:datahub",
                    ),
                ) for upstream in directive.depends_on
            ])
        ],
    ))
示例#6
0
 def generate_ownership_aspect(self):
     ownership = OwnershipClass(
         owners=[
             OwnerClass(
                 owner=builder.make_user_urn(owner),
                 type=OwnershipTypeClass.DEVELOPER,
                 source=OwnershipSourceClass(
                     type=OwnershipSourceTypeClass.SERVICE,
                     # url=dag.filepath,
                 ),
             ) for owner in (self.owners or [])
         ],
         lastModified=AuditStampClass(time=0,
                                      actor=builder.make_user_urn(
                                          self.orchestrator)),
     )
     return [ownership]
示例#7
0
    def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass:
        if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass):
            return mce
        terms_to_add = self.config.get_terms_to_add(mce.proposedSnapshot)
        if terms_to_add:
            terms = builder.get_or_add_aspect(
                mce,
                GlossaryTermsClass(
                    terms=[],
                    auditStamp=AuditStampClass(
                        time=builder.get_sys_time(), actor="urn:li:corpUser:restEmitter"
                    ),
                ),
            )
            terms.terms.extend(terms_to_add)

        return mce
示例#8
0
 def get_owner(time: int) -> OwnershipClass:
     owner = table.get("Owner")
     if owner:
         owners = [
             OwnerClass(
                 owner=f"urn:li:corpuser:{owner}",
                 type=OwnershipTypeClass.DATAOWNER,
             )
         ]
     else:
         owners = []
     return OwnershipClass(
         owners=owners,
         lastModified=AuditStampClass(
             time=time,
             actor="urn:li:corpuser:datahub",
         ),
     )
示例#9
0
def create_ownership_aspect_mce(
        directive: Directive) -> MetadataChangeEventClass:
    return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass(
        urn=dataset_name_to_urn(directive.table),
        aspects=[
            OwnershipClass(
                owners=[
                    OwnerClass(
                        owner=owner_name_to_urn(clean_owner_name(owner)),
                        type=OwnershipTypeClass.DATAOWNER,
                    ) for owner in directive.owners
                ],
                lastModified=AuditStampClass(
                    time=int(time.time() * 1000),
                    actor="urn:li:corpuser:datahub",
                ),
            )
        ],
    ))
示例#10
0
    def init_dataset(
        self, endpoint_k: str, endpoint_dets: dict
    ) -> Tuple[DatasetSnapshot, str]:
        config = self.config

        dataset_name = endpoint_k[1:].replace("/", ".")

        if len(dataset_name) > 0:
            if dataset_name[-1] == ".":
                dataset_name = dataset_name[:-1]
        else:
            dataset_name = "root"

        dataset_snapshot = DatasetSnapshot(
            urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)",
            aspects=[],
        )

        # adding description
        dataset_properties = DatasetPropertiesClass(
            description=endpoint_dets["description"], customProperties={}
        )
        dataset_snapshot.aspects.append(dataset_properties)

        # adding tags
        tags_str = [make_tag_urn(t) for t in endpoint_dets["tags"]]
        tags_tac = [TagAssociationClass(t) for t in tags_str]
        gtc = GlobalTagsClass(tags_tac)
        dataset_snapshot.aspects.append(gtc)

        # the link will appear in the "documentation"
        link_url = clean_url(config.url + self.url_basepath + endpoint_k)
        link_description = "Link to call for the dataset."
        creation = AuditStampClass(
            time=int(time.time()), actor="urn:li:corpuser:etl", impersonator=None
        )
        link_metadata = InstitutionalMemoryMetadataClass(
            url=link_url, description=link_description, createStamp=creation
        )
        inst_memory = InstitutionalMemoryClass([link_metadata])
        dataset_snapshot.aspects.append(inst_memory)

        return dataset_snapshot, dataset_name
示例#11
0
    def transform_one(
            self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass:
        if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass):
            return mce

        owners_to_add = self.config.get_owners_to_add(mce.proposedSnapshot)
        if owners_to_add:
            ownership = builder.get_or_add_aspect(
                mce,
                OwnershipClass(
                    owners=[],
                    lastModified=AuditStampClass(
                        time=builder.get_sys_time(),
                        actor=self.config.default_actor,
                    ),
                ),
            )
            ownership.owners.extend(owners_to_add)

        return mce
def make_lineage_mce(
    upstream_urns: List[str],
    downstream_urn: str,
    actor: str = make_user_urn("datahub"),
    lineage_type: str = DatasetLineageTypeClass.TRANSFORMED,
) -> MetadataChangeEventClass:
    sys_time = get_sys_time()

    mce = MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass(
        urn=downstream_urn,
        aspects=[
            UpstreamLineageClass(upstreams=[
                UpstreamClass(
                    auditStamp=AuditStampClass(
                        time=sys_time,
                        actor=actor,
                    ),
                    dataset=upstream_urn,
                    type=lineage_type,
                ) for upstream_urn in upstream_urns
            ])
        ],
    ))
    return mce
示例#13
0
)

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# First we get the current terms
gms_endpoint = "http://localhost:8080"
rest_emitter = DatahubRestEmitter(gms_server=gms_endpoint)

dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD")

term_to_add = make_term_urn("Classification.HighlyConfidential")
term_association_to_add = GlossaryTermAssociationClass(urn=term_to_add)
# an audit stamp that basically says we have no idea when these terms were added to this dataset
# change the time value to (time.time() * 1000) if you want to specify the current time of running this code as the time of the application
unknown_audit_stamp = AuditStampClass(time=0, actor="urn:li:corpuser:ingestion")

# create a brand new terms aspect
terms_aspect = GlossaryTermsClass(
    terms=[term_association_to_add],
    auditStamp=unknown_audit_stamp,
)

event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(
    entityType="dataset",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=dataset_urn,
    aspectName="glossaryTerms",
    aspect=terms_aspect,
)
rest_emitter.emit(event)
示例#14
0
                    fieldInfo.globalTags.tags.append(tag_association_to_add)
                    need_write = True
            else:
                fieldInfo.globalTags = tags_aspect_to_set
                need_write = True

    if not field_match:
        # this field isn't present in the editable schema metadata aspect, add it
        field_info = field_info_to_set
        current_editable_schema_metadata.editableSchemaFieldInfo.append(field_info)
        need_write = True

else:
    # create a brand new editable schema metadata aspect
    now = int(time.time() * 1000)  # milliseconds since epoch
    current_timestamp = AuditStampClass(time=now, actor="urn:li:corpuser:ingestion")
    current_editable_schema_metadata = EditableSchemaMetadataClass(
        editableSchemaFieldInfo=[field_info_to_set],
        created=current_timestamp,
    )
    need_write = True

if need_write:
    event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(
        entityType="dataset",
        changeType=ChangeTypeClass.UPSERT,
        entityUrn=dataset_urn,
        aspectName="editableSchemaMetadata",
        aspect=current_editable_schema_metadata,
    )
    graph.emit(event)
示例#15
0
    def process_dataflow_node(
        self,
        node: Dict[str, Any],
        flow_urn: str,
        new_dataset_ids: List[str],
        new_dataset_mces: List[MetadataChangeEvent],
        s3_formats: typing.DefaultDict[str, Set[Union[str, None]]],
    ) -> Dict[str, Any]:

        node_type = node["NodeType"]

        # for nodes representing datasets, we construct a dataset URN accordingly
        if node_type in ["DataSource", "DataSink"]:

            node_args = {
                x["Name"]: json.loads(x["Value"])
                for x in node["Args"]
            }

            # if data object is Glue table
            if "database" in node_args and "table_name" in node_args:

                full_table_name = f"{node_args['database']}.{node_args['table_name']}"

                # we know that the table will already be covered when ingesting Glue tables
                node_urn = f"urn:li:dataset:(urn:li:dataPlatform:glue,{full_table_name},{self.env})"

            # if data object is S3 bucket
            elif node_args.get("connection_type") == "s3":

                # remove S3 prefix (s3://)
                s3_name = node_args["connection_options"]["path"][5:]

                if s3_name.endswith("/"):
                    s3_name = s3_name[:-1]

                # append S3 format if different ones exist
                if len(s3_formats[s3_name]) > 1:
                    node_urn = f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name}_{node_args.get('format')},{self.env})"

                else:
                    node_urn = (
                        f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name},{self.env})"
                    )

                dataset_snapshot = DatasetSnapshot(
                    urn=node_urn,
                    aspects=[],
                )

                dataset_snapshot.aspects.append(Status(removed=False))
                dataset_snapshot.aspects.append(
                    OwnershipClass(
                        owners=[],
                        lastModified=AuditStampClass(
                            time=mce_builder.get_sys_time(),
                            actor="urn:li:corpuser:datahub",
                        ),
                    ))
                dataset_snapshot.aspects.append(
                    DatasetPropertiesClass(
                        customProperties={
                            k: str(v)
                            for k, v in node_args.items()
                        },
                        tags=[],
                    ))

                new_dataset_mces.append(
                    MetadataChangeEvent(proposedSnapshot=dataset_snapshot))
                new_dataset_ids.append(f"{node['NodeType']}-{node['Id']}")

            else:

                raise ValueError(
                    f"Unrecognized Glue data object type: {node_args}")

        # otherwise, a node represents a transformation
        else:
            node_urn = mce_builder.make_data_job_urn_with_flow(
                flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}')

        return {
            **node,
            "urn": node_urn,
            # to be filled in after traversing edges
            "inputDatajobs": [],
            "inputDatasets": [],
            "outputDatasets": [],
        }
                description=
                "This is the zipcode of the address. Specified using extended form and limited to addresses in the United States",
                recursive=False,  # Unused field, can omit
                # It is rare to attach tags to fields as part of the technical schema unless you are purely reflecting state that exists in the source system.
                # For an editable (in UI) version of this, use the editableSchemaMetadata aspect
                globalTags=GlobalTagsClass(
                    tags=[TagAssociationClass(tag=make_tag_urn("location"))]),
                # It is rare to attach glossary terms to fields as part of the technical schema unless you are purely reflecting state that exists in the source system.
                # For an editable (in UI) version of this, use the editableSchemaMetadata aspect
                glossaryTerms=GlossaryTermsClass(
                    terms=[
                        GlossaryTermAssociationClass(
                            urn=make_term_urn("Classification.PII"))
                    ],
                    auditStamp=
                    AuditStampClass(  # represents the time when this term was attached to this field?
                        time=
                        0,  # time in milliseconds, leave as 0 if no time of association is known
                        actor=
                        "urn:li:corpuser:ingestion",  # if this is a system provided tag, use a bot user id like ingestion
                    ),
                ),
            )
        ],
    ),
)

# Create rest emitter
rest_emitter = DatahubRestEmitter(gms_server="http://localhost:8080")
rest_emitter.emit(event)