예제 #1
0
    def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]:
        mcp = MetadataChangeProposalWrapper(
            entityType="dataflow",
            entityUrn=str(self.urn),
            aspectName="dataFlowInfo",
            aspect=DataFlowInfoClass(
                name=self.name if self.name is not None else self.id,
                description=self.description,
                customProperties=self.properties,
                externalUrl=self.url,
            ),
            changeType=ChangeTypeClass.UPSERT,
        )
        yield mcp

        for owner in self.generate_ownership_aspect():
            mcp = MetadataChangeProposalWrapper(
                entityType="dataflow",
                entityUrn=str(self.urn),
                aspectName="ownership",
                aspect=owner,
                changeType=ChangeTypeClass.UPSERT,
            )
            yield mcp

        for tag in self.generate_tags_aspect():
            mcp = MetadataChangeProposalWrapper(
                entityType="dataflow",
                entityUrn=str(self.urn),
                aspectName="globalTags",
                aspect=tag,
                changeType=ChangeTypeClass.UPSERT,
            )
            yield mcp
예제 #2
0
    def get_dataflow_wu(self, flow_urn: str, job: Dict[str, Any]) -> MetadataWorkUnit:
        """
        Generate a DataFlow workunit for a Glue job.

        Parameters
        ----------
            flow_urn:
                URN for the flow
            job:
                Job object from get_all_jobs()
        """
        mce = MetadataChangeEventClass(
            proposedSnapshot=DataFlowSnapshotClass(
                urn=flow_urn,
                aspects=[
                    DataFlowInfoClass(
                        name=job["Name"],
                        description=job["Description"],
                        # specify a few Glue-specific properties
                        customProperties={
                            "role": job["Role"],
                            "created": str(job["CreatedOn"]),
                            "modified": str(job["LastModifiedOn"]),
                            "command": job["Command"]["ScriptLocation"],
                        },
                    ),
                ],
            )
        )

        return MetadataWorkUnit(id=job["Name"], mce=mce)
예제 #3
0
    def generate_mce(self) -> MetadataChangeEventClass:
        flow_mce = MetadataChangeEventClass(
            proposedSnapshot=DataFlowSnapshotClass(
                urn=str(self.urn),
                aspects=[
                    DataFlowInfoClass(
                        name=self.id,
                        description=self.description,
                        customProperties=self.properties,
                        externalUrl=self.url,
                    ),
                    *self.generate_ownership_aspect(),
                    *self.generate_tags_aspect(),
                ],
            ))

        return flow_mce
예제 #4
0
    def get_dataflow_wu(self, flow_urn: str,
                        job: Dict[str, Any]) -> MetadataWorkUnit:
        """
        Generate a DataFlow workunit for a Glue job.

        Parameters
        ----------
            flow_urn:
                URN for the flow
            job:
                Job object from get_all_jobs()
        """

        region = self.source_config.aws_region

        custom_props = {
            "role": job["Role"],
        }

        if job.get("CreatedOn") is not None:
            custom_props["created"] = str(job["CreatedOn"])

        if job.get("LastModifiedOn") is not None:
            custom_props["modified"] = str(job["LastModifiedOn"])

        command = job.get("Command", {}).get("ScriptLocation")
        if command is not None:
            custom_props["command"] = command

        mce = MetadataChangeEventClass(proposedSnapshot=DataFlowSnapshotClass(
            urn=flow_urn,
            aspects=[
                DataFlowInfoClass(
                    name=job["Name"],
                    description=job.get("Description"),
                    externalUrl=
                    f"https://{region}.console.aws.amazon.com/gluestudio/home?region={region}#/editor/job/{job['Name']}/graph",
                    # specify a few Glue-specific properties
                    customProperties=custom_props,
                ),
            ],
        ))

        return MetadataWorkUnit(id=job["Name"], mce=mce)
예제 #5
0
 def construct_flow_workunits(
     self,
     flow_urn: str,
     flow_name: str,
     external_url: str,
     flow_properties: Optional[Dict[str, str]] = None,
 ) -> Iterable[MetadataWorkUnit]:
     mcp = MetadataChangeProposalWrapper(
         entityType="dataFlow",
         entityUrn=flow_urn,
         changeType=ChangeTypeClass.UPSERT,
         aspectName="dataFlowInfo",
         aspect=DataFlowInfoClass(
             name=flow_name,
             customProperties=flow_properties,
             externalUrl=external_url,
         ),
     )
     for proposal in [mcp]:
         wu = MetadataWorkUnit(
             id=f"{NIFI}.{flow_name}.{proposal.aspectName}", mcp=proposal)
         self.report.report_workunit(wu)
         yield wu
예제 #6
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:

        jobs = self.get_all_jobs()

        processed_jobs: Dict[str, SageMakerJob] = {}

        # first pass: process jobs and collect datasets used
        for job in jobs:

            job_type = SAGEMAKER_JOB_TYPES[job["type"]]
            job_name = job[job_type.list_name_key]

            job_details = self.get_job_details(job_name, job["type"])

            processed_job = getattr(self, job_type.processor)(job_details)
            processed_jobs[processed_job.job_snapshot.urn] = processed_job

        all_datasets = {}

        # second pass:
        #   - move output jobs to inputs
        #   - aggregate i/o datasets
        for job_urn in sorted(processed_jobs):
            processed_job = processed_jobs[job_urn]

            for output_job_urn in processed_job.output_jobs:
                processed_jobs[output_job_urn].input_jobs.add(output_job_urn)

            all_datasets.update(processed_job.input_datasets)
            all_datasets.update(processed_job.output_datasets)

        # yield datasets
        for dataset_urn, dataset in all_datasets.items():

            dataset_snapshot = DatasetSnapshot(
                urn=dataset_urn,
                aspects=[],
            )
            dataset_snapshot.aspects.append(
                DatasetPropertiesClass(
                    customProperties={k: str(v) for k, v in dataset.items()},
                    tags=[],
                )
            )
            dataset_mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
            dataset_wu = MetadataWorkUnit(
                id=dataset_urn,
                mce=dataset_mce,
            )
            self.report.report_dataset_scanned()
            self.report.report_workunit(dataset_wu)
            yield dataset_wu

        # third pass: construct and yield MCEs
        for job_urn in sorted(processed_jobs):

            processed_job = processed_jobs[job_urn]
            job_snapshot = processed_job.job_snapshot

            flow_urn = make_sagemaker_flow_urn(
                processed_job.job_type, processed_job.job_name, self.env
            )

            # create flow for each job
            flow_mce = MetadataChangeEvent(
                proposedSnapshot=DataFlowSnapshotClass(
                    urn=flow_urn,
                    aspects=[
                        DataFlowInfoClass(
                            name=processed_job.job_name,
                        ),
                    ],
                )
            )
            flow_wu = MetadataWorkUnit(
                id=flow_urn,
                mce=flow_mce,
            )
            self.report.report_workunit(flow_wu)
            yield flow_wu

            job_snapshot.aspects.append(
                DataJobInputOutputClass(
                    inputDatasets=sorted(list(processed_job.input_datasets.keys())),
                    outputDatasets=sorted(list(processed_job.output_datasets.keys())),
                    inputDatajobs=sorted(list(processed_job.input_jobs)),
                )
            )

            job_mce = MetadataChangeEvent(proposedSnapshot=job_snapshot)
            job_wu = MetadataWorkUnit(
                id=job_urn,
                mce=job_mce,
            )
            self.report.report_job_scanned()
            self.report.report_workunit(job_wu)
            yield job_wu
예제 #7
0
import datahub.emitter.mce_builder as builder
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.rest_emitter import DatahubRestEmitter
from datahub.metadata.com.linkedin.pegasus2avro.datajob import DataJobInfoClass
from datahub.metadata.schema_classes import ChangeTypeClass, DataFlowInfoClass

# Construct the DataJobInfo aspect with the job -> flow lineage.
dataflow_urn = builder.make_data_flow_urn(orchestrator="airflow",
                                          flow_id="flow_old_api",
                                          cluster="prod")

dataflow_info = DataFlowInfoClass(name="LowLevelApiFlow")

dataflow_info_mcp = MetadataChangeProposalWrapper(
    entityType="dataflow",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=dataflow_urn,
    aspectName="dataFlowInfo",
    aspect=dataflow_info,
)

datajob_info = DataJobInfoClass(name="My Job 1",
                                type="AIRFLOW",
                                flowUrn=dataflow_urn)

# Construct a MetadataChangeProposalWrapper object with the DataJobInfo aspect.
# NOTE: This will overwrite all of the existing dataJobInfo aspect information associated with this job.
datajob_info_mcp = MetadataChangeProposalWrapper(
    entityType="dataJob",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=builder.make_data_job_urn(orchestrator="airflow",