Пример #1
0
    def get_datajob_wu(self, node: Dict[str, Any],
                       job: Dict[str, Any]) -> MetadataWorkUnit:
        """
        Generate a DataJob workunit for a component (node) in a Glue job.

        Parameters
        ----------
            node:
                Node from process_dataflow_graph()
            job:
                Job object from get_all_jobs()
        """
        mce = MetadataChangeEventClass(proposedSnapshot=DataJobSnapshotClass(
            urn=node["urn"],
            aspects=[
                DataJobInfoClass(
                    name=f"{job['Name']}:{node['NodeType']}-{node['Id']}",
                    type="GLUE",
                    customProperties={
                        **{x["Name"]: x["Value"]
                           for x in node["Args"]},
                        "transformType": node["NodeType"],
                        "nodeId": node["Id"],
                    },
                ),
                DataJobInputOutputClass(
                    inputDatasets=node["inputDatasets"],
                    outputDatasets=node["outputDatasets"],
                    inputDatajobs=node["inputDatajobs"],
                ),
            ],
        ))

        return MetadataWorkUnit(id=f'{job["Name"]}-{node["Id"]}', mce=mce)
Пример #2
0
    def generate_data_input_output_mcp(
            self) -> Iterable[MetadataChangeProposalWrapper]:
        mcp = MetadataChangeProposalWrapper(
            entityType="datajob",
            entityUrn=str(self.urn),
            aspectName="dataJobInputOutput",
            aspect=DataJobInputOutputClass(
                inputDatasets=[str(urn) for urn in self.inlets],
                outputDatasets=[str(urn) for urn in self.outlets],
                inputDatajobs=[str(urn) for urn in self.upstream_urns],
            ),
            changeType=ChangeTypeClass.UPSERT,
        )
        yield mcp

        # Force entity materialization
        for iolet in self.inlets + self.outlets:
            mcp = MetadataChangeProposalWrapper(
                entityType="dataset",
                entityUrn=str(iolet),
                aspectName="status",
                aspect=StatusClass(removed=False),
                changeType=ChangeTypeClass.UPSERT,
            )

            yield mcp
Пример #3
0
    def construct_job_workunits(
        self,
        job_urn: str,
        job_name: str,
        external_url: str,
        job_type: str,
        description: Optional[str],
        job_properties: Optional[Dict[str, str]] = None,
        inlets: List[str] = [],
        outlets: List[str] = [],
        inputJobs: List[str] = [],
        status: Optional[str] = None,
    ) -> Iterable[MetadataWorkUnit]:
        if job_properties:
            job_properties = {k: v for k, v in job_properties.items() if v is not None}

        mcp = MetadataChangeProposalWrapper(
            entityType="dataJob",
            entityUrn=job_urn,
            changeType=ChangeTypeClass.UPSERT,
            aspectName="dataJobInfo",
            aspect=DataJobInfoClass(
                name=job_name,
                type=job_type,
                description=description,
                customProperties=job_properties,
                externalUrl=external_url,
                status=status,
            ),
        )

        wu = MetadataWorkUnit(
            id=f"{NIFI}.{job_name}.{mcp.aspectName}",
            mcp=mcp,
        )
        self.report.report_workunit(wu)
        yield wu

        inlets.sort()
        outlets.sort()
        inputJobs.sort()

        mcp = MetadataChangeProposalWrapper(
            entityType="dataJob",
            entityUrn=job_urn,
            changeType=ChangeTypeClass.UPSERT,
            aspectName="dataJobInputOutput",
            aspect=DataJobInputOutputClass(
                inputDatasets=inlets, outputDatasets=outlets, inputDatajobs=inputJobs
            ),
        )

        wu = MetadataWorkUnit(
            id=f"{NIFI}.{job_name}.{mcp.aspectName}",
            mcp=mcp,
        )
        self.report.report_workunit(wu)
        yield wu
Пример #4
0
    def get_datajob_wu(
        self, node: Dict[str, Any], job: Dict[str, Any]
    ) -> MetadataWorkUnit:
        """
        Generate a DataJob workunit for a component (node) in a Glue job.

        Parameters
        ----------
            node:
                Node from process_dataflow_graph()
            job:
                Job object from get_all_jobs()
        """

        region = self.source_config.aws_region

        mce = MetadataChangeEventClass(
            proposedSnapshot=DataJobSnapshotClass(
                urn=node["urn"],
                aspects=[
                    DataJobInfoClass(
                        name=f"{job['Name']}:{node['NodeType']}-{node['Id']}",
                        type="GLUE",
                        # there's no way to view an individual job node by link, so just show the graph
                        externalUrl=f"https://{region}.console.aws.amazon.com/gluestudio/home?region={region}#/editor/job/{job['Name']}/graph",
                        customProperties={
                            **{x["Name"]: x["Value"] for x in node["Args"]},
                            "transformType": node["NodeType"],
                            "nodeId": node["Id"],
                        },
                    ),
                    DataJobInputOutputClass(
                        inputDatasets=node["inputDatasets"],
                        outputDatasets=node["outputDatasets"],
                        inputDatajobs=node["inputDatajobs"],
                    ),
                ],
            )
        )

        return MetadataWorkUnit(id=f'{job["Name"]}-{node["Id"]}', mce=mce)
Пример #5
0
    def generate_mce(self) -> MetadataChangeEventClass:
        job_mce = MetadataChangeEventClass(
            proposedSnapshot=DataJobSnapshotClass(
                urn=str(self.urn),
                aspects=[
                    DataJobInfoClass(
                        name=self.name if self.name is not None else self.id,
                        type=AzkabanJobTypeClass.COMMAND,
                        description=self.description,
                        customProperties=self.properties,
                        externalUrl=self.url,
                    ),
                    DataJobInputOutputClass(
                        inputDatasets=[str(urn) for urn in self.inlets],
                        outputDatasets=[str(urn) for urn in self.outlets],
                        inputDatajobs=[str(urn) for urn in self.upstream_urns],
                    ),
                    *self.generate_ownership_aspect(),
                    *self.generate_tags_aspect(),
                ],
            ))

        return job_mce
Пример #6
0
# Note that bar2 is an input as well as an output dataset, but some fields are inputs while other fields are outputs.

dataJobInputOutput = DataJobInputOutputClass(
    inputDatasets=[datasetUrn("bar2"),
                   datasetUrn("bar3"),
                   datasetUrn("bar4")],
    outputDatasets=[datasetUrn("bar"), datasetUrn("bar2")],
    inputDatajobs=None,
    inputDatasetFields=[
        fldUrn("bar2", "c1"),
        fldUrn("bar2", "c2"),
        fldUrn("bar2", "c3"),
        fldUrn("bar3", "c1"),
        fldUrn("bar3", "c2"),
        fldUrn("bar4", "c1"),
    ],
    outputDatasetFields=[
        fldUrn("bar", "c1"),
        fldUrn("bar", "c2"),
        fldUrn("bar", "c3"),
        fldUrn("bar", "c4"),
        fldUrn("bar", "c5"),
        fldUrn("bar", "c6"),
        fldUrn("bar", "c7"),
        fldUrn("bar", "c9"),
        fldUrn("bar2", "c9"),
    ],
    fineGrainedLineages=fineGrainedLineages,
)

dataJobLineageMcp = MetadataChangeProposalWrapper(
Пример #7
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:

        jobs = self.get_all_jobs()

        processed_jobs: Dict[str, SageMakerJob] = {}

        # first pass: process jobs and collect datasets used
        for job in jobs:

            job_type = SAGEMAKER_JOB_TYPES[job["type"]]
            job_name = job[job_type.list_name_key]

            job_details = self.get_job_details(job_name, job["type"])

            processed_job = getattr(self, job_type.processor)(job_details)
            processed_jobs[processed_job.job_snapshot.urn] = processed_job

        all_datasets = {}

        # second pass:
        #   - move output jobs to inputs
        #   - aggregate i/o datasets
        for job_urn in sorted(processed_jobs):
            processed_job = processed_jobs[job_urn]

            for output_job_urn in processed_job.output_jobs:
                processed_jobs[output_job_urn].input_jobs.add(output_job_urn)

            all_datasets.update(processed_job.input_datasets)
            all_datasets.update(processed_job.output_datasets)

        # yield datasets
        for dataset_urn, dataset in all_datasets.items():

            dataset_snapshot = DatasetSnapshot(
                urn=dataset_urn,
                aspects=[],
            )
            dataset_snapshot.aspects.append(
                DatasetPropertiesClass(
                    customProperties={k: str(v) for k, v in dataset.items()},
                    tags=[],
                )
            )
            dataset_mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
            dataset_wu = MetadataWorkUnit(
                id=dataset_urn,
                mce=dataset_mce,
            )
            self.report.report_dataset_scanned()
            self.report.report_workunit(dataset_wu)
            yield dataset_wu

        # third pass: construct and yield MCEs
        for job_urn in sorted(processed_jobs):

            processed_job = processed_jobs[job_urn]
            job_snapshot = processed_job.job_snapshot

            flow_urn = make_sagemaker_flow_urn(
                processed_job.job_type, processed_job.job_name, self.env
            )

            # create flow for each job
            flow_mce = MetadataChangeEvent(
                proposedSnapshot=DataFlowSnapshotClass(
                    urn=flow_urn,
                    aspects=[
                        DataFlowInfoClass(
                            name=processed_job.job_name,
                        ),
                    ],
                )
            )
            flow_wu = MetadataWorkUnit(
                id=flow_urn,
                mce=flow_mce,
            )
            self.report.report_workunit(flow_wu)
            yield flow_wu

            job_snapshot.aspects.append(
                DataJobInputOutputClass(
                    inputDatasets=sorted(list(processed_job.input_datasets.keys())),
                    outputDatasets=sorted(list(processed_job.output_datasets.keys())),
                    inputDatajobs=sorted(list(processed_job.input_jobs)),
                )
            )

            job_mce = MetadataChangeEvent(proposedSnapshot=job_snapshot)
            job_wu = MetadataWorkUnit(
                id=job_urn,
                mce=job_mce,
            )
            self.report.report_job_scanned()
            self.report.report_workunit(job_wu)
            yield job_wu