def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: mcp = MetadataChangeProposalWrapper( entityType="dataflow", entityUrn=str(self.urn), aspectName="dataFlowInfo", aspect=DataFlowInfoClass( name=self.name if self.name is not None else self.id, description=self.description, customProperties=self.properties, externalUrl=self.url, ), changeType=ChangeTypeClass.UPSERT, ) yield mcp for owner in self.generate_ownership_aspect(): mcp = MetadataChangeProposalWrapper( entityType="dataflow", entityUrn=str(self.urn), aspectName="ownership", aspect=owner, changeType=ChangeTypeClass.UPSERT, ) yield mcp for tag in self.generate_tags_aspect(): mcp = MetadataChangeProposalWrapper( entityType="dataflow", entityUrn=str(self.urn), aspectName="globalTags", aspect=tag, changeType=ChangeTypeClass.UPSERT, ) yield mcp
def get_dataflow_wu(self, flow_urn: str, job: Dict[str, Any]) -> MetadataWorkUnit: """ Generate a DataFlow workunit for a Glue job. Parameters ---------- flow_urn: URN for the flow job: Job object from get_all_jobs() """ mce = MetadataChangeEventClass( proposedSnapshot=DataFlowSnapshotClass( urn=flow_urn, aspects=[ DataFlowInfoClass( name=job["Name"], description=job["Description"], # specify a few Glue-specific properties customProperties={ "role": job["Role"], "created": str(job["CreatedOn"]), "modified": str(job["LastModifiedOn"]), "command": job["Command"]["ScriptLocation"], }, ), ], ) ) return MetadataWorkUnit(id=job["Name"], mce=mce)
def generate_mce(self) -> MetadataChangeEventClass: flow_mce = MetadataChangeEventClass( proposedSnapshot=DataFlowSnapshotClass( urn=str(self.urn), aspects=[ DataFlowInfoClass( name=self.id, description=self.description, customProperties=self.properties, externalUrl=self.url, ), *self.generate_ownership_aspect(), *self.generate_tags_aspect(), ], )) return flow_mce
def get_dataflow_wu(self, flow_urn: str, job: Dict[str, Any]) -> MetadataWorkUnit: """ Generate a DataFlow workunit for a Glue job. Parameters ---------- flow_urn: URN for the flow job: Job object from get_all_jobs() """ region = self.source_config.aws_region custom_props = { "role": job["Role"], } if job.get("CreatedOn") is not None: custom_props["created"] = str(job["CreatedOn"]) if job.get("LastModifiedOn") is not None: custom_props["modified"] = str(job["LastModifiedOn"]) command = job.get("Command", {}).get("ScriptLocation") if command is not None: custom_props["command"] = command mce = MetadataChangeEventClass(proposedSnapshot=DataFlowSnapshotClass( urn=flow_urn, aspects=[ DataFlowInfoClass( name=job["Name"], description=job.get("Description"), externalUrl= f"https://{region}.console.aws.amazon.com/gluestudio/home?region={region}#/editor/job/{job['Name']}/graph", # specify a few Glue-specific properties customProperties=custom_props, ), ], )) return MetadataWorkUnit(id=job["Name"], mce=mce)
def construct_flow_workunits( self, flow_urn: str, flow_name: str, external_url: str, flow_properties: Optional[Dict[str, str]] = None, ) -> Iterable[MetadataWorkUnit]: mcp = MetadataChangeProposalWrapper( entityType="dataFlow", entityUrn=flow_urn, changeType=ChangeTypeClass.UPSERT, aspectName="dataFlowInfo", aspect=DataFlowInfoClass( name=flow_name, customProperties=flow_properties, externalUrl=external_url, ), ) for proposal in [mcp]: wu = MetadataWorkUnit( id=f"{NIFI}.{flow_name}.{proposal.aspectName}", mcp=proposal) self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[MetadataWorkUnit]: jobs = self.get_all_jobs() processed_jobs: Dict[str, SageMakerJob] = {} # first pass: process jobs and collect datasets used for job in jobs: job_type = SAGEMAKER_JOB_TYPES[job["type"]] job_name = job[job_type.list_name_key] job_details = self.get_job_details(job_name, job["type"]) processed_job = getattr(self, job_type.processor)(job_details) processed_jobs[processed_job.job_snapshot.urn] = processed_job all_datasets = {} # second pass: # - move output jobs to inputs # - aggregate i/o datasets for job_urn in sorted(processed_jobs): processed_job = processed_jobs[job_urn] for output_job_urn in processed_job.output_jobs: processed_jobs[output_job_urn].input_jobs.add(output_job_urn) all_datasets.update(processed_job.input_datasets) all_datasets.update(processed_job.output_datasets) # yield datasets for dataset_urn, dataset in all_datasets.items(): dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_snapshot.aspects.append( DatasetPropertiesClass( customProperties={k: str(v) for k, v in dataset.items()}, tags=[], ) ) dataset_mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) dataset_wu = MetadataWorkUnit( id=dataset_urn, mce=dataset_mce, ) self.report.report_dataset_scanned() self.report.report_workunit(dataset_wu) yield dataset_wu # third pass: construct and yield MCEs for job_urn in sorted(processed_jobs): processed_job = processed_jobs[job_urn] job_snapshot = processed_job.job_snapshot flow_urn = make_sagemaker_flow_urn( processed_job.job_type, processed_job.job_name, self.env ) # create flow for each job flow_mce = MetadataChangeEvent( proposedSnapshot=DataFlowSnapshotClass( urn=flow_urn, aspects=[ DataFlowInfoClass( name=processed_job.job_name, ), ], ) ) flow_wu = MetadataWorkUnit( id=flow_urn, mce=flow_mce, ) self.report.report_workunit(flow_wu) yield flow_wu job_snapshot.aspects.append( DataJobInputOutputClass( inputDatasets=sorted(list(processed_job.input_datasets.keys())), outputDatasets=sorted(list(processed_job.output_datasets.keys())), inputDatajobs=sorted(list(processed_job.input_jobs)), ) ) job_mce = MetadataChangeEvent(proposedSnapshot=job_snapshot) job_wu = MetadataWorkUnit( id=job_urn, mce=job_mce, ) self.report.report_job_scanned() self.report.report_workunit(job_wu) yield job_wu
import datahub.emitter.mce_builder as builder from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.metadata.com.linkedin.pegasus2avro.datajob import DataJobInfoClass from datahub.metadata.schema_classes import ChangeTypeClass, DataFlowInfoClass # Construct the DataJobInfo aspect with the job -> flow lineage. dataflow_urn = builder.make_data_flow_urn(orchestrator="airflow", flow_id="flow_old_api", cluster="prod") dataflow_info = DataFlowInfoClass(name="LowLevelApiFlow") dataflow_info_mcp = MetadataChangeProposalWrapper( entityType="dataflow", changeType=ChangeTypeClass.UPSERT, entityUrn=dataflow_urn, aspectName="dataFlowInfo", aspect=dataflow_info, ) datajob_info = DataJobInfoClass(name="My Job 1", type="AIRFLOW", flowUrn=dataflow_urn) # Construct a MetadataChangeProposalWrapper object with the DataJobInfo aspect. # NOTE: This will overwrite all of the existing dataJobInfo aspect information associated with this job. datajob_info_mcp = MetadataChangeProposalWrapper( entityType="dataJob", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_data_job_urn(orchestrator="airflow",