def get_dataflow_wu(self, flow_urn: str, job: Dict[str, Any]) -> MetadataWorkUnit: """ Generate a DataFlow workunit for a Glue job. Parameters ---------- flow_urn: URN for the flow job: Job object from get_all_jobs() """ mce = MetadataChangeEventClass( proposedSnapshot=DataFlowSnapshotClass( urn=flow_urn, aspects=[ DataFlowInfoClass( name=job["Name"], description=job["Description"], # specify a few Glue-specific properties customProperties={ "role": job["Role"], "created": str(job["CreatedOn"]), "modified": str(job["LastModifiedOn"]), "command": job["Command"]["ScriptLocation"], }, ), ], ) ) return MetadataWorkUnit(id=job["Name"], mce=mce)
def generate_mce(self) -> MetadataChangeEventClass: flow_mce = MetadataChangeEventClass( proposedSnapshot=DataFlowSnapshotClass( urn=str(self.urn), aspects=[ DataFlowInfoClass( name=self.id, description=self.description, customProperties=self.properties, externalUrl=self.url, ), *self.generate_ownership_aspect(), *self.generate_tags_aspect(), ], )) return flow_mce
def get_dataflow_wu(self, flow_urn: str, job: Dict[str, Any]) -> MetadataWorkUnit: """ Generate a DataFlow workunit for a Glue job. Parameters ---------- flow_urn: URN for the flow job: Job object from get_all_jobs() """ region = self.source_config.aws_region custom_props = { "role": job["Role"], } if job.get("CreatedOn") is not None: custom_props["created"] = str(job["CreatedOn"]) if job.get("LastModifiedOn") is not None: custom_props["modified"] = str(job["LastModifiedOn"]) command = job.get("Command", {}).get("ScriptLocation") if command is not None: custom_props["command"] = command mce = MetadataChangeEventClass(proposedSnapshot=DataFlowSnapshotClass( urn=flow_urn, aspects=[ DataFlowInfoClass( name=job["Name"], description=job.get("Description"), externalUrl= f"https://{region}.console.aws.amazon.com/gluestudio/home?region={region}#/editor/job/{job['Name']}/graph", # specify a few Glue-specific properties customProperties=custom_props, ), ], )) return MetadataWorkUnit(id=job["Name"], mce=mce)
def get_workunits(self) -> Iterable[MetadataWorkUnit]: jobs = self.get_all_jobs() processed_jobs: Dict[str, SageMakerJob] = {} # first pass: process jobs and collect datasets used for job in jobs: job_type = SAGEMAKER_JOB_TYPES[job["type"]] job_name = job[job_type.list_name_key] job_details = self.get_job_details(job_name, job["type"]) processed_job = getattr(self, job_type.processor)(job_details) processed_jobs[processed_job.job_snapshot.urn] = processed_job all_datasets = {} # second pass: # - move output jobs to inputs # - aggregate i/o datasets for job_urn in sorted(processed_jobs): processed_job = processed_jobs[job_urn] for output_job_urn in processed_job.output_jobs: processed_jobs[output_job_urn].input_jobs.add(output_job_urn) all_datasets.update(processed_job.input_datasets) all_datasets.update(processed_job.output_datasets) # yield datasets for dataset_urn, dataset in all_datasets.items(): dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_snapshot.aspects.append( DatasetPropertiesClass( customProperties={k: str(v) for k, v in dataset.items()}, tags=[], ) ) dataset_mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) dataset_wu = MetadataWorkUnit( id=dataset_urn, mce=dataset_mce, ) self.report.report_dataset_scanned() self.report.report_workunit(dataset_wu) yield dataset_wu # third pass: construct and yield MCEs for job_urn in sorted(processed_jobs): processed_job = processed_jobs[job_urn] job_snapshot = processed_job.job_snapshot flow_urn = make_sagemaker_flow_urn( processed_job.job_type, processed_job.job_name, self.env ) # create flow for each job flow_mce = MetadataChangeEvent( proposedSnapshot=DataFlowSnapshotClass( urn=flow_urn, aspects=[ DataFlowInfoClass( name=processed_job.job_name, ), ], ) ) flow_wu = MetadataWorkUnit( id=flow_urn, mce=flow_mce, ) self.report.report_workunit(flow_wu) yield flow_wu job_snapshot.aspects.append( DataJobInputOutputClass( inputDatasets=sorted(list(processed_job.input_datasets.keys())), outputDatasets=sorted(list(processed_job.output_datasets.keys())), inputDatajobs=sorted(list(processed_job.input_jobs)), ) ) job_mce = MetadataChangeEvent(proposedSnapshot=job_snapshot) job_wu = MetadataWorkUnit( id=job_urn, mce=job_mce, ) self.report.report_job_scanned() self.report.report_workunit(job_wu) yield job_wu