def commit_checkpoints( self, job_checkpoints: Dict[JobId, DatahubIngestionCheckpointClass]) -> None: for job_name, checkpoint in job_checkpoints.items(): # Emit the ingestion state for each job logger.info( f"Committing ingestion checkpoint for pipeline:'{checkpoint.pipelineName}'," f"instance:'{checkpoint.platformInstanceId}', job:'{job_name}'" ) datajob_urn = builder.make_data_job_urn( self.orchestrator_name, checkpoint.pipelineName, job_name, ) self.graph.emit_mcp( MetadataChangeProposalWrapper( entityType="dataJob", entityUrn=datajob_urn, aspectName="datahubIngestionCheckpoint", aspect=checkpoint, changeType=ChangeTypeClass.UPSERT, )) logger.info( f"Committed ingestion checkpoint for pipeline:'{checkpoint.pipelineName}'," f"instance:'{checkpoint.platformInstanceId}', job:'{job_name}'" )
def get_data_job_urn( orchestrator: str, pipeline_name: str, job_name: JobId, platform_instance_id: str, ) -> str: """ Standardizes datajob urn minting for all ingestion job state providers. """ return builder.make_data_job_urn( orchestrator, f"{pipeline_name}_{platform_instance_id}", job_name)
def get_latest_checkpoint( self, pipeline_name: str, platform_instance_id: str, job_name: JobId, ) -> Optional[DatahubIngestionCheckpointClass]: logger.info( f"Querying for the latest ingestion checkpoint for pipelineName:'{pipeline_name}'," f" platformInstanceId:'{platform_instance_id}', job_name:'{job_name}'" ) data_job_urn = builder.make_data_job_urn(self.orchestrator_name, pipeline_name, job_name) latest_checkpoint: Optional[ DatahubIngestionCheckpointClass] = self.graph.get_latest_timeseries_value( entity_urn=data_job_urn, aspect_name="datahubIngestionCheckpoint", filter_criteria_map={ "pipelineName": pipeline_name, "platformInstanceId": platform_instance_id, }, aspect_type=DatahubIngestionCheckpointClass, ) if latest_checkpoint: logger.info( f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}'," f" platformInstanceId:'{platform_instance_id}', job_name:'{job_name}' found with start_time:" f" {datetime.fromtimestamp(latest_checkpoint.timestampMillis/1000, tz=timezone.utc)} and a" f" bucket duration of {latest_checkpoint.eventGranularity}.") return latest_checkpoint else: logger.info( f"No committed ingestion checkpoint for pipelineName:'{pipeline_name}'," f" platformInstanceId:'{platform_instance_id}', job_name:'{job_name}' found" ) return None
fldUrn("bar3", "c2"), fldUrn("bar4", "c1"), ], outputDatasetFields=[ fldUrn("bar", "c1"), fldUrn("bar", "c2"), fldUrn("bar", "c3"), fldUrn("bar", "c4"), fldUrn("bar", "c5"), fldUrn("bar", "c6"), fldUrn("bar", "c7"), fldUrn("bar", "c9"), fldUrn("bar2", "c9"), ], fineGrainedLineages=fineGrainedLineages, ) dataJobLineageMcp = MetadataChangeProposalWrapper( entityType="dataJob", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_data_job_urn("spark", "Flow1", "Task1"), aspectName="dataJobInputOutput", aspect=dataJobInputOutput, ) # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") # Emit metadata! emitter.emit_mcp(dataJobLineageMcp)
import datahub.emitter.mce_builder as builder from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.metadata.com.linkedin.pegasus2avro.datajob import DataJobInfoClass from datahub.metadata.schema_classes import ChangeTypeClass # Construct the DataJobInfo aspect with the job -> flow lineage. dataflow_urn = builder.make_data_flow_urn( orchestrator="airflow", flow_id="flow1", cluster="prod" ) datajob_info = DataJobInfoClass(name="My Job 1", type="AIRFLOW", flowUrn=dataflow_urn) # Construct a MetadataChangeProposalWrapper object with the DataJobInfo aspect. # NOTE: This will overwrite all of the existing dataJobInfo aspect information associated with this job. chart_info_mcp = MetadataChangeProposalWrapper( entityType="dataJob", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_data_job_urn( orchestrator="airflow", flow_id="flow1", job_id="job1", cluster="prod" ), aspectName="dataJobInfo", aspect=datajob_info, ) # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") # Emit metadata! emitter.emit_mcp(chart_info_mcp)