def emit_event(): client = OpenLineageClient.from_environment() client.emit( RunEvent( RunState.COMPLETE, datetime.datetime.now().isoformat(), Run(runId=str(uuid.uuid4())), Job(namespace=os.getenv('OPENLINEAGE_NAMESPACE'), name='emit_event.wait-for-me'), _PRODUCER, [], []))
def _run(self, validation_result_suite: ExpectationSuiteValidationResult, validation_result_suite_identifier: ValidationResultIdentifier, data_asset: GEDataset, expectation_suite_identifier=None, checkpoint_identifier=None, payload=None): # Initialize logger here so that the action is serializable until it actually runs self.log = logging.getLogger(self.__class__.__module__ + '.' + self.__class__.__name__) datasets = [] if isinstance(data_asset, SqlAlchemyDataset): datasets = self._fetch_datasets_from_sql_source( data_asset, validation_result_suite) elif isinstance(data_asset, PandasDataset): datasets = self._fetch_datasets_from_pandas_source( data_asset, validation_result_suite) run_facets = {} if self.parent_run_id is not None: run_facets.update({ "parentRun": ParentRunFacet.create(self.parent_run_id, self.parent_job_namespace, self.parent_job_name) }) run_facets.update({ "great_expectations_meta": GreatExpectationsRunFacet(**validation_result_suite.meta) }) job_facets = {} if self.job_description: job_facets.update( {"documentation": DocumentationJobFacet(self.job_description)}) if self.code_location: job_facets.update({ "sourceCodeLocation": SourceCodeLocationJobFacet("", self.code_location) }) job_name = self.job_name if self.job_name is None: job_name = validation_result_suite.meta["expectation_suite_name"] + '.' \ + validation_result_suite_identifier.batch_identifier run_event = RunEvent( eventType=RunState.COMPLETE, eventTime=datetime.now().isoformat(), run=Run(runId=str(self.run_id), facets=run_facets), job=Job(self.namespace, job_name, facets=job_facets), inputs=datasets, outputs=[], producer= "https://github.com/OpenLineage/OpenLineage/tree/$VERSION/integration/common/openlineage/provider/great_expectations" # noqa ) if self.do_publish: self.openlineage_client.emit(run_event) # Great expectations tries to append stuff here, so we need to make it a dict return Serde.to_dict(run_event)
def _build_run(run_id: str, parent_run_id: Optional[str] = None, job_name: Optional[str] = None, nominal_start_time: Optional[str] = None, nominal_end_time: Optional[str] = None, custom_facets: Dict[str, Type[BaseFacet]] = None) -> Run: facets = {} if nominal_start_time: facets.update({ "nominalTime": NominalTimeRunFacet(nominal_start_time, nominal_end_time) }) if parent_run_id: facets.update({ "parentRun": ParentRunFacet.create(parent_run_id, _DAG_NAMESPACE, job_name) }) if custom_facets: facets.update(custom_facets) return Run(run_id, facets)
def test_client_sends_proper_json_with_minimal_event(): session = MagicMock() client = OpenLineageClient(url="http://example.com", session=session) client.emit( RunEvent( RunState.START, "2020-01-01", Run("69f4acab-b87d-4fc0-b27b-8ea950370ff3"), Job("openlineage", "job"), "producer" ) ) session.post.assert_called_with( "http://example.com/api/v1/lineage", '{"eventTime": "2020-01-01", "eventType": "START", "inputs": [], "job": ' '{"facets": {}, "name": "job", "namespace": "openlineage"}, "outputs": [], ' '"producer": "producer", "run": {"facets": {}, "runId": ' '"69f4acab-b87d-4fc0-b27b-8ea950370ff3"}}', timeout=5.0, verify=True )
def test_openlineage_dag_adds_custom_facets( mock_get_or_create_openlineage_client, new_lineage_run_id, clear_db_airflow_dags, ): openlineage.airflow.dag.extractors.clear() openlineage.airflow.dag.extractor_mapper.extractors.pop('TestFixtureDummyOperator', None) dag = DAG( DAG_ID, schedule_interval='@daily', default_args=DAG_DEFAULT_ARGS, description=DAG_DESCRIPTION ) # Mock the openlineage client method calls mock_openlineage_client = mock.Mock() mock_get_or_create_openlineage_client.return_value = mock_openlineage_client run_id = str(uuid.uuid4()) job_id = f"{DAG_ID}.{TASK_ID_COMPLETED}" new_lineage_run_id.return_value = run_id # Add task that will be marked as completed task_will_complete = DummyOperator( task_id=TASK_ID_COMPLETED, dag=dag ) completed_task_location = get_location(task_will_complete.dag.fileloc) # Start run dag.create_dagrun( run_id=DAG_RUN_ID, execution_date=DEFAULT_DATE, state=State.RUNNING) # Assert emit calls start_time = '2016-01-01T00:00:00.000000Z' end_time = '2016-01-02T00:00:00.000000Z' mock_openlineage_client.emit.assert_called_once_with(RunEvent( eventType=RunState.START, eventTime=mock.ANY, run=Run(run_id, { "nominalTime": NominalTimeRunFacet(start_time, end_time), "parentRun": ParentRunFacet.create( runId=DAG_RUN_ID, namespace=DAG_NAMESPACE, name=job_id ), "airflow_runArgs": AirflowRunArgsRunFacet(False), "airflow_version": AirflowVersionRunFacet( operator="airflow.operators.dummy_operator.DummyOperator", taskInfo=mock.ANY, airflowVersion=AIRFLOW_VERSION, openlineageAirflowVersion=OPENLINEAGE_AIRFLOW_VERSION ) }), job=Job("default", job_id, { "documentation": DocumentationJobFacet(DAG_DESCRIPTION), "sourceCodeLocation": SourceCodeLocationJobFacet("", completed_task_location) }), producer=PRODUCER, inputs=[], outputs=[] ))
def test_openlineage_dag_with_extract_on_complete( job_id_mapping, mock_get_or_create_openlineage_client, get_custom_facets, new_lineage_run_id, clear_db_airflow_dags, session=None): # --- test setup # Add the dummy extractor to the list for the task above openlineage.airflow.dag.extractors.clear() openlineage.airflow.dag.extractor_mapper.extractors[TestFixtureDummyOperator.__name__] = \ TestFixtureDummyExtractorOnComplete dag_id = 'test_openlineage_dag_with_extractor_on_complete' dag = DAG( dag_id, schedule_interval='@daily', default_args=DAG_DEFAULT_ARGS, description=DAG_DESCRIPTION ) dag_run_id = 'test_openlineage_dag_with_extractor_run_id' run_id = str(uuid.uuid4()) job_id = f"{dag_id}.{TASK_ID_COMPLETED}" # Mock the openlineage client method calls mock_openlineage_client = mock.Mock() mock_get_or_create_openlineage_client.return_value = mock_openlineage_client get_custom_facets.return_value = {} new_lineage_run_id.return_value = run_id # Add task that will be marked as completed task_will_complete = TestFixtureDummyOperator( task_id=TASK_ID_COMPLETED, dag=dag ) completed_task_location = get_location(task_will_complete.dag.fileloc) # Create DAG run and mark as running dagrun = dag.create_dagrun( run_id=dag_run_id, execution_date=DEFAULT_DATE, state=State.RUNNING) start_time = '2016-01-01T00:00:00.000000Z' end_time = '2016-01-02T00:00:00.000000Z' mock_openlineage_client.emit.assert_has_calls([ mock.call(RunEvent( eventType=RunState.START, eventTime=mock.ANY, run=Run(run_id, { "nominalTime": NominalTimeRunFacet(start_time, end_time), "parentRun": ParentRunFacet.create( runId=dag_run_id, namespace=DAG_NAMESPACE, name=job_id ) }), job=Job("default", job_id, { "documentation": DocumentationJobFacet(DAG_DESCRIPTION), "sourceCodeLocation": SourceCodeLocationJobFacet("", completed_task_location) }), producer=PRODUCER, inputs=[], outputs=[] )) ]) mock_openlineage_client.reset_mock() # --- Pretend complete the task job_id_mapping.pop.return_value = run_id task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) dag.handle_callback(dagrun, success=True, session=session) mock_openlineage_client.emit.assert_has_calls([ mock.call(RunEvent( eventType=RunState.COMPLETE, eventTime=mock.ANY, run=Run(run_id), job=Job("default", job_id), producer=PRODUCER, inputs=[OpenLineageDataset( namespace='dummy://localhost:1234', name='schema.extract_on_complete_input1', facets={ 'dataSource': DataSourceDatasetFacet( name='dummy://localhost:1234', uri='dummy://localhost:1234?query_tag=asdf' ), 'schema': SchemaDatasetFacet( fields=[ SchemaField(name='field1', type='text', description=''), SchemaField(name='field2', type='text', description='') ] ) }) ], outputs=[OpenLineageDataset( namespace='dummy://localhost:1234', name='extract_on_complete_output1', facets={ 'dataSource': DataSourceDatasetFacet( name='dummy://localhost:1234', uri='dummy://localhost:1234?query_tag=asdf' ) }) ] )) ])
def test_openlineage_dag( job_id_mapping, mock_get_or_create_openlineage_client, get_custom_facets, new_lineage_run_id, clear_db_airflow_dags, session=None ): dag = DAG( DAG_ID, schedule_interval='@daily', default_args=DAG_DEFAULT_ARGS, description=DAG_DESCRIPTION ) # (1) Mock the openlineage client method calls mock_ol_client = mock.Mock() mock_get_or_create_openlineage_client.return_value = mock_ol_client run_id_completed = str(uuid.uuid4()) run_id_failed = str(uuid.uuid4()) job_id_completed = f"{DAG_ID}.{TASK_ID_COMPLETED}" job_id_failed = f"{DAG_ID}.{TASK_ID_FAILED}" get_custom_facets.return_value = {} new_lineage_run_id.side_effect = [ run_id_completed, run_id_failed, run_id_completed, run_id_failed ] # (2) Add task that will be marked as completed task_will_complete = DummyOperator( task_id=TASK_ID_COMPLETED, dag=dag ) completed_task_location = get_location(task_will_complete.dag.fileloc) # (3) Add task that will be marked as failed task_will_fail = DummyOperator( task_id=TASK_ID_FAILED, dag=dag ) failed_task_location = get_location(task_will_complete.dag.fileloc) # (4) Create DAG run and mark as running dagrun = dag.create_dagrun( run_id=DAG_RUN_ID, execution_date=DEFAULT_DATE, state=State.RUNNING) # Assert emit calls start_time = '2016-01-01T00:00:00.000000Z' end_time = '2016-01-02T00:00:00.000000Z' emit_calls = [ mock.call(RunEvent( eventType=RunState.START, eventTime=mock.ANY, run=Run(run_id_completed, { "nominalTime": NominalTimeRunFacet(start_time, end_time), "parentRun": ParentRunFacet.create( runId=DAG_RUN_ID, namespace=DAG_NAMESPACE, name=job_id_completed ) }), job=Job("default", job_id_completed, { "documentation": DocumentationJobFacet(DAG_DESCRIPTION), "sourceCodeLocation": SourceCodeLocationJobFacet("", completed_task_location) }), producer=PRODUCER, inputs=[], outputs=[] )), mock.call(RunEvent( eventType=RunState.START, eventTime=mock.ANY, run=Run(run_id_failed, { "nominalTime": NominalTimeRunFacet(start_time, end_time), "parentRun": ParentRunFacet.create( runId=DAG_RUN_ID, namespace=DAG_NAMESPACE, name=job_id_failed ) }), job=Job("default", job_id_failed, { "documentation": DocumentationJobFacet(DAG_DESCRIPTION), "sourceCodeLocation": SourceCodeLocationJobFacet("", failed_task_location) }), producer=PRODUCER, inputs=[], outputs=[] )) ] log.info( f"{ [name for name, args, kwargs in mock_ol_client.mock_calls]}") mock_ol_client.emit.assert_has_calls(emit_calls) # (5) Start task that will be marked as completed task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) # (6) Start task that will be marked as failed ti1 = TaskInstance(task=task_will_fail, execution_date=DEFAULT_DATE) ti1.state = State.FAILED session.add(ti1) session.commit() job_id_mapping.pop.side_effect = [run_id_completed, run_id_failed] dag.handle_callback(dagrun, success=False, session=session) emit_calls += [ mock.call(RunEvent( eventType=RunState.COMPLETE, eventTime=mock.ANY, run=Run(run_id_completed), job=Job("default", job_id_completed), producer=PRODUCER, inputs=[], outputs=[] )), mock.call(RunEvent( eventType=RunState.FAIL, eventTime=mock.ANY, run=Run(run_id_failed), job=Job("default", job_id_failed), producer=PRODUCER, inputs=[], outputs=[] )) ] mock_ol_client.emit.assert_has_calls(emit_calls)