def test_great_expectations_operator__checkpoint_config_with_substituted_batch_request_works_and_fails( in_memory_data_context_config, in_memory_checkpoint_config): failing_batch_request = BatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "default_inferred_data_connector_name", "data_asset_name": "yellow_tripdata_sample_2019-02.csv", "data_connector_query": { "index": -1 }, }) operator = GreatExpectationsOperator( task_id="task_id", data_context_config=in_memory_data_context_config, checkpoint_config=in_memory_checkpoint_config, checkpoint_kwargs={ "validations": [{ "batch_request": failing_batch_request }] }, fail_task_on_validation_failure=False, ) result = operator.execute(context={}) # should fail the suite logger.info(result) assert result["success"] is False
def test_great_expectations_operator__validation_failure_raises_exc(): operator = GreatExpectationsOperator( task_id="task_id", data_context_root_dir=ge_root_dir, checkpoint_name="taxi.fail.chk", ) with pytest.raises(AirflowException): operator.execute(context={})
def test_great_expectations_operator__data_context_config_and_checkpoint_config_pass( in_memory_data_context_config, in_memory_checkpoint_config): operator = GreatExpectationsOperator( task_id="task_id", data_context_config=in_memory_data_context_config, checkpoint_config=in_memory_checkpoint_config, ) result = operator.execute(context={}) logger.info(result) assert result["success"]
def test_great_expectations_operator__return_json_dict(): operator = GreatExpectationsOperator(task_id="task_id", data_context_root_dir=ge_root_dir, checkpoint_name="taxi.pass.chk", return_json_dict=True) result = operator.execute(context={}) logger.info(result) assert isinstance(result, dict) assert result[ "_success"] # TODO: Update to "success" upon changes to `to_json_dict` in core GE
def test_great_expectations_operator__context_root_dir_and_checkpoint_name_pass( ): operator = GreatExpectationsOperator( task_id="task_id", data_context_root_dir=ge_root_dir, checkpoint_name="taxi.pass.chk", ) result = operator.execute(context={}) logger.info(result) assert result["success"]
def test_great_expectations_operator__validation_failure_callback(): my_callback = mock.MagicMock() operator = GreatExpectationsOperator( task_id="task_id", data_context_root_dir=ge_root_dir, checkpoint_name="taxi.fail.chk", fail_task_on_validation_failure=False, validation_failure_callback=my_callback, ) result = operator.execute(context={}) assert result["success"] is False my_callback.assert_called_once_with(result)
def test_great_expectations_operator__checkpoint_config_with_substituted_expectation_suite_works_and_fails( in_memory_data_context_config, in_memory_checkpoint_config): operator = GreatExpectationsOperator( task_id="task_id", data_context_config=in_memory_data_context_config, checkpoint_config=in_memory_checkpoint_config, checkpoint_kwargs={"expectation_suite_name": "taxi.demo_fail"}, fail_task_on_validation_failure=False, ) result = operator.execute(context={}) # should fail the suite logger.info(result) assert result["success"] is False
def test_great_expectations_operator__validation_failure_logs_warning(caplog): operator = GreatExpectationsOperator( task_id="task_id", data_context_root_dir=ge_root_dir, checkpoint_name="taxi.fail.chk", fail_task_on_validation_failure=False, ) operator._log = logging.getLogger("my_test_logger") caplog.set_level(level="WARNING", logger="my_test_logger") caplog.clear() result = operator.execute(context={}) assert result["success"] is False assert ("my_test_logger", logging.WARNING) in ((r.name, r.levelno) for r in caplog.records)
def test_great_expectations_operator__invalid_checkpoint_name(): with pytest.raises(CheckpointNotFoundError): operator = GreatExpectationsOperator( task_id="task_id", checkpoint_name="invalid-checkpoint.name", data_context_root_dir=ge_root_dir, )
def test_great_expectations_operator__raises_error_without_checkpoint( in_memory_data_context_config, ): with pytest.raises(ValueError): operator = GreatExpectationsOperator( task_id="task_id", data_context_config=in_memory_data_context_config, )
def test_great_expectations_operator__raises_error_with_checkpoint_name_and_checkpoint_config( in_memory_data_context_config, ): with pytest.raises(ValueError): operator = GreatExpectationsOperator( task_id="task_id", data_context_config=in_memory_data_context_config, data_context_root_dir=ge_root_dir, checkpoint_name="taxi.pass.chk", )
def data(): """ Workflows to validate data and create features. """ # Extract data from DWH, blob storage, etc. extract_data = BashOperator( task_id="extract_data", bash_command=f"cd {config.BASE_DIR} && dvc pull", ) # Validate data validate_projects = GreatExpectationsOperator( task_id="validate_projects", checkpoint_name="projects", data_context_root_dir="great_expectations", fail_task_on_validation_failure=True, ) validate_tags = GreatExpectationsOperator( task_id="validate_tags", checkpoint_name="tags", data_context_root_dir="great_expectations", fail_task_on_validation_failure=True, ) # Compute features compute_features = PythonOperator( task_id="compute_features", python_callable=cli.compute_features, op_kwargs={"params_fp": Path(config.CONFIG_DIR, "params.json")}, ) # Cache (feature store, database, warehouse, etc.) END_TS = "" cache = BashOperator( task_id="cache_to_feature_store", bash_command=f"cd {config.BASE_DIR}/features && feast materialize-incremental {END_TS}", ) # Task relationships extract_data >> [validate_projects, validate_tags] >> compute_features >> cache
'retries': 1, 'retry_delay': timedelta(minutes=1) } with DAG(dag_id='example_great_expectations_dag', start_date=datetime(2021, 1, 1), max_active_runs=1, schedule_interval='@daily', default_args=default_args, catchup=False) as dag: ge_batch_kwargs_pass = GreatExpectationsOperator( task_id='ge_batch_kwargs_pass', expectation_suite_name='taxi.demo', batch_kwargs={ 'path': data_file, 'datasource': 'data__dir' }, data_context_root_dir=ge_root_dir, ) # This runs an expectation suite against a data asset that passes the tests ge_batch_kwargs_list_pass = GreatExpectationsOperator( task_id='ge_batch_kwargs_list_pass', assets_to_validate=[{ 'batch_kwargs': { 'path': data_file, 'datasource': 'data__dir' }, 'expectation_suite_name': 'taxi.demo' }],
def test_great_expectations_operator__raises_error_without_data_context(): with pytest.raises(ValueError): operator = GreatExpectationsOperator(task_id="task_id", checkpoint_name="taxi.pass.chk")
'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } data_context_dir = "/opt/data/great_expectations" dag = DAG('great_expectations_validation', schedule_interval='@once', default_args=default_args, description='Validates data.') t1 = GreatExpectationsOperator(task_id='ge_sqlite_test', run_name="ge_sqlite_run", checkpoint_name="sqlite", data_context_root_dir=data_context_dir, dag=dag, fail_task_on_validation_failure=False, validation_operator_name="ol_operator", do_xcom_push=False) t2 = GreatExpectationsOperator(task_id='ge_pandas_test', run_name="ge_pandas_run", checkpoint_name="pandas", data_context_root_dir=data_context_dir, dag=dag, fail_task_on_validation_failure=False, validation_operator_name="ol_operator", do_xcom_push=False) t3 = GreatExpectationsOperator(task_id='ge_bad_sqlite_test', run_name="ge_bad_sqlite_run",
start_task = BashOperator( task_id='start', depends_on_past=False, bash_command=templated_command, params={ 'task_name': 'Start', 'start_date': default_args['start_date'] }, dag=dag, ) valid_prod_task = GreatExpectationsOperator( task_id='valid_products', expectation_suite_name='products', data_context_root_dir='/usr/src/challenge/great_expectations', batch_kwargs={ 'table': 'products', 'datasource': 'challenge_src' }, dag=dag) valid_cust_task = GreatExpectationsOperator( task_id='valid_customers', expectation_suite_name='customers', data_context_root_dir='/usr/src/challenge/great_expectations', batch_kwargs={ 'table': 'test_customers', 'datasource': 'challenge_src' }, dag=dag)
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1) } with DAG("kedro_ge_datascience", start_date=datetime(2021, 1, 1), max_active_runs=1, schedule_interval='@daily', default_args=default_args, catchup=False) as dag: ge_raw_checkpoint = GreatExpectationsOperator( task_id='ge_raw_checkpoint', expectation_suite_name='kedro.raw', batch_kwargs={ 'path': raw_data_file, 'datasource': 'data__dir' }, data_context_root_dir=ge_root_dir) ge_train_checkpoint = GreatExpectationsOperator( task_id='ge_train_checkpoint', expectation_suite_name='kedro.train', batch_kwargs={ 'path': train_data_file, 'datasource': 'data__dir' }, data_context_root_dir=ge_root_dir) ge_test_checkpoint = GreatExpectationsOperator( task_id='ge_test_checkpoint',
dag = DAG(dag_id="example_great_expectations_dag", default_args=default_args) # This runs an expectation suite against a sample data asset. You may need to change these paths if you do not have your `data` # directory living in a top-level `include` directory. Ensure the checkpoint yml files have the correct path to the data file. base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) data_file = os.path.join( base_path, "include", "data/yellow_tripdata_sample_2019-01.csv" ) ge_root_dir = os.path.join(base_path, "include", "great_expectations") ge_batch_kwargs_pass = GreatExpectationsOperator( task_id="ge_batch_kwargs_pass", expectation_suite_name="taxi.demo", batch_kwargs={"path": data_file, "datasource": "data__dir"}, data_context_root_dir=ge_root_dir, dag=dag, ) # This runs an expectation suite against a data asset that passes the tests ge_batch_kwargs_list_pass = GreatExpectationsOperator( task_id="ge_batch_kwargs_list_pass", assets_to_validate=[ { "batch_kwargs": {"path": data_file, "datasource": "data__dir"}, "expectation_suite_name": "taxi.demo", } ], data_context_root_dir=ge_root_dir, dag=dag,
from include.great_expectations.object_configs.example_data_context_config import example_data_context_config from include.great_expectations.object_configs.example_checkpoint_config import example_checkpoint_config base_path = Path(__file__).parents[2] data_dir = os.path.join(base_path, "include", "data") ge_root_dir = os.path.join(base_path, "include", "great_expectations") with DAG(dag_id="example_great_expectations_dag", start_date=datetime(2021, 12, 15), catchup=False, schedule_interval=None) as dag: ge_data_context_root_dir_with_checkpoint_name_pass = GreatExpectationsOperator( task_id="ge_data_context_root_dir_with_checkpoint_name_pass", data_context_root_dir=ge_root_dir, checkpoint_name="taxi.pass.chk", ) ge_data_context_root_dir_with_checkpoint_name_fail_validation_and_not_task = GreatExpectationsOperator( task_id= "ge_data_context_root_dir_with_checkpoint_name_fail_validation_and_not_task", data_context_root_dir=ge_root_dir, checkpoint_name="taxi.fail.chk", fail_task_on_validation_failure=False, ) ge_checkpoint_kwargs_substitute_batch_request_fails_validation_but_not_task = GreatExpectationsOperator( task_id= "ge_checkpoint_kwargs_substitute_batch_request_fails_validation_but_not_task", data_context_root_dir=ge_root_dir,
from great_expectations_provider.operators.great_expectations_bigquery import GreatExpectationsBigQueryOperator default_args = { "owner": "Airflow", "start_date": airflow.utils.dates.days_ago(1) } dag = DAG(dag_id='example_great_expectations_dag', default_args=default_args) # This runs an expectation suite against a data asset that passes the tests data_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data/yellow_tripdata_sample_2019-01.csv') ge_batch_kwargs_pass = GreatExpectationsOperator( task_id='ge_batch_kwargs_pass', expectation_suite_name='taxi.demo', batch_kwargs={ 'path': data_file, 'datasource': 'data__dir' }, dag=dag) # This runs an expectation suite against a data asset that passes the tests ge_batch_kwargs_list_pass = GreatExpectationsOperator( task_id='ge_batch_kwargs_list_pass', assets_to_validate=[{ 'batch_kwargs': { 'path': data_file, 'datasource': 'data__dir' }, 'expectation_suite_name': 'taxi.demo' }], dag=dag)
catchup=False ) as dag: opr_run_pipeline = PythonOperator( task_id='run_pipeline', python_callable=run_adf_pipeline, op_kwargs={'pipeline_name': 'pipeline1', 'date': yesterday_date} ) opr_download_data = PythonOperator( task_id='download_data', python_callable=get_azure_blob_files ) opr_ge_check = GreatExpectationsOperator( task_id='ge_check', expectation_suite_name='azure.demo', checkpoint_name="azure.pass.chk", data_context=data_context ) opr_send_email = EmailOperator( task_id='send_email', to='*****@*****.**', subject='Covid to S3 DAG', html_content='<p>The great expectations checks passed successfully. <p>' ) opr_run_pipeline >> opr_download_data >> opr_ge_check >> opr_send_email
download_data = PythonOperator(task_id='download_data', python_callable=get_azure_blob_files, op_kwargs={ 'blobname': 'or/' + yesterday_date + '.csv', 'output_filename': data_file_path + 'or_' + yesterday_date + '.csv' }) ge_check = GreatExpectationsOperator(task_id='ge_checkpoint', expectation_suite_name='azure.demo', batch_kwargs={ 'path': data_file_path + 'or_' + yesterday_date + '.csv', 'datasource': 'data__dir' }, data_context_root_dir=ge_root_dir) send_email = EmailOperator( task_id='send_email', to='*****@*****.**', subject='Covid to S3 DAG', html_content='<p>The great expectations checks passed successfully. <p>' ) run_pipeline >> download_data >> ge_check >> send_email
dbt_seed = DbtSeedOperator(task_id='dbt_seed', dir=DBT_PROJECT_DIR, profiles_dir=DBT_ROOT_DIR, target=DBT_TARGET, dag=dag) validate_load = GreatExpectationsOperator(task_id='validate_load', assets_to_validate=[{ 'batch_kwargs': { 'datasource': 'spark-thrift-server', 'schema': 'example', 'table': 'taxi_zone_lookup', 'data_asset_name': 'taxi_zone_lookup' }, 'expectation_suite_name': 'custom_sql_query.warning' }], data_context_root_dir=GE_ROOT_DIR, dag=dag) dbt_run = DbtRunOperator(task_id='dbt_run', dir=DBT_PROJECT_DIR, profiles_dir=DBT_ROOT_DIR, target=DBT_TARGET, dag=dag)
ge_root_dir = os.path.join(base_path, "include", "great_expectations") def load_source_data(): # Implement load to database pass def publish_to_prod(): # Implement load to production database pass task_validate_source_data = GreatExpectationsOperator( task_id="validate_source_data", checkpoint_name="source_data.chk", dag=dag, data_context_root_dir=ge_root_dir, ) task_load_source_data = PythonOperator( task_id="load_source_data", python_callable=load_source_data, dag=dag, ) task_validate_source_data_load = GreatExpectationsOperator( task_id="validate_source_data_load", checkpoint_name="source_data_load.chk", dag=dag, data_context_root_dir=ge_root_dir, )
data_path = os.path.join(base_path, 'data/Telco/Telco-Customer-Churn.csv') default_args = { "owner": "Airflow", "start_date": airflow.utils.dates.days_ago(1) } dag = DAG(dag_id='customer_churn', default_args=default_args, schedule_interval=None) check_csv = GreatExpectationsOperator( task_id='validate_csv', expectation_suite_name="Telco-Customer-Churn.warning", batch_kwargs={ 'path': data_path, 'datasource': 'Telco__dir', 'data_context_root_dir': base_path }, dag=dag) preproces = PythonOperator(task_id='preprocess_data', python_callable=preprocess_churn, op_kwargs={ 'data_path': data_path, 'base_path': base_path }, dag=dag) train = PythonOperator(task_id='train_model', python_callable=train_model,