def test_great_expectations_operator__checkpoint_config_with_substituted_batch_request_works_and_fails(
        in_memory_data_context_config, in_memory_checkpoint_config):
    failing_batch_request = BatchRequest(
        **{
            "datasource_name": "my_datasource",
            "data_connector_name": "default_inferred_data_connector_name",
            "data_asset_name": "yellow_tripdata_sample_2019-02.csv",
            "data_connector_query": {
                "index": -1
            },
        })

    operator = GreatExpectationsOperator(
        task_id="task_id",
        data_context_config=in_memory_data_context_config,
        checkpoint_config=in_memory_checkpoint_config,
        checkpoint_kwargs={
            "validations": [{
                "batch_request": failing_batch_request
            }]
        },
        fail_task_on_validation_failure=False,
    )
    result = operator.execute(context={})  # should fail the suite
    logger.info(result)
    assert result["success"] is False
def test_great_expectations_operator__validation_failure_raises_exc():
    operator = GreatExpectationsOperator(
        task_id="task_id",
        data_context_root_dir=ge_root_dir,
        checkpoint_name="taxi.fail.chk",
    )
    with pytest.raises(AirflowException):
        operator.execute(context={})
def test_great_expectations_operator__data_context_config_and_checkpoint_config_pass(
        in_memory_data_context_config, in_memory_checkpoint_config):
    operator = GreatExpectationsOperator(
        task_id="task_id",
        data_context_config=in_memory_data_context_config,
        checkpoint_config=in_memory_checkpoint_config,
    )
    result = operator.execute(context={})
    logger.info(result)
    assert result["success"]
def test_great_expectations_operator__return_json_dict():
    operator = GreatExpectationsOperator(task_id="task_id",
                                         data_context_root_dir=ge_root_dir,
                                         checkpoint_name="taxi.pass.chk",
                                         return_json_dict=True)
    result = operator.execute(context={})
    logger.info(result)
    assert isinstance(result, dict)
    assert result[
        "_success"]  # TODO: Update to "success" upon changes to `to_json_dict` in core GE
def test_great_expectations_operator__context_root_dir_and_checkpoint_name_pass(
):
    operator = GreatExpectationsOperator(
        task_id="task_id",
        data_context_root_dir=ge_root_dir,
        checkpoint_name="taxi.pass.chk",
    )
    result = operator.execute(context={})
    logger.info(result)
    assert result["success"]
def test_great_expectations_operator__validation_failure_callback():
    my_callback = mock.MagicMock()
    operator = GreatExpectationsOperator(
        task_id="task_id",
        data_context_root_dir=ge_root_dir,
        checkpoint_name="taxi.fail.chk",
        fail_task_on_validation_failure=False,
        validation_failure_callback=my_callback,
    )
    result = operator.execute(context={})
    assert result["success"] is False
    my_callback.assert_called_once_with(result)
def test_great_expectations_operator__checkpoint_config_with_substituted_expectation_suite_works_and_fails(
        in_memory_data_context_config, in_memory_checkpoint_config):
    operator = GreatExpectationsOperator(
        task_id="task_id",
        data_context_config=in_memory_data_context_config,
        checkpoint_config=in_memory_checkpoint_config,
        checkpoint_kwargs={"expectation_suite_name": "taxi.demo_fail"},
        fail_task_on_validation_failure=False,
    )
    result = operator.execute(context={})  # should fail the suite
    logger.info(result)
    assert result["success"] is False
def test_great_expectations_operator__validation_failure_logs_warning(caplog):
    operator = GreatExpectationsOperator(
        task_id="task_id",
        data_context_root_dir=ge_root_dir,
        checkpoint_name="taxi.fail.chk",
        fail_task_on_validation_failure=False,
    )
    operator._log = logging.getLogger("my_test_logger")
    caplog.set_level(level="WARNING", logger="my_test_logger")
    caplog.clear()
    result = operator.execute(context={})
    assert result["success"] is False
    assert ("my_test_logger", logging.WARNING) in ((r.name, r.levelno)
                                                   for r in caplog.records)
def test_great_expectations_operator__invalid_checkpoint_name():
    with pytest.raises(CheckpointNotFoundError):
        operator = GreatExpectationsOperator(
            task_id="task_id",
            checkpoint_name="invalid-checkpoint.name",
            data_context_root_dir=ge_root_dir,
        )
def test_great_expectations_operator__raises_error_without_checkpoint(
    in_memory_data_context_config, ):
    with pytest.raises(ValueError):
        operator = GreatExpectationsOperator(
            task_id="task_id",
            data_context_config=in_memory_data_context_config,
        )
def test_great_expectations_operator__raises_error_with_checkpoint_name_and_checkpoint_config(
    in_memory_data_context_config, ):
    with pytest.raises(ValueError):
        operator = GreatExpectationsOperator(
            task_id="task_id",
            data_context_config=in_memory_data_context_config,
            data_context_root_dir=ge_root_dir,
            checkpoint_name="taxi.pass.chk",
        )
Exemplo n.º 12
0
def data():
    """
    Workflows to validate data and create features.
    """

    # Extract data from DWH, blob storage, etc.
    extract_data = BashOperator(
        task_id="extract_data",
        bash_command=f"cd {config.BASE_DIR} && dvc pull",
    )

    # Validate data
    validate_projects = GreatExpectationsOperator(
        task_id="validate_projects",
        checkpoint_name="projects",
        data_context_root_dir="great_expectations",
        fail_task_on_validation_failure=True,
    )
    validate_tags = GreatExpectationsOperator(
        task_id="validate_tags",
        checkpoint_name="tags",
        data_context_root_dir="great_expectations",
        fail_task_on_validation_failure=True,
    )

    # Compute features
    compute_features = PythonOperator(
        task_id="compute_features",
        python_callable=cli.compute_features,
        op_kwargs={"params_fp": Path(config.CONFIG_DIR, "params.json")},
    )

    # Cache (feature store, database, warehouse, etc.)
    END_TS = ""
    cache = BashOperator(
        task_id="cache_to_feature_store",
        bash_command=f"cd {config.BASE_DIR}/features && feast materialize-incremental {END_TS}",
    )

    # Task relationships
    extract_data >> [validate_projects, validate_tags] >> compute_features >> cache
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

with DAG(dag_id='example_great_expectations_dag',
         start_date=datetime(2021, 1, 1),
         max_active_runs=1,
         schedule_interval='@daily',
         default_args=default_args,
         catchup=False) as dag:

    ge_batch_kwargs_pass = GreatExpectationsOperator(
        task_id='ge_batch_kwargs_pass',
        expectation_suite_name='taxi.demo',
        batch_kwargs={
            'path': data_file,
            'datasource': 'data__dir'
        },
        data_context_root_dir=ge_root_dir,
    )

    # This runs an expectation suite against a data asset that passes the tests
    ge_batch_kwargs_list_pass = GreatExpectationsOperator(
        task_id='ge_batch_kwargs_list_pass',
        assets_to_validate=[{
            'batch_kwargs': {
                'path': data_file,
                'datasource': 'data__dir'
            },
            'expectation_suite_name': 'taxi.demo'
        }],
def test_great_expectations_operator__raises_error_without_data_context():
    with pytest.raises(ValueError):
        operator = GreatExpectationsOperator(task_id="task_id",
                                             checkpoint_name="taxi.pass.chk")
Exemplo n.º 15
0
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

data_context_dir = "/opt/data/great_expectations"

dag = DAG('great_expectations_validation',
          schedule_interval='@once',
          default_args=default_args,
          description='Validates data.')

t1 = GreatExpectationsOperator(task_id='ge_sqlite_test',
                               run_name="ge_sqlite_run",
                               checkpoint_name="sqlite",
                               data_context_root_dir=data_context_dir,
                               dag=dag,
                               fail_task_on_validation_failure=False,
                               validation_operator_name="ol_operator",
                               do_xcom_push=False)

t2 = GreatExpectationsOperator(task_id='ge_pandas_test',
                               run_name="ge_pandas_run",
                               checkpoint_name="pandas",
                               data_context_root_dir=data_context_dir,
                               dag=dag,
                               fail_task_on_validation_failure=False,
                               validation_operator_name="ol_operator",
                               do_xcom_push=False)

t3 = GreatExpectationsOperator(task_id='ge_bad_sqlite_test',
                               run_name="ge_bad_sqlite_run",
Exemplo n.º 16
0
start_task = BashOperator(
    task_id='start',
    depends_on_past=False,
    bash_command=templated_command,
    params={
        'task_name': 'Start',
        'start_date': default_args['start_date']
    },
    dag=dag,
)

valid_prod_task = GreatExpectationsOperator(
    task_id='valid_products',
    expectation_suite_name='products',
    data_context_root_dir='/usr/src/challenge/great_expectations',
    batch_kwargs={
        'table': 'products',
        'datasource': 'challenge_src'
    },
    dag=dag)

valid_cust_task = GreatExpectationsOperator(
    task_id='valid_customers',
    expectation_suite_name='customers',
    data_context_root_dir='/usr/src/challenge/great_expectations',
    batch_kwargs={
        'table': 'test_customers',
        'datasource': 'challenge_src'
    },
    dag=dag)
Exemplo n.º 17
0
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

with DAG("kedro_ge_datascience",
         start_date=datetime(2021, 1, 1),
         max_active_runs=1,
         schedule_interval='@daily',
         default_args=default_args,
         catchup=False) as dag:

    ge_raw_checkpoint = GreatExpectationsOperator(
        task_id='ge_raw_checkpoint',
        expectation_suite_name='kedro.raw',
        batch_kwargs={
            'path': raw_data_file,
            'datasource': 'data__dir'
        },
        data_context_root_dir=ge_root_dir)

    ge_train_checkpoint = GreatExpectationsOperator(
        task_id='ge_train_checkpoint',
        expectation_suite_name='kedro.train',
        batch_kwargs={
            'path': train_data_file,
            'datasource': 'data__dir'
        },
        data_context_root_dir=ge_root_dir)

    ge_test_checkpoint = GreatExpectationsOperator(
        task_id='ge_test_checkpoint',
Exemplo n.º 18
0
dag = DAG(dag_id="example_great_expectations_dag", default_args=default_args)

# This runs an expectation suite against a sample data asset. You may need to change these paths if you do not have your `data`
# directory living in a top-level `include` directory. Ensure the checkpoint yml files have the correct path to the data file.
base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
data_file = os.path.join(
    base_path, "include", "data/yellow_tripdata_sample_2019-01.csv"
)
ge_root_dir = os.path.join(base_path, "include", "great_expectations")


ge_batch_kwargs_pass = GreatExpectationsOperator(
    task_id="ge_batch_kwargs_pass",
    expectation_suite_name="taxi.demo",
    batch_kwargs={"path": data_file, "datasource": "data__dir"},
    data_context_root_dir=ge_root_dir,
    dag=dag,
)

# This runs an expectation suite against a data asset that passes the tests
ge_batch_kwargs_list_pass = GreatExpectationsOperator(
    task_id="ge_batch_kwargs_list_pass",
    assets_to_validate=[
        {
            "batch_kwargs": {"path": data_file, "datasource": "data__dir"},
            "expectation_suite_name": "taxi.demo",
        }
    ],
    data_context_root_dir=ge_root_dir,
    dag=dag,
from include.great_expectations.object_configs.example_data_context_config import example_data_context_config
from include.great_expectations.object_configs.example_checkpoint_config import example_checkpoint_config

base_path = Path(__file__).parents[2]
data_dir = os.path.join(base_path, "include", "data")

ge_root_dir = os.path.join(base_path, "include", "great_expectations")

with DAG(dag_id="example_great_expectations_dag",
         start_date=datetime(2021, 12, 15),
         catchup=False,
         schedule_interval=None) as dag:
    ge_data_context_root_dir_with_checkpoint_name_pass = GreatExpectationsOperator(
        task_id="ge_data_context_root_dir_with_checkpoint_name_pass",
        data_context_root_dir=ge_root_dir,
        checkpoint_name="taxi.pass.chk",
    )

    ge_data_context_root_dir_with_checkpoint_name_fail_validation_and_not_task = GreatExpectationsOperator(
        task_id=
        "ge_data_context_root_dir_with_checkpoint_name_fail_validation_and_not_task",
        data_context_root_dir=ge_root_dir,
        checkpoint_name="taxi.fail.chk",
        fail_task_on_validation_failure=False,
    )

    ge_checkpoint_kwargs_substitute_batch_request_fails_validation_but_not_task = GreatExpectationsOperator(
        task_id=
        "ge_checkpoint_kwargs_substitute_batch_request_fails_validation_but_not_task",
        data_context_root_dir=ge_root_dir,
from great_expectations_provider.operators.great_expectations_bigquery import GreatExpectationsBigQueryOperator

default_args = {
    "owner": "Airflow",
    "start_date": airflow.utils.dates.days_ago(1)
}

dag = DAG(dag_id='example_great_expectations_dag', default_args=default_args)

# This runs an expectation suite against a data asset that passes the tests
data_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'data/yellow_tripdata_sample_2019-01.csv')
ge_batch_kwargs_pass = GreatExpectationsOperator(
    task_id='ge_batch_kwargs_pass',
    expectation_suite_name='taxi.demo',
    batch_kwargs={
        'path': data_file,
        'datasource': 'data__dir'
    },
    dag=dag)

# This runs an expectation suite against a data asset that passes the tests
ge_batch_kwargs_list_pass = GreatExpectationsOperator(
    task_id='ge_batch_kwargs_list_pass',
    assets_to_validate=[{
        'batch_kwargs': {
            'path': data_file,
            'datasource': 'data__dir'
        },
        'expectation_suite_name': 'taxi.demo'
    }],
    dag=dag)
         catchup=False
         ) as dag:

         opr_run_pipeline = PythonOperator(
            task_id='run_pipeline',
            python_callable=run_adf_pipeline,
            op_kwargs={'pipeline_name': 'pipeline1', 'date': yesterday_date}
         )

         opr_download_data = PythonOperator(
            task_id='download_data',
            python_callable=get_azure_blob_files
         )

         opr_ge_check = GreatExpectationsOperator(
            task_id='ge_check',
            expectation_suite_name='azure.demo',
            checkpoint_name="azure.pass.chk",
            data_context=data_context
         )

         opr_send_email = EmailOperator(
            task_id='send_email',
            to='*****@*****.**',
            subject='Covid to S3 DAG',
            html_content='<p>The great expectations checks passed successfully. <p>'
        )

         opr_run_pipeline >> opr_download_data >> opr_ge_check >> opr_send_email
          
Exemplo n.º 22
0
    download_data = PythonOperator(task_id='download_data',
                                   python_callable=get_azure_blob_files,
                                   op_kwargs={
                                       'blobname':
                                       'or/' + yesterday_date + '.csv',
                                       'output_filename':
                                       data_file_path + 'or_' +
                                       yesterday_date + '.csv'
                                   })

    ge_check = GreatExpectationsOperator(task_id='ge_checkpoint',
                                         expectation_suite_name='azure.demo',
                                         batch_kwargs={
                                             'path':
                                             data_file_path + 'or_' +
                                             yesterday_date + '.csv',
                                             'datasource':
                                             'data__dir'
                                         },
                                         data_context_root_dir=ge_root_dir)

    send_email = EmailOperator(
        task_id='send_email',
        to='*****@*****.**',
        subject='Covid to S3 DAG',
        html_content='<p>The great expectations checks passed successfully. <p>'
    )

    run_pipeline >> download_data >> ge_check >> send_email
Exemplo n.º 23
0
dbt_seed = DbtSeedOperator(task_id='dbt_seed',
                           dir=DBT_PROJECT_DIR,
                           profiles_dir=DBT_ROOT_DIR,
                           target=DBT_TARGET,
                           dag=dag)

validate_load = GreatExpectationsOperator(task_id='validate_load',
                                          assets_to_validate=[{
                                              'batch_kwargs': {
                                                  'datasource':
                                                  'spark-thrift-server',
                                                  'schema':
                                                  'example',
                                                  'table':
                                                  'taxi_zone_lookup',
                                                  'data_asset_name':
                                                  'taxi_zone_lookup'
                                              },
                                              'expectation_suite_name':
                                              'custom_sql_query.warning'
                                          }],
                                          data_context_root_dir=GE_ROOT_DIR,
                                          dag=dag)

dbt_run = DbtRunOperator(task_id='dbt_run',
                         dir=DBT_PROJECT_DIR,
                         profiles_dir=DBT_ROOT_DIR,
                         target=DBT_TARGET,
                         dag=dag)
Exemplo n.º 24
0
ge_root_dir = os.path.join(base_path, "include", "great_expectations")


def load_source_data():
    # Implement load to database
    pass


def publish_to_prod():
    # Implement load to production database
    pass


task_validate_source_data = GreatExpectationsOperator(
    task_id="validate_source_data",
    checkpoint_name="source_data.chk",
    dag=dag,
    data_context_root_dir=ge_root_dir,
)

task_load_source_data = PythonOperator(
    task_id="load_source_data",
    python_callable=load_source_data,
    dag=dag,
)

task_validate_source_data_load = GreatExpectationsOperator(
    task_id="validate_source_data_load",
    checkpoint_name="source_data_load.chk",
    dag=dag,
    data_context_root_dir=ge_root_dir,
)
Exemplo n.º 25
0
data_path = os.path.join(base_path, 'data/Telco/Telco-Customer-Churn.csv')

default_args = {
    "owner": "Airflow",
    "start_date": airflow.utils.dates.days_ago(1)
}

dag = DAG(dag_id='customer_churn',
          default_args=default_args,
          schedule_interval=None)

check_csv = GreatExpectationsOperator(
    task_id='validate_csv',
    expectation_suite_name="Telco-Customer-Churn.warning",
    batch_kwargs={
        'path': data_path,
        'datasource': 'Telco__dir',
        'data_context_root_dir': base_path
    },
    dag=dag)

preproces = PythonOperator(task_id='preprocess_data',
                           python_callable=preprocess_churn,
                           op_kwargs={
                               'data_path': data_path,
                               'base_path': base_path
                           },
                           dag=dag)

train = PythonOperator(task_id='train_model',
                       python_callable=train_model,