def test_schema_validation(): conn = { "source_type": "MSSQL", "host": SQL_SERVER_HOST, "user": SQL_SERVER_USER, "password": SQL_SERVER_PASSWORD, "port": 1433, "database": "guestbook", } config = { consts.CONFIG_SOURCE_CONN: conn, consts.CONFIG_TARGET_CONN: conn, consts.CONFIG_TYPE: "Schema", consts.CONFIG_SCHEMA_NAME: "dbo", consts.CONFIG_TABLE_NAME: "entries", consts.CONFIG_FORMAT: "table", } validator = data_validation.DataValidation(config, verbose=True) df = validator.execute() for validation in df.to_dict(orient="records"): assert validation[ "validation_status"] == consts.VALIDATION_STATUS_SUCCESS
def test_count_validator(count_config): validator = data_validation.DataValidation(count_config, verbose=True) df = validator.execute() count_value = df[df["validation_name"] == "count"]["source_agg_value"].values[0] count_string_value = df[df["validation_name"] == "count_string_col"][ "source_agg_value" ].values[0] avg_float_value = df[df["validation_name"] == "avg_float_col"][ "source_agg_value" ].values[0] max_timestamp_value = df[df["validation_name"] == "max_timestamp_col"][ "source_agg_value" ].values[0] min_int_value = df[df["validation_name"] == "min_int_col"][ "source_agg_value" ].values[0] assert float(count_value) > 0 assert float(count_string_value) > 0 assert float(avg_float_value) > 0 assert datetime.datetime.strptime( max_timestamp_value, "%Y-%m-%d %H:%M:%S%z", ) > datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc) assert float(min_int_value) > 0
def test_count_validator(): validator = data_validation.DataValidation(CONFIG_COUNT_VALID, verbose=True) df = validator.execute() count_value = df[df["validation_name"] == "count"]["source_agg_value"].values[0] count_tripduration_value = df[df["validation_name"] == "count_tripduration"][ "source_agg_value" ].values[0] avg_tripduration_value = df[df["validation_name"] == "avg_tripduration"][ "source_agg_value" ].values[0] max_birth_year_value = df[df["validation_name"] == "max_birth_year"][ "source_agg_value" ].values[0] min_birth_year_value = df[df["validation_name"] == "min_birth_year"][ "source_agg_value" ].values[0] assert float(count_value) > 0 assert float(count_tripduration_value) > 0 assert float(avg_tripduration_value) > 0 assert float(max_birth_year_value) > 0 assert float(min_birth_year_value) > 0 assert ( df["source_agg_value"].astype(float).sum() == df["target_agg_value"].astype(float).sum() )
def test_numeric_types(): validator = data_validation.DataValidation(CONFIG_NUMERIC_AGG_VALID, verbose=True) df = validator.execute() for validation in df.to_dict(orient="records"): assert float(validation["source_agg_value"]) == float( validation["target_agg_value"] )
def test_mysql_count_invalid_host(): try: data_validator = data_validation.DataValidation( CONFIG_COUNT_VALID, verbose=False, ) df = data_validator.execute() assert df["source_agg_value"][0] == df["target_agg_value"][0] except exceptions.DataClientConnectionFailure: # Local Testing will not work for MySQL pass
def test_schema_validation(): try: data_validator = data_validation.DataValidation( CONFIG_SCHEMA_VALID, verbose=False, ) df = data_validator.execute() for validation in df.to_dict(orient="records"): assert validation[ "validation_status"] == consts.VALIDATION_STATUS_SUCCESS except exceptions.DataClientConnectionFailure: # Local Testing will not work for MySQL pass
def test_grouped_count_validator(grouped_config): validator = data_validation.DataValidation(grouped_config, verbose=True) df = validator.execute() rows = list(df[df["validation_name"] == "count"].iterrows()) # Check that all partitions are unique. partitions = frozenset(df["group_by_columns"]) assert len(rows) == len(partitions) assert len(rows) > 1 assert df["source_agg_value"].sum() == df["target_agg_value"].sum() for _, row in rows: assert float(row["source_agg_value"]) > 0 assert row["source_agg_value"] == row["target_agg_value"]
def test_postgres_count(cloud_sql): """Test count validation on Postgres instance""" config_count_valid = { # BigQuery Specific Connection Config consts.CONFIG_SOURCE_CONN: CONN, consts.CONFIG_TARGET_CONN: CONN, # Validation Type consts.CONFIG_TYPE: "Column", # Configuration Required Depending on Validator Type consts.CONFIG_SCHEMA_NAME: "public", consts.CONFIG_TABLE_NAME: "entries", consts.CONFIG_AGGREGATES: [ { consts.CONFIG_TYPE: "count", consts.CONFIG_SOURCE_COLUMN: None, consts.CONFIG_TARGET_COLUMN: None, consts.CONFIG_FIELD_ALIAS: "count", }, { consts.CONFIG_TYPE: "count", consts.CONFIG_SOURCE_COLUMN: "guestname", consts.CONFIG_TARGET_COLUMN: "guestname", consts.CONFIG_FIELD_ALIAS: "count_guestname", }, { consts.CONFIG_TYPE: "sum", consts.CONFIG_SOURCE_COLUMN: "entryid", consts.CONFIG_TARGET_COLUMN: "entryid", consts.CONFIG_FIELD_ALIAS: "sum_entryid", }, ], consts.CONFIG_FORMAT: "table", } data_validator = data_validation.DataValidation( config_count_valid, verbose=False, ) df = data_validator.execute() assert df["source_agg_value"].equals(df["target_agg_value"]) assert sorted(list(df["source_agg_value"])) == ["28", "7", "7"]
def test_sql_server_count(cloud_sql): """Test count validation on SQL Server instance""" conn = { "source_type": "MSSQL", "host": SQL_SERVER_HOST, "user": SQL_SERVER_USER, "password": SQL_SERVER_PASSWORD, "port": 1433, "database": "guestbook", } config_count_valid = { # BigQuery Specific Connection Config consts.CONFIG_SOURCE_CONN: conn, consts.CONFIG_TARGET_CONN: conn, # Validation Type consts.CONFIG_TYPE: "Column", # Configuration Required Depending on Validator Type consts.CONFIG_SCHEMA_NAME: "dbo", consts.CONFIG_TABLE_NAME: "entries", consts.CONFIG_AGGREGATES: [ { consts.CONFIG_TYPE: "count", consts.CONFIG_SOURCE_COLUMN: None, consts.CONFIG_TARGET_COLUMN: None, consts.CONFIG_FIELD_ALIAS: "count", }, ], consts.CONFIG_FORMAT: "table", } data_validator = data_validation.DataValidation( config_count_valid, verbose=False, ) df = data_validator.execute() assert df["source_agg_value"][0] == df["target_agg_value"][0]
def test_schema_validation(cloud_sql): """Test schema validation on Postgres instance""" config_count_valid = { consts.CONFIG_SOURCE_CONN: CONN, consts.CONFIG_TARGET_CONN: CONN, consts.CONFIG_TYPE: "Schema", consts.CONFIG_SCHEMA_NAME: "public", consts.CONFIG_TABLE_NAME: "entries", consts.CONFIG_FORMAT: "table", } data_validator = data_validation.DataValidation( config_count_valid, verbose=False, ) df = data_validator.execute() for validation in df.to_dict(orient="records"): assert validation[ "validation_status"] == consts.VALIDATION_STATUS_SUCCESS
def test_execute(module_under_test, fs): num_rows = 1 source_data = _generate_fake_data(rows=num_rows, second_range=0) _create_table_file(SOURCE_TABLE_FILE_PATH, _get_fake_json_data(source_data)) # Create target data with new field target_data = _generate_fake_data( rows=num_rows, second_range=0, rename_columns={"id": "id_new"} ) _create_table_file(TARGET_TABLE_FILE_PATH, _get_fake_json_data(target_data)) dv_client = data_validation.DataValidation(SAMPLE_SCHEMA_CONFIG, verbose=True) result_df = dv_client.schema_validator.execute() failures = result_df[ result_df["validation_status"].str.contains(consts.VALIDATION_STATUS_FAIL) ] assert len(result_df) == len(source_data[0]) + 1 assert result_df["source_agg_value"].astype(float).sum() == 7 assert result_df["target_agg_value"].astype(float).sum() == 7 assert result_df.labels[0] == SAMPLE_SCHEMA_CONFIG[consts.CONFIG_LABELS] assert failures["source_column_name"].to_list() == ["id", "N/A"] assert failures["target_column_name"].to_list() == ["N/A", "id_new"]
def validation_function(project): from data_validation import data_validation from data_validation.result_handlers import bigquery as bqhandler BQ_CONN = {"source_type": "BigQuery", "project_id": project} GROUPED_CONFIG_COUNT_VALID = { # BigQuery Specific Connection Config "source_conn": BQ_CONN, "target_conn": BQ_CONN, # Validation Type "type": "Column", # Configuration Required Depending on Validator Type "schema_name": "bigquery-public-data.new_york_citibike", "table_name": "citibike_trips", "aggregates": [ { "field_alias": "count", # Tool will run a 'COUNT *' as the default aggregation "source_column": None, "target_column": None, "type": "count", }, ], } handler = bqhandler.BigQueryResultHandler.get_handler_for_project( project) validator = data_validation.DataValidation(GROUPED_CONFIG_COUNT_VALID, verbose=True, result_handler=handler) validator.execute()
def test_schema_validation(): validator = data_validation.DataValidation(CONFIG_SCHEMA_VALIDATION, verbose=True) df = validator.execute() for validation in df.to_dict(orient="records"): assert validation["validation_status"] == consts.VALIDATION_STATUS_SUCCESS
def test_count_validator(): validator = data_validation.DataValidation(TERADATA_CONFIG, verbose=True) df = validator.execute() assert int(df["source_agg_value"][0]) > 0 assert df["source_agg_value"][0] == df["target_agg_value"][0]
"target_column": "tripduration", "type": "sum", }, { "field_alias": "max__tripduration", "source_column": "tripduration", "target_column": "tripduration", "type": "max", }, { "field_alias": "min__tripduration", "source_column": "tripduration", "target_column": "tripduration", "type": "min", }, { "field_alias": "avg__tripduration", "source_column": "tripduration", "target_column": "tripduration", "type": "avg", }, ], } result_handler = bqhandler.BigQueryResultHandler.get_handler_for_project( PROJECT_ID) validator = data_validation.DataValidation(CONFIG_COUNT_VALID, verbose=True, result_handler=result_handler) df = validator.execute()
def test_timestamp_aggs(): validator = data_validation.DataValidation(CONFIG_TIMESTAMP_AGGS) df = validator.execute() for validation in df.to_dict(orient="records"): assert validation["source_agg_value"] == validation["target_agg_value"]
def validate(config): """Run Data Validation against the supplied config.""" validator = data_validation.DataValidation(config) df = validator.execute() return _clean_dataframe(df)