def test_ge_generally(csvpath: Path): # a PandasDataset is a pd.Dataframe, unless GE ever switches over to not subclassing assert issubclass(PandasDataset, pd.DataFrame) df_ge: PandasDataset = ge.read_csv(str(csvpath)) # check that a column is there res: ExpectationSuiteValidationResult = df_ge.expect_column_to_exist("opening_eco") assert res.success # check that a column value is in the set assert df_ge.expect_column_values_to_be_in_set("opening_eco", ["A22"]).success # check that things are of expected types expected_types = { "opening_eco": "object", "black_rating": "int64", "rated": "bool", "created_at": "float64", } for col, et in expected_types.items(): assert df_ge.expect_column_values_to_be_of_type(col, et).success # there are a bunch of patzers in this dataset ;-) verify that # harrygz in the test data fixture to has a rating of 2100 in order to simulate a failure assert not df_ge.expect_column_values_to_be_between( column="white_rating", min_value=500, max_value=2000 ).success # but black is ok! ;-) (chess joke) assert not df_ge.expect_column_values_to_be_between( column="black_rating", min_value=500, max_value=2000 ).success # great expectations allows you to "save" the suite of expectations you called on a df, # and then use that suite generally. let's try it. expectation_suite: ExpectationSuite = df_ge.get_expectation_suite() assert len(expectation_suite.expectations) == 6 assert expectation_suite.expectations[0].expectation_type == "expect_column_to_exist" # you can just save it do a directory suite_save_file = str(csvpath.parent / "suite.json") df_ge.save_expectation_suite(filepath=suite_save_file, discard_failed_expectations=False) # now lets load it in fresh_df: PandasDataset = ge.read_csv(str(csvpath)) validation_report: ExpectationValidationResult = fresh_df.validate( expectation_suite=suite_save_file ) assert not validation_report.success assert len(validation_report.results) == 8 failed_result: ExpectationValidationResult = validation_report.results[7] assert not failed_result.success assert failed_result.result["partial_unexpected_list"] == [2100, 2001]
def _open_file(self, file): if (os.path.isfile(file)): extension = self._get_file_extension(file) if extension.__eq__(".gz"): return ge.read_csv(filename=file, compression="gzip", names=self.columns) else: return ge.read_csv(filename=file, names=self.columns) else: raise RuntimeError("File {0} doesn't exist".format(file))
def execute_great_expectations_test_case(title, input_file_path, expectations_config_path): """ This method executes one ... :return: None. asserts correctness of results. """ # Run the pipeline under test and get the run's output out_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'test_out.csv') SuperPipeline().process_file(input_file_path, out_file_path) # Load the output dataset into Great Expectations, along with the expectations for this output dataset # (must be read from the file - path passed as an argument to this method), # and validate the dataset. with open(expectations_config_path) as f: expectations_config = json.load(f) output_df = ge.read_csv(out_file_path, expectations_config=expectations_config) output_validation_results = output_df.validate(result_format="SUMMARY", catch_exceptions=True)['results'] # Pass the validation results to the a method that will assert that the output dataset has met all expectations # and will list all the unmet expectations otherwise. process_validation_results(output_validation_results)
def test_validate_with_invalid_result(validate_result_dict): with open( file_relative_path(__file__, "./test_sets/titanic_expectations.json") ) as f: my_expectation_suite = expectationSuiteSchema.loads(f.read()) with mock.patch("uuid.uuid1") as uuid: uuid.return_value = "1234" my_df = ge.read_csv( file_relative_path(__file__, "./test_sets/Titanic.csv"), expectation_suite=my_expectation_suite, ) my_df.set_default_expectation_argument("result_format", "COMPLETE") results = my_df.validate() # catch_exceptions=True is default with open( file_relative_path( __file__, "./test_sets/titanic_expected_data_asset_validate_results_with_exceptions.json", ) ) as f: expected_results = expectationSuiteValidationResultSchema.loads(f.read()) del results.meta["great_expectations_version"] del results.meta["expectation_suite_meta"]["great_expectations_version"] for result in results.results: result.exception_info.pop("exception_traceback") assert results.to_json_dict() == expected_results.to_json_dict()
def __init__(self, *args, **kwargs): super(TestUtilMethods, self).__init__(*args, **kwargs) self.D = ge.read_csv( './tests/test_sets/distributional_expectations_data_base.csv') with open('./tests/test_sets/test_partitions.json', 'r') as file: self.test_partitions = json.loads(file.read())
def test_execute_expectation_suite_failure(): df = ge.read_csv(file_relative_path(__file__, './num_bad_data.csv')) validation = df.validate( expectation_suite=file_relative_path(__file__, 'num_expectations.json') ) ers = expectation_result_list_from_validation(validation) assert ers[0].success is True assert ers[1].success is True assert ers[2].success is False assert ers[2].metadata_entries[0].entry_data.data == { 'success': False, 'result': { 'observed_value': -2.0, 'element_count': 2, 'missing_count': 0, 'missing_percent': 0.0, }, 'exception_info': { 'raised_exception': False, 'exception_message': None, 'exception_traceback': None, }, 'expectation_config': { 'expectation_type': 'expect_column_mean_to_be_between', 'kwargs': {'column': 'num1', 'min_value': 0, 'max_value': 10}, }, }
def test_infer_distribution_parameters(self): D = ge.read_csv( './tests/test_sets/fixed_distributional_test_dataset.csv') with self.assertRaises(TypeError): ge.dataset.util.infer_distribution_parameters( data=D.norm, distribution='norm', params=['wrong_param_format']) t = ge.dataset.util.infer_distribution_parameters(data=D.norm_std, distribution='norm', params=None) self.assertEqual(t['mean'], D.norm_std.mean()) self.assertEqual(t['std_dev'], D.norm_std.std()) self.assertEqual(t['loc'], 0) self.assertEqual(t['scale'], 1) # beta t = ge.dataset.util.infer_distribution_parameters(data=D.beta, distribution='beta') self.assertEqual(t['alpha'], (t['mean']**2) * (((1 - t['mean']) / t['std_dev']**2) - (1 / t['mean'])), "beta dist, alpha infer") self.assertEqual(t['beta'], t['alpha'] * ((1 / t['mean']) - 1), "beta dist, beta infer") # gamma t = ge.dataset.util.infer_distribution_parameters(data=D.gamma, distribution='gamma') self.assertEqual(t['alpha'], D.gamma.mean()) # uniform distributions t = ge.dataset.util.infer_distribution_parameters( data=D.uniform, distribution='uniform') self.assertEqual(t['min'], min(D.uniform), "uniform, min infer") self.assertEqual(t['max'], max(D.uniform) - min(D.uniform), "uniform, max infer") uni_loc = 5 uni_scale = 10 t = ge.dataset.util.infer_distribution_parameters( data=D.uniform, distribution='uniform', params={ 'loc': uni_loc, 'scale': uni_scale }) self.assertEqual(t['min'], uni_loc, "uniform, min infer") self.assertEqual(t['max'], uni_scale, "uniform, max infer") # expon distribution with self.assertRaises(AttributeError): ge.dataset.util.infer_distribution_parameters( data=D.norm, distribution='fakedistribution') # chi2 t = ge.dataset.util.infer_distribution_parameters(data=D.chi2, distribution='chi2') self.assertEqual(t['df'], D.chi2.mean())
def titanic_validator(titanic_data_context_modular_api): """ What does this test do and why? Ensures that all available expectation types work as expected """ df = ge.read_csv(file_relative_path(__file__, "../test_sets/Titanic.csv")) return get_pandas_runtime_validator(titanic_data_context_modular_api, df)
def test_validate(self): with open("./tests/test_sets/titanic_expectations.json") as f: my_expectations_config = json.load(f) my_df = ge.read_csv( "./tests/test_sets/Titanic.csv", expectations_config=my_expectations_config ) my_df.set_default_expectation_argument("result_format", "COMPLETE") results = my_df.validate(catch_exceptions=False) # print json.dumps(results, indent=2) with open('./tests/test_sets/expected_results_20180303.json') as f: expected_results = json.load(f) #print json.dumps(expected_results, indent=2) self.maxDiff = None assertDeepAlmostEqual(self, results, expected_results ) # Now, change the results and ensure they are no longer equal results[0] = {} self.assertNotEqual(results, expected_results ) # Finally, confirm that only_return_failures works # and does not affect the "statistics" field. validation_results = my_df.validate(only_return_failures=True) #print json.dumps(validation_results) assertDeepAlmostEqual( self, validation_results, {"results": [ {"expectation_config": { "expectation_type": "expect_column_values_to_be_in_set", "kwargs": {"column": "PClass", "values_set": ["1st", "2nd", "3rd"], "result_format": "COMPLETE"} }, "success": False, "exception_info": {"exception_message": None, "exception_traceback": None, "raised_exception": False}, "result": {"partial_unexpected_index_list": [456], "unexpected_count": 1, "unexpected_list": ["*"], "unexpected_percent": 0.0007616146230007616, "element_count": 1313, "missing_percent": 0.0, "partial_unexpected_counts": [{"count": 1, "value": "*"}], "partial_unexpected_list": ["*"], "unexpected_percent_nonmissing": 0.0007616146230007616, "missing_count": 0, "unexpected_index_list": [456]}} ], "success": expected_results["success"], # unaffected "statistics": expected_results["statistics"], # unaffected } )
def test_snapshot_BasicDatasetProfiler_on_titanic(): """ A snapshot regression test for BasicDatasetProfiler. We are running the profiler on the Titanic dataset and comparing the EVRs to ones retrieved from a previously stored file. """ df = ge.read_csv(file_relative_path(__file__, "../test_sets/Titanic.csv")) suite, evrs = df.profile(BasicDatasetProfiler) # Check to make sure BasicDatasetProfiler is adding meta.columns with a single "description" field for each column assert "columns" in suite.meta for k, v in suite.meta["columns"].items(): assert v == {"description": ""} # Note: the above already produces an EVR; rerunning isn't strictly necessary just for EVRs evrs = df.validate(result_format="SUMMARY") # THIS IS NOT DEAD CODE. UNCOMMENT TO SAVE A SNAPSHOT WHEN UPDATING THIS TEST # with open('tests/test_sets/expected_evrs_BasicDatasetProfiler_on_titanic.json', 'w+') as file: # json.dump(expectationSuiteValidationResultSchema.dump(evrs).data, file, indent=2) # # with open('tests/render/fixtures/BasicDatasetProfiler_evrs.json', 'w+') as file: # json.dump(expectationSuiteValidationResultSchema.dump(evrs).data, file, indent=2) with open( file_relative_path( __file__, "../test_sets/expected_evrs_BasicDatasetProfiler_on_titanic.json" ), "r", ) as file: expected_evrs = expectationSuiteValidationResultSchema.load( json.load(file, object_pairs_hook=OrderedDict)).data # We know that python 2 does not guarantee the order of value_counts, which causes a different # order for items in the partial_unexpected_value_counts list # Remove those before assertions. for result in evrs.results: if "partial_unexpected_counts" in result.result: result.result.pop("partial_unexpected_counts") for result in expected_evrs.results: if "partial_unexpected_counts" in result.result: result.result.pop("partial_unexpected_counts") # Version and RUN-ID will be different del expected_evrs.meta["great_expectations.__version__"] del evrs.meta["great_expectations.__version__"] del expected_evrs.meta["run_id"] del evrs.meta["run_id"] del evrs.meta["batch_kwargs"]["ge_batch_id"] # DISABLE TEST IN PY2 BECAUSE OF ORDER ISSUE AND NEAR-EOL if not PY2: assert expected_evrs == evrs
def taxi_validator_sqlalchemy(titanic_data_context_modular_api): """ What does this test do and why? Ensures that all available expectation types work as expected """ df = ge.read_csv( file_relative_path(__file__, "../test_sets/yellow_tripdata_sample_2019-01.csv"), parse_dates=["pickup_datetime", "dropoff_datetime"], ) return get_sqlalchemy_runtime_validator_postgresql(df)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.D = ge.read_csv( file_relative_path( __file__, "../test_sets/distributional_expectations_data_base.csv")) with open( file_relative_path( __file__, "../test_sets/test_partitions.json")) as file: self.test_partitions = json.loads(file.read())
def test_custom_class(self): script_path = os.path.dirname(os.path.realpath(__file__)) df = ge.read_csv( script_path+'/test_sets/Titanic.csv', dataset_class=CustomPandasDataset ) df.set_default_expectation_argument("result_format", "COMPLETE") self.assertEqual( df.expect_column_values_to_be_prime( 'Age')['result']['unexpected_list'], [30.0, 25.0, 0.92000000000000004, 63.0, 39.0, 58.0, 50.0, 24.0, 36.0, 26.0, 25.0, 25.0, 28.0, 45.0, 39.0, 30.0, 58.0, 45.0, 22.0, 48.0, 44.0, 60.0, 45.0, 58.0, 36.0, 33.0, 36.0, 36.0, 14.0, 49.0, 36.0, 46.0, 27.0, 27.0, 26.0, 64.0, 39.0, 55.0, 70.0, 69.0, 36.0, 39.0, 38.0, 27.0, 27.0, 4.0, 27.0, 50.0, 48.0, 49.0, 48.0, 39.0, 36.0, 30.0, 24.0, 28.0, 64.0, 60.0, 49.0, 44.0, 22.0, 60.0, 48.0, 35.0, 22.0, 45.0, 49.0, 54.0, 38.0, 58.0, 45.0, 46.0, 25.0, 21.0, 48.0, 49.0, 45.0, 36.0, 55.0, 52.0, 24.0, 16.0, 44.0, 51.0, 42.0, 35.0, 35.0, 38.0, 35.0, 50.0, 49.0, 46.0, 58.0, 42.0, 40.0, 42.0, 55.0, 50.0, 16.0, 21.0, 30.0, 15.0, 30.0, 46.0, 54.0, 36.0, 28.0, 65.0, 33.0, 44.0, 55.0, 36.0, 58.0, 64.0, 64.0, 22.0, 28.0, 22.0, 18.0, 52.0, 46.0, 56.0, 33.0, 27.0, 55.0, 54.0, 48.0, 18.0, 21.0, 34.0, 40.0, 36.0, 50.0, 39.0, 56.0, 28.0, 56.0, 56.0, 24.0, 18.0, 24.0, 45.0, 40.0, 6.0, 57.0, 32.0, 62.0, 54.0, 52.0, 62.0, 63.0, 46.0, 52.0, 39.0, 18.0, 48.0, 49.0, 39.0, 46.0, 64.0, 60.0, 60.0, 55.0, 54.0, 21.0, 57.0, 45.0, 50.0, 50.0, 27.0, 20.0, 51.0, 21.0, 36.0, 40.0, 32.0, 33.0, 30.0, 28.0, 18.0, 34.0, 32.0, 57.0, 18.0, 36.0, 28.0, 51.0, 32.0, 28.0, 36.0, 4.0, 1.0, 12.0, 34.0, 26.0, 27.0, 15.0, 45.0, 40.0, 20.0, 25.0, 36.0, 25.0, 42.0, 26.0, 26.0, 0.82999999999999996, 54.0, 44.0, 52.0, 30.0, 30.0, 27.0, 24.0, 35.0, 8.0, 22.0, 30.0, 20.0, 21.0, 49.0, 8.0, 28.0, 18.0, 28.0, 22.0, 25.0, 18.0, 32.0, 18.0, 42.0, 34.0, 8.0, 21.0, 38.0, 38.0, 35.0, 35.0, 38.0, 24.0, 16.0, 26.0, 45.0, 24.0, 21.0, 22.0, 34.0, 30.0, 50.0, 30.0, 1.0, 44.0, 28.0, 6.0, 30.0, 45.0, 24.0, 24.0, 49.0, 48.0, 34.0, 32.0, 21.0, 18.0, 21.0, 52.0, 42.0, 36.0, 21.0, 33.0, 34.0, 22.0, 45.0, 30.0, 26.0, 34.0, 26.0, 22.0, 1.0, 25.0, 48.0, 57.0, 27.0, 30.0, 20.0, 45.0, 46.0, 30.0, 48.0, 54.0, 64.0, 32.0, 18.0, 32.0, 26.0, 20.0, 39.0, 22.0, 24.0, 28.0, 50.0, 20.0, 40.0, 42.0, 21.0, 32.0, 34.0, 33.0, 8.0, 36.0, 34.0, 30.0, 28.0, 0.80000000000000004, 25.0, 50.0, 21.0, 25.0, 18.0, 20.0, 30.0, 30.0, 35.0, 22.0, 25.0, 25.0, 14.0, 50.0, 22.0, 27.0, 27.0, 30.0, 22.0, 35.0, 30.0, 28.0, 12.0, 40.0, 36.0, 28.0, 32.0, 4.0, 36.0, 33.0, 32.0, 26.0, 30.0, 24.0, 18.0, 42.0, 16.0, 35.0, 16.0, 25.0, 18.0, 20.0, 30.0, 26.0, 40.0, 24.0, 18.0, 0.82999999999999996, 20.0, 25.0, 35.0, 32.0, 20.0, 39.0, 39.0, 6.0, 38.0, 9.0, 26.0, 4.0, 20.0, 26.0, 25.0, 18.0, 24.0, 35.0, 40.0, 38.0, 9.0, 45.0, 27.0, 20.0, 32.0, 33.0, 18.0, 40.0, 26.0, 15.0, 45.0, 18.0, 27.0, 22.0, 26.0, 22.0, 20.0, 32.0, 21.0, 18.0, 26.0, 6.0, 9.0, 40.0, 32.0, 26.0, 18.0, 20.0, 22.0, 22.0, 35.0, 21.0, 20.0, 18.0, 18.0, 38.0, 30.0, 21.0, 21.0, 21.0, 24.0, 33.0, 33.0, 28.0, 16.0, 28.0, 24.0, 21.0, 32.0, 26.0, 18.0, 20.0, 24.0, 24.0, 36.0, 30.0, 22.0, 35.0, 27.0, 30.0, 36.0, 9.0, 44.0, 45.0, 22.0, 30.0, 34.0, 28.0, 0.33000000000000002, 27.0, 25.0, 24.0, 22.0, 21.0, 26.0, 33.0, 1.0, 0.17000000000000001, 25.0, 36.0, 36.0, 30.0, 26.0, 65.0, 42.0, 32.0, 30.0, 24.0, 24.0, 24.0, 22.0, 18.0, 16.0, 45.0, 21.0, 18.0, 9.0, 48.0, 16.0, 25.0, 38.0, 22.0, 16.0, 33.0, 9.0, 38.0, 40.0, 14.0, 16.0, 9.0, 10.0, 6.0, 40.0, 32.0, 20.0, 28.0, 24.0, 28.0, 24.0, 20.0, 45.0, 26.0, 21.0, 27.0, 18.0, 26.0, 22.0, 28.0, 22.0, 27.0, 42.0, 27.0, 25.0, 27.0, 20.0, 48.0, 34.0, 22.0, 33.0, 32.0, 26.0, 49.0, 1.0, 33.0, 4.0, 24.0, 32.0, 27.0, 21.0, 32.0, 20.0, 21.0, 30.0, 21.0, 22.0, 4.0, 39.0, 20.0, 21.0, 44.0, 42.0, 21.0, 24.0, 25.0, 22.0, 22.0, 39.0, 26.0, 4.0, 22.0, 26.0, 1.5, 36.0, 18.0, 25.0, 22.0, 20.0, 26.0, 22.0, 32.0, 21.0, 21.0, 36.0, 39.0, 25.0, 45.0, 36.0, 30.0, 20.0, 21.0, 1.5, 25.0, 18.0, 63.0, 18.0, 15.0, 28.0, 36.0, 28.0, 10.0, 36.0, 30.0, 22.0, 14.0, 22.0, 51.0, 18.0, 45.0, 28.0, 21.0, 27.0, 36.0, 27.0, 15.0, 27.0, 26.0, 22.0, 24.0] ) primes = [3, 5, 7, 11, 13, 17, 23, 31] df["primes"] = df.Age.map(lambda x: random.choice(primes)) self.assertEqual( df.expect_column_values_to_be_prime( "primes")['result']['unexpected_list'], [] )
def test_validate(self): with open("./tests/test_sets/titanic_expectations.json") as f: my_expectation_suite = json.load(f) my_df = ge.read_csv("./tests/test_sets/Titanic.csv", profiler=ge.profile.ColumnsExistProfiler) self.assertEqual(len(my_df.get_expectation_suite()['expectations']), 7) # For column_expectations, _append_expectation should only replace expectations where the expetation_type AND the column match my_df.expect_column_to_exist("PClass") self.assertEqual(len(my_df.get_expectation_suite()['expectations']), 7)
def test_full_oobe_flow(): df = ge.read_csv("examples/data/Titanic.csv") df.profile(BasicDatasetProfiler) evrs = df.validate() # ["results"] rendered_json = ProfilingResultsPageRenderer().render(evrs) rendered_page = DefaultJinjaPageView().render(rendered_json) with open('./tests/render/output/test_full_oobe_flow.html', 'wb') as f: f.write(rendered_page.encode("utf-8")) assert rendered_page[:15] == "<!DOCTYPE html>" assert rendered_page[-7:] == "</html>"
def test_validate(self): with open("./tests/test_sets/titanic_expectations.json") as f: my_expectations_config = json.load(f) my_df = ge.read_csv("./tests/test_sets/Titanic.csv", expectations_config=my_expectations_config) my_df.set_default_expectation_argument("output_format", "COMPLETE") results = my_df.validate(catch_exceptions=False) # print json.dumps(results, indent=2) with open('./tests/test_sets/expected_results_20170721.json') as f: expected_results = json.load(f) # print json.dumps(expected_results, indent=2) self.maxDiff = None #!!! This needs to be converted to unicode, I think # print json.dumps(results, indent=2) # print '-'*80 # print json.dumps(expected_results, indent=2) # self.assertEqual( # json.loads(json.dumps(results)), # json.loads(json.dumps(expected_results)) # ) assertDeepAlmostEqual(self, results, expected_results) #Now, change the results and ensure they are no longer equal results[0] = {} self.assertNotEqual(results, expected_results) validation_results = my_df.validate(only_return_failures=True) # print json.dumps(validation_results, indent=2) assertDeepAlmostEqual( self, validation_results, { "results": [{ "exception_traceback": None, "expectation_type": "expect_column_values_to_be_in_set", "success": False, "exception_list": ["*"], "raised_exception": False, "kwargs": { "column": "PClass", "output_format": "COMPLETE", "values_set": ["1st", "2nd", "3rd"] }, "exception_index_list": [456] }] })
def test_display_column_evrs_as_section(): #TODO: We should add a fixture that contains EVRs df = ge.read_csv("./tests/test_sets/Titanic.csv") df.profile(BasicDatasetProfiler) evrs = df.validate(result_format="SUMMARY") # ["results"] html_to_display = jux.display_column_evrs_as_section( evrs, "Name", include_styling=False, return_without_displaying=True) print(html_to_display) #FIXME: This isn't a full snapshot test. assert '<div id="section-1" class="ge-section container-fluid">' in html_to_display assert '<span class="badge badge-info" >Carlsson, Mr Frans Olof</span>' in html_to_display assert '<li class="list-group-item d-flex justify-content-between align-items-center" >expect_column_values_to_be_in_type_list <span class="badge badge-secondary badge-pill" >True</span></li>' in html_to_display
def get_ge_df(self, dataset_name, bucket_name=None, **kwargs): if not self.check_for_key(dataset_name, bucket_name): aws_access_key_id, aws_secret_access_key, region_name, s3_endpoint_url = self._get_credentials('eu-west-1') self.log.info(aws_access_key_id) self.log.info(aws_secret_access_key) raise AirflowException("The source key {0} does not exist in bucket {1}".format(dataset_name, bucket_name)) s3_key_object = self.get_key(dataset_name, bucket_name) with NamedTemporaryFile("w") as temp_file: self.log.info("Temp dumping S3 file {0} contents to local {1} file".format(dataset_name, temp_file.name)) s3_key_object.download_file(temp_file.name) temp_file.flush() return ge.read_csv(temp_file.name, **kwargs)
def test_full_oobe_flow(): df = ge.read_csv(file_relative_path(__file__, "../../examples/data/Titanic.csv")) df.data_asset_name = "my_datasource/my_generator/my_asset" df.profile(BasicDatasetProfiler) evrs = df.validate() # results rendered_content = ProfilingResultsPageRenderer().render(evrs) rendered_page = DefaultJinjaPageView().render(rendered_content) with open(file_relative_path(__file__, './output/test_full_oobe_flow.html'), 'wb') as f: f.write(rendered_page.encode("utf-8")) assert rendered_page[:15] == "<!DOCTYPE html>" assert rendered_page[-7:] == "</html>"
def test_validate_with_invalid_result_catch_exceptions_false(validate_result_dict): with open(file_relative_path(__file__, "./test_sets/titanic_expectations.json")) as f: my_expectation_suite = expectationSuiteSchema.loads(f.read()) with mock.patch("uuid.uuid1") as uuid: uuid.return_value = "1234" my_df = ge.read_csv( file_relative_path(__file__, "./test_sets/Titanic.csv"), expectation_suite=my_expectation_suite ) my_df.set_default_expectation_argument("result_format", "COMPLETE") with pytest.raises(InvalidCacheValueError): my_df.validate(catch_exceptions=False)
def test_BasicDatasetProfiler_on_titanic(): """ A snapshot test for BasicDatasetProfiler. We are running the profiler on the Titanic dataset and comparing the EVRs to ones retrieved from a previously stored file. """ df = ge.read_csv("./tests/test_sets/Titanic.csv") suite, evrs = df.profile(BasicDatasetProfiler) # Check to make sure BasicDatasetProfiler is adding meta.columns with a single "description" field for each column print(json.dumps(suite["meta"], indent=2)) assert "columns" in suite["meta"] for k, v in suite["meta"]["columns"].items(): assert v == {"description": ""} # Note: the above already produces an EVR; rerunning isn't strictly necessary just for EVRs evrs = df.validate(result_format="SUMMARY") # ["results"] # with open('tests/test_sets/expected_evrs_BasicDatasetProfiler_on_titanic.json', 'w+') as file: # file.write(json.dumps(evrs)) # # with open('tests/render/fixtures/BasicDatasetProfiler_evrs.json', 'w+') as file: # file.write(json.dumps(evrs)) with open( 'tests/test_sets/expected_evrs_BasicDatasetProfiler_on_titanic.json', 'r') as file: expected_evrs = json.load(file, object_pairs_hook=OrderedDict) expected_evrs.pop("meta") evrs.pop("meta") # We know that python 2 does not guarantee the order of value_counts, which causes a different # order for items in the partial_unexpected_value_counts list # Remove those before test. for result in evrs["results"]: if "partial_unexpected_counts" in result["result"]: result["result"].pop("partial_unexpected_counts") for result in expected_evrs["results"]: if "partial_unexpected_counts" in result["result"]: result["result"].pop("partial_unexpected_counts") # DISABLE TEST IN PY2 BECAUSE OF ORDER ISSUE AND NEAR-EOL if not PY2: assertDeepAlmostEqual(expected_evrs, evrs)
def _get_dataframe(self): """ Load dataframe based on specified connection :return: """ if self.source_conn is None: # Use local file return ge.read_csv(self.dataset_name, **self.dataset_params) if isinstance(self.source_conn, S3Hook): hook = ExpectationS3CsvHook(aws_conn_id=self.source_conn_id) return hook.get_ge_df(self.dataset_name, self.source_bucket_name, **self.dataset_params) if isinstance(self.source_conn, DbApiHook): hook = ExpectationMySQLHook(mysql_conn_id=self.source_conn_id) return hook.get_ge_df(self.dataset_name, **self.dataset_params)
def validate(parsed_args): """ Read a dataset file and validate it using a config saved in another file. Uses parameters defined in the dispatch method. :param parsed_args: A Namespace object containing parsed arguments from the dispatch method. :return: The number of unsucessful expectations """ parsed_args = vars(parsed_args) data_set = parsed_args['dataset'] expectations_config_file = parsed_args['expectations_config_file'] expectations_config = json.load(open(expectations_config_file)) if parsed_args["evaluation_parameters"] is not None: evaluation_parameters = json.load( open(parsed_args["evaluation_parameters"])) else: evaluation_parameters = None if parsed_args["custom_dataset_module"]: sys.path.insert(0, os.path.dirname(parsed_args["custom_dataset_module"])) module_name = os.path.basename( parsed_args["custom_dataset_module"]).split('.')[0] custom_module = __import__(module_name) dataset_class = getattr(custom_module, parsed_args["custom_dataset_class"]) else: dataset_class = PandasDataset df = read_csv(data_set, expectations_config=expectations_config, dataset_class=dataset_class) result = df.validate( evaluation_parameters=evaluation_parameters, result_format=parsed_args["result_format"], catch_exceptions=parsed_args["catch_exceptions"], only_return_failures=parsed_args["only_return_failures"], ) print(json.dumps(result, indent=2)) return result['statistics']['unsuccessful_expectations']
def test_validate(self): with open("./tests/test_sets/titanic_expectations.json") as f: my_expectations_config = json.load(f) my_df = ge.read_csv("./tests/test_sets/Titanic.csv", autoinspect_func=columns_exist) self.assertEqual( len(my_df.get_expectations_config()['expectations']), 7 ) # For column_expectations, _append_expectation should only replace expectations where the expetation_type AND the column match my_df.expect_column_to_exist("PClass") self.assertEqual( len(my_df.get_expectations_config()['expectations']), 7 )
def test_dummy_ge(): df = ge.read_csv(file_relative_path(__file__, './num.csv')) ge_evr = df.expect_column_mean_to_be_between('num1', 0, 10) er = create_expectation_result('expect_column_mean_to_be_between', ge_evr) assert er.label == 'expect_column_mean_to_be_between' assert er.success assert len(er.metadata_entries) == 1 assert er.metadata_entries[0].label == 'evr' assert er.metadata_entries[0].entry_data.data == { 'success': True, 'result': { 'observed_value': 2.0, 'element_count': 2, 'missing_count': 0, 'missing_percent': 0.0, }, }
def test_display_column_evrs_as_section(empty_data_context): #TODO: We should add a fixture that contains EVRs df = ge.read_csv("./tests/test_sets/Titanic.csv") df.profile(BasicDatasetProfiler) evrs = df.validate(result_format="SUMMARY") # ["results"] html_to_display = jux.display_column_evrs_as_section( evrs, "Name", include_styling=False, return_without_displaying=True) print(html_to_display) #FIXME: This isn't a full snapshot test. assert '<div id="section-1" class="ge-section container-fluid">' in html_to_display assert '<span class="badge badge-info" style="word-break:break-all;" >Carlsson, Mr Frans Olof</span>' in html_to_display assert """\ <span class="cooltip" > Type: None <span class=top> expect_column_values_to_be_of_type <br>expect_column_values_to_be_in_type_list </span> </span>""" in html_to_display
def test_validate_with_invalid_result_catch_exceptions_false(empty_data_context): context: DataContext = empty_data_context with open( file_relative_path(__file__, "./test_sets/titanic_expectations.json") ) as f: my_expectation_suite_dict: dict = expectationSuiteSchema.loads(f.read()) my_expectation_suite: ExpectationSuite = ExpectationSuite( **my_expectation_suite_dict, data_context=context ) with mock.patch("uuid.uuid1") as uuid: uuid.return_value = "1234" my_df = ge.read_csv( file_relative_path(__file__, "./test_sets/Titanic.csv"), expectation_suite=my_expectation_suite, ) my_df.set_default_expectation_argument("result_format", "COMPLETE") with pytest.raises(InvalidCacheValueError): with pytest.warns(Warning, match=r"No great_expectations version found"): my_df.validate(catch_exceptions=False)
def test_BasicDatasetProfiler_on_titanic(): """ A snapshot test for BasicDatasetProfiler. We are running the profiler on the Titanic dataset and comparing the EVRs to ones retrieved from a previously stored file. """ df = ge.read_csv("./tests/test_sets/Titanic.csv") suite, evrs = df.profile(BasicDatasetProfiler) # Note: the above already produces an EVR; rerunning isn't strictly necessary just for EVRs evrs = df.validate(result_format="SUMMARY") # ["results"] # with open('tests/test_sets/expected_evrs_BasicDatasetProfiler_on_titanic.json', 'w+') as file: # file.write(json.dumps(evrs)) # # with open('tests/render/fixtures/BasicDatasetProfiler_evrs.json', 'w+') as file: # file.write(json.dumps(evrs)) with open( 'tests/test_sets/expected_evrs_BasicDatasetProfiler_on_titanic.json', 'r') as file: expected_evrs = json.load(file, object_pairs_hook=OrderedDict) expected_evrs.pop("meta") evrs.pop("meta") # We know that python 2 does not guarantee the order of value_counts, which causes a different # order for items in the partial_unexpected_value_counts list # Remove those before test. for result in evrs["results"]: if "partial_unexpected_counts" in result["result"]: result["result"].pop("partial_unexpected_counts") for result in expected_evrs["results"]: if "partial_unexpected_counts" in result["result"]: result["result"].pop("partial_unexpected_counts") assertDeepAlmostEqual(expected_evrs, evrs)
def validate_csv_using_greatexpectations( csv_path: InputPath(), expectation_suite_path: InputPath(), data_doc_path: OutputPath(), ): """Validate a CSV dataset against a Great Expectations suite and create Data Doc (a validation report). This component fails if validation is not successful. Annotations: authors: Yaroslav Beshta <*****@*****.**>, Anton Kiselev <*****@*****.**> Args: csv_path: Path to the CSV file with the dataset. expectation_suite_path: Path to Great Expectations expectation suite (in JSON format) """ import json import os import sys import great_expectations as ge from great_expectations.render import DefaultJinjaPageView from great_expectations.render.renderer import ValidationResultsPageRenderer with open(expectation_suite_path, 'r') as json_file: expectation_suite = json.load(json_file) df = ge.read_csv(csv_path, expectation_suite=expectation_suite) result = df.validate() document_model = ValidationResultsPageRenderer().render(result) os.makedirs(os.path.dirname(data_doc_path), exist_ok=True) with open(data_doc_path, 'w') as writer: writer.write(DefaultJinjaPageView().render(document_model)) print(f'Saved: {data_doc_path}') if not result.success: sys.exit(1)
def validate(parsed_args): parsed_args = vars(parsed_args) data_set = parsed_args['dataset'] expectations_config_file = parsed_args['expectations_config_file'] expectations_config = json.load(open(expectations_config_file)) if parsed_args["evaluation_parameters"] is not None: evaluation_parameters = json.load( open(parsed_args["evaluation_parameters"])) else: evaluation_parameters = None if parsed_args["custom_dataset_module"]: sys.path.insert(0, os.path.dirname(parsed_args["custom_dataset_module"])) module_name = os.path.basename( parsed_args["custom_dataset_module"]).split('.')[0] custom_module = __import__(module_name) dataset_class = getattr(custom_module, parsed_args["custom_dataset_class"]) else: dataset_class = PandasDataset df = read_csv(data_set, expectations_config=expectations_config, dataset_class=dataset_class) result = df.validate( evaluation_parameters=evaluation_parameters, result_format=parsed_args["result_format"], catch_exceptions=parsed_args["catch_exceptions"], only_return_failures=parsed_args["only_return_failures"], ) print(json.dumps(result, indent=2))