def test_BasicDatasetProfiler_null_column(): """ The profiler should determine that null columns are of null cardinality and of null type and not to generate expectations specific to types and cardinality categories. We verify this by running the basic profiler on a Pandas dataset with an empty column and asserting the number of successful results for the empty columns. """ toy_dataset = PandasDataset({"x": [1, 2, 3], "y": [None, None, None]}) assert (len( toy_dataset.get_expectation_suite( suppress_warnings=True).expectations) == 0) expectations_config, evr_config = BasicDatasetProfiler.profile(toy_dataset) # TODO: assert set - specific expectations assert (len([ result for result in evr_config["results"] if result.expectation_config["kwargs"].get("column") == "y" and result.success ]) == 4) assert len([ result for result in evr_config["results"] if result.expectation_config["kwargs"].get("column") == "y" and result.success ]) < len([ result for result in evr_config["results"] if result.expectation_config["kwargs"].get("column") == "x" and result.success ])
def test_BasicDatasetProfiler_null_column(): """ The profiler should determine that null columns are of null cardinality and of null type and not to generate expectations specific to types and cardinality categories. We verify this by running the basic profiler on a Pandas dataset with an empty column and asserting the number of successful results for the empty columns. """ toy_dataset = PandasDataset({ "x": [1, 2, 3], "y": [None, None, None] }, data_asset_name="toy_dataset") assert len( toy_dataset.get_expectation_suite( suppress_warnings=True)["expectations"]) == 0 expectations_config, evr_config = BasicDatasetProfiler.profile(toy_dataset) # TODO: assert set - specific expectations assert len([ result for result in evr_config['results'] if result['expectation_config']['kwargs'].get('column') == 'y' and result['success'] ]) == 4 assert len([result for result in evr_config['results'] if result['expectation_config']['kwargs'].get('column') == 'y' and result['success']]) < \ len([result for result in evr_config['results'] if result['expectation_config']['kwargs'].get('column') == 'x' and result['success']])
def filesystem_csv_2(tmp_path_factory): base_dir = tmp_path_factory.mktemp('test_files') base_dir = str(base_dir) # Put a file in the directory toy_dataset = PandasDataset({"x": [1, 2, 3]}) toy_dataset.to_csv(os.path.join(base_dir, "f1.csv"), index=None) return base_dir
def test_BasicDatasetProfiler(mock_emit): toy_dataset = PandasDataset({"x": [1, 2, 3]}, ) assert (len( toy_dataset.get_expectation_suite( suppress_warnings=True).expectations) == 0) expectations_config, evr_config = BasicDatasetProfiler.profile(toy_dataset) assert (len( toy_dataset.get_expectation_suite(suppress_warnings=True).expectations) > 0) assert "BasicDatasetProfiler" in expectations_config.meta assert set(expectations_config.meta["BasicDatasetProfiler"].keys()) == { "created_by", "created_at", "batch_kwargs", } assert "notes" in expectations_config.meta assert set( expectations_config.meta["notes"].keys()) == {"format", "content"} assert "To add additional notes" in expectations_config.meta["notes"][ "content"][0] added_expectations = set() for exp in expectations_config.expectations: added_expectations.add(exp.expectation_type) assert "BasicDatasetProfiler" in exp.meta assert "confidence" in exp.meta["BasicDatasetProfiler"] expected_expectations = { "expect_table_row_count_to_be_between", "expect_table_columns_to_match_ordered_list", "expect_column_values_to_be_in_set", "expect_column_unique_value_count_to_be_between", "expect_column_proportion_of_unique_values_to_be_between", "expect_column_values_to_not_be_null", "expect_column_values_to_be_in_type_list", "expect_column_values_to_be_unique", } assert expected_expectations.issubset(added_expectations) # Note 20211209 - Currently the only method called by the Profiler that is instrumented for usage_statistics # is ExpectationSuite's add_expectation(). It will not send a usage_stats event when called from a Profiler. # this number can change in the future our instrumentation changes. assert mock_emit.call_count == 0 assert mock_emit.call_args_list == []
def test_DataSetProfiler_methods(): toy_dataset = PandasDataset({"x": [1, 2, 3]}) assert DatasetProfiler.validate(1) == False assert DatasetProfiler.validate(toy_dataset) with pytest.raises(NotImplementedError): DatasetProfiler.profile(toy_dataset)
def test_BasicDatasetProfiler(): toy_dataset = PandasDataset({"x": [1, 2, 3]}, data_asset_name="toy_dataset") assert len( toy_dataset.get_expectation_suite( suppress_warnings=True)["expectations"]) == 0 expectations_config, evr_config = BasicDatasetProfiler.profile(toy_dataset) # print(json.dumps(expectations_config, indent=2)) assert len( toy_dataset.get_expectation_suite( suppress_warnings=True)["expectations"]) > 0 assert expectations_config["data_asset_name"] == "toy_dataset" assert "BasicDatasetProfiler" in expectations_config["meta"] assert set(expectations_config["meta"]["BasicDatasetProfiler"].keys()) == { "created_by", "created_at" } assert "notes" in expectations_config["meta"] assert set( expectations_config["meta"]["notes"].keys()) == {"format", "content"} assert "To add additional notes" in expectations_config["meta"]["notes"][ "content"][0] added_expectations = set() for exp in expectations_config["expectations"]: added_expectations.add(exp["expectation_type"]) assert "BasicDatasetProfiler" in exp["meta"] assert "confidence" in exp["meta"]["BasicDatasetProfiler"] expected_expectations = { 'expect_table_row_count_to_be_between', 'expect_table_columns_to_match_ordered_list', 'expect_column_values_to_be_in_set', 'expect_column_unique_value_count_to_be_between', 'expect_column_proportion_of_unique_values_to_be_between', 'expect_column_values_to_not_be_null', 'expect_column_values_to_be_in_type_list', 'expect_column_values_to_be_unique' } assert expected_expectations.issubset(added_expectations)
def test_ColumnsExistProfiler(): toy_dataset = PandasDataset({"x": [1, 2, 3]}) expectations_config, evr_config = ColumnsExistProfiler.profile(toy_dataset) assert len(expectations_config["expectations"]) == 1 assert expectations_config["expectations"][0]["expectation_type"] == "expect_column_to_exist" assert expectations_config["expectations"][0]["kwargs"]["column"] == "x"
def test_expectation_suite_extract_false_many_results(test_df): test_ds = PandasDataset(data=test_df) test_ds.expect_column_values_to_be_of_type("col_1", "boolean") test_ds.expect_column_values_to_be_of_type("col_2", "object") test_ds.expect_column_values_to_be_null("col_1") result = test_ds.validate() failed_results = result.get_failed_validation_results() assert isinstance(failed_results, ExpectationSuiteValidationResult) assert failed_results.statistics["evaluated_expectations"] == 2 assert result.statistics["evaluated_expectations"] == 3
def test_BasicDatasetProfiler(): toy_dataset = PandasDataset({"x": [1, 2, 3]}, ) assert (len( toy_dataset.get_expectation_suite( suppress_warnings=True).expectations) == 0) expectations_config, evr_config = BasicDatasetProfiler.profile(toy_dataset) assert (len( toy_dataset.get_expectation_suite(suppress_warnings=True).expectations) > 0) assert "BasicDatasetProfiler" in expectations_config.meta assert set(expectations_config.meta["BasicDatasetProfiler"].keys()) == { "created_by", "created_at", "batch_kwargs", } assert "notes" in expectations_config.meta assert set( expectations_config.meta["notes"].keys()) == {"format", "content"} assert "To add additional notes" in expectations_config.meta["notes"][ "content"][0] added_expectations = set() for exp in expectations_config.expectations: added_expectations.add(exp.expectation_type) assert "BasicDatasetProfiler" in exp.meta assert "confidence" in exp.meta["BasicDatasetProfiler"] expected_expectations = { "expect_table_row_count_to_be_between", "expect_table_columns_to_match_ordered_list", "expect_column_values_to_be_in_set", "expect_column_unique_value_count_to_be_between", "expect_column_proportion_of_unique_values_to_be_between", "expect_column_values_to_not_be_null", "expect_column_values_to_be_in_type_list", "expect_column_values_to_be_unique", } assert expected_expectations.issubset(added_expectations)
def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs): batch_kwargs.update(kwargs) if "path" in batch_kwargs: reader_options = batch_kwargs.copy() path = reader_options.pop( "path") # We need to remove from the reader reader_options.pop("timestamp", "") # ditto timestamp (but missing ok) reader_method = reader_options.pop("reader_method", None) if reader_method is None: reader_method = self._guess_reader_method_from_path(path) if reader_method is None: raise BatchKwargsError( "Unable to determine reader for path: %s" % path, batch_kwargs) else: try: reader_method = ReaderMethods[reader_method] except KeyError: raise BatchKwargsError( "Unknown reader method: %s" % reader_method, batch_kwargs) if reader_method == ReaderMethods.CSV: df = pd.read_csv(path, **reader_options) elif reader_method == ReaderMethods.parquet: df = pd.read_parquet(path, **reader_options) elif reader_method == ReaderMethods.excel: df = pd.read_excel(path, **reader_options) elif reader_method == ReaderMethods.JSON: df = pd.read_json(path, **reader_options) else: raise BatchKwargsError( "Unsupported reader: %s" % reader_method.name, batch_kwargs) elif "df" in batch_kwargs and isinstance(batch_kwargs["df"], (pd.DataFrame, pd.Series)): df = batch_kwargs.pop( "df") # We don't want to store the actual dataframe in kwargs batch_kwargs["PandasInMemoryDF"] = True else: raise BatchKwargsError( "Invalid batch_kwargs: path or df is required for a PandasDatasource", batch_kwargs) return PandasDataset(df, expectation_suite=expectation_suite, data_context=self._data_context, batch_kwargs=batch_kwargs)
N_ROWS = 1_000_000 COL_NAME = "x" N = 10 INT_MIN = 0.0 INT_MAX = 10_000 FLOAT_MIN = 0.0 FLOAT_MAX = 10_000.0 DT_MIN = datetime(2020, 1, 1) DT_MAX = datetime(2021, 1, 1) dataset_int = PandasDataset({ COL_NAME: pd.Series(np.random.randint(low=INT_MIN, high=INT_MAX, size=N_ROWS)) }) dataset_float = PandasDataset({ COL_NAME: pd.Series(np.random.uniform(low=FLOAT_MIN, high=FLOAT_MAX, size=N_ROWS)) }) dataset_dt = PandasDataset({ COL_NAME: tuple( datetime.fromtimestamp(val).isoformat() for val in np.random.randint(low=DT_MIN.timestamp(), high=DT_MAX.timestamp(), size=N_ROWS // 100)) })