def _profile(cls, dataset, configuration=None): logger.debug(f"Running profiler with configuration: {configuration}") if configuration == "demo": return cls._demo_profile(dataset) existing_columns = dataset.get_table_columns() selected_columns = existing_columns included_expectations = [] excluded_expectations = [] if configuration: if ("included_expectations" in configuration and "excluded_expectations" in configuration): raise ProfilerError( "Please specify either `included_expectations` or `excluded_expectations`." ) if "included_expectations" in configuration: included_expectations = configuration["included_expectations"] if included_expectations in [False, None, []]: included_expectations = None _check_that_expectations_are_available(dataset, included_expectations) if "excluded_expectations" in configuration: excluded_expectations = configuration["excluded_expectations"] if excluded_expectations in [False, None, []]: excluded_expectations = None _check_that_expectations_are_available(dataset, excluded_expectations) if ("included_columns" in configuration and "excluded_columns" in configuration): raise ProfilerError( "Please specify either `excluded_columns` or `included_columns`." ) elif "included_columns" in configuration: selected_columns = configuration["included_columns"] if selected_columns in [False, None, []]: selected_columns = [] elif "excluded_columns" in configuration: excluded_columns = configuration["excluded_columns"] if excluded_columns in [False, None, []]: excluded_columns = [] selected_columns = set(existing_columns) - set( excluded_columns) _check_that_columns_exist(dataset, selected_columns) if included_expectations is None: suite = cls._build_column_description_metadata(dataset) # remove column exist expectations suite.expectations = [] return suite dataset.set_default_expectation_argument("catch_exceptions", False) dataset = cls._build_table_row_count_expectation( dataset, excluded_expectations=excluded_expectations, included_expectations=included_expectations, ) dataset.set_config_value("interactive_evaluation", True) dataset = cls._build_table_column_expectations( dataset, excluded_expectations=excluded_expectations, included_expectations=included_expectations, ) column_cache = {} if selected_columns: with tqdm(total=len(selected_columns), desc="Profiling Columns", delay=5) as pbar: for column in selected_columns: pbar.set_postfix_str(column) cardinality = cls._get_column_cardinality_with_caching( dataset, column, column_cache) column_type = cls._get_column_type_with_caching( dataset, column, column_cache) if cardinality in [ ProfilerCardinality.TWO, ProfilerCardinality.VERY_FEW, ProfilerCardinality.FEW, ]: cls._create_expectations_for_low_card_column( dataset, column, column_cache) elif cardinality in [ ProfilerCardinality.MANY, ProfilerCardinality.VERY_MANY, ProfilerCardinality.UNIQUE, ]: # TODO we will want to finesse the number and types of # expectations created here. The simple version is deny/allow list # and the more complex version is desired per column type and # cardinality. This deserves more thought on configuration. dataset.expect_column_values_to_be_unique(column) if column_type in [ ProfilerDataType.INT, ProfilerDataType.FLOAT, ]: cls._create_expectations_for_numeric_column( dataset, column) elif column_type in [ProfilerDataType.DATETIME]: cls._create_expectations_for_datetime_column( dataset, column, excluded_expectations=excluded_expectations, included_expectations=included_expectations, ) elif column_type in [ProfilerDataType.STRING]: cls._create_expectations_for_string_column( dataset, column, excluded_expectations=excluded_expectations, included_expectations=included_expectations, ) elif column_type in [ProfilerDataType.UNKNOWN]: logger.debug( f"Skipping expectation creation for column {column} of unknown type: {column_type}" ) pbar.update() if excluded_expectations: # NOTE: we reach into a private member here because of an expected future # refactor that will make the suite directly accessible dataset._expectation_suite.remove_all_expectations_of_type( excluded_expectations) if included_expectations: for expectation in dataset.get_expectation_suite( discard_failed_expectations=False, suppress_logging=True, ).expectations: if expectation.expectation_type not in included_expectations: try: dataset.remove_expectation( ExpectationConfiguration( expectation_type=expectation.expectation_type, kwargs=expectation.kwargs, ), match_type="domain", remove_multiple_matches=True, ) except ValueError: logger.debug( f"Attempted to remove {expectation}, which was not found." ) expectation_suite = cls._build_column_description_metadata(dataset) return expectation_suite
def _profile(cls, dataset, configuration=None): logger.debug(f"Running profiler with configuration: {configuration}") if configuration == "demo": return cls._demo_profile(dataset) existing_columns = dataset.get_table_columns() selected_columns = existing_columns included_expectations = [] excluded_expectations = [] if configuration: if ("included_expectations" in configuration and "excluded_expectations" in configuration): raise ProfilerError( "Please specify either `included_expectations` or `excluded_expectations`." ) if "included_expectations" in configuration: included_expectations = configuration["included_expectations"] if included_expectations in [False, None, []]: included_expectations = None _check_that_expectations_are_available(dataset, included_expectations) if "excluded_expectations" in configuration: excluded_expectations = configuration["excluded_expectations"] _check_that_expectations_are_available(dataset, excluded_expectations) if ("included_columns" in configuration and "excluded_columns" in configuration): raise ProfilerError( "Please specify either `excluded_columns` or `included_columns`." ) elif "included_columns" in configuration: selected_columns = configuration["included_columns"] elif "excluded_columns" in configuration: excluded_columns = configuration["excluded_columns"] if excluded_columns in [False, None, []]: excluded_columns = [] selected_columns = set(existing_columns) - set( excluded_columns) _check_that_columns_exist(dataset, selected_columns) if included_expectations is None: suite = cls._build_column_description_metadata(dataset) # remove column exist expectations suite.expectations = [] return suite dataset.set_default_expectation_argument("catch_exceptions", False) dataset = cls._build_table_row_count_expectation(dataset, tolerance=0.1) dataset.set_config_value("interactive_evaluation", True) dataset = cls._build_table_column_expectations(dataset) column_cache = {} if selected_columns: for column in selected_columns: cardinality = cls._get_column_cardinality_with_caching( dataset, column, column_cache) column_type = cls._get_column_type_with_caching( dataset, column, column_cache) if cardinality in [ ProfilerCardinality.TWO, ProfilerCardinality.VERY_FEW, ProfilerCardinality.FEW, ]: cls._create_expectations_for_low_card_column( dataset, column, column_cache) elif cardinality in [ ProfilerCardinality.MANY, ProfilerCardinality.VERY_MANY, ProfilerCardinality.UNIQUE, ]: # TODO we will want to finesse the number and types of # expectations created here. The simple version is blacklisting # and the more complex version is desired per column type and # cardinality. This deserves more thought on configuration. dataset.expect_column_values_to_be_unique(column) if column_type in [ ProfilerDataType.INT, ProfilerDataType.FLOAT ]: cls._create_expectations_for_numeric_column( dataset, column) elif column_type in [ProfilerDataType.DATETIME]: cls._create_expectations_for_datetime_column( dataset, column) elif column_type in [ProfilerDataType.STRING]: cls._create_expectations_for_string_column( dataset, column) elif column_type in [ProfilerDataType.UNKNOWN]: logger.debug( f"Skipping expectation creation for column {column} of unknown type: {column_type}" ) if excluded_expectations: dataset = _remove_table_expectations(dataset, excluded_expectations) dataset = _remove_column_expectations(dataset, excluded_expectations) if included_expectations: for expectation in dataset.get_expectation_suite().expectations: if expectation.expectation_type not in included_expectations: try: dataset.remove_expectation( expectation_type=expectation.expectation_type, expectation_kwargs=expectation.kwargs, column=expectation.kwargs.get("column", None), remove_multiple_matches=True, ) except ValueError: logger.debug( f"Attempted to remove {expectation}, which was not found." ) expectation_suite = cls._build_column_description_metadata(dataset) return expectation_suite
def _create_expectations_for_numeric_column( cls, dataset, column, excluded_expectations=None, included_expectations=None) -> None: cls._create_non_nullity_expectations( dataset, column, excluded_expectations=excluded_expectations, included_expectations=included_expectations, ) if (not excluded_expectations or "expect_column_min_to_be_between" not in excluded_expectations ) and (not included_expectations or "expect_column_min_to_be_between" in included_expectations): observed_min = dataset.expect_column_min_to_be_between( column, min_value=None, max_value=None, result_format="SUMMARY").result["observed_value"] if not is_nan(observed_min): dataset.expect_column_min_to_be_between( column, min_value=observed_min - 1, max_value=observed_min + 1) else: logger.debug( f"Skipping expect_column_min_to_be_between because observed value is nan: {observed_min}" ) if (not excluded_expectations or "expect_column_max_to_be_between" not in excluded_expectations ) and (not included_expectations or "expect_column_max_to_be_between" in included_expectations): observed_max = dataset.expect_column_max_to_be_between( column, min_value=None, max_value=None, result_format="SUMMARY").result["observed_value"] if not is_nan(observed_max): dataset.expect_column_max_to_be_between( column, min_value=observed_max - 1, max_value=observed_max + 1) else: logger.debug( f"Skipping expect_column_max_to_be_between because observed value is nan: {observed_max}" ) if (not excluded_expectations or "expect_column_mean_to_be_between" not in excluded_expectations) and ( not included_expectations or "expect_column_mean_to_be_between" in included_expectations): observed_mean = dataset.expect_column_mean_to_be_between( column, min_value=None, max_value=None, result_format="SUMMARY").result["observed_value"] if not is_nan(observed_mean): dataset.expect_column_mean_to_be_between( column, min_value=observed_mean - 1, max_value=observed_mean + 1) else: logger.debug( f"Skipping expect_column_mean_to_be_between because observed value is nan: {observed_mean}" ) if (not excluded_expectations or "expect_column_median_to_be_between" not in excluded_expectations) and ( not included_expectations or "expect_column_median_to_be_between" in included_expectations): observed_median = dataset.expect_column_median_to_be_between( column, min_value=None, max_value=None, result_format="SUMMARY").result["observed_value"] if not is_nan(observed_median): dataset.expect_column_median_to_be_between( column, min_value=observed_median - 1, max_value=observed_median + 1) else: logger.debug( f"Skipping expect_column_median_to_be_between because observed value is nan: {observed_median}" ) allow_relative_error: bool = dataset.attempt_allowing_relative_error() if (not excluded_expectations or "expect_column_quantile_values_to_be_between" not in excluded_expectations) and ( not included_expectations or "expect_column_quantile_values_to_be_between" in included_expectations): quantile_result = dataset.expect_column_quantile_values_to_be_between( column, quantile_ranges={ "quantiles": [0.05, 0.25, 0.5, 0.75, 0.95], "value_ranges": [ [None, None], [None, None], [None, None], [None, None], [None, None], ], }, allow_relative_error=allow_relative_error, result_format="SUMMARY", catch_exceptions=True, ) if quantile_result.exception_info and ( quantile_result.exception_info["exception_traceback"] or quantile_result.exception_info["exception_message"]): # TODO quantiles are not implemented correctly on sqlite, and likely other sql dialects logger.debug( quantile_result.exception_info["exception_traceback"]) logger.debug( quantile_result.exception_info["exception_message"]) else: dataset.set_config_value("interactive_evaluation", False) dataset.expect_column_quantile_values_to_be_between( column, quantile_ranges={ "quantiles": quantile_result.result["observed_value"]["quantiles"], "value_ranges": [[v - 1, v + 1] for v in quantile_result.result["observed_value"]["values"]], }, allow_relative_error=allow_relative_error, catch_exceptions=True, ) dataset.set_config_value("interactive_evaluation", True)
def _create_expectations_for_numeric_column(cls, dataset, column): cls._create_non_nullity_expectations(dataset, column) value = \ dataset.expect_column_min_to_be_between(column, min_value=None, max_value=None, result_format="SUMMARY").result[ "observed_value"] value = dataset.expect_column_min_to_be_between(column, min_value=value - 1, max_value=value + 1) value = \ dataset.expect_column_max_to_be_between(column, min_value=None, max_value=None, result_format="SUMMARY").result[ "observed_value"] value = dataset.expect_column_max_to_be_between(column, min_value=value - 1, max_value=value + 1) value = dataset.expect_column_mean_to_be_between( column, min_value=None, max_value=None, result_format="SUMMARY").result["observed_value"] dataset.expect_column_mean_to_be_between(column, min_value=value - 1, max_value=value + 1) value = dataset.expect_column_median_to_be_between( column, min_value=None, max_value=None, result_format="SUMMARY").result["observed_value"] dataset.expect_column_median_to_be_between(column, min_value=value - 1, max_value=value + 1) result = dataset.expect_column_quantile_values_to_be_between( column, quantile_ranges={ "quantiles": [0.05, 0.25, 0.5, 0.75, 0.95], "value_ranges": [ [None, None], [None, None], [None, None], [None, None], [None, None], ], }, result_format="SUMMARY", catch_exceptions=True) if result.exception_info and ( result.exception_info["exception_traceback"] or result.exception_info["exception_message"]): # TODO quantiles are not implemented correctly on sqlite, and likely other sql dialects logger.debug(result.exception_info["exception_traceback"]) logger.debug(result.exception_info["exception_message"]) else: dataset.set_config_value('interactive_evaluation', False) dataset.expect_column_quantile_values_to_be_between( column, quantile_ranges={ "quantiles": result.result["observed_value"]["quantiles"], "value_ranges": [[v - 1, v + 1] for v in result.result["observed_value"]["values"]], }, catch_exceptions=True) dataset.set_config_value('interactive_evaluation', True)