def test_warning_tf_multiple_dp_with_update(self): test_root_path = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, 'data') path = os.path.join(test_dir, 'csv/diamonds.csv') data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, }) print('running dp1') profile1 = dp.Profiler(data, profiler_options=profile_options) data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, }) print('running dp2') profile2 = dp.Profiler(data, profiler_options=profile_options) profile1.update_profile(data)
def test_odd_merge_profile_list(self, mock_data_labeler, *mocks): """ A top-level function which takes in a list of profile objects, merges all the profiles together into one profile, and returns the single merged profile as the return value. The labeler object is removed prior to merge and added back to the single profile object. """ self._setup_data_labeler_mock(mock_data_labeler) data = pd.DataFrame([1, 2, 3, 4, 5, 60, 1]) profile_one = dp.Profiler(data[:2]) profile_two = dp.Profiler(data[2:]) profile_three = dp.Profiler(data[2:]) list_of_profiles = [profile_one, profile_two, profile_three] single_profile = utils.merge_profile_list( list_of_profiles=list_of_profiles) single_report = single_profile.report() self.assertEqual(1, len(single_report["data_stats"])) self.assertEqual(1, single_report["global_stats"]["column_count"]) self.assertEqual(12, single_report["global_stats"]["row_count"]) self.assertEqual("int", single_report["data_stats"][0]["data_type"]) self.assertEqual(1, single_report["data_stats"][0]["statistics"]["min"]) self.assertEqual(60.0, single_report["data_stats"][0]["statistics"]["max"])
def test_sample_size_passed_to_profile(self, *mocks): update_mock = mocks[0] # data setup data = pd.DataFrame([0] * int(50e3)) # option setup profiler_options = ProfilerOptions() profiler_options.structured_options.multiprocess.is_enabled = False profiler_options.set({'data_labeler.is_enabled': False}) # test data size < min_sample_size = 5000 by default profiler = dp.Profiler(data[:1000], profiler_options=profiler_options) profiler._min_sample_size = 5000 profiler._sampling_ratio = 0.2 self.assertEqual(1000, update_mock.call_args[0][1]) # test data size * 0.20 < min_sample_size < data size profiler = dp.Profiler(data[:10000], profiler_options=profiler_options) profiler._min_sample_size = 5000 profiler._sampling_ratio = 0.2 self.assertEqual(5000, update_mock.call_args[0][1]) # test min_sample_size > data size * 0.20 profiler = dp.Profiler(data, profiler_options=profiler_options) profiler._min_sample_size = 5000 profiler._sampling_ratio = 0.2 self.assertEqual(10000, update_mock.call_args[0][1])
def test_null_calculation_with_differently_sampled_cols(self): opts = ProfilerOptions() opts.structured_options.multiprocess.is_enabled = False data = pd.DataFrame({"full": [1, 2, 3, 4, 5, 6, 7, 8, 9], "sparse": [1, None, 3, None, 5, None, 7, None, 9]}) profile = dp.Profiler(data, samples_per_update=5, min_true_samples=5, profiler_options=opts) # Rows 2, 4, 5, 6, 7 are sampled in first column # Therefore only those rows should be considered for null calculations # The only null in those rows in second column in that subset are 5, 7 # Therefore only 2 rows have null according to row_has_null_count self.assertEqual(0, profile.row_is_null_count) self.assertEqual(2, profile.row_has_null_count) # Accordingly, make sure ratio of null rows accounts for the fact that # Only 5 total rows were sampled (5 in col 1, 9 in col 2) self.assertEqual(0, profile._get_row_is_null_ratio()) self.assertEqual(0.4, profile._get_row_has_null_ratio()) data2 = pd.DataFrame( {"sparse": [1, None, 3, None, 5, None, 7, None], "sparser": [1, None, None, None, None, None, None, 8]}) profile2 = dp.Profiler(data2, samples_per_update=2, min_true_samples=2, profiler_options=opts) # Rows are sampled as follows: [6, 5], [1, 4], [2, 3], [0, 7] # First column gets min true samples from ids 1, 4, 5, 6 # Second column gets completely sampled (has a null in 1, 4, 5, 6) # rows 1 and 5 are completely null, 4 and 6 only null in col 2 self.assertEqual(2, profile2.row_is_null_count) self.assertEqual(4, profile2.row_has_null_count) # Only 4 total rows sampled, ratio accordingly self.assertEqual(0.5, profile2._get_row_is_null_ratio()) self.assertEqual(1, profile2._get_row_has_null_ratio())
def test_integrated_merge_diff_options(self): options = dp.ProfilerOptions() options.set({'data_labeler.is_enabled': False}) data = pd.DataFrame([1, 2, 3, 4]) profile1 = dp.Profiler(data, profiler_options=options) profile2 = dp.Profiler(data) with self.assertRaisesRegex( ValueError, 'Structured profilers were not setup with ' 'the same options, hence they do not ' 'calculate the same profiles and cannot be ' 'added together.'): profile1 + profile2
def test_duplicate_column_names(self, *mocks): # validate works first valid_data = pd.DataFrame([[1, 2]], columns=['a', 'b']) profile = dp.Profiler(valid_data) self.assertIn('a', profile._profile) self.assertIn('b', profile._profile) # data has duplicate column names invalid_data = pd.DataFrame([[1, 2]], columns=['a', 'a']) with self.assertRaisesRegex( ValueError, '`Profiler` does not currently support ' 'data which contains columns with duplicate' ' names.'): profile = dp.Profiler(invalid_data)
def test_stream_profilers(self, *mocks): data = pd.DataFrame([ ['test1', 1.0], ['test2', None], ['test1', 1.0], [None, None], [None, 5.0], [None, 5.0], [None, None], ['test3', 7.0]]) # check prior to update profiler = dp.Profiler(data[:3]) self.assertEqual(1, profiler.row_has_null_count) self.assertEqual(0, profiler.row_is_null_count) self.assertEqual(3, profiler.total_samples) self.assertEqual(2, len(profiler.hashed_row_dict)) # check after update profiler.update_profile(data[3:]) self.assertIsNone(profiler.encoding) self.assertEqual( "<class 'pandas.core.frame.DataFrame'>", profiler.file_type) self.assertEqual(5, profiler.row_has_null_count) self.assertEqual(2, profiler.row_is_null_count) self.assertEqual(8, profiler.total_samples) self.assertEqual(5, len(profiler.hashed_row_dict))
def setUpClass(cls): test_dict = { '1': ['nan', 'null', None, None, ''], 1: ['nan', 'None', 'null', None, ''], } test_dataset = pd.DataFrame(data=test_dict) cls.trained_schema = dp.Profiler(test_dataset, len(test_dataset))
def test_correct_total_sample_size_and_counts_and_mutability(self): data = [['test1', 1.0], ['test2', 2.0], ['test3', 3.0], [None, None], ['test5', 5.0], ['test6', 6.0], [None, None], ['test7', 7.0]] data = pd.DataFrame(data, columns=['NAME', 'VALUE']) profiler_options = ProfilerOptions() profiler_options.set({'data_labeler.is_enabled': False}) col_one_len = len(data['NAME']) col_two_len = len(data['VALUE']) # Test reloading data, ensuring immutable for i in range(2): # Profile Once data.index = pd.RangeIndex(0, 8) profile = dp.Profiler(data, profiler_options=profiler_options, samples_per_update=2) # Profile Twice data.index = pd.RangeIndex(8, 16) profile.update_profile(data) # rows sampled are [5, 6], [13, 14] (0 index) self.assertEqual(16, profile.total_samples) self.assertEqual(4, profile._max_col_samples_used) self.assertEqual(2, profile.row_has_null_count) self.assertEqual(0.5, profile._get_row_has_null_ratio()) self.assertEqual(2, profile.row_is_null_count) self.assertEqual(0.5, profile._get_row_is_null_ratio()) self.assertEqual(0.4375, profile._get_unique_row_ratio()) self.assertEqual(9, profile._get_duplicate_row_count()) self.assertEqual(col_one_len, len(data['NAME'])) self.assertEqual(col_two_len, len(data['VALUE']))
def test_save_and_load(self): datapth = "dataprofiler/tests/data/" test_files = ["csv/guns.csv", "csv/iris.csv"] def _clean_report(report): data_stats = report["data_stats"] for key in data_stats: stats = data_stats[key]["statistics"] if "histogram" in stats: if "bin_counts" in stats["histogram"]: stats["histogram"]["bin_counts"] = \ stats["histogram"]["bin_counts"].tolist() if "bin_edges" in stats["histogram"]: stats["histogram"]["bin_edges"] = \ stats["histogram"]["bin_edges"].tolist() return report for test_file in test_files: # Create Data and Profiler objects data = dp.Data(os.path.join(datapth, test_file)) save_profile = dp.Profiler(data) # Save and Load profile with Mock IO with mock.patch('builtins.open') as m: mock_file = setup_save_mock_open(m) save_profile.save() mock_file.seek(0) load_profile = dp.Profiler.load("mock.pkl") # Check that reports are equivalent save_report = _clean_report(save_profile.report()) load_report = _clean_report(load_profile.report()) self.assertDictEqual(save_report, load_report)
def test_warning_tf_run_dp_multiple_times(self): test_root_path = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, 'data') path = os.path.join(test_dir, 'csv/diamonds.csv') for i in range(3): print('running dp =============================', i) data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, }) profile = dp.Profiler(data, profiler_options=profile_options) results = profile.report() columns = [] predictions = [] for col in results['data_stats']: columns.append(col) predictions.append(results['data_stats'][col]['data_label'])
def _pandas( cls, execution_engine, metric_domain_kwargs, metric_value_kwargs, metrics, runtime_configuration, ): df, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE) first_profile = None try: first_profile_path = metric_value_kwargs["profile_path"] first_profile = dp.Profiler.load(first_profile_path) except FileNotFoundError: raise ValueError( "'profile_path' does not point to a valid DataProfiler stored profile." ) profiler_opts = dp.ProfilerOptions() profiler_opts.structured_options.multiprocess.is_enabled = False new_profile = dp.Profiler(df, options=profiler_opts) report_diff = new_profile.diff( first_profile) # Results in diff of new_prof - first_prof # Values in this report indicate +/- change from old profile return report_diff
def test_text_data_raises_error(self): text_file_path = os.path.join(test_root_path, 'data', 'txt/sentence-10x.txt') with self.assertRaisesRegex( TypeError, 'Cannot provide TextData object' ' to Profiler'): profile = dp.Profiler(dp.Data(text_file_path))
def test_correct_rows_ingested(self): test_dict = { '1': ['nan', 'null', None, None, ''], 1: ['nan', 'None', 'null', None, ''], } test_dataset = pd.DataFrame(data=test_dict) profiler_options = ProfilerOptions() profiler_options.set({'data_labeler.is_enabled': False}) trained_schema = dp.Profiler(test_dataset, len(test_dataset), profiler_options=profiler_options) self.assertCountEqual(['', 'nan', 'None', 'null'], trained_schema.profile['1'].null_types) self.assertEqual(5, trained_schema.profile['1'].null_count) self.assertEqual({ '': {4}, 'nan': {0}, 'None': {2, 3}, 'null': {1} }, trained_schema.profile['1'].null_types_index) self.assertCountEqual(['', 'nan', 'None', 'null'], trained_schema.profile[1].null_types) self.assertEqual(5, trained_schema.profile[1].null_count) self.assertEqual({ '': {4}, 'nan': {0}, 'None': {1, 3}, 'null': {2} }, trained_schema.profile[1].null_types_index)
def test_min_col_samples_used(self, *mocks): # No cols sampled since no cols to sample empty_df = pd.DataFrame([]) empty_profile = dp.Profiler(empty_df) self.assertEqual(0, empty_profile._min_col_samples_used) # Every column fully sampled full_df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) full_profile = dp.Profiler(full_df) self.assertEqual(3, full_profile._min_col_samples_used) # First col sampled only twice, so that is min sparse_df = pd.DataFrame([[1, None, None], [1, 1, None], [1, None, 1]]) sparse_profile = dp.Profiler(sparse_df, min_true_samples=2, samples_per_update=1) self.assertEqual(2, sparse_profile._min_col_samples_used)
def test_add_profilers(self, *mocks): data = pd.DataFrame([1, None, 3, 4, 5, None, 1]) profile1 = dp.Profiler(data[:2]) profile2 = dp.Profiler(data[2:]) # test incorrect type with self.assertRaisesRegex( TypeError, '`Profiler` and `int` are ' 'not of the same profiler type.'): profile1 + 3 # test mismatched profiles popped_profile = profile2._profile.pop(0) with self.assertRaisesRegex(ValueError, 'Profiles do not have the same schema.'): profile1 + profile2 # test mismatched profiles due to options profile2._profile[0] = None with self.assertRaisesRegex( ValueError, 'The two profilers were not setup with the ' 'same options, hence they do not calculate ' 'the same profiles and cannot be added ' 'together.'): profile1 + profile2 # test success profile1._profile = dict(test=1) profile2._profile = dict(test=2) merged_profile = profile1 + profile2 self.assertEqual(3, merged_profile._profile['test']) self.assertIsNone(merged_profile.encoding) self.assertEqual("<class 'pandas.core.frame.DataFrame'>", merged_profile.file_type) self.assertEqual(2, merged_profile.row_has_null_count) self.assertEqual(2, merged_profile.row_is_null_count) self.assertEqual(7, merged_profile.total_samples) self.assertEqual(5, len(merged_profile.hashed_row_dict)) # test success if drawn from multiple files profile2.encoding = 'test' profile2.file_type = 'test' merged_profile = profile1 + profile2 self.assertEqual('multiple files', merged_profile.encoding) self.assertEqual('multiple files', merged_profile.file_type)
def test_correct_null_row_counts(self): file_path = os.path.join(test_root_path, 'data', 'csv/empty_rows.txt') data = pd.read_csv(file_path) profiler_options = ProfilerOptions() profiler_options.set({'data_labeler.is_enabled': False}) profile = dp.Profiler(data, profiler_options=profiler_options) self.assertEqual(2, profile.row_has_null_count) self.assertEqual(0.25, profile._get_row_has_null_ratio()) self.assertEqual(2, profile.row_is_null_count) self.assertEqual(0.25, profile._get_row_is_null_ratio()) file_path = os.path.join(test_root_path, 'data','csv/iris-with-null-rows.csv') data = pd.read_csv(file_path) profile = dp.Profiler(data, profiler_options=profiler_options) self.assertEqual(13, profile.row_has_null_count) self.assertEqual(13/24, profile._get_row_has_null_ratio()) self.assertEqual(3, profile.row_is_null_count) self.assertEqual(3/24, profile._get_row_is_null_ratio())
def setUpClass(cls): cls.input_file_path = os.path.join(test_root_path, 'data', 'csv/aws_honeypot_marx_geo.csv') cls.aws_dataset = pd.read_csv(cls.input_file_path) profiler_options = ProfilerOptions() profiler_options.set({'data_labeler.is_enabled': False}) cls.trained_schema = dp.Profiler(cls.aws_dataset, len(cls.aws_dataset), profiler_options=profiler_options)
def test_sample_size_warning_in_the_profiler(self, *mocks): # structure data profile mock sdp_mock = mock.Mock() sdp_mock.clean_data_and_get_base_stats.return_value = (None, None) mocks[0].return_value = sdp_mock data = pd.DataFrame([1, None, 3, 4, 5, None]) with self.assertWarnsRegex( UserWarning, "The data will be profiled with a sample " "size of 3. All statistics will be based on " "this subsample and not the whole dataset."): profile1 = dp.Profiler(data, samples_per_update=3)
def test_data_label_assigned(self): # only use 5 samples trained_schema = dp.Profiler(self.aws_dataset, samples_per_update=5) report = trained_schema.report() has_non_null_column = False for key in report['data_stats']: # only test non-null columns if report['data_stats'][key]['data_type'] is not None: self.assertIsNotNone(report['data_stats'][key]['data_label']) has_non_null_column = True if not has_non_null_column: self.fail( "Dataset tested did not have a non-null column and therefore " "could not validate the test.")
def test_null_in_file(self): filename_null_in_file = os.path.join( test_root_path, 'data', 'csv/sparse-first-and-last-column.txt') profiler_options = ProfilerOptions() profiler_options.set({'data_labeler.is_enabled': False}) data = dp.Data(filename_null_in_file) profile = dp.Profiler(data, profiler_options=profiler_options) report = profile.report(report_options={"output_format":"pretty"}) self.assertEqual( report['data_stats']['COUNT']['statistics']['null_types_index'], {'': '[2, 3, 4, 5, 7, 8]'} ) self.assertEqual( report['data_stats'][' NUMBERS']['statistics']['null_types_index'], {'': '[5, 6, 8]', ' ': '[2, 4]'} )
def test_get_sample_size(self): data = pd.DataFrame([0] * int(50e3)) # test data size < min_sample_size = 5000 by default profiler = dp.Profiler(pd.DataFrame([])) profiler._min_sample_size = 5000 profiler._sampling_ratio = 0.2 sample_size = profiler._get_sample_size(data[:1000]) self.assertEqual(1000, sample_size) # test data size * 0.20 < min_sample_size < data size sample_size = profiler._get_sample_size(data[:10000]) self.assertEqual(5000, sample_size) # test min_sample_size > data size * 0.20 sample_size = profiler._get_sample_size(data) self.assertEqual(10000, sample_size) # test min_sample_size > data size * 0.10 profiler._sampling_ratio = 0.5 sample_size = profiler._get_sample_size(data) self.assertEqual(25000, sample_size)
def test_marginal_dist_detection(): iris = datasets.load_iris() data = pd.DataFrame( data=np.c_[iris["data"], iris["target"]], columns=iris["feature_names"] + ["target"], ) data.target = data.target.astype(int) profile_options = dp.ProfilerOptions() profile_options.set({ "data_labeler.is_enabled": False, "correlation.is_enabled": True, "structured_options.multiprocess.is_enabled": False, }) profile = dp.Profiler(data, options=profile_options) report = profile.report() marginal_dist_list = detect_dist(report) assert len(marginal_dist_list) == len( report["data_stats"] ), "Length of distributions list must be equal to number of columns" for col_num, col in enumerate(report["data_stats"]): dist_name = marginal_dist_list[col_num]["dist"] assert hasattr( stats, dist_name ), "The detected distribution must be defined in scipy.stats" dist_method = getattr(stats, dist_name) if col["data_type"] == "float": assert isinstance( dist_method, stats.rv_continuous ), "Detected distribution must be continuous for columns with continuous random variables" else: assert isinstance( dist_method, stats.rv_discrete ), "Detected distribution must be discrete for columns with discrete random variables"
def test_no_tensorflow(self): import sys import importlib import types import pandas orig_import = __import__ # necessary for any wrapper around the library to test if snappy caught # as an issue def import_mock(name, *args): if name == 'tensorflow': raise ImportError('test') return orig_import(name, *args) with mock.patch('builtins.__import__', side_effect=import_mock): with self.assertWarns(RuntimeWarning) as w: import dataprofiler df = pandas.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]]) profile = dataprofiler.Profiler(df) warning_msg = "Partial Profiler Failure" self.assertIn(warning_msg, str(w.warning))
def test_generating_report_ensure_no_error(self): file_path = os.path.join(test_root_path, 'data', 'csv/diamonds.csv') data = pd.read_csv(file_path) profile = dp.Profiler(data[:1000]) readable_report = profile.report( report_options={"output_format": "compact"})
for sample_size in sample_sizes: # setup time dict print(f"Evaluating sample size: {sample_size}") df = data.data.sample(sample_size, replace=True).reset_index(drop=True) if PERCENT_TO_NAN: samples_to_nan = int(len(df) * PERCENT_TO_NAN / 100) for col_name in df: ind_to_nan = random.sample(list(df.index), samples_to_nan) df[col_name][ind_to_nan] = "None" # time profiling start_time = time.time() if ALLOW_SUBSAMPLING: profiler = dp.Profiler(df, options=options) else: profiler = dp.Profiler(df, samples_per_update=len(df), options=options) total_time = time.time() - start_time # get overall time for merging profiles start_time = time.time() try: merged_profile = profiler + profiler except ValueError: pass # empty profile merge if 0 data merge_time = time.time() - start_time # get times for each profile in the columns
def test_string_index_doesnt_cause_error(self, *mocks): dp.Profiler(pd.DataFrame([[1, 2, 3]], index=["hello"]))
class ExpectProfileNumericColumnsDiffBetweenInclusiveThresholdRange( TableExpectation): """ This expectation takes the difference report between the data it is called on and a DataProfiler profile of the same schema loaded from a provided path. This function builds upon the custom table expectations of Great Expectations. Each numerical column will be checked against a user provided dictionary of columns paired with dictionaries of statistics containing lower and upper bounds. It is expected that a statistics value for a given column is within the specified threshold, inclusive. Args: profile_path (str): A path to a saved DataProfiler profile object on the local filesystem. limit_check_report_keys (dict): A dict, containing column names as keys and dicts as values that contain statistics as keys and dicts as values containing two keys: "lower" denoting the lower bound for the threshold range, and "upper" denoting the upper bound for the threshold range. mostly (float - optional): a value indicating the lower bound percentage of successful values that must be present to evaluate to success=True. validator.expect_profile_numerical_columns_diff_between_threshold_range( profile_path = "C:/path_to/my_profile.pkl", limit_check_report_keys = { "column_one": { "min": {"lower": 2.0, "upper": 10.0}, }, "*": { "*": {"lower": 0, "upper": 100}, }, } ) Note: In limit_check_report_keys, "*" in place of a column denotes a general operator in which the value it stores will be applied to every column in the data that has no explicit key. "*" in place of a statistic denotes a general operator in which the bounds it stores will be applied to every statistic for the given column that has no explicit key. """ example_profile_data = [ [2, 5, "10", "ten", 25], [4, 10, "20", "twenty", 50], [6, 15, "30", "thirty", 75], [8, 20, "40", "forty", 100], [10, 25, "50", "fifty", 125], ] example_profile_columns = [ "by_2", "by_5", "str_by_10", "words_by_10", "by_25", ] df = pd.DataFrame(example_profile_data, columns=example_profile_columns) profiler_opts = dp.ProfilerOptions() profiler_opts.structured_options.multiprocess.is_enabled = False example_profile = dp.Profiler(df, options=profiler_opts) profile_path = ( "/example_profiles/expect_profile_diff_less_than_threshold_profile.pkl" ) dir_path = os.path.dirname(os.path.abspath(__file__)) profile_path = dir_path + profile_path example_profile.save(filepath=profile_path) examples = [ { "data": { "by_2": [4, 6, 8, 10, 12], "by_5": [10, 15, 20, 25, 30], "str_by_10": ["20", "30", "40", "50", "60"], "words_by_10": ["twenty", "thirty", "forty", "fifty", "sixty"], "by_25": [50, 75, 100, 125, 150], }, "tests": [ { "title": "profile_min_delta_witin_threshold", "exact_match_out": False, "include_in_gallery": True, "in": { "profile_path": profile_path, "limit_check_report_keys": { "*": { "min": { "lower": 0, "upper": 50 }, }, }, }, "out": { "success": True }, }, { "title": "profile_all_stats_beyond_delta_threshold", "exact_match_out": False, "include_in_gallery": True, "in": { "profile_path": profile_path, "limit_check_report_keys": { "*": { "*": { "lower": 0, "upper": 0 } }, "by_2": { "min": { "lower": -1, "upper": 1 }, }, }, }, "out": { "success": False }, }, { "title": "checking_single_failure_in_one_column", "exact_match_out": False, "include_in_gallery": True, "in": { "profile_path": profile_path, "limit_check_report_keys": { "*": { "*": { "lower": -25, "upper": 50 } }, "by_2": { "min": { "lower": 0, "upper": 0 } }, }, }, "out": { "success": False }, }, { "title": "single_failure_still_mostly_successful", "exact_match_out": False, "include_in_gallery": True, "in": { "profile_path": profile_path, "limit_check_report_keys": { "*": { "*": { "lower": -25, "upper": 50 } }, "by_2": { "min": { "lower": 0, "upper": 0 } }, }, "mostly": 0.75, }, "out": { "success": True }, }, ], }, ] metric_dependencies = ( "data_profiler.profile_numeric_columns_diff_between_inclusive_threshold_range", ) success_keys = ( "profile_path", "limit_check_report_keys", "numerical_diff_statistics", "mostly", ) default_limit_check_report_keys = { "*": { "min": { "lower": 0, "upper": 0 }, "max": { "lower": 0, "upper": 0 }, "sum": { "lower": 0, "upper": 0 }, "mean": { "lower": 0, "upper": 0 }, "median": { "lower": 0, "upper": 0 }, "median_absolute_deviation": { "lower": 0, "upper": 0 }, "variance": { "lower": 0, "upper": 0 }, "stddev": { "lower": 0, "upper": 0 }, "unique_count": { "lower": 0, "upper": 0 }, "unique_ratio": { "lower": 0, "upper": 0 }, "gini_impurity": { "lower": 0, "upper": 0 }, "unalikeability": { "lower": 0, "upper": 0 }, "sample_size": { "lower": 0, "upper": 0 }, "null_count": { "lower": 0, "upper": 0 }, } } numerical_diff_statistics = list( default_limit_check_report_keys["*"].keys()) default_kwarg_values = { "limit_check_report_keys": default_limit_check_report_keys, "numerical_diff_statistics": numerical_diff_statistics, "mostly": 1.0, } def _validate( self, configuration: ExpectationConfiguration, metrics: Dict, runtime_configuration: dict = None, execution_engine: ExecutionEngine = None, ): delta_between_thresholds = metrics.get( "data_profiler.profile_numeric_columns_diff_between_inclusive_threshold_range" ) mostly = self.get_success_kwargs().get( "mostly", self.default_kwarg_values.get("mostly")) unexpected_values = {} total_stats = 0.0 failed_stats = 0.0 for column, value in delta_between_thresholds.items(): column_unexpected_values = {} for stat, val in value.items(): if val is not True: column_unexpected_values[stat] = val failed_stats += 1.0 total_stats += 1.0 if column_unexpected_values != {}: unexpected_values[column] = column_unexpected_values successful_stats = total_stats - failed_stats percent_successful = successful_stats / total_stats success = percent_successful >= mostly results = { "success": success, "expectation_config": configuration, "result": { "unexpected_values": unexpected_values, }, } return results library_metadata = { "requirements": ["dataprofiler", "tensorflow", "scikit-learn", "numpy"], "maturity": "experimental", # "concept_only", "experimental", "beta", or "production" "tags": [ "dataprofiler", "dataassistance", ], # Tags for this Expectation in the Gallery "contributors": [ # Github handles for all contributors to this Expectation. "@stevensecreti", # Don't forget to add your github handle here! ], }