def test_data_labeler_toggle(self): src_column = self.aws_dataset.src structured_options = StructuredOptions() structured_options.data_labeler.is_enabled = False std_profile = StructuredDataProfile(src_column, sample_size=len(src_column)) togg_profile = StructuredDataProfile(src_column, sample_size=len(src_column), options=structured_options) self.assertIn('data_label_profile', std_profile.profiles) self.assertNotIn('data_label_profile', togg_profile.profiles)
def test_null_count(self): column = pd.Series([1, float('nan')] * 10) # test null_count when subset of full sample size random.seed(0) profile = StructuredDataProfile(column, sample_size=10) self.assertEqual(6, profile.null_count) # test null_count when full sample size profile = StructuredDataProfile(column, sample_size=len(column)) self.assertEqual(10, profile.null_count)
def test_uppercase_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self): """ Tests whether columns with a ratio of categorical columns less than MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL and container uppercase letters identify as categorical. """ num_unique_values = CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL + 1 list_unique_values = [ self.test_sentence + str(i + 1) for i in range(num_unique_values) ] num_sentences = int( float(1) / CategoricalColumn._CATEGORICAL_THRESHOLD_DEFAULT) + 2 cat_sentence_list = list_unique_values * num_sentences cat_sentence_list[-1] = self.test_sentence_upper1 + str(num_sentences) cat_sentence_list[-2] = self.test_sentence_upper2 + \ str(num_sentences - 1) cat_sentence_list[-3] = self.test_sentence_upper3 + \ str(num_sentences - 2) len_unique = len(set(cat_sentence_list)) cat_sentence_df = pd.Series(cat_sentence_list) column_profile = StructuredDataProfile(cat_sentence_df) cat_profiler = column_profile.profiles['data_stats_profile']._profiles[ "category"] self.assertEqual(True, cat_profiler.is_match) self.assertEqual(len_unique, len(cat_profiler.categories))
def test_base_props(self): src_column = self.aws_dataset.src src_profile = StructuredDataProfile( src_column, sample_size=len(src_column)) self.assertIsInstance(src_profile.profiles['data_type_profile'], ColumnPrimitiveTypeProfileCompiler) self.assertIsInstance(src_profile.profiles['data_stats_profile'], ColumnStatsProfileCompiler) data_types = ['int', 'float', 'datetime', 'text'] six.assertCountEqual( self, data_types, list(src_profile.profiles['data_type_profile']._profiles.keys()) ) stats_types = ['category', 'order'] six.assertCountEqual( self, stats_types, list(src_profile.profiles['data_stats_profile']._profiles.keys()) ) self.assertEqual(3, src_profile.null_count) self.assertEqual(2999, src_profile.sample_size) total_nulls = 0 for _, null_rows in src_profile.null_types_index.items(): total_nulls += len(null_rows) self.assertEqual(3, total_nulls) # test updated base props with batch addition src_profile.update_profile(src_column) src_profile.update_profile(src_column) self.assertEqual(3*3, src_profile.null_count) self.assertEqual(2999*3, src_profile.sample_size)
def test_get_base_props_and_clean_null_params(self): data = pd.Series([1, None, 3, 4, None, 6], index=['a', 'b', 'c', 'd', 'e', 'f']) # validate that if sliced data, still functional # previously `iloc` was used at: # `df_series = df_series.loc[sorted(true_sample_list)]` # which caused errors df_series, base_stats = \ StructuredDataProfile.get_base_props_and_clean_null_params( self=None, df_series=data[1:], sample_size=6, min_true_samples=0) # note data above is a subset `df_series=data[1:]`, 1.0 will not exist self.assertTrue(np.issubdtype(np.object_, df_series.dtype)) self.assertDictEqual( {'sample': ['4.0', '6.0', '3.0'], 'sample_size': 5, 'null_count': 2, 'null_types': dict(nan=['e', 'b'])}, base_stats)
def test_greater_than_CATEGORICAL_THRESHOLD_DEFAULT_identify_as_text(self): """ Tests whether columns with a ratio of categorical columns greater than MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL identify as text. """ num_unique_values = CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL + 1 list_unique_values = [ self.test_sentence + str(i + 1) for i in range(num_unique_values) ] num_sentences = int( float(1) / CategoricalColumn._CATEGORICAL_THRESHOLD_DEFAULT) - 1 cat_sentence_list = list_unique_values * num_sentences cat_sentence_df = pd.Series(cat_sentence_list) column_profile = StructuredDataProfile(cat_sentence_df) cat_profiler = column_profile.profiles['data_stats_profile']._profiles[ "category"] self.assertEqual(False, cat_profiler.is_match)
def test_long_sentences_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL( self): """ Tests whether columns with the number of unique long sentences fewer than MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL identify as categorical. """ num_sentences = CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL - 1 cat_sentence_list = [ self.test_sentence_long + str(i + 1) for i in range(num_sentences) ] len_unique = len(set(cat_sentence_list)) cat_sentence_df = pd.Series(cat_sentence_list) column_profile = StructuredDataProfile(cat_sentence_df) cat_profiler = column_profile.profiles['data_stats_profile']._profiles[ "category"] self.assertEqual(True, cat_profiler.is_match) self.assertEqual(len_unique, len(cat_profiler.categories))
def test_add_profilers(self, *mocks): data = pd.Series([1, None, 3, 4, 5, None]) profile1 = StructuredDataProfile(data[:2]) profile2 = StructuredDataProfile(data[2:]) # test incorrect type with self.assertRaisesRegex(TypeError, '`StructuredDataProfile` and `int` are ' 'not of the same profiler type.'): profile1 + 3 # test mismatched names profile1.name = 'profile1' profile2.name = 'profile2' with self.assertRaisesRegex(ValueError, 'Structured profile names are unmatched: ' 'profile1 != profile2'): profile1 + profile2 # test mismatched profiles due to options profile2.name = 'profile1' profile1._profiles = dict(test1=mock.Mock()) profile2.profiles.pop('data_label_profile') with self.assertRaisesRegex(ValueError, 'Structured profilers were not setup with ' 'the same options, hence they do not ' 'calculate the same profiles and cannot be ' 'added together.'): profile1 + profile2 # test success profile1.profiles = dict(test=1) profile2.profiles = dict(test=2) merged_profile = profile1 + profile2 self.assertEqual(3, merged_profile.profiles['test']) self.assertEqual(['4.0', '5.0', '1.0', '3.0'], merged_profile.sample) self.assertEqual(6, merged_profile.sample_size) self.assertEqual(2, merged_profile.null_count) self.assertListEqual(['nan'], merged_profile.null_types) self.assertDictEqual({'nan': [1, 5]}, merged_profile.null_types_index) # test add with different sampling properties profile1._min_sample_size = 10 profile2._min_sample_size = 100 profile1._sampling_ratio = 0.5 profile2._sampling_ratio = 0.3 profile1._min_true_samples = 11 profile2._min_true_samples = 1 merged_profile = profile1 + profile2 self.assertEqual(100, merged_profile._min_sample_size) self.assertEqual(0.5, merged_profile._sampling_ratio) self.assertEqual(11, merged_profile._min_true_samples)
def test_categorical_mapping(self): df1 = pd.Series([ "abcd", "aa", "abcd", "aa", "b", "4", "3", "2", "dfd", "2", np.nan, ]) df2 = pd.Series([ "1", "null", "ee", "NaN", "ff", "nan", "gg", "None", "aa", "b", "ee", ]) df3 = pd.Series([ "NaN", "b", "nan", "c", None, ]) column_profile = StructuredDataProfile(df1) cat_profiler = column_profile.profiles['data_stats_profile']._profiles[ "category"] num_null_types = 1 num_nan_count = 1 categories = df1.apply(str).unique().tolist() six.assertCountEqual( self, categories, cat_profiler.categories + column_profile.null_types) self.assertEqual(num_null_types, len(column_profile.null_types)) self.assertEqual(num_nan_count, len(column_profile.null_types_index["nan"])) num_null_types = 4 num_nan_count = 2 categories = pd.concat([df1, df2]).apply(str).unique().tolist() column_profile.update_profile(df2) six.assertCountEqual( self, categories, cat_profiler.categories + column_profile.null_types) self.assertEqual(num_null_types, len(column_profile.null_types)) self.assertEqual(num_nan_count, len(column_profile.null_types_index["nan"])) num_null_types = 4 num_nan_count = 3 categories = pd.concat([df1, df2, df3]).apply(str).unique().tolist() column_profile.update_profile(df3) six.assertCountEqual( self, categories, cat_profiler.categories + column_profile.null_types) self.assertEqual(num_null_types, len(column_profile.null_types)) self.assertEqual(num_nan_count, len(column_profile.null_types_index["nan"])) self.assertNotEqual(num_nan_count, len(column_profile.null_types_index["NaN"]))