def test_uppercase_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self): """ Tests whether columns with a ratio of categorical columns less than MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL and container uppercase letters identify as categorical. """ num_unique_values = CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL + 1 list_unique_values = [ self.test_sentence + str(i + 1) for i in range(num_unique_values) ] num_sentences = int( float(1) / CategoricalColumn._CATEGORICAL_THRESHOLD_DEFAULT) + 2 cat_sentence_list = list_unique_values * num_sentences cat_sentence_list[-1] = self.test_sentence_upper1 + str(num_sentences) cat_sentence_list[-2] = self.test_sentence_upper2 + \ str(num_sentences - 1) cat_sentence_list[-3] = self.test_sentence_upper3 + \ str(num_sentences - 2) len_unique = len(set(cat_sentence_list)) cat_sentence_df = pd.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles['data_stats_profile']._profiles[ "category"] self.assertEqual(True, cat_profiler.is_match) self.assertEqual(len_unique, len(cat_profiler.categories))
def test_long_sentences_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL( self): """ Tests whether columns with the number of unique long sentences fewer than MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL identify as categorical. """ num_sentences = CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL - 1 cat_sentence_list = [ self.test_sentence_long + str(i + 1) for i in range(num_sentences) ] len_unique = len(set(cat_sentence_list)) cat_sentence_df = pd.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles['data_stats_profile']._profiles[ "category"] self.assertEqual(True, cat_profiler.is_match) self.assertEqual(len_unique, len(cat_profiler.categories))
def test_greater_than_CATEGORICAL_THRESHOLD_DEFAULT_identify_as_text(self): """ Tests whether columns with a ratio of categorical columns greater than MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL identify as text. """ num_unique_values = CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL + 1 list_unique_values = [ self.test_sentence + str(i + 1) for i in range(num_unique_values) ] num_sentences = int( float(1) / CategoricalColumn._CATEGORICAL_THRESHOLD_DEFAULT) - 1 cat_sentence_list = list_unique_values * num_sentences cat_sentence_df = pd.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles['data_stats_profile']._profiles[ "category"] self.assertEqual(False, cat_profiler.is_match)
def test_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self): """ Tests whether columns with a ratio of categorical columns less than MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL identify as categorical. """ num_unique_values = ( CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL + 1) list_unique_values = [ self.test_sentence + str(i + 1) for i in range(num_unique_values) ] num_sentences = ( int(float(1) / CategoricalColumn._CATEGORICAL_THRESHOLD_DEFAULT) + 2) cat_sentence_list = list_unique_values * num_sentences len_unique = len(set(cat_sentence_list)) cat_sentence_df = pd.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category"] self.assertEqual(True, cat_profiler.is_match) self.assertEqual(len_unique, len(cat_profiler.categories))
def test_categorical_mapping(self): df1 = pd.Series([ "abcd", "aa", "abcd", "aa", "b", "4", "3", "2", "dfd", "2", np.nan, ]) df2 = pd.Series([ "1", "null", "ee", "NaN", "ff", "nan", "gg", "None", "aa", "b", "ee", ]) df3 = pd.Series([ "NaN", "b", "nan", "c", None, ]) column_profile = StructuredColProfiler(df1) cat_profiler = column_profile.profiles['data_stats_profile']._profiles[ "category"] num_null_types = 1 num_nan_count = 1 categories = df1.apply(str).unique().tolist() six.assertCountEqual( self, categories, cat_profiler.categories + column_profile.null_types) self.assertEqual(num_null_types, len(column_profile.null_types)) self.assertEqual(num_nan_count, len(column_profile.null_types_index["nan"])) expected = { "abcd": 2, "aa": 2, "b": 1, "4": 1, "3": 1, "2": 2, "dfd": 1 } self.assertDictEqual(expected, cat_profiler._categories) num_null_types = 4 num_nan_count = 2 categories = pd.concat([df1, df2]).apply(str).unique().tolist() column_profile.update_profile(df2) cat_profiler = column_profile.profiles['data_stats_profile']._profiles[ "category"] six.assertCountEqual( self, categories, cat_profiler.categories + column_profile.null_types) self.assertEqual(num_null_types, len(column_profile.null_types)) self.assertEqual(num_nan_count, len(column_profile.null_types_index["nan"])) expected = { "abcd": 2, "aa": 3, "b": 2, "4": 1, "3": 1, "2": 2, "dfd": 1, "1": 1, "ee": 2, "ff": 1, "gg": 1 } self.assertDictEqual(expected, cat_profiler._categories) num_null_types = 4 num_nan_count = 3 categories = pd.concat([df1, df2, df3]).apply(str).unique().tolist() column_profile.update_profile(df3) cat_profiler = column_profile.profiles['data_stats_profile']._profiles[ "category"] six.assertCountEqual( self, categories, cat_profiler.categories + column_profile.null_types) self.assertEqual(num_null_types, len(column_profile.null_types)) self.assertEqual(num_nan_count, len(column_profile.null_types_index["nan"])) self.assertNotEqual(num_nan_count, len(column_profile.null_types_index["NaN"]))