def test_uppercase_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self):
        """
        Tests whether columns with a ratio of categorical columns less than
        MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL and container
        uppercase letters identify as categorical.
        """
        num_unique_values = CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL + 1
        list_unique_values = [
            self.test_sentence + str(i + 1) for i in range(num_unique_values)
        ]
        num_sentences = int(
            float(1) / CategoricalColumn._CATEGORICAL_THRESHOLD_DEFAULT) + 2
        cat_sentence_list = list_unique_values * num_sentences
        cat_sentence_list[-1] = self.test_sentence_upper1 + str(num_sentences)
        cat_sentence_list[-2] = self.test_sentence_upper2 + \
            str(num_sentences - 1)
        cat_sentence_list[-3] = self.test_sentence_upper3 + \
            str(num_sentences - 2)

        len_unique = len(set(cat_sentence_list))
        cat_sentence_df = pd.Series(cat_sentence_list)
        column_profile = StructuredColProfiler(cat_sentence_df)
        cat_profiler = column_profile.profiles['data_stats_profile']._profiles[
            "category"]
        self.assertEqual(True, cat_profiler.is_match)
        self.assertEqual(len_unique, len(cat_profiler.categories))
    def test_long_sentences_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL(
            self):
        """
        Tests whether columns with the number of unique long sentences fewer
        than MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL identify as
        categorical.
        """
        num_sentences = CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL - 1
        cat_sentence_list = [
            self.test_sentence_long + str(i + 1) for i in range(num_sentences)
        ]

        len_unique = len(set(cat_sentence_list))
        cat_sentence_df = pd.Series(cat_sentence_list)
        column_profile = StructuredColProfiler(cat_sentence_df)
        cat_profiler = column_profile.profiles['data_stats_profile']._profiles[
            "category"]
        self.assertEqual(True, cat_profiler.is_match)
        self.assertEqual(len_unique, len(cat_profiler.categories))
    def test_greater_than_CATEGORICAL_THRESHOLD_DEFAULT_identify_as_text(self):
        """
        Tests whether columns with a ratio of categorical columns greater than
        MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL identify as text.
        """
        num_unique_values = CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL + 1
        list_unique_values = [
            self.test_sentence + str(i + 1) for i in range(num_unique_values)
        ]
        num_sentences = int(
            float(1) / CategoricalColumn._CATEGORICAL_THRESHOLD_DEFAULT) - 1
        cat_sentence_list = list_unique_values * num_sentences

        cat_sentence_df = pd.Series(cat_sentence_list)
        column_profile = StructuredColProfiler(cat_sentence_df)
        cat_profiler = column_profile.profiles['data_stats_profile']._profiles[
            "category"]

        self.assertEqual(False, cat_profiler.is_match)
예제 #4
0
    def test_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self):
        """
        Tests whether columns with a ratio of categorical columns less than
        MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL identify as
        categorical.
        """
        num_unique_values = (
            CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL
            + 1)
        list_unique_values = [
            self.test_sentence + str(i + 1) for i in range(num_unique_values)
        ]
        num_sentences = (
            int(float(1) / CategoricalColumn._CATEGORICAL_THRESHOLD_DEFAULT) +
            2)
        cat_sentence_list = list_unique_values * num_sentences

        len_unique = len(set(cat_sentence_list))
        cat_sentence_df = pd.Series(cat_sentence_list)
        column_profile = StructuredColProfiler(cat_sentence_df)
        cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[
            "category"]
        self.assertEqual(True, cat_profiler.is_match)
        self.assertEqual(len_unique, len(cat_profiler.categories))
    def test_categorical_mapping(self):

        df1 = pd.Series([
            "abcd",
            "aa",
            "abcd",
            "aa",
            "b",
            "4",
            "3",
            "2",
            "dfd",
            "2",
            np.nan,
        ])
        df2 = pd.Series([
            "1",
            "null",
            "ee",
            "NaN",
            "ff",
            "nan",
            "gg",
            "None",
            "aa",
            "b",
            "ee",
        ])
        df3 = pd.Series([
            "NaN",
            "b",
            "nan",
            "c",
            None,
        ])

        column_profile = StructuredColProfiler(df1)
        cat_profiler = column_profile.profiles['data_stats_profile']._profiles[
            "category"]
        num_null_types = 1
        num_nan_count = 1
        categories = df1.apply(str).unique().tolist()
        six.assertCountEqual(
            self, categories,
            cat_profiler.categories + column_profile.null_types)
        self.assertEqual(num_null_types, len(column_profile.null_types))
        self.assertEqual(num_nan_count,
                         len(column_profile.null_types_index["nan"]))
        expected = {
            "abcd": 2,
            "aa": 2,
            "b": 1,
            "4": 1,
            "3": 1,
            "2": 2,
            "dfd": 1
        }
        self.assertDictEqual(expected, cat_profiler._categories)
        num_null_types = 4
        num_nan_count = 2
        categories = pd.concat([df1, df2]).apply(str).unique().tolist()
        column_profile.update_profile(df2)
        cat_profiler = column_profile.profiles['data_stats_profile']._profiles[
            "category"]
        six.assertCountEqual(
            self, categories,
            cat_profiler.categories + column_profile.null_types)
        self.assertEqual(num_null_types, len(column_profile.null_types))
        self.assertEqual(num_nan_count,
                         len(column_profile.null_types_index["nan"]))
        expected = {
            "abcd": 2,
            "aa": 3,
            "b": 2,
            "4": 1,
            "3": 1,
            "2": 2,
            "dfd": 1,
            "1": 1,
            "ee": 2,
            "ff": 1,
            "gg": 1
        }
        self.assertDictEqual(expected, cat_profiler._categories)

        num_null_types = 4
        num_nan_count = 3
        categories = pd.concat([df1, df2, df3]).apply(str).unique().tolist()
        column_profile.update_profile(df3)
        cat_profiler = column_profile.profiles['data_stats_profile']._profiles[
            "category"]
        six.assertCountEqual(
            self, categories,
            cat_profiler.categories + column_profile.null_types)
        self.assertEqual(num_null_types, len(column_profile.null_types))
        self.assertEqual(num_nan_count,
                         len(column_profile.null_types_index["nan"]))
        self.assertNotEqual(num_nan_count,
                            len(column_profile.null_types_index["NaN"]))