Exemplo n.º 1
0
 def test_data_labeler_toggle(self):
     src_column = self.aws_dataset.src
     structured_options = StructuredOptions()
     structured_options.data_labeler.is_enabled = False
     std_profile = StructuredDataProfile(src_column,
                                         sample_size=len(src_column))
     togg_profile = StructuredDataProfile(src_column,
                                         sample_size=len(src_column),
                                         options=structured_options)
     self.assertIn('data_label_profile', std_profile.profiles)
     self.assertNotIn('data_label_profile', togg_profile.profiles)
Exemplo n.º 2
0
    def test_null_count(self):
        column = pd.Series([1, float('nan')] * 10)

        # test null_count when subset of full sample size
        random.seed(0)
        profile = StructuredDataProfile(column, sample_size=10)
        self.assertEqual(6, profile.null_count)

        # test null_count when full sample size
        profile = StructuredDataProfile(column, sample_size=len(column))
        self.assertEqual(10, profile.null_count)
Exemplo n.º 3
0
    def test_uppercase_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self):
        """
        Tests whether columns with a ratio of categorical columns less than
        MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL and container
        uppercase letters identify as categorical.
        """
        num_unique_values = CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL + 1
        list_unique_values = [
            self.test_sentence + str(i + 1) for i in range(num_unique_values)
        ]
        num_sentences = int(
            float(1) / CategoricalColumn._CATEGORICAL_THRESHOLD_DEFAULT) + 2
        cat_sentence_list = list_unique_values * num_sentences
        cat_sentence_list[-1] = self.test_sentence_upper1 + str(num_sentences)
        cat_sentence_list[-2] = self.test_sentence_upper2 + \
            str(num_sentences - 1)
        cat_sentence_list[-3] = self.test_sentence_upper3 + \
            str(num_sentences - 2)

        len_unique = len(set(cat_sentence_list))
        cat_sentence_df = pd.Series(cat_sentence_list)
        column_profile = StructuredDataProfile(cat_sentence_df)
        cat_profiler = column_profile.profiles['data_stats_profile']._profiles[
            "category"]
        self.assertEqual(True, cat_profiler.is_match)
        self.assertEqual(len_unique, len(cat_profiler.categories))
Exemplo n.º 4
0
    def test_base_props(self):
        src_column = self.aws_dataset.src
        src_profile = StructuredDataProfile(
            src_column, sample_size=len(src_column))

        self.assertIsInstance(src_profile.profiles['data_type_profile'],
                              ColumnPrimitiveTypeProfileCompiler)
        self.assertIsInstance(src_profile.profiles['data_stats_profile'],
                              ColumnStatsProfileCompiler)

        data_types = ['int', 'float', 'datetime', 'text']
        six.assertCountEqual(
            self, data_types,
            list(src_profile.profiles['data_type_profile']._profiles.keys())
        )

        stats_types = ['category', 'order']
        six.assertCountEqual(
            self, stats_types,
            list(src_profile.profiles['data_stats_profile']._profiles.keys())
        )

        self.assertEqual(3, src_profile.null_count)
        self.assertEqual(2999, src_profile.sample_size)

        total_nulls = 0
        for _, null_rows in src_profile.null_types_index.items():
            total_nulls += len(null_rows)
        self.assertEqual(3, total_nulls)

        # test updated base props with batch addition
        src_profile.update_profile(src_column)
        src_profile.update_profile(src_column)

        self.assertEqual(3*3, src_profile.null_count)
        self.assertEqual(2999*3, src_profile.sample_size)
Exemplo n.º 5
0
    def test_get_base_props_and_clean_null_params(self):
        data = pd.Series([1, None, 3, 4, None, 6],
                         index=['a', 'b', 'c', 'd', 'e', 'f'])

        # validate that if sliced data, still functional
        # previously `iloc` was used at:
        # `df_series = df_series.loc[sorted(true_sample_list)]`
        # which caused errors
        df_series, base_stats = \
            StructuredDataProfile.get_base_props_and_clean_null_params(
                self=None, df_series=data[1:], sample_size=6,
                min_true_samples=0)
        # note data above is a subset `df_series=data[1:]`, 1.0 will not exist
        self.assertTrue(np.issubdtype(np.object_, df_series.dtype))
        self.assertDictEqual(
            {'sample': ['4.0', '6.0', '3.0'], 'sample_size': 5, 'null_count': 2,
             'null_types': dict(nan=['e', 'b'])},
            base_stats)
Exemplo n.º 6
0
    def test_greater_than_CATEGORICAL_THRESHOLD_DEFAULT_identify_as_text(self):
        """
        Tests whether columns with a ratio of categorical columns greater than
        MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL identify as text.
        """
        num_unique_values = CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL + 1
        list_unique_values = [
            self.test_sentence + str(i + 1) for i in range(num_unique_values)
        ]
        num_sentences = int(
            float(1) / CategoricalColumn._CATEGORICAL_THRESHOLD_DEFAULT) - 1
        cat_sentence_list = list_unique_values * num_sentences

        cat_sentence_df = pd.Series(cat_sentence_list)
        column_profile = StructuredDataProfile(cat_sentence_df)
        cat_profiler = column_profile.profiles['data_stats_profile']._profiles[
            "category"]
        self.assertEqual(False, cat_profiler.is_match)
Exemplo n.º 7
0
    def test_long_sentences_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL(
            self):
        """
        Tests whether columns with the number of unique long sentences fewer
        than MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL identify as
        categorical.
        """
        num_sentences = CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL - 1
        cat_sentence_list = [
            self.test_sentence_long + str(i + 1) for i in range(num_sentences)
        ]

        len_unique = len(set(cat_sentence_list))
        cat_sentence_df = pd.Series(cat_sentence_list)
        column_profile = StructuredDataProfile(cat_sentence_df)
        cat_profiler = column_profile.profiles['data_stats_profile']._profiles[
            "category"]
        self.assertEqual(True, cat_profiler.is_match)
        self.assertEqual(len_unique, len(cat_profiler.categories))
Exemplo n.º 8
0
    def test_add_profilers(self, *mocks):
        data = pd.Series([1, None, 3, 4, 5, None])
        profile1 = StructuredDataProfile(data[:2])
        profile2 = StructuredDataProfile(data[2:])

        # test incorrect type
        with self.assertRaisesRegex(TypeError,
                                    '`StructuredDataProfile` and `int` are '
                                    'not of the same profiler type.'):
            profile1 + 3

        # test mismatched names
        profile1.name = 'profile1'
        profile2.name = 'profile2'
        with self.assertRaisesRegex(ValueError,
                                    'Structured profile names are unmatched: '
                                    'profile1 != profile2'):
            profile1 + profile2

        # test mismatched profiles due to options
        profile2.name = 'profile1'
        profile1._profiles = dict(test1=mock.Mock())
        profile2.profiles.pop('data_label_profile')
        with self.assertRaisesRegex(ValueError,
                                    'Structured profilers were not setup with '
                                    'the same options, hence they do not '
                                    'calculate the same profiles and cannot be '
                                    'added together.'):
            profile1 + profile2

        # test success
        profile1.profiles = dict(test=1)
        profile2.profiles = dict(test=2)
        merged_profile = profile1 + profile2
        self.assertEqual(3, merged_profile.profiles['test'])
        self.assertEqual(['4.0', '5.0', '1.0', '3.0'], merged_profile.sample)
        self.assertEqual(6, merged_profile.sample_size)
        self.assertEqual(2, merged_profile.null_count)
        self.assertListEqual(['nan'], merged_profile.null_types)
        self.assertDictEqual({'nan': [1, 5]}, merged_profile.null_types_index)

        # test add with different sampling properties
        profile1._min_sample_size = 10
        profile2._min_sample_size = 100
        profile1._sampling_ratio = 0.5
        profile2._sampling_ratio = 0.3
        profile1._min_true_samples = 11
        profile2._min_true_samples = 1
        merged_profile = profile1 + profile2
        self.assertEqual(100, merged_profile._min_sample_size)
        self.assertEqual(0.5, merged_profile._sampling_ratio)
        self.assertEqual(11, merged_profile._min_true_samples)
Exemplo n.º 9
0
    def test_categorical_mapping(self):

        df1 = pd.Series([
            "abcd",
            "aa",
            "abcd",
            "aa",
            "b",
            "4",
            "3",
            "2",
            "dfd",
            "2",
            np.nan,
        ])
        df2 = pd.Series([
            "1",
            "null",
            "ee",
            "NaN",
            "ff",
            "nan",
            "gg",
            "None",
            "aa",
            "b",
            "ee",
        ])
        df3 = pd.Series([
            "NaN",
            "b",
            "nan",
            "c",
            None,
        ])

        column_profile = StructuredDataProfile(df1)
        cat_profiler = column_profile.profiles['data_stats_profile']._profiles[
            "category"]

        num_null_types = 1
        num_nan_count = 1
        categories = df1.apply(str).unique().tolist()
        six.assertCountEqual(
            self, categories,
            cat_profiler.categories + column_profile.null_types)
        self.assertEqual(num_null_types, len(column_profile.null_types))
        self.assertEqual(num_nan_count,
                         len(column_profile.null_types_index["nan"]))

        num_null_types = 4
        num_nan_count = 2
        categories = pd.concat([df1, df2]).apply(str).unique().tolist()
        column_profile.update_profile(df2)
        six.assertCountEqual(
            self, categories,
            cat_profiler.categories + column_profile.null_types)
        self.assertEqual(num_null_types, len(column_profile.null_types))
        self.assertEqual(num_nan_count,
                         len(column_profile.null_types_index["nan"]))

        num_null_types = 4
        num_nan_count = 3
        categories = pd.concat([df1, df2, df3]).apply(str).unique().tolist()
        column_profile.update_profile(df3)
        six.assertCountEqual(
            self, categories,
            cat_profiler.categories + column_profile.null_types)
        self.assertEqual(num_null_types, len(column_profile.null_types))
        self.assertEqual(num_nan_count,
                         len(column_profile.null_types_index["nan"]))
        self.assertNotEqual(num_nan_count,
                            len(column_profile.null_types_index["NaN"]))