def test_base_case(self, mock_instance):
        self._setup_data_labeler_mock(mock_instance)

        data = pd.Series([], dtype=object)
        profiler = DataLabelerColumn(data.name)

        time_array = [float(i) for i in range(4, 0, -1)]
        with mock.patch('time.time', side_effect=lambda: time_array.pop()):
            profiler.update(data)

            self.assertEqual(0, profiler.sample_size)
            self.assertEqual(["a", "b"], profiler._possible_data_labels)
            self.assertEqual(None, profiler.data_label)
            self.assertEqual(None, profiler.avg_predictions)
            six.assertCountEqual(self, [
                "data_label", "avg_predictions", "data_label_representation",
                "times"
            ], list(profiler.profile.keys()))
            self.assertEqual(
                {
                    "data_label": None,
                    "avg_predictions": None,
                    "data_label_representation": None,
                    "times": defaultdict()
                },
                profiler.profile,
            )
    def test_multi_labels(self, mock_instance):
        mock_DataLabeler = mock_instance.return_value
        mock_DataLabeler.label_mapping = {"a": 0, "b": 1, "c": 2, "d": 3}
        mock_DataLabeler.reverse_label_mapping = \
            {0: "a", 1: "b", 2: "c", 3: "d"}
        mock_DataLabeler.model.num_labels = 4
        mock_DataLabeler.model.requires_zero_mapping = False

        def mock_low_predict(data, *args, **kwargs):
            return {
                'pred':
                None,
                'conf':
                np.array([
                    [1, 0, 0, 0],  # 4 repeated
                    [1, 0, 0, 0],
                    [1, 0, 0, 0],
                    [1, 0, 0, 0],
                    [0, 1, 0, 0],  # 2 repeated
                    [0, 1, 0, 0],
                    [0, 0, 1, 0],  # 3 repeated
                    [0, 0, 1, 0],
                    [0, 0, 1, 0],
                    [0, 0, 0, 1],  # 1 repeated
                ])
            }  # counts [4, 2, 3, 1] => [a, b, c, d]

        mock_instance.return_value.predict.side_effect = mock_low_predict

        data = pd.Series(['1'] * 10)
        profiler = DataLabelerColumn(data.name)
        profiler.update(data)
        self.assertEqual("a|c|b", profiler.data_label)
示例#3
0
    def test_data_label_low_accuracy(self, mock_instance):
        self._setup_data_labeler_mock(mock_instance)

        def mock_low_predict(data, *args, **kwargs):
            return {'pred': np.array([[0, 0]]), 'conf': np.array([[0.2, 0.2]])}
        mock_instance.return_value.predict.side_effect = mock_low_predict

        data = pd.Series(['1'])
        profiler = DataLabelerColumn(data.name)
        profiler.update(data)
        self.assertEqual("could not determine", profiler.data_label)
示例#4
0
    def test_update(self, mock_instance):
        self._setup_data_labeler_mock(mock_instance)

        data = pd.Series(['1', '2', '3'])
        profiler = DataLabelerColumn(data.name)
        profiler.update(data)

        self.assertEqual(3, profiler.sample_size)
        self.assertEqual(["a", "b"], profiler._possible_data_labels)
        self.assertEqual("a", profiler.data_label)
        self.assertDictEqual(dict(a=2/3, b=1/3), profiler.avg_predictions)
        self.assertDictEqual(dict(a=2, b=1), profiler.rank_distribution)
        self.assertDictEqual(dict(a=2/3, b=1/3), profiler.label_representation)
    def test_update_reserve_label_mapping(self, mock_instance):
        self._setup_data_labeler_mock(mock_instance)
        mock_DataLabeler = mock_instance.return_value
        mock_DataLabeler.model.requires_zero_mapping = True
        mock_DataLabeler.label_mapping = {"PAD": 0, "a": 1, "b": 2}
        mock_DataLabeler.reverse_label_mapping = {0: "PAD", 1: "a", 2: "b"}
        mock_DataLabeler.model.num_labels = 3

        data = pd.Series(['1', '2', '3'])
        profiler = DataLabelerColumn(data.name)
        profiler.update(data)

        self.assertEqual(["a", "b"], profiler._possible_data_labels)
        self.assertEqual("a", profiler.data_label)
        self.assertDictEqual(dict(a=2 / 3, b=1 / 3), profiler.avg_predictions)
        self.assertDictEqual(dict(a=2, b=1), profiler.rank_distribution)
        self.assertDictEqual(dict(a=2 / 3, b=1 / 3),
                             profiler.label_representation)
示例#6
0
    def test_label_match(self, mock_instance):
        """
        Test label match between avg_prediction and data_label_representation
        """
        mock_DataLabeler = mock_instance.return_value
        mock_DataLabeler.label_mapping = \
            {"a": 0, "b": 1, "c": 1, "d": 2, "e": 2, "f": 3}
        mock_DataLabeler.reverse_label_mapping = \
            {0: "a", 1: "c", 2: "e", 3: "f"}
        mock_DataLabeler.model.num_labels = 4

        data = pd.Series(['1', '2', '3', '4', '5', '6'])
        profiler = DataLabelerColumn(data.name)
        profiler.sample_size = 1

        self.assertEqual(["a", "c", "e", "f"], profiler._possible_data_labels)
        self.assertDictEqual(dict(a=0, c=0, e=0, f=0), profiler.label_representation)
        self.assertDictEqual(dict(a=0, c=0, e=0, f=0), profiler.avg_predictions)
    def test_profile(self, mock_instance):
        self._setup_data_labeler_mock(mock_instance)

        data = pd.Series(['1', '2', '3'])
        profiler = DataLabelerColumn(data.name)

        expected_profile = {
            "data_label": 'a',
            "avg_predictions": dict(a=2 / 3, b=1 / 3),
            "data_label_representation": dict(a=2 / 3, b=1 / 3),
            "times": defaultdict(float, {'data_labeler_predict': 1.0})
        }

        time_array = [float(i) for i in range(4, 0, -1)]
        with mock.patch('time.time', side_effect=lambda: time_array.pop()):
            # Validate that the times dictionary is empty
            self.assertEqual(defaultdict(float), profiler.profile['times'])
            profiler.update(data)

            # Validate the time in the DataLabeler class has the expected time.
            profile = profiler.profile
            self.assertDictEqual(expected_profile, profile)

            # Validate time in datetime class has expected time after second update
            profiler.update(data)
            expected = defaultdict(float, {'data_labeler_predict': 2.0})
            self.assertEqual(expected, profiler.profile['times'])
    def test_diff(self, mock_instance):
        self._setup_data_labeler_mock(mock_instance)

        profiler1 = DataLabelerColumn("")
        profiler2 = DataLabelerColumn("")

        # Mock out the data_label, avg_predictions, and label_representation
        # properties
        with mock.patch("dataprofiler.profilers.data_labeler_column_profile"
                        ".DataLabelerColumn.data_label"), \
            mock.patch("dataprofiler.profilers.data_labeler_column_profile."
                       "DataLabelerColumn.avg_predictions"),\
            mock.patch("dataprofiler.profilers.data_labeler_column_profile."
                       "DataLabelerColumn.label_representation"):
            profiler1.data_label = "a|b|c"
            profiler1.avg_predictions = {"a": 0.25, "b": 0.0, "c": 0.75}
            profiler1.label_representation = {"a": 0.15, "b": 0.01, "c": 0.84}

            profiler2.data_label = "b|c|d"
            profiler2.avg_predictions = {"a": 0.25, "b": 0.70, "c": 0.05}
            profiler2.label_representation = {"a": 0.99, "b": 0.01, "c": 0.0}

            diff = profiler1.diff(profiler2)
            expected_diff = {
                "data_label":
                utils.find_diff_of_lists_and_sets(['a', 'b', 'c'],
                                                  ['b', 'c', 'd']),
                "avg_predictions": {
                    "a": "unchanged",
                    "b": -0.70,
                    "c": 0.70
                },
                "label_representation": {
                    "a": -0.84,
                    "b": "unchanged",
                    "c": 0.84
                }
            }
            self.maxDiff = None
            self.assertDictEqual(expected_diff, diff)
    def test_profile_merge_with_different_options(self, mock_instance):
        self._setup_data_labeler_mock(mock_instance)

        # Different max_sample_size values
        data = pd.Series(['1', '2', '3', '11'])
        data2 = pd.Series(['4', '5', '6', '7', '9', '10', '12'])
        options = DataLabelerOptions()
        options.max_sample_size = 20
        profiler = DataLabelerColumn(data.name, options=options)
        profiler.update(data)

        options2 = DataLabelerOptions()
        options2.max_sample_size = 15
        profiler2 = DataLabelerColumn(data2.name, options=options2)
        profiler2.update(data2)

        with self.assertRaisesRegex(
                AttributeError,
                "Cannot merge. The data labeler and/or the max "
                "sample size are not the same for both column "
                "profiles."):
            profiler3 = profiler + profiler2

        # Different labelers
        profiler = DataLabelerColumn(data.name)
        profiler.data_labeler = mock.MagicMock()
        profiler2 = DataLabelerColumn(data2.name)

        with self.assertRaisesRegex(
                AttributeError,
                "Cannot merge. The data labeler and/or the max "
                "sample size are not the same for both column "
                "profiles."):
            profiler3 = profiler + profiler2
 def test_data_labeler_column_with_wrong_options(self, *mocks):
     with self.assertRaisesRegex(
             ValueError, "DataLabelerColumn parameter 'options' must "
             "be of type DataLabelerOptions."):
         profiler = DataLabelerColumn("Data_Labeler",
                                      options="wrong_data_type")
    def test_profile_merge(self, mock_instance):
        self._setup_data_labeler_mock(mock_instance)

        data = pd.Series(['1', '2', '3', '11'])
        data2 = pd.Series(['4', '5', '6', '7', '9', '10', '12'])

        expected_profile = {
            "data_label": "a|b",
            "avg_predictions": dict(a=54 / 99, b=45 / 99),
            "data_label_representation": dict(a=54 / 99, b=45 / 99),
            "times": defaultdict(float, {'data_labeler_predict': 2.0})
        }
        expected_sum_predictions = [6, 5]
        expected_rank_distribution = {'a': 6, 'b': 5}

        time_array = [float(i) for i in range(4, 0, -1)]
        with mock.patch('time.time', side_effect=lambda: time_array.pop()):
            profiler = DataLabelerColumn(data.name)
            profiler.update(data)

            profiler2 = DataLabelerColumn(data2.name)
            profiler2.update(data2)

            profiler3 = profiler + profiler2

            # Assert correct values
            self.assertEqual(expected_profile, profiler3.profile)
            self.assertEqual(expected_sum_predictions,
                             profiler3._sum_predictions.tolist())
            self.assertEqual(expected_rank_distribution,
                             profiler3.rank_distribution)
            self.assertEqual(expected_profile, profiler3.profile)
            self.assertEqual(profiler.data_labeler, profiler3.data_labeler)
            self.assertEqual(profiler._possible_data_labels,
                             profiler3._possible_data_labels)
            self.assertEqual(profiler._top_k_voting, profiler3._top_k_voting)
            self.assertEqual(profiler._min_voting_prob,
                             profiler3._min_voting_prob)
            self.assertEqual(profiler._min_prob_differential,
                             profiler3._min_prob_differential)
            self.assertEqual(profiler._top_k_labels, profiler3._top_k_labels)
            self.assertEqual(profiler._min_top_label_prob,
                             profiler3._min_top_label_prob)
            self.assertEqual(profiler._max_sample_size,
                             profiler3._max_sample_size)
            self.assertEqual(profiler._top_k_voting, profiler3._top_k_voting)

            # Check adding even more profiles together
            profiler3 = profiler + profiler3
            expected_profile = {
                "data_label": "a|b",
                "avg_predictions": dict(a=8 / 15, b=7 / 15),
                "data_label_representation": dict(a=8 / 15, b=7 / 15),
                "times": defaultdict(float, {'data_labeler_predict': 3.0})
            }
            expected_sum_predictions = [8, 7]
            expected_rank_distribution = {'a': 8, 'b': 7}

            # Assert only the proper changes have been made
            self.assertEqual(expected_profile, profiler3.profile)
            self.assertEqual(expected_sum_predictions,
                             profiler3._sum_predictions.tolist())
            self.assertEqual(expected_rank_distribution,
                             profiler3.rank_distribution)
            self.assertEqual(expected_profile, profiler3.profile)
            self.assertEqual(profiler.data_labeler, profiler3.data_labeler)
            self.assertEqual(profiler._possible_data_labels,
                             profiler3._possible_data_labels)
            self.assertEqual(profiler._top_k_voting, profiler3._top_k_voting)
            self.assertEqual(profiler._min_voting_prob,
                             profiler3._min_voting_prob)
            self.assertEqual(profiler._min_prob_differential,
                             profiler3._min_prob_differential)
            self.assertEqual(profiler._top_k_labels, profiler3._top_k_labels)
            self.assertEqual(profiler._min_top_label_prob,
                             profiler3._min_top_label_prob)
            self.assertEqual(profiler._max_sample_size,
                             profiler3._max_sample_size)
            self.assertEqual(profiler._top_k_voting, profiler3._top_k_voting)

        # Check that error is thrown if profiles are unequal
        with self.assertRaises(ValueError):
            profiler._top_k_voting = 13
            test = profiler + profiler2