Exemplo n.º 1
0
 def _invalid_check(data):
     with self.assertRaisesRegex(
         TypeError,
         "Data must be imported using the "
         "data_readers, pd.DataFrames, "
         "np.ndarrays, or lists.",
     ):
         BaseDataLabeler._check_and_return_valid_data_format(data)
Exemplo n.º 2
0
 def _valid_check(data):
     try:
         print("\tChecking data format: {}".format(str(type(data))))
         data = BaseDataLabeler._check_and_return_valid_data_format(
             data, fit_or_predict='predict')
     except Exception as e:
         self.fail("Exception raised on input of accepted types.")
     return data
Exemplo n.º 3
0
    def test_load_with_components(self, *mocks):

        mock_preprocessor = mock.Mock(spec=data_processing.BaseDataPreprocessor)
        mock_preprocessor._parameters = {"test": 1}
        mock_postprocessor = mock.Mock(
            spec=data_processing.BaseDataPostprocessor)
        mock_postprocessor._parameters = {"test": 2}
        mock_model = mock.Mock(spec=BaseTrainableModel)
        mock_model._parameters = {"test": 3}

        data_labeler = BaseDataLabeler.load_with_components(
            preprocessor=mock_preprocessor,
            model=mock_model,
            postprocessor=mock_postprocessor)

        self.assertIsInstance(data_labeler, BaseDataLabeler)
        self.assertEqual('CustomDataLabeler', data_labeler.__class__.__name__)
        self.assertEqual(mock_preprocessor, data_labeler.preprocessor)
        self.assertEqual({"test": 1}, data_labeler.preprocessor._parameters)
        self.assertEqual(mock_model, data_labeler.model)
        self.assertEqual({"test": 3}, data_labeler.model._parameters)
        self.assertEqual(mock_postprocessor, data_labeler.postprocessor)
        self.assertEqual({"test": 2}, data_labeler.postprocessor._parameters)
Exemplo n.º 4
0
    def test_save_labeler(self, mock_load_data_labeler, mock_open,
                          mock_load_model):

        # setup mocks
        mock_file = setup_save_mock_open(mock_open)

        base_data_labeler = BaseDataLabeler('fake_path')
        base_data_labeler._model = mock.Mock()
        base_data_labeler._preprocessor = mock.Mock()
        base_data_labeler._postprocessor = mock.Mock()

        base_data_labeler.save_to_disk('test')

        self.assertEqual(
            '{"model": {"class": "Mock"}, "preprocessor": {"class": "Mock"}, '
            '"postprocessor": {"class": "Mock"}}', mock_file.getvalue())
        mock_open.assert_called_with('test/data_labeler_parameters.json', 'w')
        base_data_labeler._model.save_to_disk.assert_called_with('test')
        base_data_labeler._preprocessor.save_to_disk.assert_called_with('test')
        base_data_labeler._postprocessor.save_to_disk.assert_called_with(
            'test')

        # close mock
        StringIO.close(mock_file)
Exemplo n.º 5
0
 def _invalid_check(data):
     with self.assertRaisesRegex(
             TypeError, "Data must either be imported using "
             "the data_readers or pd.DataFrame."):
         BaseDataLabeler._check_and_return_valid_data_format(data)
Exemplo n.º 6
0
    def test_check_and_return_valid_data_format(self):
        # test incorrect fit_or_predict value
        with self.assertRaisesRegex(ValueError, '`fit_or_predict` must equal '
                                                '`fit` or `predict`'):
            BaseDataLabeler._check_and_return_valid_data_format([], 'oops')

        # test incorrect data type
        with self.assertRaisesRegex(TypeError, "Data must be imported using the"
                                               " data_readers, pd.DataFrames, "
                                               "np.ndarrays, or lists."):
            BaseDataLabeler._check_and_return_valid_data_format('oops')

        # test proper conversion of 2 dimensional structured data
        two_dim = [["this", "is"], ["two", "dimensions"]]
        two_dim_pred = np.array(["this", "is", "two", "dimensions"])
        # for fit
        self.assertTrue(
            np.array_equal(np.array(two_dim),
                           BaseDataLabeler._check_and_return_valid_data_format(
                           two_dim, fit_or_predict='fit')))
        self.assertTrue(
            np.array_equal(np.array(two_dim),
                           BaseDataLabeler._check_and_return_valid_data_format(
                           pd.DataFrame(two_dim), fit_or_predict='fit')))
        self.assertTrue(
            np.array_equal(np.array(two_dim),
                           BaseDataLabeler._check_and_return_valid_data_format(
                           np.array(two_dim), fit_or_predict='fit')))
        # for predict
        self.assertTrue(
            np.array_equal(two_dim_pred,
                           BaseDataLabeler._check_and_return_valid_data_format(
                           two_dim, fit_or_predict='predict')))
        self.assertTrue(
            np.array_equal(two_dim_pred,
                           BaseDataLabeler._check_and_return_valid_data_format(
                           pd.DataFrame(two_dim), fit_or_predict='predict')))
        self.assertTrue(
            np.array_equal(two_dim_pred,
                           BaseDataLabeler._check_and_return_valid_data_format(
                           np.array(two_dim), fit_or_predict='predict')))

        # test proper conversion of 1 dimensional data
        one_dim = ["this", "is", "one", "dimension"]
        one_dim_pred = np.array(one_dim)
        # for fit
        self.assertTrue(
            np.array_equal(np.array(one_dim),
                           BaseDataLabeler._check_and_return_valid_data_format(
                           one_dim, fit_or_predict='fit')))
        self.assertTrue(
            np.array_equal(np.array(one_dim),
                           BaseDataLabeler._check_and_return_valid_data_format(
                           pd.Series(one_dim), fit_or_predict='fit')))
        self.assertTrue(
            np.array_equal(np.array(one_dim),
                           BaseDataLabeler._check_and_return_valid_data_format(
                           np.array(one_dim), fit_or_predict='fit')))
        # for predict
        self.assertTrue(
            np.array_equal(one_dim_pred,
                           BaseDataLabeler._check_and_return_valid_data_format(
                           one_dim, fit_or_predict='predict')))
        self.assertTrue(
            np.array_equal(one_dim_pred,
                           BaseDataLabeler._check_and_return_valid_data_format(
                           pd.DataFrame(one_dim), fit_or_predict='predict')))
        self.assertTrue(
            np.array_equal(one_dim_pred,
                           BaseDataLabeler._check_and_return_valid_data_format(
                           np.array(one_dim), fit_or_predict='predict')))

        # test proper conversion of unstructured labels
        labels = [[(0, 4, "UNKNOWN"), (4, 10, "ADDRESS")],
                  [(0, 5, "SSN"), (5, 8, "UNKNOWN")]]
        validated_labels = \
            BaseDataLabeler._check_and_return_valid_data_format(labels)
        self.assertIsInstance(validated_labels, np.ndarray)
        self.assertEqual(len(validated_labels), 2)
        self.assertEqual(len(validated_labels[0]), 2)
        self.assertEqual(len(validated_labels[0][0]), 3)
        self.assertEqual(validated_labels[0][0][0], 0)
        self.assertEqual(validated_labels[0][1][1], 10)
        self.assertEqual(validated_labels[1][0][2], "SSN")

        # test proper conversion of data reader objects
        for dt in ["csv", "json", "parquet"]:
            data_obj = dp.Data(data=pd.DataFrame(two_dim), data_type=dt)
            val = BaseDataLabeler._check_and_return_valid_data_format(data_obj)
            self.assertTrue(np.array_equal(np.array(two_dim), val))
 def setUpClass(cls) -> None:
     cls.data = np.array([
         '123 Fake St.', '1/2/2020', 'nice.', '4/3/22', 'abc', '333-44-2341'
     ]).reshape((-1, ))
     cls.data_labeler = BaseDataLabeler.load_from_disk(
         os.path.join(default_labeler_dir, 'regex_model'))
Exemplo n.º 8
0
 def setUpClass(cls) -> None:
     cls.data = np.array([
         "123 Fake St.", "1/2/2020", "nice.", "4/3/22", "abc", "333-44-2341"
     ]).reshape((-1, ))
     cls.data_labeler = BaseDataLabeler.load_from_disk(
         os.path.join(default_labeler_dir, "regex_model"))