示例#1
0
 def test_param_validation(self):
     # Make sure all parameters can be altered. Make sure non-valid params
     # are caught
     parameters = {
         "max_length": 10,
         "max_char_encoding_id": 11,
         "size_fc": [64, 64],
         "dropout": 0.9,
         "size_conv": 11,
         "default_label": "UNKNOWN",
         "num_fil": [48 for _ in range(2)],
     }
     invalid_parameters = {
         "max_length": -1,
         "max_char_encoding_id": "words",
         "size_fc": 5,
         "dropout": 0.9,
         "size_conv": 11,
         "optimizer": 6,
         "num_fil": [48 for _ in range(2)],
         "fake_extra_param": "fails",
     }
     cnn_model = CharacterLevelCnnModel(label_mapping=self.label_mapping,
                                        parameters=parameters)
     cnn_model._construct_model()
     self.assertDictEqual(parameters, cnn_model._parameters)
     with self.assertRaises(ValueError):
         CharacterLevelCnnModel(label_mapping=self.label_mapping,
                                parameters=invalid_parameters)
示例#2
0
 def test_param_validation(self):
     # Make sure all parameters can be altered. Make sure non-valid params
     # are caught
     parameters = {
         'max_length': 10,
         'max_char_encoding_id': 11,
         'size_fc': [64, 64],
         'dropout': 0.9,
         'size_conv': 11,
         'default_label': "BACKGROUND",
         'num_fil': [48 for _ in range(2)]
     }
     invalid_parameters = {
         'max_length': -1,
         'max_char_encoding_id': "words",
         'size_fc': 5,
         'dropout': 0.9,
         'size_conv': 11,
         'optimizer': 6,
         'num_fil': [48 for _ in range(2)],
         'fake_extra_param': "fails"
     }
     cnn_model = CharacterLevelCnnModel(label_mapping=self.label_mapping,
                                        parameters=parameters)
     cnn_model._construct_model()
     self.assertDictEqual(parameters, cnn_model._parameters)
     with self.assertRaises(ValueError):
         CharacterLevelCnnModel(label_mapping=self.label_mapping,
                                parameters=invalid_parameters)
示例#3
0
    def test_input_encoding(self):
        cnn_model = CharacterLevelCnnModel(self.label_mapping)

        input_str_tensor = tf.convert_to_tensor(['test'])
        max_char_encoding_id = 127
        max_len = 10

        encode_output = cnn_model._char_encoding_layer(input_str_tensor,
                                                       max_char_encoding_id,
                                                       max_len).numpy()[0]
        expected_output = [117, 102, 116, 117, 0, 0, 0, 0, 0, 0]
        self.assertCountEqual(encode_output, expected_output)
示例#4
0
    def test_validation_evaluate_and_classification_report(self, *mocks):
        cnn_model = CharacterLevelCnnModel(self.label_mapping)
        cnn_model._construct_model()

        # validation data
        val_gen = [[
            np.array([['123 fake st']]),
            np.zeros((1, 3400, max(self.label_mapping.values()) + 1))
        ]]
        val_gen[0][1][:, :11, self.label_mapping['ADDRESS']] = 1

        f1, f1_report = cnn_model._validate_training(val_gen, 32, True, True)
        self.assertIsNotNone(f1)
        self.assertIsNotNone(f1_report)
        self.assertEqual(11, f1_report['ADDRESS']['support'])
示例#5
0
    def test_threshold_layer(self):
        cnn_model = CharacterLevelCnnModel(self.label_mapping)

        confidences = tf.convert_to_tensor([[[0.0, 0.0, 1.0, 0.0],
                                             [0.0, 0.6, 0.4, 0.0],
                                             [0.9, 0.0, 0.0, 0.1]]])

        argmax = tf.convert_to_tensor([[2, 1, 0]])
        expected_threshold_output = argmax.numpy()[0]
        num_labels = 4

        threshold_layer = cnn_model._argmax_threshold_layer(num_labels,
                                                            threshold=0.0,
                                                            default_ind=1)
        threshold_output = threshold_layer(argmax, confidences).numpy()[0]
        self.assertCountEqual(threshold_output, expected_threshold_output)
示例#6
0
    def test_reverse_label_mapping(self, *mocks):

        # load default
        cnn_model = CharacterLevelCnnModel(self.label_mapping)

        # should notice that CITY does not exist in reverse
        reverse_label_mapping = {
            0: "PAD",
            1: "UNKNOWN",
            2: "ADDRESS",
            3: "BAN",
            4: "CREDIT_CARD",
            5: "EMAIL_ADDRESS",
            6: "UUID",
            7: "HASH_OR_KEY",
            8: "IPV4",
            9: "IPV6",
            10: "MAC_ADDRESS",
            11: "PERSON",
            12: "PHONE_NUMBER",
            13: "SSN",
            14: "URL",
            15: "DATETIME",
            16: "INTEGER_BIG",
        }

        self.assertDictEqual(reverse_label_mapping,
                             cnn_model.reverse_label_mapping)
示例#7
0
    def test_reverse_label_mapping(self, *mocks):

        # load default
        cnn_model = CharacterLevelCnnModel(self.label_mapping)

        # should notice that CITY does not exist in reverse
        reverse_label_mapping = {
            0: 'PAD',
            1: 'BACKGROUND',
            2: 'ADDRESS',
            3: 'BAN',
            4: 'CREDIT_CARD',
            5: 'EMAIL_ADDRESS',
            6: 'UUID',
            7: 'HASH_OR_KEY',
            8: 'IPV4',
            9: 'IPV6',
            10: 'MAC_ADDRESS',
            11: 'PERSON',
            12: 'PHONE_NUMBER',
            13: 'SSN',
            14: 'URL',
            15: 'DATETIME',
            16: 'INTEGER_BIG'
        }

        self.assertDictEqual(reverse_label_mapping,
                             cnn_model.reverse_label_mapping)
示例#8
0
    def test_labels(self, *mocks):

        # load default
        cnn_model = CharacterLevelCnnModel(self.label_mapping)

        labels = [
            "PAD",
            "UNKNOWN",
            "ADDRESS",
            "BAN",
            "CREDIT_CARD",
            "EMAIL_ADDRESS",
            "UUID",
            "HASH_OR_KEY",
            "IPV4",
            "IPV6",
            "MAC_ADDRESS",
            "PERSON",
            "PHONE_NUMBER",
            "SSN",
            "URL",
            "DATETIME",
            "INTEGER_BIG",
        ]

        self.assertListEqual(labels, cnn_model.labels)
示例#9
0
    def test_save(self, mock_open, *mocks):
        # setup mock
        mock_file = setup_save_mock_open(mock_open)

        # Save and load a CNN Model with custom parameters
        parameters = {'max_char_encoding_id': 100, 'size_conv': 6}
        label_mapping = {
            'PAD': 0,
            'CITY': 1,  # SAME AS BACKGROUND
            'BACKGROUND': 1,
            'ADDRESS': 2,
        }
        cnn_model = CharacterLevelCnnModel(label_mapping, parameters)
        cnn_model._model = mock.Mock()
        cnn_model._model_num_labels = 3
        cnn_model._model_default_ind = 1

        # save file and test
        cnn_model.save_to_disk(".")
        self.assertEqual(
            # model parameters
            '{"max_char_encoding_id": 100, "size_conv": 6, "max_length": 3400, '
            '"dim_embed": 64, "size_fc": [96, 96], "dropout": 0.073, '
            '"default_label": "BACKGROUND", "num_fil": [48, 48, 48, 48], '
            '"pad_label": "PAD"}'
            # label_mapping
            '{"PAD": 0, "CITY": 1, "BACKGROUND": 1, "ADDRESS": 2}',
            mock_file.getvalue())

        # close mock
        StringIO.close(mock_file)
示例#10
0
    def test_fit_and_predict_with_reset_weights(self, *mocks):
        # model
        cnn_model = CharacterLevelCnnModel(self.label_mapping)

        # data for model
        data_gen = [[
            np.array([['test']]),  # x_data
            np.zeros((1, 3400, max(self.label_mapping.values()) + 1))
        ]  # y_data
                    ]
        cv_gen = data_gen

        # Basic Fit with Validation Data
        history, f1, f1_report = cnn_model.fit(data_gen,
                                               cv_gen,
                                               reset_weights=True)
        data_gen = [np.array([['test']])]
        cnn_model.predict(data_gen)
示例#11
0
    def test_labels(self, *mocks):

        # load default
        cnn_model = CharacterLevelCnnModel(self.label_mapping)

        labels = [
            'PAD', 'UNKNOWN', 'ADDRESS', 'BAN', 'CREDIT_CARD', 'EMAIL_ADDRESS',
            'UUID', 'HASH_OR_KEY', 'IPV4', 'IPV6', 'MAC_ADDRESS', 'PERSON',
            'PHONE_NUMBER', 'SSN', 'URL', 'DATETIME', 'INTEGER_BIG'
        ]

        self.assertListEqual(labels, cnn_model.labels)
示例#12
0
    def test_model_construct(self):
        # Default Model Construct

        cnn_model = CharacterLevelCnnModel(label_mapping=self.label_mapping)
        cnn_model._construct_model()
        # Test Details
        cnn_model.details()

        expected_layers = [
            "input_1",
            "lambda",
            "embedding",
            "conv1d",
            "dropout",
            "batch_normalization",
            "conv1d_1",
            "dropout_1",
            "batch_normalization_1",
            "conv1d_2",
            "dropout_2",
            "batch_normalization_2",
            "conv1d_3",
            "dropout_3",
            "batch_normalization_3",
            "dense",
            "dropout_4",
            "dense_1",
            "dropout_5",
            "dense_2",
            "tf_op_layer_ArgMax",
            "thresh_arg_max_layer",
        ]
        model_layers = [layer.name for layer in cnn_model._model.layers]
        self.assertEqual(len(expected_layers), len(model_layers))
        self.assertEqual(17, cnn_model.num_labels)
示例#13
0
    def test_fit_and_predict_with_reset_weights(self, *mocks):
        # model
        cnn_model = CharacterLevelCnnModel(self.label_mapping)

        # data for model
        data_gen = [[
            np.array([["test"]]),  # x_data
            np.zeros((1, 3400, max(self.label_mapping.values()) + 1)),
        ]  # y_data
                    ]
        cv_gen = data_gen

        # Basic Fit with Validation Data
        with self.assertLogs("DataProfiler.labelers.character_level_cnn_model",
                             level="INFO") as logs:
            history, f1, f1_report = cnn_model.fit(data_gen,
                                                   cv_gen,
                                                   reset_weights=True)

        # Ensure info was logged during fit
        self.assertTrue(len(logs.output))

        data_gen = [np.array([["test"]])]
        cnn_model.predict(data_gen)
示例#14
0
    def test_fit_and_predict_with_new_labels(self):
        # Initialize model
        cnn_model = CharacterLevelCnnModel(self.label_mapping)

        # data for model
        data_gen = [[
            np.array([['test']]),  # x_data
            np.zeros((1, 3400, max(self.label_mapping.values()) + 1))
        ]
                    # y_data
                    ]
        cv_gen = data_gen

        cnn_model._construct_model()

        # fit with new labels
        history, f1, f1_report = cnn_model.fit(
            data_gen, cv_gen, label_mapping=self.label_mapping)

        # predict after fitting on just the text
        cnn_model.predict(data_gen[0][0])
示例#15
0
    def test_validation(self):

        # model
        cnn_model = CharacterLevelCnnModel(label_mapping=self.label_mapping)
        cnn_model._construct_model()

        # data for model
        cv_data_gen = [[
            np.array([['test']]),  # x_data
            np.zeros((1, 3400, max(self.label_mapping.values()) + 1))
        ]
                       # y_data
                       ]

        # validation
        cnn_model._validate_training(cv_data_gen,
                                     batch_size_test=32,
                                     verbose_log=True,
                                     verbose_keras=False)
示例#16
0
    def test_model_construct(self):
        # Default Model Construct

        cnn_model = CharacterLevelCnnModel(label_mapping=self.label_mapping)
        cnn_model._construct_model()
        # Test Details
        cnn_model.details()

        expected_layers = [
            'input_1', 'lambda', 'embedding', 'conv1d', 'dropout',
            'batch_normalization', 'conv1d_1', 'dropout_1',
            'batch_normalization_1', 'conv1d_2', 'dropout_2',
            'batch_normalization_2', 'conv1d_3', 'dropout_3',
            'batch_normalization_3', 'dense', 'dropout_4', 'dense_1',
            'dropout_5', 'dense_2', 'tf_op_layer_ArgMax',
            'thresh_arg_max_layer'
        ]
        model_layers = [layer.name for layer in cnn_model._model.layers]
        self.assertEqual(len(expected_layers), len(model_layers))
        self.assertEqual(17, cnn_model.num_labels)
示例#17
0
    def test_label_mapping(self, *mocks):

        # load default
        cnn_model = CharacterLevelCnnModel(self.label_mapping)

        self.assertDictEqual(self.label_mapping, cnn_model.label_mapping)
示例#18
0
 def test_load(self, *mocks):
     dir = os.path.join(_resource_labeler_dir, 'unstructured_model/')
     loaded_model = CharacterLevelCnnModel.load_from_disk(dir)
     self.assertIsInstance(loaded_model, CharacterLevelCnnModel)
示例#19
0
    def test_set_label_mapping(self, *mocks):

        # load default
        cnn_model = CharacterLevelCnnModel(self.label_mapping)

        # test not dict
        label_mapping = None
        with self.assertRaisesRegex(
                TypeError, "Labels must either be a non-empty encoding dict "
                "which maps labels to index encodings or a list."):
            cnn_model.set_label_mapping(label_mapping)

        # test raise error for setting non PAD to 0
        label_mapping = {'TEST': 0}
        with self.assertRaisesRegex(ValueError,
                                    "`PAD` must map to index zero."):
            cnn_model.set_label_mapping(label_mapping)

        # test raise error for setting PAD other than 0
        label_mapping = {'PAD': 1}
        with self.assertRaisesRegex(ValueError,
                                    "`PAD` must map to index zero."):
            cnn_model.set_label_mapping(label_mapping)

        # test raise error if default label not in mapping
        label_mapping = {'PAD': 0}
        with self.assertRaisesRegex(
                ValueError, "The `default_label` of BACKGROUND must "
                "exist in the label mapping."):
            cnn_model.set_label_mapping(label_mapping)

        # test label_mapping without PAD
        label_mapping = {
            'CITY': 1,  # SAME AS BACKGROUND
            'BACKGROUND': 1,
            'ADDRESS': 2,
        }
        cnn_model.set_label_mapping(label_mapping)

        self.assertNotEqual(label_mapping, cnn_model.label_mapping)
        label_mapping['PAD'] = 0
        self.assertDictEqual(label_mapping, cnn_model.label_mapping)

        # test label_mapping with PAD: 0
        label_mapping = {
            'PAD': 0,
            'CITY': 1,  # SAME AS BACKGROUND
            'BACKGROUND': 1,
            'ADDRESS': 2,
        }
        cnn_model.set_label_mapping(label_mapping)
        self.assertDictEqual(label_mapping, cnn_model.label_mapping)
示例#20
0
    def test_fit_and_predict_with_new_labels_set_via_method(self):
        # Initialize model
        invalid_entities = {"PAD": 0, "BACKGROUND": 1, "test3": 2}
        cnn_model = CharacterLevelCnnModel(invalid_entities)
        cnn_model._construct_model()
        invalid_entities2 = {"PAD": 0, "BACKGROUND": 1}
        cnn_model.set_label_mapping(invalid_entities2)
        cnn_model._reconstruct_model()
        cnn_model.set_label_mapping(self.label_mapping)

        # data for model
        data_gen = [[
            np.array([['test']]),  # x_data
            np.zeros((1, 3400, max(self.label_mapping.values()) + 1))
        ]
                    # y_data
                    ]
        cv_gen = data_gen

        cnn_model._construct_model()

        # set different labels
        cnn_model.set_label_mapping(self.label_mapping)
        history, f1, f1_report = cnn_model.fit(data_gen, cv_gen)

        # test predict on just the text
        cnn_model.predict(data_gen[0][0])
示例#21
0
 def test_help(self, mock_stdout):
     CharacterLevelCnnModel.help()
     self.assertIn("CharacterLevelCnnModel", mock_stdout.getvalue())
     self.assertIn("Parameters", mock_stdout.getvalue())