示例#1
0
    def _testCreateVocabLayer(self, vocab_hub_url):
        layer = vocab_layer.create_vocab_layer(self.vocab_layer_param,
                                               vocab_hub_url)
        outputs = layer(self.inputs)
        tf.saved_model.save(layer, self.vocab_layer_dir)

        loaded_layer = vocab_layer.create_vocab_layer(None,
                                                      self.vocab_layer_dir)
        loaded_layer_outputs = loaded_layer(self.inputs)

        for k, v in outputs.items():
            self.assertAllEqual(v, loaded_layer_outputs[k])

        shutil.rmtree(self.vocab_layer_dir)
示例#2
0
 def testVocabLookup(self):
     """Tests vocab_lookup()"""
     vocab_layer_param = copy.copy(self.vocab_layer_param)
     layer = vocab_layer.create_vocab_layer(vocab_layer_param, '')
     outputs = layer.vocab_lookup(tf.sparse.from_dense([["hello",
                                                         "build"]]))
     self.assertAllEqual(tf.sparse.to_dense(outputs),
                         tf.convert_to_tensor([[0, 4]]))
示例#3
0
    def testLength(self):
        """Tests length()"""
        vocab_layer_param = copy.copy(self.vocab_layer_param)
        inputs = copy.copy(self.inputs)
        inputs['min_len'] = 1
        inputs['max_len'] = 16
        inputs['num_cls'] = 0
        inputs['num_sep'] = 0

        layer = vocab_layer.create_vocab_layer(vocab_layer_param, '')
        outputs = layer(inputs)
        self.assertAllEqual(outputs[InternalFtrType.LENGTH],
                            tf.constant([2, 5]))

        inputs['num_cls'] = 1
        inputs['num_sep'] = 1
        layer = vocab_layer.create_vocab_layer(vocab_layer_param, '')
        outputs = layer(inputs)
        self.assertAllEqual(outputs[InternalFtrType.LENGTH],
                            tf.constant([4, 7]))
示例#4
0
    def __init__(self, vocab_layer_param, vocab_hub_url, we_file, we_trainable, num_units, name_prefix='w'):
        """ Initializes the embedding layer

        :param vocab_layer_param Parameters related to vocabulary layer initialization. If vocab_hub_url is empty/None, a new vocab layer will be constructed
          using this param
        :param vocab_hub_url Url to saved vocabulary layer. If empty string or None, no vocab layer will be loaded
        :param we_file Path to pretrained word embedding
        :param we_trainable Whether word embedding is trainable
        :param num_units Dimension of embedding
        :param name_prefix Prefix of embedding variables
        """
        super().__init__()
        self.vocab_layer = create_vocab_layer(vocab_layer_param, vocab_hub_url=vocab_hub_url)
        self._num_units = num_units
        self._vocab_size = self.vocab_layer.vocab_size()
        self._sep_id = self.vocab_layer.sep_id()

        self.embedding = init_word_embedding(self._vocab_size, num_units, we_trainable, we_file, name_prefix)
示例#5
0
    def testConvertIdsToTexts(self):
        """Tests convert_ids_to_texts()"""
        vocab_layer_param = copy.copy(self.vocab_layer_param)
        layer = vocab_layer.create_vocab_layer(vocab_layer_param, '')
        inputs = self.inputs
        outputs = layer(inputs)
        expected_tokenized_result = tf.constant([[1, 0, 0, 2, 3, 3, 3],
                                                 [1, 4, 4, 4, 4, 0, 2]])
        expected_outputs = {
            InternalFtrType.LENGTH: tf.constant([4, 7]),
            InternalFtrType.TOKENIZED_IDS: expected_tokenized_result
        }

        for k, v in outputs.items():
            self.assertAllEqual(v, expected_outputs[k])

        expected_inverse_vocab_lookup_results = [
            b'[CLS] [UNK] [UNK] [SEP] [PAD] [PAD] [PAD]',
            b'[CLS] build build build build [UNK] [SEP]'
        ]
        self.assertAllEqual(
            layer.convert_ids_to_texts(expected_tokenized_result),
            expected_inverse_vocab_lookup_results)
示例#6
0
class TestPrefixSearch(TestCase):
    """ Unit test for prefix_search.py """
    min_len = 0
    max_len = 3
    num_cls = 1
    num_sep = 0
    searcher = prefix_search.PrefixSearcher(
        vocab_layer.create_vocab_layer(TestCase.vocab_layer_param, ''),
        min_len, max_len, num_cls, num_sep)

    def testPrefixSearch(self):
        prefix_list = [
            tf.constant('b'),
            tf.constant('build s'),
            tf.constant('h'),
            tf.constant(''),
            tf.constant('b '),
            tf.constant(' ')
        ]
        exist_prefix_list = [True, True, False, False, True, True]
        vocab_mask_list = [[False] * 4 + [True] + [False] * 11,
                           [False] * 12 + [True, True] + [False] * 2,
                           [False] * 16, [False] * 16, [True] * 16,
                           [True] * 16]
        length_list = [1, 2, 1, 0, 2, 1]

        assert len(prefix_list) == len(exist_prefix_list) == len(
            vocab_mask_list) == len(
                length_list), 'Test input list must have the same size'
        for prefix, exist_prefix, vocab_mask, length in zip(
                prefix_list, exist_prefix_list, vocab_mask_list, length_list):
            self._testPrefixSearch(prefix, exist_prefix, vocab_mask, length)

    def _testPrefixSearch(self, prefix, exist_prefix, vocab_mask, length):
        outputs = self.searcher(prefix)

        self.assertAllEqual(outputs[InternalFtrType.EXIST_PREFIX],
                            exist_prefix)
        self.assertAllEqual(outputs[InternalFtrType.COMPLETION_VOCAB_MASK],
                            vocab_mask)
        self.assertAllEqual(outputs[InternalFtrType.LENGTH], length)

    def testKeyValueArrayDict(self):
        keys_list = [[1, 2, 3], [1, 2, 3], ['10', '2', '3']]
        values_list = [[[2, 3, 4], [5, 0], [1]], [[2, 3, 4], [5, 0], [1]],
                       [['2', '3', '4'], ['5', '0'], ['1']]]
        test_key_list = [2, -1, '2']
        default_values = [-1, -1, ""]
        expected_value_list = [
            tf.convert_to_tensor([
                0, default_values[0], default_values[0], default_values[0],
                default_values[0], 5
            ],
                                 dtype='int32'),
            tf.convert_to_tensor([
                0, default_values[1], default_values[1], default_values[1],
                default_values[1], 5
            ],
                                 dtype='int32'),
            tf.convert_to_tensor([
                '0', default_values[2], default_values[2], default_values[2],
                default_values[2], '5'
            ],
                                 dtype='string')
        ]
        key_type_list = ['int32', 'int32', 'string']
        exist_prefix_list = [
            tf.convert_to_tensor(True),
            tf.convert_to_tensor(False),
            tf.convert_to_tensor(True)
        ]
        for keys, values, test_key, expected_value, default_value, key_type, exist_prefix in zip(
                keys_list, values_list, test_key_list, expected_value_list,
                default_values, key_type_list, exist_prefix_list):
            self._testKeyValueArrayDict(keys, values, test_key, expected_value,
                                        default_value, key_type, exist_prefix)

    def _testKeyValueArrayDict(self, keys, values, test_key, expected_value,
                               default_value, key_type, exist_prefix):
        table = prefix_search.KeyValueArrayDict(keys, values)
        outputs = table.lookup(tf.convert_to_tensor(test_key, dtype=key_type))
        self.assertAllEqual(outputs[InternalFtrType.EXIST_KEY], exist_prefix)
        if exist_prefix.numpy():
            self.assertAllEqual(
                tf.sparse.to_dense(outputs[InternalFtrType.COMPLETION_INDICES],
                                   default_value=default_value),
                expected_value)
示例#7
0
class TestVocabLayer(TestCase):
    """Tests vocab_layer.py """
    num_cls = 1
    num_sep = 1
    sentences = tf.constant(['hello sent1', 'build build build build sent2'])
    inputs = get_sorted_dict({
        'sentences':
        sentences,
        'num_cls':
        tf.constant(num_cls, dtype=tf.dtypes.int32),
        'num_sep':
        tf.constant(num_sep, dtype=tf.dtypes.int32),
        'min_len':
        tf.constant(DataSetup.min_len, dtype=tf.dtypes.int32),
        'max_len':
        tf.constant(DataSetup.max_len, dtype=tf.dtypes.int32)
    })
    layer = vocab_layer.create_vocab_layer(TestCase.vocab_layer_param, '')

    def testAddClsSep(self):
        """Tests add_cls_sep() """
        inputs = copy.copy(self.inputs)
        inputs['min_len'] = 6
        inputs['max_len'] = 7

        num_cls_lst = [1, 0, 1]
        num_sep_lst = [2, 2, 0]

        expected_output_lst = [
            tf.constant([
                self.CLS_ID, self.UNK_ID, self.UNK_ID, self.SEP_ID,
                self.SEP_ID, self.PAD_ID, self.PAD_ID
            ]),
            tf.constant([
                self.UNK_ID, self.UNK_ID, self.SEP_ID, self.SEP_ID,
                self.PAD_ID, self.PAD_ID, self.PAD_ID
            ]),
            tf.constant([
                self.CLS_ID, self.UNK_ID, self.UNK_ID, self.PAD_ID,
                self.PAD_ID, self.PAD_ID
            ]),
        ]

        for num_cls, num_sep, expected_output in zip(num_cls_lst, num_sep_lst,
                                                     expected_output_lst):
            inputs['num_cls'] = num_cls
            inputs['num_sep'] = num_sep
            self._testAddClsSep(inputs, self.layer, expected_output)

    def _testAddClsSep(self, inputs, layer, expected_output):
        outputs = layer(inputs)
        self.assertAllEqual(outputs[InternalFtrType.TOKENIZED_IDS][0],
                            expected_output)

    def testAdjustLen(self):
        """Tests adjust_len() """
        inputs = copy.copy(self.inputs)
        inputs['min_len'] = 12
        inputs['max_len'] = 16

        outputs = self.layer(inputs)
        shape = tf.shape(outputs[InternalFtrType.TOKENIZED_IDS])
        self.assertAllEqual(shape, tf.constant([2, 12]))

        inputs['min_len'] = 0
        inputs['max_len'] = 1
        outputs = self.layer(inputs)
        shape = tf.shape(outputs[InternalFtrType.TOKENIZED_IDS])
        self.assertAllEqual(shape, tf.constant([2, 1]))

    def testAdjustLenRight(self):
        """Tests adjust_len_right() """
        inputs = tf.convert_to_tensor([[2, 1]], dtype=tf.int32)
        min_len = tf.convert_to_tensor(1, dtype=tf.int32)
        max_len = tf.convert_to_tensor(1, dtype=tf.int32)
        outputs = self.layer.adjust_len_right(inputs, min_len, max_len)
        self.assertAllEqual(outputs, tf.convert_to_tensor([[1]]))

        inputs = tf.convert_to_tensor([[2, 1]], dtype=tf.int32)
        min_len = tf.convert_to_tensor(1, dtype=tf.int32)
        max_len = tf.convert_to_tensor(3, dtype=tf.int32)
        outputs = self.layer.adjust_len_right(inputs, min_len, max_len)
        self.assertAllEqual(outputs, tf.convert_to_tensor([[2, 1]]))

    def testKeys(self):
        """Tests keys()"""
        self.assertAllEqual(self.layer.keys(), [
            b'[UNK]', b'[CLS]', b'[SEP]', b'[PAD]', b'build', b'word',
            b'function', b'able', b'test', b'this', b'is', b'a', b'source',
            b'sentence', b'target', b'token'
        ])

    def testValues(self):
        """Tests values()"""
        self.assertAllEqual(self.layer.values(),
                            list(range(len(self.layer.keys()))))

    def testLength(self):
        """Tests length()"""
        vocab_layer_param = copy.copy(self.vocab_layer_param)
        inputs = copy.copy(self.inputs)
        inputs['min_len'] = 1
        inputs['max_len'] = 16
        inputs['num_cls'] = 0
        inputs['num_sep'] = 0

        layer = vocab_layer.create_vocab_layer(vocab_layer_param, '')
        outputs = layer(inputs)
        self.assertAllEqual(outputs[InternalFtrType.LENGTH],
                            tf.constant([2, 5]))

        inputs['num_cls'] = 1
        inputs['num_sep'] = 1
        layer = vocab_layer.create_vocab_layer(vocab_layer_param, '')
        outputs = layer(inputs)
        self.assertAllEqual(outputs[InternalFtrType.LENGTH],
                            tf.constant([4, 7]))

    def testVocabLayerApi(self):
        """Checks whether a given layer conforms to the smart compose vocab layer API"""
        layer = hub.load(self.vocab_hub_url)
        layer: vocab_layer.VocabLayerBase

        self.assertEqual(layer.vocab_size(), self.vocab_size)
        self.assertEqual(layer.pad_id(), self.PAD_ID)
        self.assertEqual(layer.sep_id(), self.SEP_ID)

    def testVocabLookup(self):
        """Tests vocab_lookup()"""
        vocab_layer_param = copy.copy(self.vocab_layer_param)
        layer = vocab_layer.create_vocab_layer(vocab_layer_param, '')
        outputs = layer.vocab_lookup(tf.sparse.from_dense([["hello",
                                                            "build"]]))
        self.assertAllEqual(tf.sparse.to_dense(outputs),
                            tf.convert_to_tensor([[0, 4]]))

    def testConvertIdsToTexts(self):
        """Tests convert_ids_to_texts()"""
        vocab_layer_param = copy.copy(self.vocab_layer_param)
        layer = vocab_layer.create_vocab_layer(vocab_layer_param, '')
        inputs = self.inputs
        outputs = layer(inputs)
        expected_tokenized_result = tf.constant([[1, 0, 0, 2, 3, 3, 3],
                                                 [1, 4, 4, 4, 4, 0, 2]])
        expected_outputs = {
            InternalFtrType.LENGTH: tf.constant([4, 7]),
            InternalFtrType.TOKENIZED_IDS: expected_tokenized_result
        }

        for k, v in outputs.items():
            self.assertAllEqual(v, expected_outputs[k])

        expected_inverse_vocab_lookup_results = [
            b'[CLS] [UNK] [UNK] [SEP] [PAD] [PAD] [PAD]',
            b'[CLS] build build build build [UNK] [SEP]'
        ]
        self.assertAllEqual(
            layer.convert_ids_to_texts(expected_tokenized_result),
            expected_inverse_vocab_lookup_results)

    def testCreateVocabLayer(self):
        """Tests create_vocab_layer() """
        for vocab_hub_url in ['', self.vocab_hub_url]:
            self._testCreateVocabLayer(vocab_hub_url)

    def _testCreateVocabLayer(self, vocab_hub_url):
        layer = vocab_layer.create_vocab_layer(self.vocab_layer_param,
                                               vocab_hub_url)
        outputs = layer(self.inputs)
        tf.saved_model.save(layer, self.vocab_layer_dir)

        loaded_layer = vocab_layer.create_vocab_layer(None,
                                                      self.vocab_layer_dir)
        loaded_layer_outputs = loaded_layer(self.inputs)

        for k, v in outputs.items():
            self.assertAllEqual(v, loaded_layer_outputs[k])

        shutil.rmtree(self.vocab_layer_dir)