def _testCreateVocabLayer(self, vocab_hub_url): layer = vocab_layer.create_vocab_layer(self.vocab_layer_param, vocab_hub_url) outputs = layer(self.inputs) tf.saved_model.save(layer, self.vocab_layer_dir) loaded_layer = vocab_layer.create_vocab_layer(None, self.vocab_layer_dir) loaded_layer_outputs = loaded_layer(self.inputs) for k, v in outputs.items(): self.assertAllEqual(v, loaded_layer_outputs[k]) shutil.rmtree(self.vocab_layer_dir)
def testVocabLookup(self): """Tests vocab_lookup()""" vocab_layer_param = copy.copy(self.vocab_layer_param) layer = vocab_layer.create_vocab_layer(vocab_layer_param, '') outputs = layer.vocab_lookup(tf.sparse.from_dense([["hello", "build"]])) self.assertAllEqual(tf.sparse.to_dense(outputs), tf.convert_to_tensor([[0, 4]]))
def testLength(self): """Tests length()""" vocab_layer_param = copy.copy(self.vocab_layer_param) inputs = copy.copy(self.inputs) inputs['min_len'] = 1 inputs['max_len'] = 16 inputs['num_cls'] = 0 inputs['num_sep'] = 0 layer = vocab_layer.create_vocab_layer(vocab_layer_param, '') outputs = layer(inputs) self.assertAllEqual(outputs[InternalFtrType.LENGTH], tf.constant([2, 5])) inputs['num_cls'] = 1 inputs['num_sep'] = 1 layer = vocab_layer.create_vocab_layer(vocab_layer_param, '') outputs = layer(inputs) self.assertAllEqual(outputs[InternalFtrType.LENGTH], tf.constant([4, 7]))
def __init__(self, vocab_layer_param, vocab_hub_url, we_file, we_trainable, num_units, name_prefix='w'): """ Initializes the embedding layer :param vocab_layer_param Parameters related to vocabulary layer initialization. If vocab_hub_url is empty/None, a new vocab layer will be constructed using this param :param vocab_hub_url Url to saved vocabulary layer. If empty string or None, no vocab layer will be loaded :param we_file Path to pretrained word embedding :param we_trainable Whether word embedding is trainable :param num_units Dimension of embedding :param name_prefix Prefix of embedding variables """ super().__init__() self.vocab_layer = create_vocab_layer(vocab_layer_param, vocab_hub_url=vocab_hub_url) self._num_units = num_units self._vocab_size = self.vocab_layer.vocab_size() self._sep_id = self.vocab_layer.sep_id() self.embedding = init_word_embedding(self._vocab_size, num_units, we_trainable, we_file, name_prefix)
def testConvertIdsToTexts(self): """Tests convert_ids_to_texts()""" vocab_layer_param = copy.copy(self.vocab_layer_param) layer = vocab_layer.create_vocab_layer(vocab_layer_param, '') inputs = self.inputs outputs = layer(inputs) expected_tokenized_result = tf.constant([[1, 0, 0, 2, 3, 3, 3], [1, 4, 4, 4, 4, 0, 2]]) expected_outputs = { InternalFtrType.LENGTH: tf.constant([4, 7]), InternalFtrType.TOKENIZED_IDS: expected_tokenized_result } for k, v in outputs.items(): self.assertAllEqual(v, expected_outputs[k]) expected_inverse_vocab_lookup_results = [ b'[CLS] [UNK] [UNK] [SEP] [PAD] [PAD] [PAD]', b'[CLS] build build build build [UNK] [SEP]' ] self.assertAllEqual( layer.convert_ids_to_texts(expected_tokenized_result), expected_inverse_vocab_lookup_results)
class TestPrefixSearch(TestCase): """ Unit test for prefix_search.py """ min_len = 0 max_len = 3 num_cls = 1 num_sep = 0 searcher = prefix_search.PrefixSearcher( vocab_layer.create_vocab_layer(TestCase.vocab_layer_param, ''), min_len, max_len, num_cls, num_sep) def testPrefixSearch(self): prefix_list = [ tf.constant('b'), tf.constant('build s'), tf.constant('h'), tf.constant(''), tf.constant('b '), tf.constant(' ') ] exist_prefix_list = [True, True, False, False, True, True] vocab_mask_list = [[False] * 4 + [True] + [False] * 11, [False] * 12 + [True, True] + [False] * 2, [False] * 16, [False] * 16, [True] * 16, [True] * 16] length_list = [1, 2, 1, 0, 2, 1] assert len(prefix_list) == len(exist_prefix_list) == len( vocab_mask_list) == len( length_list), 'Test input list must have the same size' for prefix, exist_prefix, vocab_mask, length in zip( prefix_list, exist_prefix_list, vocab_mask_list, length_list): self._testPrefixSearch(prefix, exist_prefix, vocab_mask, length) def _testPrefixSearch(self, prefix, exist_prefix, vocab_mask, length): outputs = self.searcher(prefix) self.assertAllEqual(outputs[InternalFtrType.EXIST_PREFIX], exist_prefix) self.assertAllEqual(outputs[InternalFtrType.COMPLETION_VOCAB_MASK], vocab_mask) self.assertAllEqual(outputs[InternalFtrType.LENGTH], length) def testKeyValueArrayDict(self): keys_list = [[1, 2, 3], [1, 2, 3], ['10', '2', '3']] values_list = [[[2, 3, 4], [5, 0], [1]], [[2, 3, 4], [5, 0], [1]], [['2', '3', '4'], ['5', '0'], ['1']]] test_key_list = [2, -1, '2'] default_values = [-1, -1, ""] expected_value_list = [ tf.convert_to_tensor([ 0, default_values[0], default_values[0], default_values[0], default_values[0], 5 ], dtype='int32'), tf.convert_to_tensor([ 0, default_values[1], default_values[1], default_values[1], default_values[1], 5 ], dtype='int32'), tf.convert_to_tensor([ '0', default_values[2], default_values[2], default_values[2], default_values[2], '5' ], dtype='string') ] key_type_list = ['int32', 'int32', 'string'] exist_prefix_list = [ tf.convert_to_tensor(True), tf.convert_to_tensor(False), tf.convert_to_tensor(True) ] for keys, values, test_key, expected_value, default_value, key_type, exist_prefix in zip( keys_list, values_list, test_key_list, expected_value_list, default_values, key_type_list, exist_prefix_list): self._testKeyValueArrayDict(keys, values, test_key, expected_value, default_value, key_type, exist_prefix) def _testKeyValueArrayDict(self, keys, values, test_key, expected_value, default_value, key_type, exist_prefix): table = prefix_search.KeyValueArrayDict(keys, values) outputs = table.lookup(tf.convert_to_tensor(test_key, dtype=key_type)) self.assertAllEqual(outputs[InternalFtrType.EXIST_KEY], exist_prefix) if exist_prefix.numpy(): self.assertAllEqual( tf.sparse.to_dense(outputs[InternalFtrType.COMPLETION_INDICES], default_value=default_value), expected_value)
class TestVocabLayer(TestCase): """Tests vocab_layer.py """ num_cls = 1 num_sep = 1 sentences = tf.constant(['hello sent1', 'build build build build sent2']) inputs = get_sorted_dict({ 'sentences': sentences, 'num_cls': tf.constant(num_cls, dtype=tf.dtypes.int32), 'num_sep': tf.constant(num_sep, dtype=tf.dtypes.int32), 'min_len': tf.constant(DataSetup.min_len, dtype=tf.dtypes.int32), 'max_len': tf.constant(DataSetup.max_len, dtype=tf.dtypes.int32) }) layer = vocab_layer.create_vocab_layer(TestCase.vocab_layer_param, '') def testAddClsSep(self): """Tests add_cls_sep() """ inputs = copy.copy(self.inputs) inputs['min_len'] = 6 inputs['max_len'] = 7 num_cls_lst = [1, 0, 1] num_sep_lst = [2, 2, 0] expected_output_lst = [ tf.constant([ self.CLS_ID, self.UNK_ID, self.UNK_ID, self.SEP_ID, self.SEP_ID, self.PAD_ID, self.PAD_ID ]), tf.constant([ self.UNK_ID, self.UNK_ID, self.SEP_ID, self.SEP_ID, self.PAD_ID, self.PAD_ID, self.PAD_ID ]), tf.constant([ self.CLS_ID, self.UNK_ID, self.UNK_ID, self.PAD_ID, self.PAD_ID, self.PAD_ID ]), ] for num_cls, num_sep, expected_output in zip(num_cls_lst, num_sep_lst, expected_output_lst): inputs['num_cls'] = num_cls inputs['num_sep'] = num_sep self._testAddClsSep(inputs, self.layer, expected_output) def _testAddClsSep(self, inputs, layer, expected_output): outputs = layer(inputs) self.assertAllEqual(outputs[InternalFtrType.TOKENIZED_IDS][0], expected_output) def testAdjustLen(self): """Tests adjust_len() """ inputs = copy.copy(self.inputs) inputs['min_len'] = 12 inputs['max_len'] = 16 outputs = self.layer(inputs) shape = tf.shape(outputs[InternalFtrType.TOKENIZED_IDS]) self.assertAllEqual(shape, tf.constant([2, 12])) inputs['min_len'] = 0 inputs['max_len'] = 1 outputs = self.layer(inputs) shape = tf.shape(outputs[InternalFtrType.TOKENIZED_IDS]) self.assertAllEqual(shape, tf.constant([2, 1])) def testAdjustLenRight(self): """Tests adjust_len_right() """ inputs = tf.convert_to_tensor([[2, 1]], dtype=tf.int32) min_len = tf.convert_to_tensor(1, dtype=tf.int32) max_len = tf.convert_to_tensor(1, dtype=tf.int32) outputs = self.layer.adjust_len_right(inputs, min_len, max_len) self.assertAllEqual(outputs, tf.convert_to_tensor([[1]])) inputs = tf.convert_to_tensor([[2, 1]], dtype=tf.int32) min_len = tf.convert_to_tensor(1, dtype=tf.int32) max_len = tf.convert_to_tensor(3, dtype=tf.int32) outputs = self.layer.adjust_len_right(inputs, min_len, max_len) self.assertAllEqual(outputs, tf.convert_to_tensor([[2, 1]])) def testKeys(self): """Tests keys()""" self.assertAllEqual(self.layer.keys(), [ b'[UNK]', b'[CLS]', b'[SEP]', b'[PAD]', b'build', b'word', b'function', b'able', b'test', b'this', b'is', b'a', b'source', b'sentence', b'target', b'token' ]) def testValues(self): """Tests values()""" self.assertAllEqual(self.layer.values(), list(range(len(self.layer.keys())))) def testLength(self): """Tests length()""" vocab_layer_param = copy.copy(self.vocab_layer_param) inputs = copy.copy(self.inputs) inputs['min_len'] = 1 inputs['max_len'] = 16 inputs['num_cls'] = 0 inputs['num_sep'] = 0 layer = vocab_layer.create_vocab_layer(vocab_layer_param, '') outputs = layer(inputs) self.assertAllEqual(outputs[InternalFtrType.LENGTH], tf.constant([2, 5])) inputs['num_cls'] = 1 inputs['num_sep'] = 1 layer = vocab_layer.create_vocab_layer(vocab_layer_param, '') outputs = layer(inputs) self.assertAllEqual(outputs[InternalFtrType.LENGTH], tf.constant([4, 7])) def testVocabLayerApi(self): """Checks whether a given layer conforms to the smart compose vocab layer API""" layer = hub.load(self.vocab_hub_url) layer: vocab_layer.VocabLayerBase self.assertEqual(layer.vocab_size(), self.vocab_size) self.assertEqual(layer.pad_id(), self.PAD_ID) self.assertEqual(layer.sep_id(), self.SEP_ID) def testVocabLookup(self): """Tests vocab_lookup()""" vocab_layer_param = copy.copy(self.vocab_layer_param) layer = vocab_layer.create_vocab_layer(vocab_layer_param, '') outputs = layer.vocab_lookup(tf.sparse.from_dense([["hello", "build"]])) self.assertAllEqual(tf.sparse.to_dense(outputs), tf.convert_to_tensor([[0, 4]])) def testConvertIdsToTexts(self): """Tests convert_ids_to_texts()""" vocab_layer_param = copy.copy(self.vocab_layer_param) layer = vocab_layer.create_vocab_layer(vocab_layer_param, '') inputs = self.inputs outputs = layer(inputs) expected_tokenized_result = tf.constant([[1, 0, 0, 2, 3, 3, 3], [1, 4, 4, 4, 4, 0, 2]]) expected_outputs = { InternalFtrType.LENGTH: tf.constant([4, 7]), InternalFtrType.TOKENIZED_IDS: expected_tokenized_result } for k, v in outputs.items(): self.assertAllEqual(v, expected_outputs[k]) expected_inverse_vocab_lookup_results = [ b'[CLS] [UNK] [UNK] [SEP] [PAD] [PAD] [PAD]', b'[CLS] build build build build [UNK] [SEP]' ] self.assertAllEqual( layer.convert_ids_to_texts(expected_tokenized_result), expected_inverse_vocab_lookup_results) def testCreateVocabLayer(self): """Tests create_vocab_layer() """ for vocab_hub_url in ['', self.vocab_hub_url]: self._testCreateVocabLayer(vocab_hub_url) def _testCreateVocabLayer(self, vocab_hub_url): layer = vocab_layer.create_vocab_layer(self.vocab_layer_param, vocab_hub_url) outputs = layer(self.inputs) tf.saved_model.save(layer, self.vocab_layer_dir) loaded_layer = vocab_layer.create_vocab_layer(None, self.vocab_layer_dir) loaded_layer_outputs = loaded_layer(self.inputs) for k, v in outputs.items(): self.assertAllEqual(v, loaded_layer_outputs[k]) shutil.rmtree(self.vocab_layer_dir)