def test_sequence(): texts = [ 'The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.' ] tokenize = preprocessor.TextToIntSequence() dataset = tf.data.Dataset.from_tensor_slices(texts) tokenize.set_hp(kerastuner.HyperParameters()) for x in dataset: tokenize.update(x) tokenize.finalize() tokenize.set_config(tokenize.get_config()) weights = tokenize.get_weights() tokenize.clear_weights() tokenize.set_weights(weights) for a in dataset: tokenize.transform(a) def map_func(x): return tf.py_function(tokenize.transform, inp=[x], Tout=(tf.int64, )) new_dataset = dataset.map(map_func) for _ in new_dataset: pass assert isinstance(new_dataset, tf.data.Dataset)
def test_sequence(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] dataset = tf.data.Dataset.from_tensor_slices(texts) new_dataset = run_preprocessor(preprocessor.TextToIntSequence(), dataset, common.generate_data(dtype='dataset'), tf.int64) assert isinstance(new_dataset, tf.data.Dataset)
def build(self, hp, inputs=None): input_node = nest.flatten(inputs)[0] output_node = input_node vectorizer = self.vectorizer or hp.Choice( 'vectorizer', ['sequence', 'ngram'], default='sequence') if not isinstance(input_node, node.TextNode): raise ValueError('The input_node should be a TextNode.') if vectorizer == 'ngram': output_node = preprocessor.TextToNgramVector()(output_node) output_node = block.DenseBlock()(output_node) else: output_node = preprocessor.TextToIntSequence()(output_node) output_node = block.EmbeddingBlock( pretraining=self.pretraining)(output_node) output_node = block.ConvBlock(separable=True)(output_node) return output_node