def test_ngram(): texts = [ 'The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.' ] tokenize = preprocessor.TextToNgramVector() dataset = tf.data.Dataset.from_tensor_slices(texts) tokenize.set_hp(kerastuner.HyperParameters()) for x in dataset: tokenize.update(x) tokenize.finalize() tokenize.set_config(tokenize.get_config()) weights = tokenize.get_weights() tokenize.clear_weights() tokenize.set_weights(weights) for a in dataset: tokenize.transform(a) def map_func(x): return tf.py_function(tokenize.transform, inp=[x], Tout=(tf.float64, )) new_dataset = dataset.map(map_func) for _ in new_dataset: pass assert isinstance(new_dataset, tf.data.Dataset)
def test_ngram(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] dataset = tf.data.Dataset.from_tensor_slices(texts) new_dataset = run_preprocessor(preprocessor.TextToNgramVector(), dataset, common.generate_data(dtype='dataset'), tf.float32) assert isinstance(new_dataset, tf.data.Dataset)
def build(self, hp, inputs=None): input_node = nest.flatten(inputs)[0] output_node = input_node vectorizer = self.vectorizer or hp.Choice( 'vectorizer', ['sequence', 'ngram'], default='sequence') if not isinstance(input_node, node.TextNode): raise ValueError('The input_node should be a TextNode.') if vectorizer == 'ngram': output_node = preprocessor.TextToNgramVector()(output_node) output_node = block.DenseBlock()(output_node) else: output_node = preprocessor.TextToIntSequence()(output_node) output_node = block.EmbeddingBlock( pretraining=self.pretraining)(output_node) output_node = block.ConvBlock(separable=True)(output_node) return output_node
def test_ngram_result(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] sklearn_vectorizer = TfidfVectorizer(ngram_range=(1, 2)) sklearn_vectorizer.fit(texts) sklearn_vec = sklearn_vectorizer.transform([texts[0]]).toarray()[0] dataset = tf.data.Dataset.from_tensor_slices(texts) ngram_vectorizer = preprocessor_module.TextToNgramVector(ngram_range=(1, 2)) ngram_vectorizer.build(kerastuner.HyperParameters()) for text in dataset: ngram_vectorizer.update([text]) ngram_vectorizer.finalize() ngram_vec = [] for text in dataset: ngram_vec.append(ngram_vectorizer.transform([text])) assert len(ngram_vectorizer.vocabulary) == len(sklearn_vectorizer.vocabulary_) for key in ngram_vectorizer.vocabulary.keys(): assert key in sklearn_vectorizer.vocabulary_ assert (sklearn_vec[sklearn_vectorizer.vocabulary_[key]] - ngram_vec[0][ngram_vectorizer.vocabulary[key]] <= 0.001)