def test_local_textset_integration(self): local_set = LocalTextSet(self.texts, self.labels) assert local_set.is_local() assert not local_set.is_distributed() assert local_set.get_texts() == self.texts assert local_set.get_labels() == self.labels tokenized = ChainedPreprocessing([Tokenizer(), Normalizer()])(local_set) word_index = tokenized.generate_word_index_map(max_words_num=10) transformed = ChainedPreprocessing([WordIndexer(word_index), SequenceShaper(10), TextFeatureToSample()])(tokenized) assert transformed.is_local() word_index = transformed.get_word_index() assert len(word_index) == 10 assert word_index["my"] == 1 samples = transformed.get_samples() assert len(samples) == 3 for sample in samples: assert sample.feature.shape[0] == 10 vocab_file = create_tmp_path() + ".txt" transformed.save_word_index(vocab_file) local_set2 = LocalTextSet(self.texts, self.labels) local_set2.load_word_index(vocab_file) transformed2 = local_set2.tokenize().normalize().word2idx()\ .shape_sequence(10).generate_sample() samples2 = transformed2.get_samples() for s1, s2 in zip(samples, samples2): assert np.allclose(s1.feature.to_ndarray(), s2.feature.to_ndarray()) os.remove(vocab_file) model = TextClassifier(5, self.glove_path, word_index, 10) model.compile("adagrad", "sparse_categorical_crossentropy", ['accuracy']) tmp_log_dir = create_tmp_path() tmp_checkpoint_path = create_tmp_path() os.mkdir(tmp_checkpoint_path) model.set_tensorboard(tmp_log_dir, "textclassification") model.set_checkpoint(tmp_checkpoint_path) model.fit(transformed, batch_size=2, nb_epoch=2, validation_data=transformed) acc = model.evaluate(transformed, batch_size=2) res_set = model.predict(transformed, batch_per_thread=2) predicts = res_set.get_predicts() # Test for loaded model predict on TextSet tmp_path = create_tmp_path() + ".bigdl" model.save_model(tmp_path, over_write=True) loaded_model = TextClassifier.load_model(tmp_path) loaded_res_set = loaded_model.predict(transformed, batch_per_thread=2) loaded_predicts = loaded_res_set.get_predicts() assert len(predicts) == len(loaded_predicts) for i in range(0, len(predicts)): # (uri, prediction) assert not predicts[i][0] assert not loaded_predicts[i][0] # uri is not recorded and thus None assert len(predicts[i][1]) == 1 assert len(loaded_predicts[i][1]) == 1 assert predicts[i][1][0].shape == (5, ) assert np.allclose(predicts[i][1][0], loaded_predicts[i][1][0]) shutil.rmtree(tmp_log_dir) shutil.rmtree(tmp_checkpoint_path) os.remove(tmp_path)
def test_local_textset_integration(self): local_set = LocalTextSet(self.texts, self.labels) assert local_set.is_local() assert not local_set.is_distributed() assert local_set.get_texts() == self.texts assert local_set.get_labels() == self.labels tokenized = ChainedPreprocessing([Tokenizer(), Normalizer(), SequenceShaper(10)])(local_set) word_index = tokenized.generate_word_index_map(max_words_num=10) transformed = ChainedPreprocessing([WordIndexer(word_index), TextFeatureToSample()])(tokenized) assert transformed.is_local() word_index = transformed.get_word_index() assert len(word_index) == 10 assert word_index["my"] == 1 samples = transformed.get_samples() assert len(samples) == 3 for sample in samples: assert sample.feature.shape[0] == 10 model = TestTextSet._build_model(10) model.compile("adagrad", "sparse_categorical_crossentropy", ['accuracy']) model.fit(transformed, batch_size=2, nb_epoch=2, validation_data=transformed) res_set = model.predict(transformed, batch_per_thread=2) predicts = res_set.get_predicts() for predict in predicts: assert len(predict) == 1 assert predict[0].shape == (5, ) acc = model.evaluate(transformed, batch_size=2)