예제 #1
0
    def test_local_textset_integration(self):
        local_set = LocalTextSet(self.texts, self.labels)
        assert local_set.is_local()
        assert not local_set.is_distributed()
        assert local_set.get_texts() == self.texts
        assert local_set.get_labels() == self.labels
        tokenized = ChainedPreprocessing([Tokenizer(), Normalizer()])(local_set)
        word_index = tokenized.generate_word_index_map(max_words_num=10)
        transformed = ChainedPreprocessing([WordIndexer(word_index), SequenceShaper(10),
                                            TextFeatureToSample()])(tokenized)
        assert transformed.is_local()
        word_index = transformed.get_word_index()
        assert len(word_index) == 10
        assert word_index["my"] == 1
        samples = transformed.get_samples()
        assert len(samples) == 3
        for sample in samples:
            assert sample.feature.shape[0] == 10

        vocab_file = create_tmp_path() + ".txt"
        transformed.save_word_index(vocab_file)
        local_set2 = LocalTextSet(self.texts, self.labels)
        local_set2.load_word_index(vocab_file)
        transformed2 = local_set2.tokenize().normalize().word2idx()\
            .shape_sequence(10).generate_sample()
        samples2 = transformed2.get_samples()
        for s1, s2 in zip(samples, samples2):
            assert np.allclose(s1.feature.to_ndarray(), s2.feature.to_ndarray())
        os.remove(vocab_file)

        model = TextClassifier(5, self.glove_path, word_index, 10)
        model.compile("adagrad", "sparse_categorical_crossentropy", ['accuracy'])
        tmp_log_dir = create_tmp_path()
        tmp_checkpoint_path = create_tmp_path()
        os.mkdir(tmp_checkpoint_path)
        model.set_tensorboard(tmp_log_dir, "textclassification")
        model.set_checkpoint(tmp_checkpoint_path)
        model.fit(transformed, batch_size=2, nb_epoch=2, validation_data=transformed)
        acc = model.evaluate(transformed, batch_size=2)
        res_set = model.predict(transformed, batch_per_thread=2)
        predicts = res_set.get_predicts()

        # Test for loaded model predict on TextSet
        tmp_path = create_tmp_path() + ".bigdl"
        model.save_model(tmp_path, over_write=True)
        loaded_model = TextClassifier.load_model(tmp_path)
        loaded_res_set = loaded_model.predict(transformed, batch_per_thread=2)
        loaded_predicts = loaded_res_set.get_predicts()
        assert len(predicts) == len(loaded_predicts)

        for i in range(0, len(predicts)):  # (uri, prediction)
            assert not predicts[i][0]
            assert not loaded_predicts[i][0]  # uri is not recorded and thus None
            assert len(predicts[i][1]) == 1
            assert len(loaded_predicts[i][1]) == 1
            assert predicts[i][1][0].shape == (5, )
            assert np.allclose(predicts[i][1][0], loaded_predicts[i][1][0])
        shutil.rmtree(tmp_log_dir)
        shutil.rmtree(tmp_checkpoint_path)
        os.remove(tmp_path)
예제 #2
0
    def test_local_textset_integration(self):
        local_set = LocalTextSet(self.texts, self.labels)
        assert local_set.is_local()
        assert not local_set.is_distributed()
        assert local_set.get_texts() == self.texts
        assert local_set.get_labels() == self.labels
        tokenized = ChainedPreprocessing([Tokenizer(), Normalizer(), SequenceShaper(10)])(local_set)
        word_index = tokenized.generate_word_index_map(max_words_num=10)
        transformed = ChainedPreprocessing([WordIndexer(word_index),
                                            TextFeatureToSample()])(tokenized)
        assert transformed.is_local()
        word_index = transformed.get_word_index()
        assert len(word_index) == 10
        assert word_index["my"] == 1
        samples = transformed.get_samples()
        assert len(samples) == 3
        for sample in samples:
            assert sample.feature.shape[0] == 10

        model = TestTextSet._build_model(10)
        model.compile("adagrad", "sparse_categorical_crossentropy", ['accuracy'])
        model.fit(transformed, batch_size=2, nb_epoch=2, validation_data=transformed)
        res_set = model.predict(transformed, batch_per_thread=2)
        predicts = res_set.get_predicts()
        for predict in predicts:
            assert len(predict) == 1
            assert predict[0].shape == (5, )
        acc = model.evaluate(transformed, batch_size=2)