示例#1
0
    def test_local_textset_integration(self):
        local_set = LocalTextSet(self.texts, self.labels)
        assert local_set.is_local()
        assert not local_set.is_distributed()
        assert local_set.get_texts() == self.texts
        assert local_set.get_labels() == self.labels
        tokenized = ChainedPreprocessing([Tokenizer(), Normalizer()])(local_set)
        word_index = tokenized.generate_word_index_map(max_words_num=10)
        transformed = ChainedPreprocessing([WordIndexer(word_index), SequenceShaper(10),
                                            TextFeatureToSample()])(tokenized)
        assert transformed.is_local()
        word_index = transformed.get_word_index()
        assert len(word_index) == 10
        assert word_index["my"] == 1
        samples = transformed.get_samples()
        assert len(samples) == 3
        for sample in samples:
            assert sample.feature.shape[0] == 10

        vocab_file = create_tmp_path() + ".txt"
        transformed.save_word_index(vocab_file)
        local_set2 = LocalTextSet(self.texts, self.labels)
        local_set2.load_word_index(vocab_file)
        transformed2 = local_set2.tokenize().normalize().word2idx()\
            .shape_sequence(10).generate_sample()
        samples2 = transformed2.get_samples()
        for s1, s2 in zip(samples, samples2):
            assert np.allclose(s1.feature.to_ndarray(), s2.feature.to_ndarray())
        os.remove(vocab_file)

        model = TextClassifier(5, self.glove_path, word_index, 10)
        model.compile("adagrad", "sparse_categorical_crossentropy", ['accuracy'])
        tmp_log_dir = create_tmp_path()
        tmp_checkpoint_path = create_tmp_path()
        os.mkdir(tmp_checkpoint_path)
        model.set_tensorboard(tmp_log_dir, "textclassification")
        model.set_checkpoint(tmp_checkpoint_path)
        model.fit(transformed, batch_size=2, nb_epoch=2, validation_data=transformed)
        acc = model.evaluate(transformed, batch_size=2)
        res_set = model.predict(transformed, batch_per_thread=2)
        predicts = res_set.get_predicts()

        # Test for loaded model predict on TextSet
        tmp_path = create_tmp_path() + ".bigdl"
        model.save_model(tmp_path, over_write=True)
        loaded_model = TextClassifier.load_model(tmp_path)
        loaded_res_set = loaded_model.predict(transformed, batch_per_thread=2)
        loaded_predicts = loaded_res_set.get_predicts()
        assert len(predicts) == len(loaded_predicts)

        for i in range(0, len(predicts)):  # (uri, prediction)
            assert not predicts[i][0]
            assert not loaded_predicts[i][0]  # uri is not recorded and thus None
            assert len(predicts[i][1]) == 1
            assert len(loaded_predicts[i][1]) == 1
            assert predicts[i][1][0].shape == (5, )
            assert np.allclose(predicts[i][1][0], loaded_predicts[i][1][0])
        shutil.rmtree(tmp_log_dir)
        shutil.rmtree(tmp_checkpoint_path)
        os.remove(tmp_path)
示例#2
0
        def input_fn(mode):
            import os
            resource_path = os.path.join(os.path.split(__file__)[0], "../resources")
            if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
                image_folder = os.path.join(resource_path, "cat_dog")
                image_set = ImageSet.read(image_folder, with_label=True, sc=self.sc,
                                          one_based_label=False)
                transformer = ChainedPreprocessing([ImageResize(256, 256),
                                                    ImageRandomCrop(224, 224, True),
                                                    ImageMatToTensor(format="NHWC"),
                                                    ImageSetToSample(input_keys=["imageTensor"],
                                                                     target_keys=["label"])])
                image_set = image_set.transform(transformer)
                dataset = TFDataset.from_image_set(image_set,
                                                   image=(tf.float32, [224, 224, 3]),
                                                   label=(tf.int32, [1]),
                                                   batch_size=8)
            else:
                image_folder = os.path.join(resource_path, "cat_dog/*/*")
                image_set = ImageSet.read(image_folder, with_label=False, sc=self.sc,
                                          one_based_label=False)
                transformer = ChainedPreprocessing([ImageResize(256, 256),
                                                    ImageRandomCrop(224, 224, True),
                                                    ImageMatToTensor(format="NHWC"),
                                                    ImageSetToSample(
                                                        input_keys=["imageTensor"])])
                image_set = image_set.transform(transformer)
                dataset = TFDataset.from_image_set(image_set,
                                                   image=(tf.float32, [224, 224, 3]),
                                                   batch_per_thread=8)

            return dataset
示例#3
0
    def test_local_textset_integration(self):
        local_set = LocalTextSet(self.texts, self.labels)
        assert local_set.is_local()
        assert not local_set.is_distributed()
        assert local_set.get_texts() == self.texts
        assert local_set.get_labels() == self.labels
        tokenized = ChainedPreprocessing([Tokenizer(), Normalizer(), SequenceShaper(10)])(local_set)
        word_index = tokenized.generate_word_index_map(max_words_num=10)
        transformed = ChainedPreprocessing([WordIndexer(word_index),
                                            TextFeatureToSample()])(tokenized)
        assert transformed.is_local()
        word_index = transformed.get_word_index()
        assert len(word_index) == 10
        assert word_index["my"] == 1
        samples = transformed.get_samples()
        assert len(samples) == 3
        for sample in samples:
            assert sample.feature.shape[0] == 10

        model = TestTextSet._build_model(10)
        model.compile("adagrad", "sparse_categorical_crossentropy", ['accuracy'])
        model.fit(transformed, batch_size=2, nb_epoch=2, validation_data=transformed)
        res_set = model.predict(transformed, batch_per_thread=2)
        predicts = res_set.get_predicts()
        for predict in predicts:
            assert len(predict) == 1
            assert predict[0].shape == (5, )
        acc = model.evaluate(transformed, batch_size=2)
示例#4
0
    def test_training_imageset(self):
        images = []
        labels = []
        for i in range(0, 32):
            features = np.random.uniform(0, 1, (200, 200, 3))
            label = np.array([2])
            images.append(features)
            labels.append(label)
        image_set = DistributedImageSet(self.sc.parallelize(images),
                                        self.sc.parallelize(labels))

        transformer = ChainedPreprocessing([
            ImageBytesToMat(),
            ImageResize(256, 256),
            ImageCenterCrop(224, 224),
            ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224, 0.225),
            ImageMatToTensor(),
            ImageSetToSample(target_keys=['label'])
        ])
        data = image_set.transform(transformer)

        model = Sequential()
        model.add(Convolution2D(1, 5, 5, input_shape=(3, 224, 224)))
        model.add(Reshape((1 * 220 * 220, )))
        model.add(Dense(20, activation="softmax"))
        model.compile(optimizer="sgd",
                      loss="sparse_categorical_crossentropy",
                      metrics=["accuracy"])
        model.fit(data, batch_size=8, nb_epoch=2, validation_data=data)
        result = model.predict(data, batch_per_thread=8)
        accuracy = model.evaluate(data, batch_size=8)
 def input_fn(mode):
     if mode == tf.estimator.ModeKeys.TRAIN:
         image_set = self.get_raw_image_set(with_label=True)
         feature_set = FeatureSet.image_frame(
             image_set.to_image_frame())
         train_transformer = ChainedPreprocessing([
             ImageBytesToMat(),
             ImageResize(256, 256),
             ImageRandomCrop(224, 224),
             ImageRandomPreprocessing(ImageHFlip(), 0.5),
             ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224,
                                   0.225),
             ImageMatToTensor(to_RGB=True, format="NHWC"),
             ImageSetToSample(input_keys=["imageTensor"],
                              target_keys=["label"])
         ])
         feature_set = feature_set.transform(train_transformer)
         feature_set = feature_set.transform(ImageFeatureToSample())
         training_dataset = TFDataset.from_feature_set(
             feature_set,
             features=(tf.float32, [224, 224, 3]),
             labels=(tf.int32, [1]),
             batch_size=8)
         return training_dataset
     else:
         raise NotImplementedError
示例#6
0
 def test_create_image_config(self):
     from zoo.models.image.common.image_config import ImageConfigure
     from zoo.feature.image.imagePreprocessing import ImageResize
     from zoo.feature.common import ChainedPreprocessing
     ImageConfigure(pre_processor=ImageResize(224, 224))
     ImageConfigure(pre_processor=ChainedPreprocessing(
         [ImageResize(224, 224),
          ImageResize(224, 224)]))
示例#7
0
 def create_image_set(self, with_label):
     image_set = self.get_raw_image_set(with_label)
     transformer = ChainedPreprocessing([ImageResize(256, 256),
                                         ImageRandomCrop(224, 224, True),
                                         ImageMatToTensor(format="NHWC"),
                                         ImageSetToSample(input_keys=["imageTensor"],
                                                          target_keys=["label"]
                                                          if with_label else None)])
     image_set = image_set.transform(transformer)
     return image_set
示例#8
0
 def create_train_features_Set(self):
     image_set = self.get_raw_image_set(with_label=True)
     feature_set = FeatureSet.image_frame(image_set.to_image_frame())
     train_transformer = ChainedPreprocessing([
         ImageBytesToMat(),
         ImageResize(256, 256),
         ImageRandomCrop(224, 224),
         ImageRandomPreprocessing(ImageHFlip(), 0.5),
         ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224, 0.225),
         ImageMatToTensor(to_RGB=True, format="NHWC"),
         ImageSetToSample(input_keys=["imageTensor"], target_keys=["label"])
     ])
     feature_set = feature_set.transform(train_transformer)
     return feature_set