def test_local_textset_integration(self): local_set = LocalTextSet(self.texts, self.labels) assert local_set.is_local() assert not local_set.is_distributed() assert local_set.get_texts() == self.texts assert local_set.get_labels() == self.labels tokenized = ChainedPreprocessing([Tokenizer(), Normalizer()])(local_set) word_index = tokenized.generate_word_index_map(max_words_num=10) transformed = ChainedPreprocessing([WordIndexer(word_index), SequenceShaper(10), TextFeatureToSample()])(tokenized) assert transformed.is_local() word_index = transformed.get_word_index() assert len(word_index) == 10 assert word_index["my"] == 1 samples = transformed.get_samples() assert len(samples) == 3 for sample in samples: assert sample.feature.shape[0] == 10 vocab_file = create_tmp_path() + ".txt" transformed.save_word_index(vocab_file) local_set2 = LocalTextSet(self.texts, self.labels) local_set2.load_word_index(vocab_file) transformed2 = local_set2.tokenize().normalize().word2idx()\ .shape_sequence(10).generate_sample() samples2 = transformed2.get_samples() for s1, s2 in zip(samples, samples2): assert np.allclose(s1.feature.to_ndarray(), s2.feature.to_ndarray()) os.remove(vocab_file) model = TextClassifier(5, self.glove_path, word_index, 10) model.compile("adagrad", "sparse_categorical_crossentropy", ['accuracy']) tmp_log_dir = create_tmp_path() tmp_checkpoint_path = create_tmp_path() os.mkdir(tmp_checkpoint_path) model.set_tensorboard(tmp_log_dir, "textclassification") model.set_checkpoint(tmp_checkpoint_path) model.fit(transformed, batch_size=2, nb_epoch=2, validation_data=transformed) acc = model.evaluate(transformed, batch_size=2) res_set = model.predict(transformed, batch_per_thread=2) predicts = res_set.get_predicts() # Test for loaded model predict on TextSet tmp_path = create_tmp_path() + ".bigdl" model.save_model(tmp_path, over_write=True) loaded_model = TextClassifier.load_model(tmp_path) loaded_res_set = loaded_model.predict(transformed, batch_per_thread=2) loaded_predicts = loaded_res_set.get_predicts() assert len(predicts) == len(loaded_predicts) for i in range(0, len(predicts)): # (uri, prediction) assert not predicts[i][0] assert not loaded_predicts[i][0] # uri is not recorded and thus None assert len(predicts[i][1]) == 1 assert len(loaded_predicts[i][1]) == 1 assert predicts[i][1][0].shape == (5, ) assert np.allclose(predicts[i][1][0], loaded_predicts[i][1][0]) shutil.rmtree(tmp_log_dir) shutil.rmtree(tmp_checkpoint_path) os.remove(tmp_path)
def input_fn(mode): import os resource_path = os.path.join(os.path.split(__file__)[0], "../resources") if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: image_folder = os.path.join(resource_path, "cat_dog") image_set = ImageSet.read(image_folder, with_label=True, sc=self.sc, one_based_label=False) transformer = ChainedPreprocessing([ImageResize(256, 256), ImageRandomCrop(224, 224, True), ImageMatToTensor(format="NHWC"), ImageSetToSample(input_keys=["imageTensor"], target_keys=["label"])]) image_set = image_set.transform(transformer) dataset = TFDataset.from_image_set(image_set, image=(tf.float32, [224, 224, 3]), label=(tf.int32, [1]), batch_size=8) else: image_folder = os.path.join(resource_path, "cat_dog/*/*") image_set = ImageSet.read(image_folder, with_label=False, sc=self.sc, one_based_label=False) transformer = ChainedPreprocessing([ImageResize(256, 256), ImageRandomCrop(224, 224, True), ImageMatToTensor(format="NHWC"), ImageSetToSample( input_keys=["imageTensor"])]) image_set = image_set.transform(transformer) dataset = TFDataset.from_image_set(image_set, image=(tf.float32, [224, 224, 3]), batch_per_thread=8) return dataset
def test_local_textset_integration(self): local_set = LocalTextSet(self.texts, self.labels) assert local_set.is_local() assert not local_set.is_distributed() assert local_set.get_texts() == self.texts assert local_set.get_labels() == self.labels tokenized = ChainedPreprocessing([Tokenizer(), Normalizer(), SequenceShaper(10)])(local_set) word_index = tokenized.generate_word_index_map(max_words_num=10) transformed = ChainedPreprocessing([WordIndexer(word_index), TextFeatureToSample()])(tokenized) assert transformed.is_local() word_index = transformed.get_word_index() assert len(word_index) == 10 assert word_index["my"] == 1 samples = transformed.get_samples() assert len(samples) == 3 for sample in samples: assert sample.feature.shape[0] == 10 model = TestTextSet._build_model(10) model.compile("adagrad", "sparse_categorical_crossentropy", ['accuracy']) model.fit(transformed, batch_size=2, nb_epoch=2, validation_data=transformed) res_set = model.predict(transformed, batch_per_thread=2) predicts = res_set.get_predicts() for predict in predicts: assert len(predict) == 1 assert predict[0].shape == (5, ) acc = model.evaluate(transformed, batch_size=2)
def test_training_imageset(self): images = [] labels = [] for i in range(0, 32): features = np.random.uniform(0, 1, (200, 200, 3)) label = np.array([2]) images.append(features) labels.append(label) image_set = DistributedImageSet(self.sc.parallelize(images), self.sc.parallelize(labels)) transformer = ChainedPreprocessing([ ImageBytesToMat(), ImageResize(256, 256), ImageCenterCrop(224, 224), ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224, 0.225), ImageMatToTensor(), ImageSetToSample(target_keys=['label']) ]) data = image_set.transform(transformer) model = Sequential() model.add(Convolution2D(1, 5, 5, input_shape=(3, 224, 224))) model.add(Reshape((1 * 220 * 220, ))) model.add(Dense(20, activation="softmax")) model.compile(optimizer="sgd", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) model.fit(data, batch_size=8, nb_epoch=2, validation_data=data) result = model.predict(data, batch_per_thread=8) accuracy = model.evaluate(data, batch_size=8)
def input_fn(mode): if mode == tf.estimator.ModeKeys.TRAIN: image_set = self.get_raw_image_set(with_label=True) feature_set = FeatureSet.image_frame( image_set.to_image_frame()) train_transformer = ChainedPreprocessing([ ImageBytesToMat(), ImageResize(256, 256), ImageRandomCrop(224, 224), ImageRandomPreprocessing(ImageHFlip(), 0.5), ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224, 0.225), ImageMatToTensor(to_RGB=True, format="NHWC"), ImageSetToSample(input_keys=["imageTensor"], target_keys=["label"]) ]) feature_set = feature_set.transform(train_transformer) feature_set = feature_set.transform(ImageFeatureToSample()) training_dataset = TFDataset.from_feature_set( feature_set, features=(tf.float32, [224, 224, 3]), labels=(tf.int32, [1]), batch_size=8) return training_dataset else: raise NotImplementedError
def test_create_image_config(self): from zoo.models.image.common.image_config import ImageConfigure from zoo.feature.image.imagePreprocessing import ImageResize from zoo.feature.common import ChainedPreprocessing ImageConfigure(pre_processor=ImageResize(224, 224)) ImageConfigure(pre_processor=ChainedPreprocessing( [ImageResize(224, 224), ImageResize(224, 224)]))
def create_image_set(self, with_label): image_set = self.get_raw_image_set(with_label) transformer = ChainedPreprocessing([ImageResize(256, 256), ImageRandomCrop(224, 224, True), ImageMatToTensor(format="NHWC"), ImageSetToSample(input_keys=["imageTensor"], target_keys=["label"] if with_label else None)]) image_set = image_set.transform(transformer) return image_set
def create_train_features_Set(self): image_set = self.get_raw_image_set(with_label=True) feature_set = FeatureSet.image_frame(image_set.to_image_frame()) train_transformer = ChainedPreprocessing([ ImageBytesToMat(), ImageResize(256, 256), ImageRandomCrop(224, 224), ImageRandomPreprocessing(ImageHFlip(), 0.5), ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224, 0.225), ImageMatToTensor(to_RGB=True, format="NHWC"), ImageSetToSample(input_keys=["imageTensor"], target_keys=["label"]) ]) feature_set = feature_set.transform(train_transformer) return feature_set