예제 #1
0
    def test_tfdataset_with_tf_data_dataset(self):
        dataset = tf.data.Dataset.from_tensor_slices(
            (np.random.randn(100, 28, 28,
                             1), np.random.randint(0, 10, size=(100, ))))
        dataset = dataset.map(lambda feature, label:
                              (tf.to_float(feature), label))
        dataset = TFDataset.from_tf_data_dataset(dataset, batch_size=16)
        seq = tf.keras.Sequential([
            tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
            tf.keras.layers.Dense(10, activation="softmax")
        ])

        seq.compile(optimizer=tf.keras.optimizers.RMSprop(),
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])
        model = KerasModel(seq)
        model.fit(dataset)
        dataset = tf.data.Dataset.from_tensor_slices(
            (np.random.randn(100, 28, 28,
                             1), np.random.randint(0, 10, size=(100, ))))
        dataset = dataset.map(lambda feature, label:
                              (tf.to_float(feature), label))
        dataset = TFDataset.from_tf_data_dataset(dataset, batch_per_thread=16)
        model.evaluate(dataset)
        dataset = tf.data.Dataset.from_tensor_slices(
            np.random.randn(100, 28, 28, 1))
        dataset = dataset.map(lambda data: tf.to_float(data))
        dataset = TFDataset.from_tf_data_dataset(dataset, batch_per_thread=16)
        model.predict(dataset).collect()
예제 #2
0
    def test_tfdataset_with_dataframe(self):
        rdd = self.sc.range(0, 1000)
        df = rdd.map(lambda x: (DenseVector(
            np.random.rand(20).astype(np.float)), x % 10)).toDF(
                ["feature", "label"])

        train_df, val_df = df.randomSplit([0.7, 0.3])
        dataset = TFDataset.from_dataframe(train_df,
                                           feature_cols=["feature"],
                                           labels_cols=["label"],
                                           batch_size=32,
                                           validation_df=val_df)

        seq = tf.keras.Sequential([
            tf.keras.layers.Flatten(input_shape=(20, )),
            tf.keras.layers.Dense(10, activation="softmax")
        ])

        seq.compile(optimizer=tf.keras.optimizers.RMSprop(),
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])
        model = KerasModel(seq)
        model.fit(dataset)
        dataset = TFDataset.from_dataframe(val_df,
                                           feature_cols=["feature"],
                                           batch_per_thread=32)
        model.predict(dataset).collect()
        dataset = TFDataset.from_dataframe(val_df,
                                           feature_cols=["feature"],
                                           labels_cols=["label"],
                                           batch_per_thread=32)
        model.evaluate(dataset)
예제 #3
0
        def input_fn(mode):
            import os
            resource_path = os.path.join(os.path.split(__file__)[0], "../resources")
            if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
                image_folder = os.path.join(resource_path, "cat_dog")
                image_set = ImageSet.read(image_folder, with_label=True, sc=self.sc,
                                          one_based_label=False)
                transformer = ChainedPreprocessing([ImageResize(256, 256),
                                                    ImageRandomCrop(224, 224, True),
                                                    ImageMatToTensor(format="NHWC"),
                                                    ImageSetToSample(input_keys=["imageTensor"],
                                                                     target_keys=["label"])])
                image_set = image_set.transform(transformer)
                dataset = TFDataset.from_image_set(image_set,
                                                   image=(tf.float32, [224, 224, 3]),
                                                   label=(tf.int32, [1]),
                                                   batch_size=8)
            else:
                image_folder = os.path.join(resource_path, "cat_dog/*/*")
                image_set = ImageSet.read(image_folder, with_label=False, sc=self.sc,
                                          one_based_label=False)
                transformer = ChainedPreprocessing([ImageResize(256, 256),
                                                    ImageRandomCrop(224, 224, True),
                                                    ImageMatToTensor(format="NHWC"),
                                                    ImageSetToSample(
                                                        input_keys=["imageTensor"])])
                image_set = image_set.transform(transformer)
                dataset = TFDataset.from_image_set(image_set,
                                                   image=(tf.float32, [224, 224, 3]),
                                                   batch_per_thread=8)

            return dataset
 def input_fn(mode):
     x = np.random.rand(20, 10)
     y = np.random.randint(0, 10, (20, ))
     if mode == tf.estimator.ModeKeys.TRAIN:
         return TFDataset.from_ndarrays((x, y), batch_size=8)
     elif mode == tf.estimator.ModeKeys.EVAL:
         return TFDataset.from_ndarrays((x, y), batch_per_thread=1)
     else:
         return TFDataset.from_ndarrays(x, batch_per_thread=1)
예제 #5
0
    def input_fn(mode):
        if mode == tf.estimator.ModeKeys.TRAIN:
            training_data = get_data("train")
            dataset = TFDataset.from_ndarrays(training_data, batch_size=320)
        elif mode == tf.estimator.ModeKeys.EVAL:
            testing_data = get_data("test")
            dataset = TFDataset.from_ndarrays(testing_data,
                                              batch_per_thread=80)
        else:
            images, _ = get_data("test")
            dataset = TFDataset.from_ndarrays(images, batch_per_thread=80)

        return dataset
 def input_fn(mode):
     if mode == tf.estimator.ModeKeys.TRAIN:
         image_set = self.get_raw_image_set(with_label=True)
         feature_set = FeatureSet.image_frame(
             image_set.to_image_frame())
         train_transformer = ChainedPreprocessing([
             ImageBytesToMat(),
             ImageResize(256, 256),
             ImageRandomCrop(224, 224),
             ImageRandomPreprocessing(ImageHFlip(), 0.5),
             ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224,
                                   0.225),
             ImageMatToTensor(to_RGB=True, format="NHWC"),
             ImageSetToSample(input_keys=["imageTensor"],
                              target_keys=["label"])
         ])
         feature_set = feature_set.transform(train_transformer)
         feature_set = feature_set.transform(ImageFeatureToSample())
         training_dataset = TFDataset.from_feature_set(
             feature_set,
             features=(tf.float32, [224, 224, 3]),
             labels=(tf.int32, [1]),
             batch_size=8)
         return training_dataset
     else:
         raise NotImplementedError
예제 #7
0
    def test_tfdataset_with_tf_data_dataset_which_requires_table(self):

        keys = [1, 0, -1]
        dataset = tf.data.Dataset.from_tensor_slices([1, 2, -1, 5] * 40)
        table = tf.contrib.lookup.HashTable(
            initializer=tf.contrib.lookup.KeyValueTensorInitializer(
                keys=keys, values=list(reversed(keys))),
            default_value=100)
        dataset = dataset.map(table.lookup)

        def transform(x):
            float_x = tf.to_float(x)
            return float_x, 1

        dataset = dataset.map(transform)
        dataset = TFDataset.from_tf_data_dataset(dataset, batch_size=16)
        seq = tf.keras.Sequential([
            tf.keras.layers.Flatten(input_shape=()),
            tf.keras.layers.Dense(10, activation="softmax")
        ])
        seq.compile(optimizer=tf.keras.optimizers.RMSprop(),
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])
        model = KerasModel(seq)
        model.fit(dataset)
    def test_tf_optimizer_with_sparse_gradient_using_keras(self):
        import tensorflow as tf

        ids = np.random.randint(0, 10, size=[40])
        labels = np.random.randint(0, 5, size=[40])
        id_rdd = self.sc.parallelize(ids)
        label_rdd = self.sc.parallelize(labels)
        training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]])

        dataset = TFDataset.from_rdd(training_rdd,
                                     names=["ids", "labels"],
                                     shapes=[[], []],
                                     types=[tf.int32, tf.int32],
                                     batch_size=8)
        from tensorflow.python.ops import variable_scope

        def variable_creator(**kwargs):
            kwargs["use_resource"] = False
            return variable_scope.default_variable_creator(None, **kwargs)

        getter = lambda next_creator, **kwargs: variable_creator(**kwargs)
        with variable_scope.variable_creator_scope(getter):
            words_input = tf.keras.layers.Input(shape=(), name='words_input')
            embedding_layer = tf.keras.layers.Embedding(input_dim=10,
                                                        output_dim=5,
                                                        name='word_embedding')
            word_embeddings = embedding_layer(words_input)
            embedding = tf.keras.layers.Flatten()(word_embeddings)
            output = tf.keras.layers.Dense(5, activation="softmax")(embedding)
            model = tf.keras.models.Model(inputs=[words_input],
                                          outputs=[output])
            model.compile(optimizer="sgd", loss="sparse_categorical_crossentropy")\

        optimizer = TFOptimizer.from_keras(model, dataset)
        optimizer.optimize()
예제 #9
0
 def input_fn(mode):
     if mode == tf.estimator.ModeKeys.PREDICT:
         # get the TFDataset
         image_dataset = TFDataset.from_ndarrays(image_array[None, ...])
         return image_dataset
     else:
         raise NotImplementedError
예제 #10
0
    def test_control_inputs(self):

        features = np.random.randn(20, 10)
        labels = np.random.randint(0, 10, size=[20])
        with tf.Graph().as_default():
            dataset = TFDataset.from_ndarrays((features, labels),
                                              batch_size=4,
                                              val_tensors=(features, labels))
            is_training = tf.placeholder(dtype=tf.bool, shape=())
            feature_tensor, label_tensor = dataset.tensors
            features = tf.layers.dense(feature_tensor, 8)
            features = tf.layers.dropout(features, training=is_training)
            output = tf.layers.dense(features, 10)
            loss = tf.reduce_mean(
                tf.losses.sparse_softmax_cross_entropy(logits=output,
                                                       labels=label_tensor))
            optimizer = TFOptimizer.from_loss(
                loss,
                Adam(),
                val_outputs=[output],
                val_labels=[label_tensor],
                val_method=Accuracy(),
                tensor_with_value={is_training: (True, False)},
                metrics={"loss": loss})
            optimizer.optimize(end_trigger=MaxEpoch(1))
            optimizer.sess.close()
예제 #11
0
 def test_tf_net_predict_dataset(self):
     tfnet_path = os.path.join(TestTF.resource_path, "tfnet")
     net = TFNet.from_export_folder(tfnet_path)
     dataset = TFDataset.from_ndarrays((np.random.rand(16, 4), ))
     output = net.predict(dataset)
     output = np.stack(output.collect())
     assert output.shape == (16, 2)
예제 #12
0
    def test_tf_optimizer_metrics(self):

        features = np.random.randn(20, 10)
        labels = np.random.randint(0, 10, size=[20])
        with tf.Graph().as_default():
            dataset = TFDataset.from_ndarrays((features, labels),
                                              batch_size=4,
                                              val_tensors=(features, labels))
            feature_tensor, label_tensor = dataset.tensors
            features = tf.layers.dense(feature_tensor, 8)
            output = tf.layers.dense(features, 10)
            loss = tf.reduce_mean(tf.losses.
                                  sparse_softmax_cross_entropy(logits=output,
                                                               labels=label_tensor))
            optimizer = TFOptimizer.from_loss(loss, {"dense/": Adam(1e-3), "dense_1/": SGD(0.0)},
                                              val_outputs=[output],
                                              val_labels=[label_tensor],
                                              val_method=Accuracy(), metrics={"loss": loss})
            initial_weights = optimizer.tf_model.training_helper_layer.get_weights()
            optimizer.optimize(end_trigger=MaxEpoch(1))
            updated_weights = optimizer.tf_model.training_helper_layer.get_weights()
            for i in [0, 1]:  # weights and bias combined with "dense/" should be updated
                assert not np.allclose(initial_weights[i], updated_weights[i])
            for i in [2, 3]:  # weights and bias combined with "dense_1" should be unchanged
                assert np.allclose(initial_weights[i], updated_weights[i])
            optimizer.sess.close()
예제 #13
0
    def test_tf_optimizer_with_sparse_gradient_using_keras(self):
        import tensorflow as tf

        ids = np.random.randint(0, 10, size=[40])
        labels = np.random.randint(0, 5, size=[40])
        id_rdd = self.sc.parallelize(ids)
        label_rdd = self.sc.parallelize(labels)
        training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]])

        dataset = TFDataset.from_rdd(training_rdd,
                                     names=["ids", "labels"],
                                     shapes=[[], []],
                                     types=[tf.int32, tf.int32],
                                     batch_size=8)
        words_input = tf.keras.layers.Input(shape=(), name='words_input')
        embedding_layer = tf.keras.layers.Embedding(input_dim=10,
                                                    output_dim=5,
                                                    name='word_embedding')
        word_embeddings = embedding_layer(words_input)
        embedding = tf.keras.layers.Flatten()(word_embeddings)
        output = tf.keras.layers.Dense(5, activation="softmax")(embedding)
        model = tf.keras.models.Model(inputs=[words_input], outputs=[output])
        model.compile(optimizer="sgd", loss="sparse_categorical_crossentropy")

        optimizer = TFOptimizer.from_keras(model, dataset)
        optimizer.optimize()
    def input_fn(mode):

        if mode == tf.estimator.ModeKeys.TRAIN:
            # demo_small directory structure
            # \demo_small
            #    \cats
            #       cat images ...
            #    \dogs
            #       dog images ...
            image_set = ImageSet.read("./datasets/cat_dog/demo_small",
                                      sc=sc,
                                      with_label=True,
                                      one_based_label=False)
            train_transformer = ChainedPreprocessing([
                ImageBytesToMat(),
                ImageResize(256, 256),
                ImageRandomCrop(224, 224),
                ImageRandomPreprocessing(ImageHFlip(), 0.5),
                ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224,
                                      0.225),
                ImageMatToTensor(to_RGB=True, format="NHWC"),
                ImageSetToSample(input_keys=["imageTensor"],
                                 target_keys=["label"])
            ])
            feature_set = FeatureSet.image_frame(image_set.to_image_frame())
            feature_set = feature_set.transform(train_transformer)
            dataset = TFDataset.from_feature_set(feature_set,
                                                 features=(tf.float32,
                                                           [224, 224, 3]),
                                                 labels=(tf.int32, [1]),
                                                 batch_size=16)
        else:
            raise NotImplementedError

        return dataset
예제 #15
0
def main(data_num):

    sc = init_nncontext()

    # get data, pre-process and create TFDataset
    (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "test")
    images_data = (images_data[:data_num] - mnist.TRAIN_MEAN) / mnist.TRAIN_STD
    labels_data = labels_data[:data_num].astype(np.int32)
    dataset = TFDataset.from_ndarrays((images_data, labels_data), batch_per_thread=20)

    # construct the model from TFDataset
    images, labels = dataset.tensors

    labels = tf.squeeze(labels)

    with slim.arg_scope(lenet.lenet_arg_scope()):
        logits, end_points = lenet.lenet(images, num_classes=10, is_training=False)

    predictions = tf.to_int32(tf.argmax(logits, axis=1))
    correct = tf.expand_dims(tf.to_int32(tf.equal(predictions, labels)), axis=1)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, "/tmp/lenet/model")

        predictor = TFPredictor(sess, [correct])

        accuracy = predictor.predict().mean()

        print("predict accuracy is %s" % accuracy)
예제 #16
0
 def input_function():
     ds = tf.data.Dataset.from_tensor_slices((dict(data_df), ))
     ds = TFDataset.from_tf_data_dataset(
         dataset=ds,
         batch_size=batch_size,
         batch_per_thread=batch_per_thread)
     return ds
        def input_fn(mode):
            np.random.seed(20)
            x = np.random.rand(20, 10)
            y = np.random.randint(0, 10, (20))

            rdd_x = self.sc.parallelize(x)
            rdd_y = self.sc.parallelize(y)

            rdd = rdd_x.zip(rdd_y)
            if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
                dataset = TFDataset.from_rdd(rdd,
                                             features=(tf.float32, [10]),
                                             labels=(tf.int32, []))
            else:
                dataset = TFDataset.from_rdd(rdd_x,
                                             features=(tf.float32, [10]))
            return dataset
예제 #18
0
 def test_training_for_feature_set(self):
     model = self.create_image_model()
     feature_set = self.create_train_features_Set()
     training_dataset = TFDataset.from_feature_set(feature_set,
                                                   features=(tf.float32, [224, 224, 3]),
                                                   labels=(tf.int32, [1]),
                                                   batch_size=8)
     model.fit(training_dataset)
예제 #19
0
    def test_predict_for_imageset(self):
        model = self.create_image_model()
        image_set = self.create_image_set(with_label=False)

        predict_dataset = TFDataset.from_image_set(image_set,
                                                   image=(tf.float32, [224, 224, 3]),
                                                   batch_per_thread=1)
        results = model.predict(predict_dataset).get_predict().collect()
        assert all(r[1] is not None for r in results)
예제 #20
0
    def test_training_for_imageset(self):

        model = self.create_image_model()
        image_set = self.create_image_set(with_label=True)
        training_dataset = TFDataset.from_image_set(image_set,
                                                    image=(tf.float32, [224, 224, 3]),
                                                    label=(tf.int32, [1]),
                                                    batch_size=4)
        model.fit(training_dataset)
예제 #21
0
    def test_evaluation_for_imageset(self):

        model = self.create_image_model()
        image_set = self.create_image_set(with_label=True)
        eval_dataset = TFDataset.from_image_set(image_set,
                                                image=(tf.float32, [224, 224, 3]),
                                                label=(tf.int32, [1]),
                                                batch_per_thread=1)

        model.evaluate(eval_dataset)
예제 #22
0
def main(max_epoch):
    sc = init_nncontext()

    training_rdd = get_data_rdd("train", sc)
    testing_rdd = get_data_rdd("test", sc)

    dataset = TFDataset.from_rdd(training_rdd,
                                 features=(tf.float32, [28, 28, 1]),
                                 labels=(tf.int32, []),
                                 batch_size=320,
                                 val_rdd=testing_rdd)

    model = tf.keras.Sequential(
        [tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
         tf.keras.layers.Dense(64, activation='relu'),
         tf.keras.layers.Dense(64, activation='relu'),
         tf.keras.layers.Dense(10, activation='softmax'),
         ]
    )

    model.compile(optimizer=tf.keras.optimizers.RMSprop(),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    keras_model = KerasModel(model)

    keras_model.fit(dataset,
                    epochs=max_epoch,
                    distributed=True)

    eval_dataset = TFDataset.from_rdd(
        testing_rdd,
        features=(tf.float32, [28, 28, 1]),
        labels=(tf.int32, []), batch_per_thread=80)
    result = keras_model.evaluate(eval_dataset)

    print(result)
    # >> [0.08865142822265625, 0.9722]

    # the following assert is used for internal testing
    assert result['acc Top1Accuracy'] > 0.95

    model.save_weights("/tmp/mnist_keras.h5")
예제 #23
0
    def input_fn(mode):
        if mode == tf.estimator.ModeKeys.TRAIN:
            training_rdd = get_data_rdd("train", sc)
            dataset = TFDataset.from_rdd(training_rdd,
                                         features=(tf.float32, [28, 28, 1]),
                                         labels=(tf.int32, []),
                                         batch_size=320)
        elif mode == tf.estimator.ModeKeys.EVAL:
            testing_rdd = get_data_rdd("test", sc)
            dataset = TFDataset.from_rdd(testing_rdd,
                                         features=(tf.float32, [28, 28, 1]),
                                         labels=(tf.int32, []),
                                         batch_size=320)
        else:
            testing_rdd = get_data_rdd("test", sc).map(lambda x: x[0])
            dataset = TFDataset.from_rdd(testing_rdd,
                                         features=(tf.float32, [28, 28, 1]),
                                         batch_per_thread=80)

        return dataset
예제 #24
0
        def create_ds(mode):
            if mode == "train":
                dataset = TFDataset.from_dataframe(train_df,
                                                   feature_cols=["feature"],
                                                   labels_cols=["label"],
                                                   batch_size=32,
                                                   validation_df=val_df)
            elif mode == "predict":
                dataset = TFDataset.from_dataframe(val_df,
                                                   feature_cols=["feature"],
                                                   batch_per_thread=32)
            elif mode == "evaluate":
                dataset = TFDataset.from_dataframe(val_df,
                                                   feature_cols=["feature"],
                                                   labels_cols=["label"],
                                                   batch_per_thread=32)
            else:
                raise ValueError("unrecognized mode: {}".format(mode))

            return dataset
예제 #25
0
    def test_dataset_without_batch(self):
        x = np.random.rand(20, 10)
        y = np.random.randint(0, 2, (20))

        rdd_x = self.sc.parallelize(x)
        rdd_y = self.sc.parallelize(y)

        rdd = rdd_x.zip(rdd_y)

        dataset = TFDataset.from_rdd(rdd,
                                     features=(tf.float32, [10]),
                                     labels=(tf.int32, []),
                                     names=["features", "labels"],
                                     val_rdd=rdd)

        keras_model = self.create_model()
        model = KerasModel(keras_model)
        self.intercept(
            lambda: model.fit(dataset), "The batch_size of TFDataset must be" +
            " specified when used in KerasModel fit.")

        dataset = TFDataset.from_rdd(
            rdd,
            features=(tf.float32, [10]),
            labels=(tf.int32, []),
            names=["features", "labels"],
        )
        self.intercept(
            lambda: model.evaluate(dataset),
            "The batch_per_thread of TFDataset must be " +
            "specified when used in KerasModel evaluate.")

        dataset = TFDataset.from_rdd(
            rdd_x,
            features=(tf.float32, [10]),
            names=["features", "labels"],
        )
        self.intercept(
            lambda: model.predict(dataset),
            "The batch_per_thread of TFDataset must be" +
            " specified when used in KerasModel predict.")
예제 #26
0
    def create_predict_dataset(self):
        np.random.seed(20)
        x = np.random.rand(20, 10)

        rdd = self.sc.parallelize(x)

        rdd = rdd.map(lambda x: [x])

        dataset = TFDataset.from_rdd(rdd,
                                     features=(tf.float32, [10]),
                                     batch_per_thread=1)
        return dataset
예제 #27
0
 def test_tfdataset_with_string_rdd(self):
     string_rdd = self.sc.parallelize(["123", "456"], 1)
     ds = TFDataset.from_string_rdd(string_rdd, batch_per_thread=1)
     input_tensor = tf.placeholder(dtype=tf.string, shape=(None, ))
     output_tensor = tf.string_to_number(input_tensor)
     with tf.Session() as sess:
         tfnet = TFNet.from_session(sess,
                                    inputs=[input_tensor],
                                    outputs=[output_tensor])
     result = tfnet.predict(ds).collect()
     assert result[0] == 123
     assert result[1] == 456
예제 #28
0
    def create_predict_dataset(self):
        np.random.seed(20)
        x = np.random.rand(20, 10)

        rdd = self.sc.parallelize(x)

        rdd = rdd.map(lambda x: [x])

        dataset = TFDataset.from_rdd(rdd,
                                     names=["features"],
                                     shapes=[[10]],
                                     types=[tf.float32],
                                     batch_per_thread=1)
        return dataset
예제 #29
0
    def input_fn():
        def map_func(data):
            image = data['image']
            label = data['label']
            one_hot_label = tf.one_hot(label, depth=10)
            noise = tf.random.normal(mean=0.0, stddev=1.0, shape=(NOISE_DIM,))
            generator_inputs = (noise, one_hot_label)
            discriminator_inputs = ((tf.to_float(image) / 255.0) - 0.5) * 2
            return (generator_inputs, discriminator_inputs)

        ds = tfds.load("mnist", split="train")
        ds = ds.map(map_func)
        dataset = TFDataset.from_tf_data_dataset(ds, batch_size=36)
        return dataset
예제 #30
0
    def create_evaluation_dataset(self):
        np.random.seed(20)
        x = np.random.rand(20, 10)
        y = np.random.randint(0, 2, (20))

        rdd_x = self.sc.parallelize(x)
        rdd_y = self.sc.parallelize(y)

        rdd = rdd_x.zip(rdd_y)

        dataset = TFDataset.from_rdd(rdd,
                                     features=(tf.float32, [10]),
                                     labels=(tf.int32, []),
                                     batch_per_thread=1)
        return dataset