예제 #1
0
    def test_tfdataset_with_dataframe(self):
        rdd = self.sc.range(0, 1000)
        df = rdd.map(lambda x: (DenseVector(
            np.random.rand(20).astype(np.float)), x % 10)).toDF(
                ["feature", "label"])

        train_df, val_df = df.randomSplit([0.7, 0.3])
        dataset = TFDataset.from_dataframe(train_df,
                                           feature_cols=["feature"],
                                           labels_cols=["label"],
                                           batch_size=32,
                                           validation_df=val_df)

        seq = tf.keras.Sequential([
            tf.keras.layers.Flatten(input_shape=(20, )),
            tf.keras.layers.Dense(10, activation="softmax")
        ])

        seq.compile(optimizer=tf.keras.optimizers.RMSprop(),
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])
        model = KerasModel(seq)
        model.fit(dataset)
        dataset = TFDataset.from_dataframe(val_df,
                                           feature_cols=["feature"],
                                           batch_per_thread=32)
        model.predict(dataset).collect()
        dataset = TFDataset.from_dataframe(val_df,
                                           feature_cols=["feature"],
                                           labels_cols=["label"],
                                           batch_per_thread=32)
        model.evaluate(dataset)
예제 #2
0
        def create_ds(mode):
            if mode == "train":
                dataset = TFDataset.from_dataframe(train_df,
                                                   feature_cols=["feature"],
                                                   labels_cols=["label"],
                                                   batch_size=32,
                                                   validation_df=val_df)
            elif mode == "predict":
                dataset = TFDataset.from_dataframe(val_df,
                                                   feature_cols=["feature"],
                                                   batch_per_thread=32)
            elif mode == "evaluate":
                dataset = TFDataset.from_dataframe(val_df,
                                                   feature_cols=["feature"],
                                                   labels_cols=["label"],
                                                   batch_per_thread=32)
            else:
                raise ValueError("unrecognized mode: {}".format(mode))

            return dataset