def test_spark_ml_model(spark_context): df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) # Evaluate Spark model by evaluating the underlying model prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.precision()) print(metrics.recall())
def test_regression_model(spark_context, regression_model, boston_housing_dataset): batch_size = 64 epochs = 10 x_train, y_train, x_test, y_test = boston_housing_dataset df = to_data_frame(spark_context, x_train, y_train) test_df = to_data_frame(spark_context, x_test, y_test) sgd = optimizers.SGD(lr=0.00001) sgd_conf = optimizers.serialize(sgd) estimator = ElephasEstimator() estimator.set_keras_model_config(regression_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("mae") estimator.set_metrics(['mae']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.01) estimator.set_categorical_labels(False) pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_observations = pnl.rdd.map(lambda row: (row.label, row.prediction)) metrics = RegressionMetrics(prediction_and_observations) print(metrics.r2)
def test_custom_objects(spark_context, boston_housing_dataset): def custom_activation(x): return 2 * relu(x) model = Sequential() model.add(Dense(64, input_shape=(13, ))) model.add(Dense(64, activation=custom_activation)) model.add(Dense(1, activation='linear')) x_train, y_train, x_test, y_test = boston_housing_dataset df = to_data_frame(spark_context, x_train, y_train) test_df = to_data_frame(spark_context, x_test, y_test) sgd = optimizers.SGD(lr=0.00001) sgd_conf = optimizers.serialize(sgd) estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("mae") estimator.set_metrics(['mae']) estimator.set_epochs(10) estimator.set_batch_size(32) estimator.set_validation_split(0.01) estimator.set_categorical_labels(False) estimator.set_custom_objects({'custom_activation': custom_activation}) pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) prediction = fitted_pipeline.transform(test_df)
def test_functional_model(spark_context, classification_model_functional, mnist_data): batch_size = 64 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD() sgd_conf = optimizers.serialize(sgd) estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model_functional.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(10) pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.accuracy)
def test_to_data_frame(spark_context): features = np.ones((2, 10)) labels = np.asarray([[2.0], [1.0]]) data_frame = adapter.to_data_frame( spark_context, features, labels, categorical=False) assert data_frame.count() == 2
def test_spark_ml_model_classification(spark_context, classification_model, mnist_data): batch_size = 64 nb_classes = 10 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) # Evaluate Spark model by evaluating the underlying model prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) # since prediction in a multiclass classification problem is a vector, we need to compute argmax # the casting to a double is just necessary for using MulticlassMetrics pnl = pnl.select( 'label', argmax('prediction').astype(DoubleType()).alias('prediction')) prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.accuracy)
def test_batch_predict_classes_probability(spark_context, classification_model, mnist_data): batch_size = 64 nb_classes = 10 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer fitted_pipeline = estimator.fit(df) results = fitted_pipeline.transform(test_df) # Set inference batch size and do transform again on the same test_df inference_batch_size = int(len(y_test) / 10) fitted_pipeline.set_params(inference_batch_size=inference_batch_size) fitted_pipeline.set_params(outputCol="prediction_via_batch_inference") results_with_batch_prediction = fitted_pipeline.transform(results) # we should have an array of 10 elements in the prediction column, since we have 10 classes # and therefore 10 probabilities results_np = results_with_batch_prediction.take(1)[0] assert len(results_np.prediction) == 10 assert len(results_np.prediction_via_batch_inference) == 10 assert np.array_equal(results_np.prediction, results_np.prediction_via_batch_inference)
def test_df_to_simple_rdd(spark_context): features = np.ones((2, 10)) labels = np.asarray([[2.0], [1.0]]).reshape((2,)) data_frame = adapter.to_data_frame( spark_context, features, labels, categorical=False) rdd = adapter.df_to_simple_rdd(data_frame, False) assert rdd.count() == 2
def test_from_data_frame_cat(spark_context): features = np.ones((2, 10)) labels = np.asarray([[0, 0, 1.0], [0, 1.0, 0]]) data_frame = adapter.to_data_frame( spark_context, features, labels, categorical=True) x, y = adapter.from_data_frame(data_frame, categorical=True, nb_classes=3) assert features.shape == x.shape assert labels.shape == y.shape
def test_from_data_frame(spark_context): features = np.ones((2, 10)) labels = np.asarray([[2.0], [1.0]]).reshape((2,)) data_frame = adapter.to_data_frame( spark_context, features, labels, categorical=False) x, y = adapter.from_data_frame(data_frame, categorical=False) assert features.shape == x.shape assert labels.shape == y.shape
def test_serialization_transformer_and_predict(spark_context, classification_model, mnist_data): _, _, x_test, y_test = mnist_data df = to_data_frame(spark_context, x_test, y_test, categorical=True) transformer = ElephasTransformer( weights=classification_model.get_weights(), model_type=ModelType.CLASSIFICATION) transformer.set_keras_model_config(classification_model.to_yaml()) transformer.save("test.h5") loaded_transformer = load_ml_transformer("test.h5") loaded_transformer.transform(df)
def test_set_cols_deprecated(spark_context, regression_model, boston_housing_dataset): with pytest.deprecated_call(): batch_size = 64 epochs = 10 x_train, y_train, x_test, y_test = boston_housing_dataset df = to_data_frame(spark_context, x_train, y_train) df = df.withColumnRenamed('features', 'scaled_features') df = df.withColumnRenamed('label', 'ground_truth') test_df = to_data_frame(spark_context, x_test, y_test) test_df = test_df.withColumnRenamed('features', 'scaled_features') test_df = test_df.withColumnRenamed('label', 'ground_truth') sgd = optimizers.SGD(lr=0.00001) sgd_conf = optimizers.serialize(sgd) estimator = ElephasEstimator() estimator.set_keras_model_config(regression_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.setFeaturesCol('scaled_features') estimator.setOutputCol('output') estimator.setLabelCol('ground_truth') estimator.set_mode("synchronous") estimator.set_loss("mae") estimator.set_metrics(['mae']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.01) estimator.set_categorical_labels(False) pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("ground_truth", "output") pnl.show(100) prediction_and_observations = pnl.rdd.map( lambda row: (row['ground_truth'], row['output'])) metrics = RegressionMetrics(prediction_and_observations) print(metrics.r2)
def test_predict_classes_probability(spark_context, classification_model, mnist_data): batch_size = 64 nb_classes = 10 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_predict_classes(False) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) results = fitted_pipeline.transform(test_df) # we should have an array of 10 elements in the prediction column, since we have 10 classes # and therefore 10 probabilities assert len(results.take(1)[0].prediction) == 10
print(x_test.shape[0], 'test samples') model = Sequential() model.add(Dense(64, input_shape=(x_train.shape[1],))) model.add(Activation('relu')) model.add(Dense(64)) model.add(Activation('relu')) model.add(Dense(1)) # Create Spark context conf = SparkConf().setAppName('BostonHousing_Spark_MLP').setMaster('local[*]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels df = to_data_frame(sc, x_train, y_train) test_df = to_data_frame(sc, x_test, y_test) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("mae") estimator.set_metrics(['mse']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1)
model.add(Dense(128, 128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(128, 10)) model.add(Activation('softmax')) # Compile model rms = RMSprop() model.compile(loss='categorical_crossentropy', optimizer=rms) # Create Spark context conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels df = to_data_frame(sc, X_train, Y_train, categorical=True) test_df = to_data_frame(sc, X_test, Y_test, categorical=True) # Initialize Spark ML Estimator estimator = ElephasEstimator(sc,model, nb_epoch=nb_epoch, batch_size=batch_size, verbose=0, validation_split=0.1, num_workers=8, categorical=True, nb_classes=nb_classes) # Fitting a model returns a Transformer fitted_model = estimator.fit(df) # Evaluate Spark model by evaluating the underlying model prediction = fitted_model.transform(df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_label= pnl.map(lambda row: (row.label, row.prediction))
model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(10)) model.add(Activation('softmax')) # Compile model adam = Adam() model.compile(loss='categorical_crossentropy', optimizer=adam) # Create Spark context conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels df = to_data_frame(sc, x_train, y_train, categorical=True) test_df = to_data_frame(sc, x_test, y_test, categorical=True) # Initialize Spark ML Estimator adadelta = elephas_optimizers.Adadelta() estimator = ElephasEstimator(sc, model, nb_epoch=nb_epoch, batch_size=batch_size, optimizer=adadelta, frequency='batch', mode='asynchronous', num_workers=2, verbose=0, validation_split=0.1, categorical=True,
model = Sequential() model.add(Dense(128, input_dim=784)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(10)) model.add(Activation('softmax')) # Create Spark context conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels df = to_data_frame(sc, x_train, y_train, categorical=True) test_df = to_data_frame(sc, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1)
model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(10)) model.add(Activation('softmax')) # Compile model adam = Adam() model.compile(loss='categorical_crossentropy', optimizer=adam) # Create Spark context conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels df = to_data_frame(sc, X_train, Y_train, categorical=True) test_df = to_data_frame(sc, X_test, Y_test, categorical=True) # Initialize Spark ML Estimator adadelta = elephas_optimizers.Adadelta() estimator = ElephasEstimator(sc, model, nb_epoch=nb_epoch, batch_size=batch_size, optimizer=adadelta, frequency='batch', mode='asynchronous', num_workers=2, verbose=0, validation_split=0.1, categorical=True,
import numpy as np # Change features to right shape def raise_dim(l): x = np.array(l) r = x[0::3].reshape(-1, 2) g = x[1::3].reshape(-1, 2) b = x[2::3].reshape(-1, 2) return np.expand_dims(np.array([r, g, b]), axis=0) f2 = np.array([raise_dim(x) for x in features]) lb_df = to_data_frame(sc, f2, labels, True) lb_df.printSchema() lb_df.show() ## Turn back mllib vectors #from pyspark.mllib import #to_mllib_vectors = F.udf(lambda x: #scaled_df.show() # Split in to train/test splits = lb_df.randomSplit([0.8, 0.2], 314) train_df = splits[0] test_df = splits[1] print(train_df.count())