def test_set_predict_classes_regression_warning(spark_context, regression_model): with pytest.warns(ElephasWarning): estimator = ElephasEstimator() estimator.set_loss("mae") estimator.set_metrics(['mae']) estimator.set_categorical_labels(False) estimator.set_predict_classes(True)
def test_serialization_estimator(): estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_loss("categorical_crossentropy") estimator.save("test.h5") load_ml_estimator("test.h5")
def test_serialization_estimator(classification_model): estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_loss("categorical_crossentropy") estimator.save("test.h5") loaded_model = load_ml_estimator("test.h5") assert loaded_model.get_model().to_yaml() == classification_model.to_yaml()
model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam') sgd = optimizers.SGD(lr=0.01) sgd_conf = optimizers.serialize(sgd) # Initialize Elephas Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.setFeaturesCol("scaled_features") estimator.setLabelCol("index_category") estimator.set_epochs(10) estimator.set_batch_size(128) estimator.set_num_workers(1) estimator.set_verbosity(0) estimator.set_validation_split(0.15) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes)
model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam') # Initialize Elephas Spark ML Estimator adagrad = elephas_optimizers.Adagrad() estimator = ElephasEstimator() estimator.setFeaturesCol("scaled_features") estimator.setLabelCol("index_category") estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(adagrad.get_config()) estimator.set_nb_epoch(10) estimator.set_batch_size(128) estimator.set_num_workers(4) estimator.set_verbosity(0) estimator.set_validation_split(0.15) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[string_indexer, scaler, estimator]) fitted_pipeline = pipeline.fit(train_df)
model.add(Dropout(0.5)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam') # Initialize Elephas Spark ML Estimator adagrad = elephas_optimizers.Adagrad() estimator = ElephasEstimator() estimator.setFeaturesCol("scaled_features") estimator.setLabelCol("index_category") estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(adagrad.get_config()) estimator.set_nb_epoch(10) estimator.set_batch_size(128) estimator.set_num_workers(4) estimator.set_verbosity(0) estimator.set_validation_split(0.15) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[string_indexer, scaler, estimator]) fitted_pipeline = pipeline.fit(train_df)
elapsed_validation_VectorsCpu = time.clock() - startCpu start = time.time() startCpu = time.clock() input_dim = train_df.select("features").first()[0].shape logger.info( f"We have {num_classes} classes and {input_dim[0]} features") model = get_model(train_df, input_dim) model.compile(optimizer=optimizer, loss=mywloss, metrics=['accuracy']) adam = optimizers.nadam(lr=0.01) opt_conf = optimizers.serialize(adam) # Initialize SparkML Estimator and set all relevant properties estimator = ElephasEstimator() estimator.setFeaturesCol( "features") # These two come directly from pyspark, estimator.setLabelCol("target") # hence the camel case. Sorry :) estimator.set_keras_model_config( model.to_yaml()) # Provide serialized Keras model estimator.set_categorical_labels(True) estimator.set_nb_classes(num_classes) estimator.set_num_workers( 10) # We just use one worker here. Feel free to adapt it. estimator.set_epochs(2) # was max-epochs estimator.set_batch_size(batch_size) # was 128 estimator.set_verbosity(2) # was 1 estimator.set_validation_split(0.15) estimator.set_optimizer_config(opt_conf) estimator.set_mode("synchronous") # Was synchronous
def test_spark_ml_model_classification(spark_context, classification_model, mnist_data): batch_size = 64 nb_classes = 10 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) # Evaluate Spark model by evaluating the underlying model prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.accuracy)
def test_regression_model(spark_context, regression_model, boston_housing_dataset): batch_size = 64 epochs = 10 x_train, y_train, x_test, y_test = boston_housing_dataset df = to_data_frame(spark_context, x_train, y_train) test_df = to_data_frame(spark_context, x_test, y_test) sgd = optimizers.SGD(lr=0.00001) sgd_conf = optimizers.serialize(sgd) estimator = ElephasEstimator() estimator.set_keras_model_config(regression_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("mae") estimator.set_metrics(['mae']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.01) estimator.set_categorical_labels(False) pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_observations = pnl.rdd.map(lambda row: (row.label, row.prediction)) metrics = RegressionMetrics(prediction_and_observations) print(metrics.r2)
# Create Spark context conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels df = to_data_frame(sc, x_train, y_train, categorical=True) test_df = to_data_frame(sc, x_test, y_test, categorical=True) # Initialize Spark ML Estimator adadelta = elephas_optimizers.Adadelta() estimator = ElephasEstimator(sc, model, nb_epoch=nb_epoch, batch_size=batch_size, optimizer=adadelta, frequency='batch', mode='asynchronous', num_workers=2, verbose=0, validation_split=0.1, categorical=True, nb_classes=nb_classes) # Fitting a model returns a Transformer fitted_model = estimator.fit(df) # Evaluate Spark model by evaluating the underlying model prediction = fitted_model.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_label = pnl.map(lambda row: (row.label, row.prediction))
adam = Adam() model.compile(loss='categorical_crossentropy', optimizer=adam) # Create Spark context conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels df = to_data_frame(sc, x_train, y_train, categorical=True) test_df = to_data_frame(sc, x_test, y_test, categorical=True) # Define elephas optimizer adadelta = elephas_optimizers.Adadelta() # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(adadelta.get_config()) estimator.set_nb_epoch(nb_epoch) estimator.set_batch_size(batch_size) estimator.set_num_workers(1) estimator.set_verbosity(0) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) # Evaluate Spark model by evaluating the underlying model
def make_model(data): data.show() data = data.dropna() nb_classes = data.select("label").distinct().count() input_dim = len(data.select("features").first()[0]) print(nb_classes, input_dim) model = Sequential() model.add(Embedding(input_dim=input_dim, output_dim=100)) #model.add(LSTM(64,return_sequences=False,dropout=0.1,recurrent_dropout=0.1)) model.add(Dense(100, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes, activation='softmax')) #sgd = optimizers.SGD(lr=0.1) #model.compile(sgd, 'categorical_crossentropy', ['acc']) model.compile(loss='binary_crossentropy', optimizer='adam') #model.compile(loss='categorical_crossentropy', optimizer='adam') spark_model = SparkModel(model, frequency='epoch', mode='asynchronous') adam = optimizers.Adam(lr=0.01) opt_conf = optimizers.serialize(adam) estimator = ElephasEstimator() estimator.setFeaturesCol("features") estimator.setLabelCol("label") estimator.set_keras_model_config(model.to_yaml()) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) estimator.set_num_workers(1) estimator.set_epochs(20) estimator.set_batch_size(128) estimator.set_verbosity(1) estimator.set_validation_split(0.15) estimator.set_optimizer_config(opt_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) #estimator = ElephasEstimator(model, epochs=20, batch_size=32, frequency='batch', mode='asynchronous', nb_classes=1) pipeline = Pipeline(stages=[estimator]) #fitted_model = estimator.fit(data) #prediction = fitted_model.transform(data) fitted_pipeline = pipeline.fit(data) # Fit model to data prediction = fitted_pipeline.transform(data) # Evaluate on train data. # prediction = fitted_pipeline.transform(test_df) # <-- The same code evaluates test data. pnl = prediction.select("text", "prediction") pnl.show(100) prediction_and_label = pnl.map(lambda row: (row.text, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.precision()) pnl = prediction.select("label", "prediction").show() pnl.show(100)
def test_save_pipeline(spark_context, classification_model): sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(10) estimator.set_batch_size(10) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(10) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) pipeline.save('tmp')
def test_batch_predict_classes_probability(spark_context, classification_model, mnist_data): batch_size = 64 nb_classes = 10 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer fitted_pipeline = estimator.fit(df) results = fitted_pipeline.transform(test_df) # Set inference batch size and do transform again on the same test_df inference_batch_size = int(len(y_test) / 10) fitted_pipeline.set_params(inference_batch_size=inference_batch_size) fitted_pipeline.set_params(outputCol="prediction_via_batch_inference") results_with_batch_prediction = fitted_pipeline.transform(results) # we should have an array of 10 elements in the prediction column, since we have 10 classes # and therefore 10 probabilities results_np = results_with_batch_prediction.take(1)[0] assert len(results_np.prediction) == 10 assert len(results_np.prediction_via_batch_inference) == 10 assert np.array_equal(results_np.prediction, results_np.prediction_via_batch_inference)
model.add(Dense(10)) model.add(Activation('softmax')) # Create Spark context conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels df = to_data_frame(sc, x_train, y_train, categorical=True) test_df = to_data_frame(sc, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df)
# Create Spark context conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels df = to_data_frame(sc, X_train, Y_train, categorical=True) test_df = to_data_frame(sc, X_test, Y_test, categorical=True) # Initialize Spark ML Estimator adadelta = elephas_optimizers.Adadelta() estimator = ElephasEstimator(sc, model, nb_epoch=nb_epoch, batch_size=batch_size, optimizer=adadelta, frequency='batch', mode='asynchronous', num_workers=2, verbose=0, validation_split=0.1, categorical=True, nb_classes=nb_classes) # Fitting a model returns a Transformer fitted_model = estimator.fit(df) # Evaluate Spark model by evaluating the underlying model prediction = fitted_model.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_label = pnl.map(lambda row: (row.label, row.prediction))
def test_set_cols(spark_context, regression_model, boston_housing_dataset): batch_size = 64 epochs = 10 x_train, y_train, x_test, y_test = boston_housing_dataset df = to_data_frame(spark_context, x_train, y_train) df = df.withColumnRenamed('features', 'scaled_features') df = df.withColumnRenamed('label', 'ground_truth') test_df = to_data_frame(spark_context, x_test, y_test) test_df = test_df.withColumnRenamed('features', 'scaled_features') test_df = test_df.withColumnRenamed('label', 'ground_truth') sgd = optimizers.SGD(lr=0.00001) sgd_conf = optimizers.serialize(sgd) estimator = ElephasEstimator(labelCol='ground_truth', outputCol='output', featuresCol='scaled_features') estimator.set_keras_model_config(regression_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("mae") estimator.set_metrics(['mae']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.01) estimator.set_categorical_labels(False) pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("ground_truth", "output") pnl.show(100) prediction_and_observations = pnl.rdd.map( lambda row: (row['ground_truth'], row['output'])) metrics = RegressionMetrics(prediction_and_observations) print(metrics.r2)
def test_predict_classes_probability(spark_context, classification_model, mnist_data): batch_size = 64 nb_classes = 10 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_predict_classes(False) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) results = fitted_pipeline.transform(test_df) # we should have an array of 10 elements in the prediction column, since we have 10 classes # and therefore 10 probabilities assert len(results.take(1)[0].prediction) == 10
def test_custom_objects(spark_context, boston_housing_dataset): def custom_activation(x): return 2 * relu(x) model = Sequential() model.add(Dense(64, input_shape=(13, ))) model.add(Dense(64, activation=custom_activation)) model.add(Dense(1, activation='linear')) x_train, y_train, x_test, y_test = boston_housing_dataset df = to_data_frame(spark_context, x_train, y_train) test_df = to_data_frame(spark_context, x_test, y_test) sgd = optimizers.SGD(lr=0.00001) sgd_conf = optimizers.serialize(sgd) estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("mae") estimator.set_metrics(['mae']) estimator.set_epochs(10) estimator.set_batch_size(32) estimator.set_validation_split(0.01) estimator.set_categorical_labels(False) estimator.set_custom_objects({'custom_activation': custom_activation}) pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) prediction = fitted_pipeline.transform(test_df)
model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam') from elephas.ml_model import ElephasEstimator from elephas import optimizers as elephas_optimizers # Define elephas optimizer (which tells the model how to aggregate updates on the Spark master) adadelta = elephas_optimizers.Adadelta() # Initialize SparkML Estimator and set all relevant properties estimator = ElephasEstimator() estimator.setFeaturesCol("scaled_features") # These two come directly from pyspark, estimator.setLabelCol("index_category") # hence the camel case. Sorry :) estimator.set_keras_model_config(model.to_yaml()) # Provide serialized Keras model estimator.set_optimizer_config(adadelta.get_config()) # Provide serialized Elephas optimizer estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) estimator.set_num_workers(1) # We just use one worker here. Feel free to adapt it. estimator.set_nb_epoch(20) estimator.set_batch_size(128) estimator.set_verbosity(1) estimator.set_validation_split(0.15)
def test_functional_model(spark_context, classification_model_functional, mnist_data): batch_size = 64 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD() sgd_conf = optimizers.serialize(sgd) estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model_functional.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(10) pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.accuracy)
model.add(Activation('relu')) model.add(Dense(1)) # Create Spark context conf = SparkConf().setAppName('BostonHousing_Spark_MLP').setMaster('local[*]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels df = to_data_frame(sc, x_train, y_train) test_df = to_data_frame(sc, x_test, y_test) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("mae") estimator.set_metrics(['mse']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(False) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) # Evaluate Spark model by evaluating the underlying model