def test_predict_classes_probability(spark_context, classification_model, mnist_data): batch_size = 64 nb_classes = 10 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) results = fitted_pipeline.transform(test_df) # we should have an array of 10 elements in the prediction column, since we have 10 classes # and therefore 10 probabilities assert len(results.take(1)[0].prediction) == 10
# Build RDD from numpy features and labels df = to_data_frame(sc, x_train, y_train, categorical=True) test_df = to_data_frame(sc, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) # Evaluate Spark model by evaluating the underlying model prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction)) metrics = MulticlassMetrics(prediction_and_label)
model.compile(loss='categorical_crossentropy', optimizer='adam') sgd = optimizers.SGD(lr=0.01) sgd_conf = optimizers.serialize(sgd) # Initialize Elephas Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.setFeaturesCol("scaled_features") estimator.setLabelCol("index_category") estimator.set_epochs(10) estimator.set_batch_size(128) estimator.set_num_workers(1) estimator.set_verbosity(0) estimator.set_validation_split(0.15) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[string_indexer, scaler, estimator]) fitted_pipeline = pipeline.fit(train_df) # Evaluate Spark model prediction = fitted_pipeline.transform(train_df) pnl = prediction.select("index_category", "prediction") pnl.show(100)
model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer='sgd') # Model Summary model.summary() # Initialize SparkML Estimator and Get Settings estimator = ElephasEstimator() estimator.setFeaturesCol("features") estimator.setLabelCol("label_index") estimator.set_keras_model_config(model.to_yaml()) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) estimator.set_num_workers(1) estimator.set_epochs(25) estimator.set_batch_size(64) estimator.set_verbosity(1) estimator.set_validation_split(0.10) estimator.set_optimizer_config(sgd) estimator.set_mode("synchronous") estimator.set_loss("binary_crossentropy") estimator.set_metrics(['acc']) # Create Deep Learning Pipeline dl_pipeline = Pipeline(stages=[estimator]) print(dl_pipeline) def dl_pipeline_fit_score_results(dl_pipeline=dl_pipeline, train_data=train_data, test_data=test_data,
model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam') # Initialize Elephas Spark ML Estimator adagrad = elephas_optimizers.Adagrad() estimator = ElephasEstimator() estimator.setFeaturesCol("scaled_features") estimator.setLabelCol("index_category") estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(adagrad.get_config()) estimator.set_nb_epoch(10) estimator.set_batch_size(128) estimator.set_num_workers(4) estimator.set_verbosity(0) estimator.set_validation_split(0.15) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[string_indexer, scaler, estimator]) fitted_pipeline = pipeline.fit(train_df) from pyspark.mllib.evaluation import MulticlassMetrics # Evaluate Spark model prediction = fitted_pipeline.transform(train_df) pnl = prediction.select("index_category", "prediction")
conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels df = to_data_frame(sc, x_train, y_train, categorical=True) test_df = to_data_frame(sc, x_test, y_test, categorical=True) # Define elephas optimizer adadelta = elephas_optimizers.Adadelta() # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(adadelta.get_config()) estimator.set_nb_epoch(nb_epoch) estimator.set_batch_size(batch_size) estimator.set_num_workers(1) estimator.set_verbosity(0) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) # Evaluate Spark model by evaluating the underlying model prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100)
adam = optimizers.nadam(lr=0.01) opt_conf = optimizers.serialize(adam) # Initialize SparkML Estimator and set all relevant properties estimator = ElephasEstimator() estimator.setFeaturesCol( "features") # These two come directly from pyspark, estimator.setLabelCol("target") # hence the camel case. Sorry :) estimator.set_keras_model_config( model.to_yaml()) # Provide serialized Keras model estimator.set_categorical_labels(True) estimator.set_nb_classes(num_classes) estimator.set_num_workers( 10) # We just use one worker here. Feel free to adapt it. estimator.set_epochs(2) # was max-epochs estimator.set_batch_size(batch_size) # was 128 estimator.set_verbosity(2) # was 1 estimator.set_validation_split(0.15) estimator.set_optimizer_config(opt_conf) estimator.set_mode("synchronous") # Was synchronous estimator.set_loss(mywloss) # was("categorical_crossentropy") estimator.set_metrics(['accuracy']) buildModelElapsed = time.time() - start buildModelElapseCpu = time.clock() - startCpu start = time.time() startCpu = time.clock() pipeline = Pipeline(stages=[estimator])