예제 #1
0
def test_spark_ml_model(spark_context):

    df = to_data_frame(spark_context, x_train, y_train, categorical=True)
    test_df = to_data_frame(spark_context, x_test, y_test, categorical=True)

    sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    sgd_conf = optimizers.serialize(sgd)

    # Initialize Spark ML Estimator
    estimator = ElephasEstimator()
    estimator.set_keras_model_config(model.to_yaml())
    estimator.set_optimizer_config(sgd_conf)
    estimator.set_mode("synchronous")
    estimator.set_loss("categorical_crossentropy")
    estimator.set_metrics(['acc'])
    estimator.set_epochs(epochs)
    estimator.set_batch_size(batch_size)
    estimator.set_validation_split(0.1)
    estimator.set_categorical_labels(True)
    estimator.set_nb_classes(nb_classes)

    # Fitting a model returns a Transformer
    pipeline = Pipeline(stages=[estimator])
    fitted_pipeline = pipeline.fit(df)

    # Evaluate Spark model by evaluating the underlying model
    prediction = fitted_pipeline.transform(test_df)
    pnl = prediction.select("label", "prediction")
    pnl.show(100)

    prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction))
    metrics = MulticlassMetrics(prediction_and_label)
    print(metrics.precision())
    print(metrics.recall())
예제 #2
0
def test_regression_model(spark_context, regression_model,
                          boston_housing_dataset):
    batch_size = 64
    epochs = 10

    x_train, y_train, x_test, y_test = boston_housing_dataset
    df = to_data_frame(spark_context, x_train, y_train)
    test_df = to_data_frame(spark_context, x_test, y_test)

    sgd = optimizers.SGD(lr=0.00001)
    sgd_conf = optimizers.serialize(sgd)
    estimator = ElephasEstimator()
    estimator.set_keras_model_config(regression_model.to_yaml())
    estimator.set_optimizer_config(sgd_conf)
    estimator.set_mode("synchronous")
    estimator.set_loss("mae")
    estimator.set_metrics(['mae'])
    estimator.set_epochs(epochs)
    estimator.set_batch_size(batch_size)
    estimator.set_validation_split(0.01)
    estimator.set_categorical_labels(False)

    pipeline = Pipeline(stages=[estimator])
    fitted_pipeline = pipeline.fit(df)
    prediction = fitted_pipeline.transform(test_df)
    pnl = prediction.select("label", "prediction")
    pnl.show(100)

    prediction_and_observations = pnl.rdd.map(lambda row:
                                              (row.label, row.prediction))
    metrics = RegressionMetrics(prediction_and_observations)
    print(metrics.r2)
예제 #3
0
def test_custom_objects(spark_context, boston_housing_dataset):
    def custom_activation(x):
        return 2 * relu(x)

    model = Sequential()
    model.add(Dense(64, input_shape=(13, )))
    model.add(Dense(64, activation=custom_activation))
    model.add(Dense(1, activation='linear'))
    x_train, y_train, x_test, y_test = boston_housing_dataset
    df = to_data_frame(spark_context, x_train, y_train)
    test_df = to_data_frame(spark_context, x_test, y_test)

    sgd = optimizers.SGD(lr=0.00001)
    sgd_conf = optimizers.serialize(sgd)
    estimator = ElephasEstimator()
    estimator.set_keras_model_config(model.to_yaml())
    estimator.set_optimizer_config(sgd_conf)
    estimator.set_mode("synchronous")
    estimator.set_loss("mae")
    estimator.set_metrics(['mae'])
    estimator.set_epochs(10)
    estimator.set_batch_size(32)
    estimator.set_validation_split(0.01)
    estimator.set_categorical_labels(False)
    estimator.set_custom_objects({'custom_activation': custom_activation})

    pipeline = Pipeline(stages=[estimator])
    fitted_pipeline = pipeline.fit(df)
    prediction = fitted_pipeline.transform(test_df)
예제 #4
0
def test_functional_model(spark_context, classification_model_functional,
                          mnist_data):
    batch_size = 64
    epochs = 1

    x_train, y_train, x_test, y_test = mnist_data
    x_train = x_train[:1000]
    y_train = y_train[:1000]
    df = to_data_frame(spark_context, x_train, y_train, categorical=True)
    test_df = to_data_frame(spark_context, x_test, y_test, categorical=True)

    sgd = optimizers.SGD()
    sgd_conf = optimizers.serialize(sgd)
    estimator = ElephasEstimator()
    estimator.set_keras_model_config(classification_model_functional.to_yaml())
    estimator.set_optimizer_config(sgd_conf)
    estimator.set_mode("synchronous")
    estimator.set_loss("categorical_crossentropy")
    estimator.set_metrics(['acc'])
    estimator.set_epochs(epochs)
    estimator.set_batch_size(batch_size)
    estimator.set_validation_split(0.1)
    estimator.set_categorical_labels(True)
    estimator.set_nb_classes(10)
    pipeline = Pipeline(stages=[estimator])
    fitted_pipeline = pipeline.fit(df)
    prediction = fitted_pipeline.transform(test_df)
    pnl = prediction.select("label", "prediction")
    pnl.show(100)

    prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction))
    metrics = MulticlassMetrics(prediction_and_label)
    print(metrics.accuracy)
예제 #5
0
def test_to_data_frame(spark_context):
    features = np.ones((2, 10))
    labels = np.asarray([[2.0], [1.0]])

    data_frame = adapter.to_data_frame(
        spark_context, features, labels, categorical=False)
    assert data_frame.count() == 2
예제 #6
0
def test_spark_ml_model_classification(spark_context, classification_model,
                                       mnist_data):
    batch_size = 64
    nb_classes = 10
    epochs = 1

    x_train, y_train, x_test, y_test = mnist_data
    x_train = x_train[:1000]
    y_train = y_train[:1000]
    df = to_data_frame(spark_context, x_train, y_train, categorical=True)
    test_df = to_data_frame(spark_context, x_test, y_test, categorical=True)

    sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    sgd_conf = optimizers.serialize(sgd)

    # Initialize Spark ML Estimator
    estimator = ElephasEstimator()
    estimator.set_keras_model_config(classification_model.to_yaml())
    estimator.set_optimizer_config(sgd_conf)
    estimator.set_mode("synchronous")
    estimator.set_loss("categorical_crossentropy")
    estimator.set_metrics(['acc'])
    estimator.set_epochs(epochs)
    estimator.set_batch_size(batch_size)
    estimator.set_validation_split(0.1)
    estimator.set_categorical_labels(True)
    estimator.set_nb_classes(nb_classes)

    # Fitting a model returns a Transformer
    pipeline = Pipeline(stages=[estimator])
    fitted_pipeline = pipeline.fit(df)

    # Evaluate Spark model by evaluating the underlying model
    prediction = fitted_pipeline.transform(test_df)
    pnl = prediction.select("label", "prediction")
    pnl.show(100)

    # since prediction in a multiclass classification problem is a vector, we need to compute argmax
    # the casting to a double is just necessary for using MulticlassMetrics
    pnl = pnl.select(
        'label',
        argmax('prediction').astype(DoubleType()).alias('prediction'))
    prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction))
    metrics = MulticlassMetrics(prediction_and_label)
    print(metrics.accuracy)
예제 #7
0
def test_batch_predict_classes_probability(spark_context, classification_model,
                                           mnist_data):
    batch_size = 64
    nb_classes = 10
    epochs = 1

    x_train, y_train, x_test, y_test = mnist_data
    x_train = x_train[:1000]
    y_train = y_train[:1000]
    df = to_data_frame(spark_context, x_train, y_train, categorical=True)
    test_df = to_data_frame(spark_context, x_test, y_test, categorical=True)

    sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    sgd_conf = optimizers.serialize(sgd)

    # Initialize Spark ML Estimator
    estimator = ElephasEstimator()
    estimator.set_keras_model_config(classification_model.to_yaml())
    estimator.set_optimizer_config(sgd_conf)
    estimator.set_mode("synchronous")
    estimator.set_loss("categorical_crossentropy")
    estimator.set_metrics(['acc'])
    estimator.set_epochs(epochs)
    estimator.set_batch_size(batch_size)
    estimator.set_validation_split(0.1)
    estimator.set_categorical_labels(True)
    estimator.set_nb_classes(nb_classes)

    # Fitting a model returns a Transformer
    fitted_pipeline = estimator.fit(df)

    results = fitted_pipeline.transform(test_df)

    # Set inference batch size and do transform again on the same test_df
    inference_batch_size = int(len(y_test) / 10)
    fitted_pipeline.set_params(inference_batch_size=inference_batch_size)
    fitted_pipeline.set_params(outputCol="prediction_via_batch_inference")
    results_with_batch_prediction = fitted_pipeline.transform(results)
    # we should have an array of 10 elements in the prediction column, since we have 10 classes
    # and therefore 10 probabilities
    results_np = results_with_batch_prediction.take(1)[0]
    assert len(results_np.prediction) == 10
    assert len(results_np.prediction_via_batch_inference) == 10
    assert np.array_equal(results_np.prediction,
                          results_np.prediction_via_batch_inference)
예제 #8
0
def test_df_to_simple_rdd(spark_context):
    features = np.ones((2, 10))
    labels = np.asarray([[2.0], [1.0]]).reshape((2,))

    data_frame = adapter.to_data_frame(
        spark_context, features, labels, categorical=False)

    rdd = adapter.df_to_simple_rdd(data_frame, False)
    assert rdd.count() == 2
예제 #9
0
def test_from_data_frame_cat(spark_context):
    features = np.ones((2, 10))
    labels = np.asarray([[0, 0, 1.0], [0, 1.0, 0]])

    data_frame = adapter.to_data_frame(
        spark_context, features, labels, categorical=True)

    x, y = adapter.from_data_frame(data_frame, categorical=True, nb_classes=3)
    assert features.shape == x.shape
    assert labels.shape == y.shape
예제 #10
0
def test_from_data_frame(spark_context):
    features = np.ones((2, 10))
    labels = np.asarray([[2.0], [1.0]]).reshape((2,))

    data_frame = adapter.to_data_frame(
        spark_context, features, labels, categorical=False)

    x, y = adapter.from_data_frame(data_frame, categorical=False)
    assert features.shape == x.shape
    assert labels.shape == y.shape
예제 #11
0
def test_serialization_transformer_and_predict(spark_context,
                                               classification_model,
                                               mnist_data):
    _, _, x_test, y_test = mnist_data
    df = to_data_frame(spark_context, x_test, y_test, categorical=True)
    transformer = ElephasTransformer(
        weights=classification_model.get_weights(),
        model_type=ModelType.CLASSIFICATION)
    transformer.set_keras_model_config(classification_model.to_yaml())
    transformer.save("test.h5")
    loaded_transformer = load_ml_transformer("test.h5")
    loaded_transformer.transform(df)
예제 #12
0
def test_set_cols_deprecated(spark_context, regression_model,
                             boston_housing_dataset):
    with pytest.deprecated_call():
        batch_size = 64
        epochs = 10

        x_train, y_train, x_test, y_test = boston_housing_dataset
        df = to_data_frame(spark_context, x_train, y_train)
        df = df.withColumnRenamed('features', 'scaled_features')
        df = df.withColumnRenamed('label', 'ground_truth')
        test_df = to_data_frame(spark_context, x_test, y_test)
        test_df = test_df.withColumnRenamed('features', 'scaled_features')
        test_df = test_df.withColumnRenamed('label', 'ground_truth')

        sgd = optimizers.SGD(lr=0.00001)
        sgd_conf = optimizers.serialize(sgd)
        estimator = ElephasEstimator()
        estimator.set_keras_model_config(regression_model.to_yaml())
        estimator.set_optimizer_config(sgd_conf)
        estimator.setFeaturesCol('scaled_features')
        estimator.setOutputCol('output')
        estimator.setLabelCol('ground_truth')
        estimator.set_mode("synchronous")
        estimator.set_loss("mae")
        estimator.set_metrics(['mae'])
        estimator.set_epochs(epochs)
        estimator.set_batch_size(batch_size)
        estimator.set_validation_split(0.01)
        estimator.set_categorical_labels(False)

        pipeline = Pipeline(stages=[estimator])
        fitted_pipeline = pipeline.fit(df)
        prediction = fitted_pipeline.transform(test_df)
        pnl = prediction.select("ground_truth", "output")
        pnl.show(100)

        prediction_and_observations = pnl.rdd.map(
            lambda row: (row['ground_truth'], row['output']))
        metrics = RegressionMetrics(prediction_and_observations)
        print(metrics.r2)
예제 #13
0
def test_predict_classes_probability(spark_context, classification_model,
                                     mnist_data):
    batch_size = 64
    nb_classes = 10
    epochs = 1

    x_train, y_train, x_test, y_test = mnist_data
    x_train = x_train[:1000]
    y_train = y_train[:1000]
    df = to_data_frame(spark_context, x_train, y_train, categorical=True)
    test_df = to_data_frame(spark_context, x_test, y_test, categorical=True)

    sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    sgd_conf = optimizers.serialize(sgd)

    # Initialize Spark ML Estimator
    estimator = ElephasEstimator()
    estimator.set_keras_model_config(classification_model.to_yaml())
    estimator.set_optimizer_config(sgd_conf)
    estimator.set_mode("synchronous")
    estimator.set_loss("categorical_crossentropy")
    estimator.set_metrics(['acc'])
    estimator.set_predict_classes(False)
    estimator.set_epochs(epochs)
    estimator.set_batch_size(batch_size)
    estimator.set_validation_split(0.1)
    estimator.set_categorical_labels(True)
    estimator.set_nb_classes(nb_classes)

    # Fitting a model returns a Transformer
    pipeline = Pipeline(stages=[estimator])
    fitted_pipeline = pipeline.fit(df)

    results = fitted_pipeline.transform(test_df)
    # we should have an array of 10 elements in the prediction column, since we have 10 classes
    # and therefore 10 probabilities
    assert len(results.take(1)[0].prediction) == 10
예제 #14
0
print(x_test.shape[0], 'test samples')


model = Sequential()
model.add(Dense(64, input_shape=(x_train.shape[1],)))
model.add(Activation('relu'))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dense(1))

# Create Spark context
conf = SparkConf().setAppName('BostonHousing_Spark_MLP').setMaster('local[*]')
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
df = to_data_frame(sc, x_train, y_train)
test_df = to_data_frame(sc, x_test, y_test)

sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
sgd_conf = optimizers.serialize(sgd)

# Initialize Spark ML Estimator
estimator = ElephasEstimator()
estimator.set_keras_model_config(model.to_yaml())
estimator.set_optimizer_config(sgd_conf)
estimator.set_mode("synchronous")
estimator.set_loss("mae")
estimator.set_metrics(['mse'])
estimator.set_epochs(epochs)
estimator.set_batch_size(batch_size)
estimator.set_validation_split(0.1)
예제 #15
0
model.add(Dense(128, 128))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(128, 10))
model.add(Activation('softmax'))

# Compile model
rms = RMSprop()
model.compile(loss='categorical_crossentropy', optimizer=rms)

# Create Spark context
conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]')
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
df = to_data_frame(sc, X_train, Y_train, categorical=True)
test_df = to_data_frame(sc, X_test, Y_test, categorical=True)

# Initialize Spark ML Estimator
estimator = ElephasEstimator(sc,model, nb_epoch=nb_epoch, batch_size=batch_size, 
        verbose=0, validation_split=0.1, num_workers=8, categorical=True, nb_classes=nb_classes)

# Fitting a model returns a Transformer
fitted_model = estimator.fit(df)

# Evaluate Spark model by evaluating the underlying model
prediction = fitted_model.transform(df)
pnl = prediction.select("label", "prediction")
pnl.show(100)

prediction_and_label= pnl.map(lambda row: (row.label, row.prediction))
예제 #16
0
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(10))
model.add(Activation('softmax'))


# Compile model
adam = Adam()
model.compile(loss='categorical_crossentropy', optimizer=adam)

# Create Spark context
conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]')
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
df = to_data_frame(sc, x_train, y_train, categorical=True)
test_df = to_data_frame(sc, x_test, y_test, categorical=True)

# Initialize Spark ML Estimator
adadelta = elephas_optimizers.Adadelta()
estimator = ElephasEstimator(sc,
                             model,
                             nb_epoch=nb_epoch,
                             batch_size=batch_size,
                             optimizer=adadelta,
                             frequency='batch',
                             mode='asynchronous',
                             num_workers=2,
                             verbose=0,
                             validation_split=0.1,
                             categorical=True,
model = Sequential()
model.add(Dense(128, input_dim=784))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(10))
model.add(Activation('softmax'))

# Create Spark context
conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]')
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
df = to_data_frame(sc, x_train, y_train, categorical=True)
test_df = to_data_frame(sc, x_test, y_test, categorical=True)

sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
sgd_conf = optimizers.serialize(sgd)

# Initialize Spark ML Estimator
estimator = ElephasEstimator()
estimator.set_keras_model_config(model.to_yaml())
estimator.set_optimizer_config(sgd_conf)
estimator.set_mode("synchronous")
estimator.set_loss("categorical_crossentropy")
estimator.set_metrics(['acc'])
estimator.set_epochs(epochs)
estimator.set_batch_size(batch_size)
estimator.set_validation_split(0.1)
예제 #18
0
파일: ml_mlp.py 프로젝트: nkhuyu/elephas
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(10))
model.add(Activation('softmax'))

# Compile model
adam = Adam()
model.compile(loss='categorical_crossentropy', optimizer=adam)

# Create Spark context
conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]')
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
df = to_data_frame(sc, X_train, Y_train, categorical=True)
test_df = to_data_frame(sc, X_test, Y_test, categorical=True)

# Initialize Spark ML Estimator
adadelta = elephas_optimizers.Adadelta()
estimator = ElephasEstimator(sc,
                             model,
                             nb_epoch=nb_epoch,
                             batch_size=batch_size,
                             optimizer=adadelta,
                             frequency='batch',
                             mode='asynchronous',
                             num_workers=2,
                             verbose=0,
                             validation_split=0.1,
                             categorical=True,
예제 #19
0
import numpy as np


# Change features to right shape
def raise_dim(l):
    x = np.array(l)
    r = x[0::3].reshape(-1, 2)
    g = x[1::3].reshape(-1, 2)
    b = x[2::3].reshape(-1, 2)
    return np.expand_dims(np.array([r, g, b]), axis=0)


f2 = np.array([raise_dim(x) for x in features])

lb_df = to_data_frame(sc, f2, labels, True)
lb_df.printSchema()
lb_df.show()

## Turn back mllib vectors
#from pyspark.mllib import
#to_mllib_vectors = F.udf(lambda x:

#scaled_df.show()

# Split in to train/test
splits = lb_df.randomSplit([0.8, 0.2], 314)
train_df = splits[0]
test_df = splits[1]

print(train_df.count())