예제 #1
0
def predictMain(modelName,sc):
    timeSteps= 30                                                                   # No of past values that has to be used for Training purpose
    print "Going to Initialize the LSTM model"
    SMARTparameters=getSMARTParameters()
    print("The following are the SMART parameters:",SMARTparameters)
    lstm = ls.cloudLSTM(timeSteps=timeSteps,parms=SMARTparameters)                  # Initializing the DiskPrediction Model(LSTM Model)
    print "Initialized the Model"
    lstmModel = lstm.get_LSTM_Model()                   			    # Obtaining the LSTM model for initializing SparkModel Class
    trainSize= 0.2                                                                  # Fraction of input used for Training purpose
    acc = 0.0                                                                       # Model accuracy
    inputFilePath = os.environ.get('DATA_FILE_PATH')                                # Get the Input CSV filepath from environment
    year=sys.argv[1]                                                                # get the year from the Command Line arguments
    month=sys.argv[2]                                                               # get the month from the Command Line arguments
    inputFilePath=inputFilePath+str(year)+"/"+str(year)+"-"+str(month)+"*.csv"  # For E.g "/home/user/Desktop/Cloud/Test/2014/2014-11*.csv"
    print("InputPath",inputFilePath)
    rd.generate_DataFrame(inputFilePath,SMARTparameters)
    inputCSVFilePath = os.environ.get('MODEL_CSV_FILEPATH')+str(modelName)+".csv"    # For E.g "/hadoop/elephas/Output/ST4000DM000.csv"

    modelFeatures = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=SMARTparameters)
    modelLabel = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=['failure'])   #"/hadoop/elephas/Output/ST4000DM000.csv"

    # Removing Not A Number values from the Input Dataframe
    modelFeatures = modelFeatures.fillna(0)
    modelLabel = modelLabel.fillna(0)

    # Obtaining 3D training and testing vectors
    (feature_train, label_train), (feature_test, label_test) = lstm.train_test_split(modelFeatures,modelLabel,trainSize,timeSteps)

    # Condition to check whether the failure cases exists in the data
    if len(feature_train)==0:
        print("DiskModel has no failure eleements. Training of the model cannot proceed!!")
        return
    # Initializing the Adam Optimizer for Elephas
    adam = elephas_optimizers.Adam()
    print "Adam Optimizer initialized"
    #Converting Dataframe to Spark RDD
    rddataset = to_simple_rdd(sc, feature_train, label_train)
    print "Training data converted into Resilient Distributed Dataset"
    #Initializing the SparkModel with Optimizer,Master-Worker Mode and Number of Workers
    spark_model = SparkModel(sc,lstmModel,optimizer=adam ,frequency='epoch', mode='asynchronous', num_workers=2)
    print "Spark Model Initialized"
    #Initial training run of the model
    spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0)
    # Saving the model
    score = spark_model.evaluate(feature_test, label_test,show_accuracy=True)

    while(score <= 0.5):
        # Training the Input Data set
        spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0)
        print "LSTM model training done !!"
        score = spark_model.evaluate(feature_test, label_test,show_accuracy=True)
    print "Saving weights!!"
    outFilePath=os.environ.get('GATOR_SQUAD_HOME')
    outFilePath=outFilePath+"Weights/"+str(year)+"/"+str(month)+"/"+str(modelName)+"_my_model_weights.h5"
    spark_model.save_weights(outFilePath)
    print "LSTM model testing commencing !!"
    predicted1=spark_model.predict_classes(feature_test)
    df_confusion = pd.crosstab(label_test.flatten(), predicted1.flatten(), rownames=['Actual'], colnames=['Predicted'], margins=True)
    print df_confusion
예제 #2
0
def test_training_classification(spark_context, mode, parameter_server_mode,
                                 mnist_data, classification_model):
    # Define basic parameters
    batch_size = 64
    epochs = 10

    # Load data
    x_train, y_train, x_test, y_test = mnist_data
    x_train = x_train[:1000]
    y_train = y_train[:1000]

    sgd = SGD(lr=0.1)
    classification_model.compile(sgd, 'categorical_crossentropy', ['acc'])

    # Build RDD from numpy features and labels
    rdd = to_simple_rdd(spark_context, x_train, y_train)

    # Initialize SparkModel from keras model and Spark context
    spark_model = SparkModel(classification_model,
                             frequency='epoch',
                             mode=mode,
                             parameter_server_mode=parameter_server_mode,
                             port=4000 + random.randint(0, 500))

    # Train Spark model
    spark_model.fit(rdd,
                    epochs=epochs,
                    batch_size=batch_size,
                    verbose=0,
                    validation_split=0.1)

    # run inference on trained spark model
    predictions = spark_model.predict(x_test)
    # run evaluation on trained spark model
    evals = spark_model.evaluate(x_test, y_test)

    # assert we can supply rdd and get same prediction results when supplying numpy array
    test_rdd = spark_context.parallelize(x_test)
    assert [np.argmax(x) for x in predictions
            ] == [np.argmax(x) for x in spark_model.predict(test_rdd)]

    # assert we get the same prediction result with calling predict on keras model directly
    assert [np.argmax(x) for x in predictions] == [
        np.argmax(x) for x in spark_model.master_network.predict(x_test)
    ]

    # assert we get the same evaluation results when calling evaluate on keras model directly
    assert isclose(evals[0],
                   spark_model.master_network.evaluate(x_test, y_test)[0],
                   abs_tol=0.01)
    assert isclose(evals[1],
                   spark_model.master_network.evaluate(x_test, y_test)[1],
                   abs_tol=0.01)
예제 #3
0
def test_training_regression(spark_context, mode, parameter_server_mode,
                             boston_housing_dataset, regression_model):
    x_train, y_train, x_test, y_test = boston_housing_dataset
    rdd = to_simple_rdd(spark_context, x_train, y_train)

    # Define basic parameters
    batch_size = 64
    epochs = 10
    sgd = SGD(lr=0.0000001)
    regression_model.compile(sgd, 'mse', ['mae'])
    spark_model = SparkModel(regression_model,
                             frequency='epoch',
                             mode=mode,
                             parameter_server_mode=parameter_server_mode,
                             port=4000 + random.randint(0, 500))

    # Train Spark model
    spark_model.fit(rdd,
                    epochs=epochs,
                    batch_size=batch_size,
                    verbose=0,
                    validation_split=0.1)

    # run inference on trained spark model
    predictions = spark_model.predict(x_test)
    # run evaluation on trained spark model
    evals = spark_model.evaluate(x_test, y_test)

    # assert we can supply rdd and get same prediction results when supplying numpy array
    test_rdd = spark_context.parallelize(x_test)
    assert all(
        np.isclose(x, y, 0.01)
        for x, y in zip(predictions, spark_model.predict(test_rdd)))

    # assert we get the same prediction result with calling predict on keras model directly
    assert all(
        np.isclose(x, y, 0.01) for x, y in zip(
            predictions, spark_model.master_network.predict(x_test)))

    # assert we get the same evaluation results when calling evaluate on keras model directly
    assert isclose(evals[0],
                   spark_model.master_network.evaluate(x_test, y_test)[0],
                   abs_tol=0.01)
    assert isclose(evals[1],
                   spark_model.master_network.evaluate(x_test, y_test)[1],
                   abs_tol=0.01)
예제 #4
0
def test_training_custom_activation(mode, spark_context):
    def custom_activation(x):
        return sigmoid(x) + 1

    model = Sequential()
    model.add(Dense(1, input_dim=1, activation=custom_activation))
    model.add(Dense(1, activation='sigmoid'))

    sgd = SGD(lr=0.1)
    model.compile(sgd, 'binary_crossentropy', ['acc'])

    x_train = np.random.rand(1000)
    y_train = np.zeros(1000)
    x_test = np.random.rand(100)
    y_test = np.zeros(100)
    y_train[:500] = 1
    rdd = to_simple_rdd(spark_context, x_train, y_train)

    spark_model = SparkModel(model, frequency='epoch', mode=mode,
                             custom_objects={'custom_activation': custom_activation})
    spark_model.fit(rdd, epochs=1, batch_size=16, verbose=0, validation_split=0.1)
    assert spark_model.predict(x_test)
    assert spark_model.evaluate(x_test, y_test)
import tensorflow as tf
(x_train, y_train), (x_test, y_test) = mnist.load_data()
print("length =  ( ", len(x_train), ", ", len(y_train), " )")
print("shape of the dataset = ", tf.shape(y_train))

x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype("float32")
x_test = x_test.astype("float32")
x_train /= 255
x_test /= 255
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

nb_classes = 10
# Convert class vectors to binary class matrices
y_train = to_categorical(y_train, nb_classes)
y_test = to_categorical(y_test, nb_classes)

rdd = to_simple_rdd(sc, x_train, y_train)
print("rdd = ", rdd)

from elephas.spark_model import SparkModel
spark_model = SparkModel(model,
                         frequency='epoch',
                         mode='asynchronous',
                         num_workers=2)
spark_model.fit(rdd, epochs=10, batch_size=32, verbose=0, validation_split=0.1)
score = spark_model.evaluate(x_test, y_test, verbose=2)
print('Test accuracy:', score)