def predictMain(modelName,sc): timeSteps= 30 # No of past values that has to be used for Training purpose print "Going to Initialize the LSTM model" SMARTparameters=getSMARTParameters() print("The following are the SMART parameters:",SMARTparameters) lstm = ls.cloudLSTM(timeSteps=timeSteps,parms=SMARTparameters) # Initializing the DiskPrediction Model(LSTM Model) print "Initialized the Model" lstmModel = lstm.get_LSTM_Model() # Obtaining the LSTM model for initializing SparkModel Class trainSize= 0.2 # Fraction of input used for Training purpose acc = 0.0 # Model accuracy inputFilePath = os.environ.get('DATA_FILE_PATH') # Get the Input CSV filepath from environment year=sys.argv[1] # get the year from the Command Line arguments month=sys.argv[2] # get the month from the Command Line arguments inputFilePath=inputFilePath+str(year)+"/"+str(year)+"-"+str(month)+"*.csv" # For E.g "/home/user/Desktop/Cloud/Test/2014/2014-11*.csv" print("InputPath",inputFilePath) rd.generate_DataFrame(inputFilePath,SMARTparameters) inputCSVFilePath = os.environ.get('MODEL_CSV_FILEPATH')+str(modelName)+".csv" # For E.g "/hadoop/elephas/Output/ST4000DM000.csv" modelFeatures = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=SMARTparameters) modelLabel = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=['failure']) #"/hadoop/elephas/Output/ST4000DM000.csv" # Removing Not A Number values from the Input Dataframe modelFeatures = modelFeatures.fillna(0) modelLabel = modelLabel.fillna(0) # Obtaining 3D training and testing vectors (feature_train, label_train), (feature_test, label_test) = lstm.train_test_split(modelFeatures,modelLabel,trainSize,timeSteps) # Condition to check whether the failure cases exists in the data if len(feature_train)==0: print("DiskModel has no failure eleements. Training of the model cannot proceed!!") return # Initializing the Adam Optimizer for Elephas adam = elephas_optimizers.Adam() print "Adam Optimizer initialized" #Converting Dataframe to Spark RDD rddataset = to_simple_rdd(sc, feature_train, label_train) print "Training data converted into Resilient Distributed Dataset" #Initializing the SparkModel with Optimizer,Master-Worker Mode and Number of Workers spark_model = SparkModel(sc,lstmModel,optimizer=adam ,frequency='epoch', mode='asynchronous', num_workers=2) print "Spark Model Initialized" #Initial training run of the model spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0) # Saving the model score = spark_model.evaluate(feature_test, label_test,show_accuracy=True) while(score <= 0.5): # Training the Input Data set spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0) print "LSTM model training done !!" score = spark_model.evaluate(feature_test, label_test,show_accuracy=True) print "Saving weights!!" outFilePath=os.environ.get('GATOR_SQUAD_HOME') outFilePath=outFilePath+"Weights/"+str(year)+"/"+str(month)+"/"+str(modelName)+"_my_model_weights.h5" spark_model.save_weights(outFilePath) print "LSTM model testing commencing !!" predicted1=spark_model.predict_classes(feature_test) df_confusion = pd.crosstab(label_test.flatten(), predicted1.flatten(), rownames=['Actual'], colnames=['Predicted'], margins=True) print df_confusion
def test_training_classification(spark_context, mode, parameter_server_mode, mnist_data, classification_model): # Define basic parameters batch_size = 64 epochs = 10 # Load data x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] sgd = SGD(lr=0.1) classification_model.compile(sgd, 'categorical_crossentropy', ['acc']) # Build RDD from numpy features and labels rdd = to_simple_rdd(spark_context, x_train, y_train) # Initialize SparkModel from keras model and Spark context spark_model = SparkModel(classification_model, frequency='epoch', mode=mode, parameter_server_mode=parameter_server_mode, port=4000 + random.randint(0, 500)) # Train Spark model spark_model.fit(rdd, epochs=epochs, batch_size=batch_size, verbose=0, validation_split=0.1) # run inference on trained spark model predictions = spark_model.predict(x_test) # run evaluation on trained spark model evals = spark_model.evaluate(x_test, y_test) # assert we can supply rdd and get same prediction results when supplying numpy array test_rdd = spark_context.parallelize(x_test) assert [np.argmax(x) for x in predictions ] == [np.argmax(x) for x in spark_model.predict(test_rdd)] # assert we get the same prediction result with calling predict on keras model directly assert [np.argmax(x) for x in predictions] == [ np.argmax(x) for x in spark_model.master_network.predict(x_test) ] # assert we get the same evaluation results when calling evaluate on keras model directly assert isclose(evals[0], spark_model.master_network.evaluate(x_test, y_test)[0], abs_tol=0.01) assert isclose(evals[1], spark_model.master_network.evaluate(x_test, y_test)[1], abs_tol=0.01)
def test_training_regression(spark_context, mode, parameter_server_mode, boston_housing_dataset, regression_model): x_train, y_train, x_test, y_test = boston_housing_dataset rdd = to_simple_rdd(spark_context, x_train, y_train) # Define basic parameters batch_size = 64 epochs = 10 sgd = SGD(lr=0.0000001) regression_model.compile(sgd, 'mse', ['mae']) spark_model = SparkModel(regression_model, frequency='epoch', mode=mode, parameter_server_mode=parameter_server_mode, port=4000 + random.randint(0, 500)) # Train Spark model spark_model.fit(rdd, epochs=epochs, batch_size=batch_size, verbose=0, validation_split=0.1) # run inference on trained spark model predictions = spark_model.predict(x_test) # run evaluation on trained spark model evals = spark_model.evaluate(x_test, y_test) # assert we can supply rdd and get same prediction results when supplying numpy array test_rdd = spark_context.parallelize(x_test) assert all( np.isclose(x, y, 0.01) for x, y in zip(predictions, spark_model.predict(test_rdd))) # assert we get the same prediction result with calling predict on keras model directly assert all( np.isclose(x, y, 0.01) for x, y in zip( predictions, spark_model.master_network.predict(x_test))) # assert we get the same evaluation results when calling evaluate on keras model directly assert isclose(evals[0], spark_model.master_network.evaluate(x_test, y_test)[0], abs_tol=0.01) assert isclose(evals[1], spark_model.master_network.evaluate(x_test, y_test)[1], abs_tol=0.01)
def test_training_custom_activation(mode, spark_context): def custom_activation(x): return sigmoid(x) + 1 model = Sequential() model.add(Dense(1, input_dim=1, activation=custom_activation)) model.add(Dense(1, activation='sigmoid')) sgd = SGD(lr=0.1) model.compile(sgd, 'binary_crossentropy', ['acc']) x_train = np.random.rand(1000) y_train = np.zeros(1000) x_test = np.random.rand(100) y_test = np.zeros(100) y_train[:500] = 1 rdd = to_simple_rdd(spark_context, x_train, y_train) spark_model = SparkModel(model, frequency='epoch', mode=mode, custom_objects={'custom_activation': custom_activation}) spark_model.fit(rdd, epochs=1, batch_size=16, verbose=0, validation_split=0.1) assert spark_model.predict(x_test) assert spark_model.evaluate(x_test, y_test)
import tensorflow as tf (x_train, y_train), (x_test, y_test) = mnist.load_data() print("length = ( ", len(x_train), ", ", len(y_train), " )") print("shape of the dataset = ", tf.shape(y_train)) x_train = x_train.reshape(60000, 784) x_test = x_test.reshape(10000, 784) x_train = x_train.astype("float32") x_test = x_test.astype("float32") x_train /= 255 x_test /= 255 print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') nb_classes = 10 # Convert class vectors to binary class matrices y_train = to_categorical(y_train, nb_classes) y_test = to_categorical(y_test, nb_classes) rdd = to_simple_rdd(sc, x_train, y_train) print("rdd = ", rdd) from elephas.spark_model import SparkModel spark_model = SparkModel(model, frequency='epoch', mode='asynchronous', num_workers=2) spark_model.fit(rdd, epochs=10, batch_size=32, verbose=0, validation_split=0.1) score = spark_model.evaluate(x_test, y_test, verbose=2) print('Test accuracy:', score)