def predictMain(modelName,sc): timeSteps= 30 # No of past values that has to be used for Training purpose print "Going to Initialize the LSTM model" SMARTparameters=getSMARTParameters() print("The following are the SMART parameters:",SMARTparameters) lstm = ls.cloudLSTM(timeSteps=timeSteps,parms=SMARTparameters) # Initializing the DiskPrediction Model(LSTM Model) print "Initialized the Model" lstmModel = lstm.get_LSTM_Model() # Obtaining the LSTM model for initializing SparkModel Class trainSize= 0.2 # Fraction of input used for Training purpose acc = 0.0 # Model accuracy inputFilePath = os.environ.get('DATA_FILE_PATH') # Get the Input CSV filepath from environment year=sys.argv[1] # get the year from the Command Line arguments month=sys.argv[2] # get the month from the Command Line arguments inputFilePath=inputFilePath+str(year)+"/"+str(year)+"-"+str(month)+"*.csv" # For E.g "/home/user/Desktop/Cloud/Test/2014/2014-11*.csv" print("InputPath",inputFilePath) rd.generate_DataFrame(inputFilePath,SMARTparameters) inputCSVFilePath = os.environ.get('MODEL_CSV_FILEPATH')+str(modelName)+".csv" # For E.g "/hadoop/elephas/Output/ST4000DM000.csv" modelFeatures = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=SMARTparameters) modelLabel = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=['failure']) #"/hadoop/elephas/Output/ST4000DM000.csv" # Removing Not A Number values from the Input Dataframe modelFeatures = modelFeatures.fillna(0) modelLabel = modelLabel.fillna(0) # Obtaining 3D training and testing vectors (feature_train, label_train), (feature_test, label_test) = lstm.train_test_split(modelFeatures,modelLabel,trainSize,timeSteps) # Condition to check whether the failure cases exists in the data if len(feature_train)==0: print("DiskModel has no failure eleements. Training of the model cannot proceed!!") return # Initializing the Adam Optimizer for Elephas adam = elephas_optimizers.Adam() print "Adam Optimizer initialized" #Converting Dataframe to Spark RDD rddataset = to_simple_rdd(sc, feature_train, label_train) print "Training data converted into Resilient Distributed Dataset" #Initializing the SparkModel with Optimizer,Master-Worker Mode and Number of Workers spark_model = SparkModel(sc,lstmModel,optimizer=adam ,frequency='epoch', mode='asynchronous', num_workers=2) print "Spark Model Initialized" #Initial training run of the model spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0) # Saving the model score = spark_model.evaluate(feature_test, label_test,show_accuracy=True) while(score <= 0.5): # Training the Input Data set spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0) print "LSTM model training done !!" score = spark_model.evaluate(feature_test, label_test,show_accuracy=True) print "Saving weights!!" outFilePath=os.environ.get('GATOR_SQUAD_HOME') outFilePath=outFilePath+"Weights/"+str(year)+"/"+str(month)+"/"+str(modelName)+"_my_model_weights.h5" spark_model.save_weights(outFilePath) print "LSTM model testing commencing !!" predicted1=spark_model.predict_classes(feature_test) df_confusion = pd.crosstab(label_test.flatten(), predicted1.flatten(), rownames=['Actual'], colnames=['Predicted'], margins=True) print df_confusion
from elephas.mllib.adapter import to_vector, from_vector from pyspark import SparkContext, SparkConf # Create Spark context conf = SparkConf().setAppName('LSTM_Spark_MLP') sc = SparkContext(conf=conf) def deal_x(x): x = np.array(x) x_data = x[1:] x_data = np.expand_dims(x_data, axis=2) return x_data test_data = sc.textFile("output/data/z2.csv").map( lambda ln: deal_x([float(x) for x in ln.split(',')])) model = load_model('model.h5') adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='synchronous', num_workers=3) # Test Spark model spark_model.predict_classes(test_data, "output/data/prediction")
from elephas.utils.rdd_utils import to_labeled_point from elephas.utils.rdd_utils import to_simple_rdd lp_rdd = to_simple_rdd(sc, features_train, labels_train) #print(lp_rdd.take(5)) from elephas.spark_model import SparkModel from elephas import optimizers as elephas_optimizers adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='asynchronous', num_workers=8) spark_model.train(lp_rdd, nb_epoch=20, batch_size=32, verbose=0, validation_split=0.1) print(spark_model) prediction = spark_model.predict_classes(features_test) print(prediction) truth = [l[1] for l in labels_test] from sklearn.metrics import confusion_matrix print(confusion_matrix(truth, prediction))