예제 #1
0
def predictMain(modelName,sc):
    timeSteps= 30                                                                   # No of past values that has to be used for Training purpose
    print "Going to Initialize the LSTM model"
    SMARTparameters=getSMARTParameters()
    print("The following are the SMART parameters:",SMARTparameters)
    lstm = ls.cloudLSTM(timeSteps=timeSteps,parms=SMARTparameters)                  # Initializing the DiskPrediction Model(LSTM Model)
    print "Initialized the Model"
    lstmModel = lstm.get_LSTM_Model()                   			    # Obtaining the LSTM model for initializing SparkModel Class
    trainSize= 0.2                                                                  # Fraction of input used for Training purpose
    acc = 0.0                                                                       # Model accuracy
    inputFilePath = os.environ.get('DATA_FILE_PATH')                                # Get the Input CSV filepath from environment
    year=sys.argv[1]                                                                # get the year from the Command Line arguments
    month=sys.argv[2]                                                               # get the month from the Command Line arguments
    inputFilePath=inputFilePath+str(year)+"/"+str(year)+"-"+str(month)+"*.csv"  # For E.g "/home/user/Desktop/Cloud/Test/2014/2014-11*.csv"
    print("InputPath",inputFilePath)
    rd.generate_DataFrame(inputFilePath,SMARTparameters)
    inputCSVFilePath = os.environ.get('MODEL_CSV_FILEPATH')+str(modelName)+".csv"    # For E.g "/hadoop/elephas/Output/ST4000DM000.csv"

    modelFeatures = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=SMARTparameters)
    modelLabel = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=['failure'])   #"/hadoop/elephas/Output/ST4000DM000.csv"

    # Removing Not A Number values from the Input Dataframe
    modelFeatures = modelFeatures.fillna(0)
    modelLabel = modelLabel.fillna(0)

    # Obtaining 3D training and testing vectors
    (feature_train, label_train), (feature_test, label_test) = lstm.train_test_split(modelFeatures,modelLabel,trainSize,timeSteps)

    # Condition to check whether the failure cases exists in the data
    if len(feature_train)==0:
        print("DiskModel has no failure eleements. Training of the model cannot proceed!!")
        return
    # Initializing the Adam Optimizer for Elephas
    adam = elephas_optimizers.Adam()
    print "Adam Optimizer initialized"
    #Converting Dataframe to Spark RDD
    rddataset = to_simple_rdd(sc, feature_train, label_train)
    print "Training data converted into Resilient Distributed Dataset"
    #Initializing the SparkModel with Optimizer,Master-Worker Mode and Number of Workers
    spark_model = SparkModel(sc,lstmModel,optimizer=adam ,frequency='epoch', mode='asynchronous', num_workers=2)
    print "Spark Model Initialized"
    #Initial training run of the model
    spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0)
    # Saving the model
    score = spark_model.evaluate(feature_test, label_test,show_accuracy=True)

    while(score <= 0.5):
        # Training the Input Data set
        spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0)
        print "LSTM model training done !!"
        score = spark_model.evaluate(feature_test, label_test,show_accuracy=True)
    print "Saving weights!!"
    outFilePath=os.environ.get('GATOR_SQUAD_HOME')
    outFilePath=outFilePath+"Weights/"+str(year)+"/"+str(month)+"/"+str(modelName)+"_my_model_weights.h5"
    spark_model.save_weights(outFilePath)
    print "LSTM model testing commencing !!"
    predicted1=spark_model.predict_classes(feature_test)
    df_confusion = pd.crosstab(label_test.flatten(), predicted1.flatten(), rownames=['Actual'], colnames=['Predicted'], margins=True)
    print df_confusion
예제 #2
0
def run_train(master_name, filename, outname):
    import pyspark
    conf = pyspark.SparkConf().setAppName("CRF").setMaster(master_name)
    sc = pyspark.SparkContext(conf=conf)
    tfile = sc.textFile(filename)
    dataset = textFileToDataset(tfile)
    indexer = Indexer()
    indexer.prepareIndexer(dataset, min_count=0)

    print('[Prepare Trainloader] {} samples'.format(dataset.count()))
    trainset = indexer.convertToElephasFormat(dataset)
    embedding_size = 128
    print('[Char account] {}'.format(len(indexer.chars)))

    crf_model = CRF(5, True, name='CRF')
    cnn_model = Sequential([
        Embedding(len(indexer.chars)+1, embedding_size),
        Conv1D(128, 3, activation='relu', padding='same',\
               kernel_constraint=maxnorm(1.0), name='conv1'),
        Conv1D(128, 3, activation='relu', padding='same',\
               kernel_constraint=maxnorm(1.0), name='conv2'),
        Dense(5),
        Lambda(lambda x:x)
        #crf_model
    ])
    '''
    embed=Embedding(len(Indexer._chars)+1, embedding_size)(inph)
    cnn=Conv1D(128, 3, activation='relu', padding='same')(embed)
    cnn=Conv1D(128, 3, activation='relu', padding='same')(cnn)
    tag_score=Dense(5)(cnn)
    '''
    crf_model.trans = cnn_model.layers[-1].add_weight(name='transM', \
                        shape=(crf_model.num_labels, crf_model.num_labels),\
                        initializer=glorot_normal())
    cnn_model.compile(loss=crf_model.loss,
                      optimizer='adam',
                      metrics=[crf_model.accuracy])
    cnn_model.summary()
    # momentum = 0., decay=0. nesterov=False
    optimizerE = elephas.optimizers.SGD(lr=0.0001,
                                        momentum=0.9,
                                        decay=0.7,
                                        nesterov=True)
    spark_model = SparkModel(sc, cnn_model, optimizer=optimizerE,\
                    frequency='epoch', mode='asynchronous', num_workers=2,\
                             ) #custom_objects={'CRF': crf_model})

    spark_model.train(trainset,
                      nb_epoch=2,
                      batch_size=200,
                      validation_split=0.3,
                      verbose=1)
    model = spark_model.master_network
    model.save(outname)
    print('Train Finish')
예제 #3
0
    print("Creating Training and Test Data")
    ((x_train, y_train), (x_test, y_test)) = train_test_split(testinput.fillna(0), testoutput.fillna(0), test_size=0.3)

    print("Training data : x")
    print(type(x_train))
    print(x_train)
    print("Training data : y")
    print(type(y_train))
    print(y_train)

    print("Test data : x")
    print(type(x_test))
    print(x_test)
    print("Test data : y")
    print(type(y_test))
    print(y_test)

    print("Converting training data to RDD")
    rddataset = to_simple_rdd(sc, x_train, y_train)

    print("Initializing SPark Model")
    sgd = elephas_optimizers.SGD()
    spark_model = SparkModel(sc, model, optimizer=sgd, frequency="epoch", mode="asynchronous", num_workers=2)

    print("Commencing training")
    spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0)
    # model.fit(x_train, y_train, nb_epoch=5, batch_size=32)
    print("Training completed")

    sc.stop()
예제 #4
0
model.add(Activation('relu'))
model.add(Convolution2D(nb_filters, nb_conv, nb_conv))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adadelta')

## spark
conf = SparkConf().setAppName(APP_NAME).setMaster(MASTER_IP)
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
rdd = to_simple_rdd(sc, X_train, Y_train)

# Initialize SparkModel from Keras model and Spark context
spark_model = SparkModel(sc,model)

# Train Spark model
spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=0, validation_split=0.1, num_workers=24)

# Evaluate Spark model by evaluating the underlying model
score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2)
print('Test accuracy:', score[1])
예제 #5
0
# Compile model
sgd = SGD(lr=0.1)
model.compile(loss='categorical_crossentropy', optimizer=sgd)

# Build RDD from numpy features and labels
rdd = to_simple_rdd(sc, x_train, y_train)

# Initialize SparkModel from Keras model and Spark context
adagrad = elephas_optimizers.Adagrad()
spark_model = SparkModel(sc,
                         model,
                         optimizer=adagrad,
                         frequency='epoch',
                         mode='asynchronous',
                         num_workers=2)

# Train Spark model
spark_model.train(rdd,
                  nb_epoch=nb_epoch,
                  batch_size=batch_size,
                  verbose=2,
                  validation_split=0.1)

# Evaluate Spark model by evaluating the underlying model
score = spark_model.master_network.evaluate(x_test,
                                            y_test,
                                            show_accuracy=True,
                                            verbose=2)
print('Test accuracy:', score[1])
batch_size = 100
# Accuracy records
stat_lines = []
adagrad = elephas_optimizers.Adagrad()
for i in range(0, 200):
    # Train Spark model
    # Initialize SparkModel from Keras model and Spark context
    spark_model = SparkModel(sc,
                             model,
                             mode='asynchronous',
                             frequency='epoch',
                             num_workers=1,
                             optimizer=adagrad)
    spark_model.train(rdd,
                      nb_epoch=num_epoch_in_one_step,
                      batch_size=batch_size,
                      verbose=0,
                      validation_split=0.1)
    score1 = model.evaluate(x_train, y_train, verbose=0)
    score2 = model.evaluate(x_test, y_test, verbose=0)
    print('#############################')
    print('Finished epochs', (i + 1) * num_epoch_in_one_step)
    print('Train accuracy:', score1[1])
    print('Test accuracy:', score2[1])
    print('#############################')
    stat_lines.append(
        str((i + 1) * 10) + ', ' + str(score1[1]) + ', ' + str(score2[1]))
    FileIO.write_lines_to_file('./cnn_1.log', stat_lines)
    if (i + 1) % 10 == 0 and i != 0:
        model.save('./models/cnn_1_' + str((i + 1) * 10) + 'ep.h5')
# sc.stop()
예제 #7
0
model = Sequential()
model.add(Dense(784, 128))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(128, 128))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(128, 10))
model.add(Activation('softmax'))

# Compile model
rms = RMSprop()
model.compile(loss='categorical_crossentropy', optimizer=rms)

# Create Spark context
conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]')
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
rdd = to_simple_rdd(sc, X_train, Y_train)

# Initialize SparkModel from Keras model and Spark context
spark_model = SparkModel(sc,model)

# Train Spark model
spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=0, validation_split=0.1, num_workers=8)

# Evaluate Spark model by evaluating the underlying model
score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2)
print('Test accuracy:', score[1])
from elephas.utils.rdd_utils import to_labeled_point
from elephas.utils.rdd_utils import to_simple_rdd
lp_rdd = to_simple_rdd(sc, features_train, labels_train)

#print(lp_rdd.take(5))

from elephas.spark_model import SparkModel
from elephas import optimizers as elephas_optimizers

adagrad = elephas_optimizers.Adagrad()
spark_model = SparkModel(sc,
                         model,
                         optimizer=adagrad,
                         frequency='epoch',
                         mode='asynchronous',
                         num_workers=8)
spark_model.train(lp_rdd,
                  nb_epoch=20,
                  batch_size=32,
                  verbose=0,
                  validation_split=0.1)

print(spark_model)

prediction = spark_model.predict_classes(features_test)
print(prediction)
truth = [l[1] for l in labels_test]

from sklearn.metrics import confusion_matrix
print(confusion_matrix(truth, prediction))
class KerasNeuralNetworkSpark(object):
    def __init__(self, layers, spark, batch_size=64, epoch=10, num_workers=2, predictionCol='prediction',
                 labelCol='target', featuresCol='feature'):
        self._batch_size = batch_size
        self._epoch = epoch
        self._model = None
        self._spark = spark
        self._labels = labelCol
        self._features = featuresCol
        self._prediction = predictionCol
        self._layers = layers
        self._worker_num = num_workers
        self._build_model()

    def _build_model(self):
        model = Sequential()
        adam = elephas_optimizers.Adam()
        layers = self._layers
        model.add(Dense(layers[1], input_dim=layers[0], init='normal', activation='relu'))
        for i in range(2, len(layers) - 1):
            model.add(Dense(layers[i], activation='relu'))

        model.add(Dense(layers[-1], activation='sigmoid'))
        self._model = SparkModel(self._spark.sparkContext, model,
                                 optimizer=adam,
                                 frequency='epoch',
                                 mode='asynchronous',
                                 master_loss='mse',
                                 num_workers=self._worker_num)

    def fit(self, df):
        if hasattr(self._model, 'server'):
            self._model.server.terminate()
        pdf = df.toPandas()

        rdd = to_simple_rdd(self._spark.sparkContext, pdf[self._features], pdf[self._labels])
        self._model.train(rdd, self._epoch, self._batch_size, 0, 0.1)

    def transform(self, df):
        pdf = df.toPandas()
        # df.write.save('test_df.parquet')
        pnparray = pdf[self._features].values
        container = np.zeros((pnparray.shape[0], len(pnparray[0])))
        for i in range(pnparray.shape[0]):
            container[i, :] = pnparray[i][:]
        result = self._model.predict(container)

        pdf[self._prediction] = result

        # import pickle
        # with open('ann_result.p', 'w') as f:
        #     pickle.dump(result, f)

        # result_df = pd.DataFrame(pdf
        new_df = self._spark.createDataFrame(pdf)
        # df.join(new_df)
        return new_df

    def stop_server(self):
        if hasattr(self._model, 'server') and hasattr(self._model.server, 'terminate'):
            self._model.server.terminate()
                                  # output signal.  Here's the activation function is given be ReLU.  
model.add(Activation('relu'))
model.add(Dropout(0.5))           # dropout is then applied 

# finally the 128 outputs of the previous FC layer are fully connected to num_classes of neurons, which 
# is activated by a softmax function
model.add( Dense(nb_classes, W_regularizer=l2(0.01) ))
model.add( Activation('softmax') )
# write the neural network model representation to a png image
#grapher.plot(model, 'nn_mnist.png')

model.compile(loss='categorical_crossentropy', optimizer='adadelta')
# model.compile(loss='categorical_crossentropy', optimizer='sgd' or 'adam or 'adadelta')

## spark
conf = SparkConf().setAppName(APP_NAME) #.setMaster(MASTER_IP)
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
rdd = to_simple_rdd(sc, X_train, Y_train)

# Initialize SparkModel from Keras model and Spark context
spark_model = SparkModel(sc,model)

# Train Spark model
spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.15) # num_workers might not work in early spark version

# Evaluate Spark model by evaluating the underlying model
score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2)
print('Test accuracy:', score[1])
예제 #11
0
#early_stopping = EarlyStopping(monitor='val_acc', patience=5)
#print 'Start training...'
#model.fit( X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=verbose, callbacks=[checkpointer],validation_split=validation_split, shuffle=shuffle,show_accuracy=show_accuracy)

# Create Spark Context
conf = SparkConf().setAppName(MODEL)
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
rdd = to_simple_rdd(sc, X_train, Y_train)

# Initialize SparkModel from Keras model and Spark Context

rmsprop = elephas_optimizers.RMSprop()

spark_model = SparkModel(sc,\
                        model,\
                        optimizer=rmsprop,\
                        frequency='epoch',\
                        mode='asynchronous',\
                        num_workers=3)

spark_model.train(rdd,\
                    nb_epoch=nb_epoch,\
                    batch_size=batch_size,\
                    verbose=2,\
                    validation_split=validation_split)

spark_model.get_network().save_weights(MODEL_FILE_NAME)

#---(i.e. in training each worker will train on part of the data)
rdd = to_simple_rdd(sc, X_train, y_train)

#---Initialize SparkModel from Keras model and Spark context
#---there are two optimizers needed:
sgd = SGD(lr=0.1)  #<---the master optimizer
adagrad = elephas_optimizers.Adagrad()  #<---the elephas opimizer
spark_model = SparkModel(sc,
                         model,
                         optimizer=adagrad,
                         frequency='epoch',
                         mode='asynchronous',
                         num_workers=args.N_workers,
                         master_optimizer=sgd)

#---Train Spark model
spark_model.train(rdd,
                  nb_epoch=args.nb_epoch,
                  batch_size=args.batch_size,
                  verbose=1,
                  validation_split=0.25)

#---Evaluate Spark model by evaluating the underlying Keras master model
pred = spark_model.predict(X_test)
print np.shape(pred)
print np.shape(y_test)
acc = accuracy_score([np.argmax(y) for y in y_test],
                     [np.argmax(p) for p in pred])
print "--->test accuracy: ", acc
print "--->number of workers: ", args.N_workers
print "--->time: ", time.time() - start_time
예제 #13
0
model.add(Dense(784, 128))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(128, 128))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(128, 10))
model.add(Activation('softmax'))

# Compile model
rms = RMSprop()
model.compile(loss='categorical_crossentropy', optimizer=rms)

# Create Spark context
conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]')
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
rdd = to_simple_rdd(sc, X_train, Y_train)
rdd = rdd.repartition(8)

# Initialize SparkModel from Keras model and Spark context
spark_model = SparkModel(sc,model)

# Train Spark model
spark_model.train(rdd, nb_epoch=20, batch_size=32, verbose=0, validation_split=0.1)

# Evaluate Spark model by evaluating the underlying model
score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2)
print('Test accuracy:', score[1])
예제 #14
0
#checkpointer = ModelCheckpoint(filepath=MODEL_ROOT+MODEL+".h5", verbose=1, save_best_only=False)
#early_stopping = EarlyStopping(monitor='val_acc', patience=5)
#print 'Start training...'
#model.fit( X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=verbose, callbacks=[checkpointer],validation_split=validation_split, shuffle=shuffle,show_accuracy=show_accuracy)

# Create Spark Context
conf = SparkConf().setAppName(MODEL)
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
rdd = to_simple_rdd(sc, X_train, Y_train)

# Initialize SparkModel from Keras model and Spark Context

rmsprop = elephas_optimizers.RMSprop()

spark_model = SparkModel(sc,\
                        model,\
                        optimizer=rmsprop,\
                        frequency='epoch',\
                        mode='asynchronous',\
                        num_workers=3)

spark_model.train(rdd,\
                    nb_epoch=nb_epoch,\
                    batch_size=batch_size,\
                    verbose=2,\
                    validation_split=validation_split)

spark_model.get_network().save_weights(MODEL_FILE_NAME)
예제 #15
0
    model.add(MaxPooling1D(pool_size=4))

    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))

    model.add(Dense(modelpara_dict['Lable_num'], activation='softmax'))

    print(model.summary())

    sgd = SGD(lr=0.1)
    model.compile(loss='categorical_crossentropy', optimizer=sgd)
    adagrad = elephas_optimizers.Adagrad()
    spark_model = SparkModel(sc,
                             model,
                             optimizer=adagrad,
                             frequency='epoch',
                             mode='synchronous',
                             num_workers=3)

    # Train Spark model
    spark_model.train(train_data,
                      nb_epoch=1,
                      batch_size=32,
                      verbose=2,
                      validation_split=0.1)
    spark_model.master_network.save('model/' + modelname + '/' + modelname +
                                    '.h5')
# Evaluate Spark model by evaluating the underlying model
#score = spark_model.master_network.evaluate(x_test, y_test, verbose=2)
#print('Test accuracy:', score[1])
예제 #16
0
    print('Test data : x')
    print(type(x_test))
    print(x_test)
    print('Test data : y')
    print(type(y_test))
    print(y_test)

    print('Converting training data to RDD')
    rddataset = to_simple_rdd(sc, x_train, y_train)

    print('Initializing SPark Model')
    sgd = elephas_optimizers.SGD()
    spark_model = SparkModel(sc,
                             model,
                             optimizer=sgd,
                             frequency='epoch',
                             mode='asynchronous',
                             num_workers=2)

    print('Commencing training')
    spark_model.train(rddataset,
                      nb_epoch=10,
                      batch_size=200,
                      verbose=1,
                      validation_split=0)
    #model.fit(x_train, y_train, nb_epoch=5, batch_size=32)
    print('Training completed')

    sc.stop()
예제 #17
0
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(10))
model.add(Activation('softmax'))

sgd = SGD(lr=0.1)

# Build RDD from numpy features and labels
rdd = to_simple_rdd(sc, x_train, y_train)

# Initialize SparkModel from Keras model and Spark context
adagrad = elephas_optimizers.Adagrad()
spark_model = SparkModel(sc,
                         model,
                         optimizer=adagrad,
                         frequency='epoch',
                         mode='asynchronous',
                         num_workers=2,
                         master_optimizer=sgd)

# Train Spark model
spark_model.train(rdd,
                  nb_epoch=nb_epoch,
                  batch_size=batch_size,
                  verbose=2,
                  validation_split=0.1)

# Evaluate Spark model by evaluating the underlying model
score = spark_model.master_network.evaluate(x_test, y_test, verbose=2)
print('Test accuracy:', score[1])
예제 #18
0
model.compile(loss='categorical_crossentropy', optimizer=SGD())
model.summary()


# Create a Resilient Distributed Dataset (RDD) from training data

# TODO: get data
# TODO: is it possible to separate traininng data into multiple batches?

rdd = to_simple_rdd(sc, X_train, Y_train)


# Create the Elephas model instance
spark_model = SparkModel(sc,
                         model,
                         optimizer = elephas_optimizers.Adagrad(),
                         frequency = 'epoch',
                         mode = 'asynchronous',
                         num_workers = WORKERS
                         )

# Train model
spark_model.train(rdd,
                  nb_epoch = EPOCHS,
                  batch_size = BATCH_SIZE,
                  verbose = False,
                  validation_split = VAL_SPLIT,
                  num_workers = WORKERS
                  )