def main(): gps_files = glob.glob('../data/prototype/**/gps_points.csv') trip_files = glob.glob('../data/prototype/**/gps_trips.csv') file_results = process_file(trip_file = trip_files[0], gps_file = gps_files[0]) seq_results = build_seq(input_df = file_results['df'], unique_trips = file_results['unique_trips']) X = seq_results['x'] y = seq_results['y'] print('Bulding training data from files..') for i in range(1, len(gps_files)): file_results = process_file(trip_file = trip_files[i], gps_file = gps_files[i]) seq_results = build_seq(input_df = file_results['df'], unique_trips = file_results['unique_trips']) X = np.vstack((X, seq_results['x'])) y = np.vstack((y, seq_results['y'])) x_train, x_val, y_train, y_val = train_test_split(X, y, random_state=1, train_size=0.8) rdd = to_simple_rdd(sc, x_train, y_train) model = build_model() spark_model = SparkModel(model, frequency='epoch', mode='asynchronous') spark_model.fit(rdd, epochs=5, batch_size=32, verbose=0, validation_split=0.1) # model.fit(x_train, y_train, epochs=5, validation_data=(x_val, y_val)) y_pred = spark_model.predict(x_val) acc = sum(np.argmax(y_pred, axis=1) == np.argmax(y_val, axis=1)) / y_pred.shape[0] print("Validation Accuracy: {number:.{digits}f}%".format(number=(acc*100), digits=2))
def train_gate(self, datagen, weights_file): model = self.gateModel model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy']) print(model.summary()) self.gateModel = SparkModel(model, frequency='epoch', mode='asynchronous') score = self.gateModel.master_network.evaluate(self.x_test, self.y_test, verbose=2, batch_size=50) self.gateModel.fit(self.rdd, epochs=1, batch_size=50, verbose=1) self.gateModel = self.gateModel.master_network self.gateModel.save_weights(weights_file + '.hdf5') file = '../lib/output.txt' if os.path.exists(file): append_write = 'a' else: append_write = 'w' #score = self.gateModel.evaluate(self.x_test, self.y_test, verbose=2, batch_size=50) print("------------------------------") print("Score is:" + str(score[1])) print("-------------------------------") text_file = open(file, append_write) text_file.write("Score: %s" % score[1]) text_file.close()
def predictMain(modelName,sc): timeSteps= 30 # No of past values that has to be used for Training purpose print "Going to Initialize the LSTM model" SMARTparameters=getSMARTParameters() print("The following are the SMART parameters:",SMARTparameters) lstm = ls.cloudLSTM(timeSteps=timeSteps,parms=SMARTparameters) # Initializing the DiskPrediction Model(LSTM Model) print "Initialized the Model" lstmModel = lstm.get_LSTM_Model() # Obtaining the LSTM model for initializing SparkModel Class trainSize= 0.2 # Fraction of input used for Training purpose acc = 0.0 # Model accuracy inputFilePath = os.environ.get('DATA_FILE_PATH') # Get the Input CSV filepath from environment year=sys.argv[1] # get the year from the Command Line arguments month=sys.argv[2] # get the month from the Command Line arguments inputFilePath=inputFilePath+str(year)+"/"+str(year)+"-"+str(month)+"*.csv" # For E.g "/home/user/Desktop/Cloud/Test/2014/2014-11*.csv" print("InputPath",inputFilePath) rd.generate_DataFrame(inputFilePath,SMARTparameters) inputCSVFilePath = os.environ.get('MODEL_CSV_FILEPATH')+str(modelName)+".csv" # For E.g "/hadoop/elephas/Output/ST4000DM000.csv" modelFeatures = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=SMARTparameters) modelLabel = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=['failure']) #"/hadoop/elephas/Output/ST4000DM000.csv" # Removing Not A Number values from the Input Dataframe modelFeatures = modelFeatures.fillna(0) modelLabel = modelLabel.fillna(0) # Obtaining 3D training and testing vectors (feature_train, label_train), (feature_test, label_test) = lstm.train_test_split(modelFeatures,modelLabel,trainSize,timeSteps) # Condition to check whether the failure cases exists in the data if len(feature_train)==0: print("DiskModel has no failure eleements. Training of the model cannot proceed!!") return # Initializing the Adam Optimizer for Elephas adam = elephas_optimizers.Adam() print "Adam Optimizer initialized" #Converting Dataframe to Spark RDD rddataset = to_simple_rdd(sc, feature_train, label_train) print "Training data converted into Resilient Distributed Dataset" #Initializing the SparkModel with Optimizer,Master-Worker Mode and Number of Workers spark_model = SparkModel(sc,lstmModel,optimizer=adam ,frequency='epoch', mode='asynchronous', num_workers=2) print "Spark Model Initialized" #Initial training run of the model spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0) # Saving the model score = spark_model.evaluate(feature_test, label_test,show_accuracy=True) while(score <= 0.5): # Training the Input Data set spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0) print "LSTM model training done !!" score = spark_model.evaluate(feature_test, label_test,show_accuracy=True) print "Saving weights!!" outFilePath=os.environ.get('GATOR_SQUAD_HOME') outFilePath=outFilePath+"Weights/"+str(year)+"/"+str(month)+"/"+str(modelName)+"_my_model_weights.h5" spark_model.save_weights(outFilePath) print "LSTM model testing commencing !!" predicted1=spark_model.predict_classes(feature_test) df_confusion = pd.crosstab(label_test.flatten(), predicted1.flatten(), rownames=['Actual'], colnames=['Predicted'], margins=True) print df_confusion
def test_sequential_serialization(spark_context, classification_model): classification_model.compile(optimizer="sgd", loss="categorical_crossentropy", metrics=["acc"]) spark_model = SparkModel(classification_model, frequency='epoch', mode='synchronous') spark_model.save("elephas_sequential.h5")
def run_train(master_name, filename, outname): import pyspark conf = pyspark.SparkConf().setAppName("CRF").setMaster(master_name) sc = pyspark.SparkContext(conf=conf) tfile = sc.textFile(filename) dataset = textFileToDataset(tfile) indexer = Indexer() indexer.prepareIndexer(dataset, min_count=0) print('[Prepare Trainloader] {} samples'.format(dataset.count())) trainset = indexer.convertToElephasFormat(dataset) embedding_size = 128 print('[Char account] {}'.format(len(indexer.chars))) crf_model = CRF(5, True, name='CRF') cnn_model = Sequential([ Embedding(len(indexer.chars)+1, embedding_size), Conv1D(128, 3, activation='relu', padding='same',\ kernel_constraint=maxnorm(1.0), name='conv1'), Conv1D(128, 3, activation='relu', padding='same',\ kernel_constraint=maxnorm(1.0), name='conv2'), Dense(5), Lambda(lambda x:x) #crf_model ]) ''' embed=Embedding(len(Indexer._chars)+1, embedding_size)(inph) cnn=Conv1D(128, 3, activation='relu', padding='same')(embed) cnn=Conv1D(128, 3, activation='relu', padding='same')(cnn) tag_score=Dense(5)(cnn) ''' crf_model.trans = cnn_model.layers[-1].add_weight(name='transM', \ shape=(crf_model.num_labels, crf_model.num_labels),\ initializer=glorot_normal()) cnn_model.compile(loss=crf_model.loss, optimizer='adam', metrics=[crf_model.accuracy]) cnn_model.summary() # momentum = 0., decay=0. nesterov=False optimizerE = elephas.optimizers.SGD(lr=0.0001, momentum=0.9, decay=0.7, nesterov=True) spark_model = SparkModel(sc, cnn_model, optimizer=optimizerE,\ frequency='epoch', mode='asynchronous', num_workers=2,\ ) #custom_objects={'CRF': crf_model}) spark_model.train(trainset, nb_epoch=2, batch_size=200, validation_split=0.3, verbose=1) model = spark_model.master_network model.save(outname) print('Train Finish')
def test_sync_mode(spark_context): # Define basic parameters batch_size = 64 nb_classes = 10 epochs = 10 # Load data (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape(60000, 784) x_test = x_test.reshape(10000, 784) x_train = x_train.astype("float32") x_test = x_test.astype("float32") x_train /= 255 x_test /= 255 print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') # Convert class vectors to binary class matrices y_train = np_utils.to_categorical(y_train, nb_classes) y_test = np_utils.to_categorical(y_test, nb_classes) model = Sequential() model.add(Dense(128, input_dim=784)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(10)) model.add(Activation('softmax')) sgd = SGD(lr=0.1) model.compile(sgd, 'categorical_crossentropy', ['acc']) # Build RDD from numpy features and labels rdd = to_simple_rdd(spark_context, x_train, y_train) # Initialize SparkModel from Keras model and Spark context spark_model = SparkModel(model, mode='synchronous') # Train Spark model spark_model.fit(rdd, epochs=epochs, batch_size=batch_size, verbose=2, validation_split=0.1) # Evaluate Spark model by evaluating the underlying model score = spark_model.master_network.evaluate(x_test, y_test, verbose=2) assert score[1] >= 0.70
def set_model(self): """ Set the HAN model according to the given hyperparameters """ if self.hyperparameters['l2_regulizer'] is None: kernel_regularizer = None else: kernel_regularizer = regularizers.l2( self.hyperparameters['l2_regulizer']) if self.hyperparameters['dropout_regulizer'] is None: dropout_regularizer = 1 else: dropout_regularizer = self.hyperparameters['dropout_regulizer'] word_input = Input(shape=(self.max_senten_len, ), dtype='float32') word_sequences = self.get_embedding_layer()(word_input) word_lstm = Bidirectional(self.hyperparameters['rnn']( self.hyperparameters['rnn_units'], return_sequences=True, kernel_regularizer=kernel_regularizer))(word_sequences) word_dense = TimeDistributed( Dense(self.hyperparameters['dense_units'], kernel_regularizer=kernel_regularizer))(word_lstm) word_att = AttentionWithContext()(word_dense) wordEncoder = Model(word_input, word_att) sent_input = Input(shape=(self.max_senten_num, self.max_senten_len), dtype='float32') sent_encoder = TimeDistributed(wordEncoder)(sent_input) sent_lstm = Bidirectional(self.hyperparameters['rnn']( self.hyperparameters['rnn_units'], return_sequences=True, kernel_regularizer=kernel_regularizer))(sent_encoder) sent_dense = TimeDistributed( Dense(self.hyperparameters['dense_units'], kernel_regularizer=kernel_regularizer))(sent_lstm) sent_att = Dropout(dropout_regularizer)( AttentionWithContext()(sent_dense)) preds = Dense(len(self.classes))(sent_att) self.model = Model(sent_input, preds) self.model.compile(loss=self.hyperparameters['loss'], optimizer=self.hyperparameters['optimizer'], metrics=self.hyperparameters['metrics']) self.spark_model = SparkModel(self.model, frequency='epoch', mode='asynchronous')
def test_sequential_serialization(): # Create Spark context pytest.mark.usefixtures("spark_context") seq_model = Sequential() seq_model.add(Dense(128, input_dim=784)) seq_model.add(Activation('relu')) seq_model.add(Dropout(0.2)) seq_model.add(Dense(128)) seq_model.add(Activation('relu')) seq_model.add(Dropout(0.2)) seq_model.add(Dense(10)) seq_model.add(Activation('softmax')) seq_model.compile( optimizer="sgd", loss="categorical_crossentropy", metrics=["acc"]) spark_model = SparkModel(seq_model, frequency='epoch', mode='synchronous') spark_model.save("elephas_sequential.h5")
def test_model_serialization(): # This returns a tensor inputs = Input(shape=(784,)) # a layer instance is callable on a tensor, and returns a tensor x = Dense(64, activation='relu')(inputs) x = Dense(64, activation='relu')(x) predictions = Dense(10, activation='softmax')(x) # This creates a model that includes # the Input layer and three Dense layers model = Model(inputs=inputs, outputs=predictions) model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) spark_model = SparkModel(model, frequency='epoch', mode='synchronous') spark_model.save("elephas_model.h5")
def make_model(data): data.show() data = data.dropna() nb_classes = data.select("label").distinct().count() input_dim = len(data.select("features").first()[0]) print(nb_classes, input_dim) model = Sequential() model.add(Embedding(input_dim=input_dim, output_dim=100)) #model.add(LSTM(64,return_sequences=False,dropout=0.1,recurrent_dropout=0.1)) model.add(Dense(100, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes, activation='softmax')) #sgd = optimizers.SGD(lr=0.1) #model.compile(sgd, 'categorical_crossentropy', ['acc']) model.compile(loss='binary_crossentropy', optimizer='adam') #model.compile(loss='categorical_crossentropy', optimizer='adam') spark_model = SparkModel(model, frequency='epoch', mode='asynchronous') adam = optimizers.Adam(lr=0.01) opt_conf = optimizers.serialize(adam) estimator = ElephasEstimator() estimator.setFeaturesCol("features") estimator.setLabelCol("label") estimator.set_keras_model_config(model.to_yaml()) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) estimator.set_num_workers(1) estimator.set_epochs(20) estimator.set_batch_size(128) estimator.set_verbosity(1) estimator.set_validation_split(0.15) estimator.set_optimizer_config(opt_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) #estimator = ElephasEstimator(model, epochs=20, batch_size=32, frequency='batch', mode='asynchronous', nb_classes=1) pipeline = Pipeline(stages=[estimator]) #fitted_model = estimator.fit(data) #prediction = fitted_model.transform(data) fitted_pipeline = pipeline.fit(data) # Fit model to data prediction = fitted_pipeline.transform(data) # Evaluate on train data. # prediction = fitted_pipeline.transform(test_df) # <-- The same code evaluates test data. pnl = prediction.select("text", "prediction") pnl.show(100) prediction_and_label = pnl.map(lambda row: (row.text, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.precision()) pnl = prediction.select("label", "prediction").show() pnl.show(100)
def train_elephas_model(x, y): model = models.Sequential() # Input Layer sgd = optimizers.Adam(lr=0.01) model.add(Dense(256, activation="relu", input_shape=(x.shape[1],))) model.add(Dropout(0.05)) model.add(Dense(256, activation="relu", input_shape=(x.shape[1],))) model.add(Dropout(0.05)) # output layer model.add(Dense(1)) model.compile(optimizer=sgd, loss="mse", metrics=["mse"]) model.summary() rdd = to_simple_rdd(sc, x, y) spark_model = SparkModel(model, frequency='epoch', mode='asynchronous') # spark_model.fit(rdd, epochs=10, batch_size=64, verbose=1, validation_split=0.2) spark_model.fit(rdd, epochs=25, batch_size=64, verbose=1, validation_split=0.2) return spark_model
def dist_training(n_iter): sbcnn = SBCNN_Model(field_size, bands, frames, num_channels, num_labels) sgd = SGD(lr=0.001, momentum=0.0, decay=0.0, nesterov=False) sbcnn.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=sgd) train_arr, train_labels_arr, test_arr, test_labels_arr = get_data() rdd = to_simple_rdd(sc, train_arr, train_labels_arr) spark_model = SparkModel(sbcnn, frequency='epoch', mode='asynchronous') spark_model.fit(rdd, epochs=n_iter, batch_size=32, verbose=0, validation_split=0.1) score = spark_model.master_network.evaluate(test_arr, test_labels_arr, verbose=2) print('Test accuracy:', score[1])
def test_sequential_serialization(): # Define basic parameters batch_size = 64 nb_classes = 10 epochs = 1 # Create Spark context pytest.mark.usefixtures("spark_context") # Load data (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape(60000, 784) x_test = x_test.reshape(10000, 784) x_train = x_train.astype("float32") x_test = x_test.astype("float32") x_train /= 255 x_test /= 255 print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') # Convert class vectors to binary class matrices y_train = np_utils.to_categorical(y_train, nb_classes) y_test = np_utils.to_categorical(y_test, nb_classes) seq_model = Sequential() seq_model.add(Dense(128, input_dim=784)) seq_model.add(Activation('relu')) seq_model.add(Dropout(0.2)) seq_model.add(Dense(128)) seq_model.add(Activation('relu')) seq_model.add(Dropout(0.2)) seq_model.add(Dense(10)) seq_model.add(Activation('softmax')) seq_model.compile(optimizer="sgd", loss="categorical_crossentropy", metrics=["acc"]) spark_model = SparkModel(seq_model, frequency='epoch', mode='synchronous') spark_model.save("elephas_sequential.h5")
def test_training_classification(spark_context, mode, parameter_server_mode, mnist_data, classification_model): # Define basic parameters batch_size = 64 epochs = 10 # Load data x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] sgd = SGD(lr=0.1) classification_model.compile(sgd, 'categorical_crossentropy', ['acc']) # Build RDD from numpy features and labels rdd = to_simple_rdd(spark_context, x_train, y_train) # Initialize SparkModel from keras model and Spark context spark_model = SparkModel(classification_model, frequency='epoch', mode=mode, parameter_server_mode=parameter_server_mode, port=4000 + random.randint(0, 500)) # Train Spark model spark_model.fit(rdd, epochs=epochs, batch_size=batch_size, verbose=0, validation_split=0.1) # run inference on trained spark model predictions = spark_model.predict(x_test) # run evaluation on trained spark model evals = spark_model.evaluate(x_test, y_test) # assert we can supply rdd and get same prediction results when supplying numpy array test_rdd = spark_context.parallelize(x_test) assert [np.argmax(x) for x in predictions ] == [np.argmax(x) for x in spark_model.predict(test_rdd)] # assert we get the same prediction result with calling predict on keras model directly assert [np.argmax(x) for x in predictions] == [ np.argmax(x) for x in spark_model.master_network.predict(x_test) ] # assert we get the same evaluation results when calling evaluate on keras model directly assert isclose(evals[0], spark_model.master_network.evaluate(x_test, y_test)[0], abs_tol=0.01) assert isclose(evals[1], spark_model.master_network.evaluate(x_test, y_test)[1], abs_tol=0.01)
def _build_model(self): model = Sequential() adam = elephas_optimizers.Adam() layers = self._layers model.add(Dense(layers[1], input_dim=layers[0], init='normal', activation='relu')) for i in range(2, len(layers) - 1): model.add(Dense(layers[i], activation='relu')) model.add(Dense(layers[-1], activation='sigmoid')) self._model = SparkModel(self._spark.sparkContext, model, optimizer=adam, frequency='epoch', mode='asynchronous', master_loss='mse', num_workers=self._worker_num)
def test_training_regression(spark_context, mode, parameter_server_mode, boston_housing_dataset, regression_model): x_train, y_train, x_test, y_test = boston_housing_dataset rdd = to_simple_rdd(spark_context, x_train, y_train) # Define basic parameters batch_size = 64 epochs = 10 sgd = SGD(lr=0.0000001) regression_model.compile(sgd, 'mse', ['mae']) spark_model = SparkModel(regression_model, frequency='epoch', mode=mode, parameter_server_mode=parameter_server_mode, port=4000 + random.randint(0, 500)) # Train Spark model spark_model.fit(rdd, epochs=epochs, batch_size=batch_size, verbose=0, validation_split=0.1) # run inference on trained spark model predictions = spark_model.predict(x_test) # run evaluation on trained spark model evals = spark_model.evaluate(x_test, y_test) # assert we can supply rdd and get same prediction results when supplying numpy array test_rdd = spark_context.parallelize(x_test) assert all( np.isclose(x, y, 0.01) for x, y in zip(predictions, spark_model.predict(test_rdd))) # assert we get the same prediction result with calling predict on keras model directly assert all( np.isclose(x, y, 0.01) for x, y in zip( predictions, spark_model.master_network.predict(x_test))) # assert we get the same evaluation results when calling evaluate on keras model directly assert isclose(evals[0], spark_model.master_network.evaluate(x_test, y_test)[0], abs_tol=0.01) assert isclose(evals[1], spark_model.master_network.evaluate(x_test, y_test)[1], abs_tol=0.01)
def test_training_custom_activation(mode, spark_context): def custom_activation(x): return sigmoid(x) + 1 model = Sequential() model.add(Dense(1, input_dim=1, activation=custom_activation)) model.add(Dense(1, activation='sigmoid')) sgd = SGD(lr=0.1) model.compile(sgd, 'binary_crossentropy', ['acc']) x_train = np.random.rand(1000) y_train = np.zeros(1000) x_test = np.random.rand(100) y_test = np.zeros(100) y_train[:500] = 1 rdd = to_simple_rdd(spark_context, x_train, y_train) spark_model = SparkModel(model, frequency='epoch', mode=mode, custom_objects={'custom_activation': custom_activation}) spark_model.fit(rdd, epochs=1, batch_size=16, verbose=0, validation_split=0.1) assert spark_model.predict(x_test) assert spark_model.evaluate(x_test, y_test)
input_shape=(modelpara_dict['Column_num'], 1))) model.add(MaxPooling1D(pool_size=4)) model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(modelpara_dict['Lable_num'], activation='softmax')) print(model.summary()) sgd = SGD(lr=0.1) model.compile(loss='categorical_crossentropy', optimizer=sgd) adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='synchronous', num_workers=3) # Train Spark model spark_model.train(train_data, nb_epoch=1, batch_size=32, verbose=2, validation_split=0.1) spark_model.master_network.save('model/' + modelname + '/' + modelname + '.h5') # Evaluate Spark model by evaluating the underlying model #score = spark_model.master_network.evaluate(x_test, y_test, verbose=2) #print('Test accuracy:', score[1])
print("Creating Training and Test Data") ((x_train, y_train), (x_test, y_test)) = train_test_split(testinput.fillna(0), testoutput.fillna(0), test_size=0.3) print("Training data : x") print(type(x_train)) print(x_train) print("Training data : y") print(type(y_train)) print(y_train) print("Test data : x") print(type(x_test)) print(x_test) print("Test data : y") print(type(y_test)) print(y_test) print("Converting training data to RDD") rddataset = to_simple_rdd(sc, x_train, y_train) print("Initializing SPark Model") sgd = elephas_optimizers.SGD() spark_model = SparkModel(sc, model, optimizer=sgd, frequency="epoch", mode="asynchronous", num_workers=2) print("Commencing training") spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0) # model.fit(x_train, y_train, nb_epoch=5, batch_size=32) print("Training completed") sc.stop()
#early_stopping = EarlyStopping(monitor='val_acc', patience=5) #print 'Start training...' #model.fit( X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=verbose, callbacks=[checkpointer],validation_split=validation_split, shuffle=shuffle,show_accuracy=show_accuracy) # Create Spark Context conf = SparkConf().setAppName(MODEL) sc = SparkContext(conf=conf) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, X_train, Y_train) # Initialize SparkModel from Keras model and Spark Context rmsprop = elephas_optimizers.RMSprop() spark_model = SparkModel(sc,\ model,\ optimizer=rmsprop,\ frequency='epoch',\ mode='asynchronous',\ num_workers=3) spark_model.train(rdd,\ nb_epoch=nb_epoch,\ batch_size=batch_size,\ verbose=2,\ validation_split=validation_split) spark_model.get_network().save_weights(MODEL_FILE_NAME)
class HAN(object): """ HAN model is implemented here. """ def __init__(self, text, labels, pretrained_embedded_vector_path, max_features, max_senten_len, max_senten_num, embedding_size, num_categories=None, validation_split=0.2, verbose=0): """Initialize the HAN module Keyword arguments: text -- list of the articles for training. labels -- labels corresponding the given `text`. pretrained_embedded_vector_path -- path of any pretrained vector max_features -- max features embeddeding matrix can have. To more checkout https://keras.io/layers/embeddings/ max_senten_len -- maximum sentence length. It is recommended not to use the maximum one but the one that covers 0.95 quatile of the data. max_senten_num -- maximum number of sentences. It is recommended not to use the maximum one but the one that covers 0.95 quatile of the data. embedding_size -- size of the embedding vector num_categories -- total number of categories. validation_split -- train-test split. verbose -- how much you want to see. """ try: self.verbose = verbose self.max_features = max_features self.max_senten_len = max_senten_len self.max_senten_num = max_senten_num self.embed_size = embedding_size self.validation_split = validation_split self.embedded_dir = pretrained_embedded_vector_path self.text = pd.Series(text) self.categories = pd.Series(labels) self.classes = self.categories.unique().tolist() # Initialize default hyperparameters # You can change it using `set_hyperparameters` function self.hyperparameters = { 'l2_regulizer': None, 'dropout_regulizer': None, 'rnn': LSTM, 'rnn_units': 150, 'dense_units': 200, 'activation': 'softmax', 'optimizer': 'adam', 'metrics': ['acc'], 'loss': 'categorical_crossentropy' } if num_categories is not None: assert (num_categories == len(self.classes)) assert (self.text.shape[0] == self.categories.shape[0]) self.data, self.labels = self.preprocessing() self.x_train, self.y_train, self.x_val, self.y_val = self.split_dataset( ) self.embedding_index = self.add_glove_model() self.set_model() except AssertionError: print('Input and label data must be of same size') # Implement this after you have seen all the different kinds of errors # try: # conf = SparkConf().setAppName('HANMusicClassifier').setMaster('') # self.sc = SparkContext(conf=conf) # except Error: conf = SparkConf().setAppName('HANMusicClassifier') self.sc = SparkContext(conf=conf) def set_hyperparameters(self, tweaked_instances): """Set hyperparameters of HAN model. Keywords arguemnts: tweaked_instances -- dictionary of all those keys you want to change """ for key, value in tweaked_instances.items(): if key in self.hyperparameters: self.hyperparameters[key] = value else: raise KeyError(key + ' does not exist in hyperparameters') self.set_model() def show_hyperparameters(self): """To check the values of all the current hyperparameters """ print('Hyperparameter\tCorresponding Value') for key, value in self.hyperparameters.items(): print(key, '\t\t', value) def clean_string(self, string): """ Tokenization/string cleaning for dataset Every dataset is lower cased except """ string = re.sub(r"\\", "", string) string = re.sub(r"\'", "", string) string = re.sub(r"\"", "", string) return string.strip().lower() def add_dataset(self, text, labels): try: self.text = pd.concat([self.text, pd.Series(text)]) self.categories = pd.concat([self.categories, pd.Series(labels)]) assert (len(self.classes) == self.categories.unique().tolist()) except AssertionError: print("New class cannot be added in this manner") def preprocessing(self): """Preprocessing of the text to make it more resonant for training """ paras = [] labels = [] texts = [] for idx in range(self.text.shape[0]): text = self.clean_string(self.text[idx]) texts.append(text) sentences = tokenize.sent_tokenize(text) paras.append(sentences) tokenizer = Tokenizer(num_words=self.max_features, oov_token=True) tokenizer.fit_on_texts(texts) data = np.zeros((len(texts), self.max_senten_num, self.max_senten_len), dtype='int32') for i, sentences in enumerate(paras): for j, sent in enumerate(sentences): if j < self.max_senten_num: wordTokens = text_to_word_sequence(sent) k = 0 for _, word in enumerate(wordTokens): if k < self.max_senten_len and word in tokenizer.word_index and tokenizer.word_index[ word] < self.max_features: data[i, j, k] = tokenizer.word_index[word] k = k + 1 self.word_index = tokenizer.word_index if self.verbose == 1: print('Total %s unique tokens.' % len(self.word_index)) labels = pd.get_dummies(self.categories) if self.verbose == 1: print('Shape of data tensor:', data.shape) print('Shape of labels tensor:', labels.shape) assert (len(self.classes) == labels.shape[1]) assert (data.shape[0] == labels.shape[0]) return data, labels def split_dataset(self): indices = np.arange(self.data.shape[0]) np.random.shuffle(indices) self.data = self.data[indices] self.labels = self.labels.iloc[indices] nb_validation_samples = int(self.validation_split * self.data.shape[0]) x_train = self.data[:-nb_validation_samples] y_train = self.labels[:-nb_validation_samples] x_val = self.data[-nb_validation_samples:] y_val = self.labels[-nb_validation_samples:] if self.verbose == 1: print( 'Number of positive and negative reviews in traing and validation set' ) print(y_train.columns.tolist()) print(y_train.sum(axis=0).tolist()) print(y_val.sum(axis=0).tolist()) return x_train, y_train, x_val, y_val def get_model(self): """ Returns the HAN model so that it can be used as a part of pipeline """ return self.model def add_glove_model(self): """ Read and save Pretrained Embedding model """ embeddings_index = {} try: f = open(self.embedded_dir) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') assert (coefs.shape[0] == self.embed_size) embeddings_index[word] = coefs f.close() except OSError: print('Embedded file does not found') exit() except AssertionError: print( "Embedding vector size does not match with given embedded size" ) return embeddings_index def get_embedding_matrix(self): """ Returns Embedding matrix """ embedding_matrix = np.random.random( (len(self.word_index) + 1, self.embed_size)) absent_words = 0 for word, i in self.word_index.items(): embedding_vector = self.embedding_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector else: absent_words += 1 if self.verbose == 1: print('Total absent words are', absent_words, 'which is', "%0.2f" % (absent_words * 100 / len(self.word_index)), '% of total words') return embedding_matrix def get_embedding_layer(self): """ Returns Embedding layer """ embedding_matrix = self.get_embedding_matrix() return Embedding(len(self.word_index) + 1, self.embed_size, weights=[embedding_matrix], input_length=self.max_senten_len, trainable=False) def set_model(self): """ Set the HAN model according to the given hyperparameters """ if self.hyperparameters['l2_regulizer'] is None: kernel_regularizer = None else: kernel_regularizer = regularizers.l2( self.hyperparameters['l2_regulizer']) if self.hyperparameters['dropout_regulizer'] is None: dropout_regularizer = 1 else: dropout_regularizer = self.hyperparameters['dropout_regulizer'] word_input = Input(shape=(self.max_senten_len, ), dtype='float32') word_sequences = self.get_embedding_layer()(word_input) word_lstm = Bidirectional(self.hyperparameters['rnn']( self.hyperparameters['rnn_units'], return_sequences=True, kernel_regularizer=kernel_regularizer))(word_sequences) word_dense = TimeDistributed( Dense(self.hyperparameters['dense_units'], kernel_regularizer=kernel_regularizer))(word_lstm) word_att = AttentionWithContext()(word_dense) wordEncoder = Model(word_input, word_att) sent_input = Input(shape=(self.max_senten_num, self.max_senten_len), dtype='float32') sent_encoder = TimeDistributed(wordEncoder)(sent_input) sent_lstm = Bidirectional(self.hyperparameters['rnn']( self.hyperparameters['rnn_units'], return_sequences=True, kernel_regularizer=kernel_regularizer))(sent_encoder) sent_dense = TimeDistributed( Dense(self.hyperparameters['dense_units'], kernel_regularizer=kernel_regularizer))(sent_lstm) sent_att = Dropout(dropout_regularizer)( AttentionWithContext()(sent_dense)) preds = Dense(len(self.classes))(sent_att) self.model = Model(sent_input, preds) self.model.compile(loss=self.hyperparameters['loss'], optimizer=self.hyperparameters['optimizer'], metrics=self.hyperparameters['metrics']) self.spark_model = SparkModel(self.model, frequency='epoch', mode='asynchronous') # Currently cannot plot learning curve def train_model(self, rdd, epochs, batch_size, verbose=1, validation_split=0.1): """Training the model rdd -- The actual data epochs -- Total number of epochs batch_size -- size of a batch verbose -- Whether or not we want verbose feedback validation_split -- What percentage of the data from the rdd is actually used as a validation set """ self.spark_model.fit(self, rdd, epochs=epochs, batch_size=batch_size, verbose=verbose, validation_split=validation_split) def predict(self, rdd): self.spark_model.predict(rdd) def plot_results(self): """ Plotting learning curve of last trained model. """ # summarize history for accuracy plt.subplot(211) plt.plot(self.history.history['acc']) plt.plot(self.history.history['val_acc']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') # summarize history for loss plt.subplot(212) plt.plot(self.history.history['val_loss']) plt.plot(self.history.history['loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show() time.sleep(10) plt.close()
model.add(Activation('relu')) model.add(Convolution2D(nb_filters, nb_conv, nb_conv)) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adadelta') ## spark conf = SparkConf().setAppName(APP_NAME).setMaster(MASTER_IP) sc = SparkContext(conf=conf) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, X_train, Y_train) # Initialize SparkModel from Keras model and Spark context spark_model = SparkModel(sc,model) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=0, validation_split=0.1, num_workers=24) # Evaluate Spark model by evaluating the underlying model score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2) print('Test accuracy:', score[1])
# score = model.evaluate(x_test, y_test, verbose=0) # # print('Test score:', score[0]) # print('Test accuracy:', score[1]) # Create Spark context conf = SparkConf().setAppName('Mnist_Spark_MLP') # .setMaster('local[8]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels # lp_rdd = to_labeled_point(sc, x_train, y_train, categorical=True) rdd = to_simple_rdd(sc, x_train, y_train) # Train Spark model spark_model = SparkModel(model, frequency='epoch', mode='asynchronous') spark_model.fit(rdd, epochs=epochs, batch_size=batch_size, verbose=2, validation_split=0.1) # Evaluate Spark model by evaluating the underlying model score = spark_model.master_network.evaluate(x_test, y_test, verbose=2) print('Test loss:', score[0]) print('Test accuracy:', score[1]) model_file = 'save/mlp.h5' import os if not os.path.exists("save/"):
model = Sequential() model.add(Dense(18, input_dim=26)) model.add(Activation('sigmoid')) model.add(Dense(6)) model.add(Activation('sigmoid')) model.add(Dense(1)) model.add(Activation('sigmoid')) spark = SparkSession.builder.appName('ElephasTest').getOrCreate() rdd = to_simple_rdd(spark.sparkContext, train, y_train) sgd = SGD(lr=0.1) adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(spark.sparkContext, model, optimizer=adagrad, frequency='epoch', mode='asynchronous', master_loss='mse', num_workers=2, master_optimizer=sgd) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=2, validation_split=0.1) # Evaluate Spark model by evaluating the underlying model score = spark_model.master_network.evaluate(test.values, y_test, verbose=2) print('Test accuracy:', score[1]) print spark_model.predict(test.values) print y_test
.getOrCreate()) sc = spark.sparkContext """### Training Model""" from elephas.spark_model import SparkModel from elephas.utils.rdd_utils import to_simple_rdd # Compile the model. model_9.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # Build RDD from features and labels. rdd = to_simple_rdd(sc, x_train, y_train) # Initialize SparkModel from Keras model and Spark context. spark_model = SparkModel(model_9, frequency='epoch', mode='asynchronous', num_workers=3) # Train the Spark model. spark_model.fit(rdd, epochs=10, batch_size=32, verbose=1, validation_split=0.1) score = spark_model.master_network.evaluate(x_test, y_test, verbose=1) print('Test accuracy:', score) """### Predcit and evaluate Model""" """### Save Model""" import json #lets assume 'model' is main model model_json = model_9.to_json() with open("model_in_json.json", "w") as json_file: json.dump(model_json, json_file)
model = Sequential() model.add(Dense(784, 128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(128, 128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(128, 10)) model.add(Activation('softmax')) # Compile model rms = RMSprop() model.compile(loss='categorical_crossentropy', optimizer=rms) # Create Spark context conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, X_train, Y_train) # Initialize SparkModel from Keras model and Spark context spark_model = SparkModel(sc,model) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=0, validation_split=0.1, num_workers=8) # Evaluate Spark model by evaluating the underlying model score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2) print('Test accuracy:', score[1])
import tensorflow as tf (x_train, y_train), (x_test, y_test) = mnist.load_data() print("length = ( ", len(x_train), ", ", len(y_train), " )") print("shape of the dataset = ", tf.shape(y_train)) x_train = x_train.reshape(60000, 784) x_test = x_test.reshape(10000, 784) x_train = x_train.astype("float32") x_test = x_test.astype("float32") x_train /= 255 x_test /= 255 print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') nb_classes = 10 # Convert class vectors to binary class matrices y_train = to_categorical(y_train, nb_classes) y_test = to_categorical(y_test, nb_classes) rdd = to_simple_rdd(sc, x_train, y_train) print("rdd = ", rdd) from elephas.spark_model import SparkModel spark_model = SparkModel(model, frequency='epoch', mode='asynchronous', num_workers=2) spark_model.fit(rdd, epochs=10, batch_size=32, verbose=0, validation_split=0.1) score = spark_model.evaluate(x_test, y_test, verbose=2) print('Test accuracy:', score)
print("Test size: %d" % test_data.count()) # create model object model = Sequential() model.add(LSTM(128, activation="sigmoid", input_shape=(1, input_dim))) model.add(Dropout(0.2)) model.add(Dense(1)) metrics = ['MeanSquaredError', 'MeanAbsoluteError'] model.compile(loss='mean_squared_error', optimizer='adam', metrics=metrics) print(model.summary()) rdd = train_data.rdd.map(lambda x: (x[0].toArray().reshape(1, len(x[0])), x[1])) spark_model = SparkModel(model, frequency='epoch', mode='synchronous', metrics=metrics) start = time() spark_model.fit(rdd, epochs=1, batch_size=64, verbose=0, validation_split=0.1) fit_dt = time() - start print(f"Fit took: {fit_dt}") x_test = test_data.toPandas()['features'] x_test = np.asarray(test_data.rdd.map(lambda x: x[0].toArray()).collect()) x_test = x_test.reshape((x_test.shape[0], 1, x_test.shape[1])) y_test = test_data.toPandas()["Weighted_Price"].to_numpy() y_test = y_test.reshape((len(y_test), 1, 1))
class SparseGate(ModelFrame): def __init__(self, x_train, y_train, x_test, y_test, inputs, spark_context): ModelFrame.__init__(self, x_train, y_train, x_test, y_test, spark_context) self.gateModel = None self.inputs = inputs def gating_network(self): c1 = Conv2D(32, (3, 3), padding='same', kernel_regularizer=regularizers.l2(weight_decay), input_shape=self.x_train.shape[1:], name='gate1')(self.inputs) c2 = Activation('elu', name='gate2')(c1) c3 = BatchNormalization(name='gate3')(c2) c4 = Conv2D(32, (3, 3), padding='same', kernel_regularizer=regularizers.l2(weight_decay), name='gate4')(c3) c5 = Activation('elu', name='gate5')(c4) c6 = BatchNormalization(name='gate6')(c5) c7 = MaxPooling2D(pool_size=(2, 2), name='gate7')(c6) c8 = Dropout(0.2, name='gate26')(c7) c9 = Conv2D(32 * 2, (3, 3), name='gate8', padding='same', kernel_regularizer=regularizers.l2(weight_decay))(c8) c10 = Activation('elu', name='gate9')(c9) c11 = BatchNormalization(name='gate25')(c10) c12 = Conv2D(32 * 2, (3, 3), name='gate10', padding='same', kernel_regularizer=regularizers.l2(weight_decay))(c11) c13 = Activation('elu', name='gate11')(c12) c14 = BatchNormalization(name='gate12')(c13) c15 = MaxPooling2D(pool_size=(2, 2), name='gate13')(c14) c16 = Dropout(0.3, name='gate14')(c15) c25 = Flatten(name='gate23')(c16) c26 = Dense(5, name='gate24', activation='elu')(c25) model = Model(inputs=self.inputs, outputs=c26) return model def create_gate_model(self, expert_models): gate_network = self.gating_network() merged = Lambda(lambda x: K.tf.transpose( sum( K.tf.transpose(x[i]) * x[0][:, i - 1] for i in range( 1, len(x)))))([gate_network.layers[-1].output] + [m.layers[-1].output for m in expert_models]) b = Activation('softmax', name='gatex')(merged) model = Model(inputs=self.inputs, outputs=b) model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy']) return model def train_gate(self, datagen, weights_file): model = self.gateModel model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy']) print(model.summary()) self.gateModel = SparkModel(model, frequency='epoch', mode='asynchronous') score = self.gateModel.master_network.evaluate(self.x_test, self.y_test, verbose=2, batch_size=50) self.gateModel.fit(self.rdd, epochs=1, batch_size=50, verbose=1) self.gateModel = self.gateModel.master_network self.gateModel.save_weights(weights_file + '.hdf5') file = '../lib/output.txt' if os.path.exists(file): append_write = 'a' else: append_write = 'w' #score = self.gateModel.evaluate(self.x_test, self.y_test, verbose=2, batch_size=50) print("------------------------------") print("Score is:" + str(score[1])) print("-------------------------------") text_file = open(file, append_write) text_file.write("Score: %s" % score[1]) text_file.close() def load_gate_weights(self, model_old, weights_file='../lib/weights/moe_full.hdf5'): model_old.load_weights(weights_file) for l in self.gateModel.layers: for b in model_old.layers: if (l.name == b.name): l.set_weights(b.get_weights()) print("loaded gate layer " + str(l.name))
early_stop = EarlyStopping(monitor='val_loss', patience=4, verbose=1) model = Sequential() model.add(Dense(32, input_shape=(239, ), activation='tanh')) model.add(Dense(1)) opt = Adam(lr=0.001) model.compile(loss='mean_squared_error', optimizer=opt) model.summary() from elephas.utils.rdd_utils import to_simple_rdd rdd = train_data.rdd from elephas.spark_model import SparkModel from elephas.optimizers import Adam spark_model = SparkModel(model, frequency='epoch', mode='synchronous', num_workers=4, elephas_optimizer=Adam()) spark_model.fit(rdd, epochs=20, batch_size=500, verbose=1, validation_split=0.1) spark_model.fit()
from elephas.mllib.adapter import to_vector, from_vector from pyspark import SparkContext, SparkConf # Create Spark context conf = SparkConf().setAppName('LSTM_Spark_MLP') sc = SparkContext(conf=conf) def deal_x(x): x = np.array(x) x_data = x[1:] x_data = np.expand_dims(x_data, axis=2) return x_data test_data = sc.textFile("output/data/z2.csv").map( lambda ln: deal_x([float(x) for x in ln.split(',')])) model = load_model('model.h5') adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='synchronous', num_workers=3) # Test Spark model spark_model.predict_classes(test_data, "output/data/prediction")
sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, x_train, y_train) # Epoch Before Check Point num_epoch_in_one_step = 10 batch_size = 100 # Accuracy records stat_lines = [] adagrad = elephas_optimizers.Adagrad() for i in range(0, 200): # Train Spark model # Initialize SparkModel from Keras model and Spark context spark_model = SparkModel(sc, model, mode='asynchronous', frequency='epoch', num_workers=1, optimizer=adagrad) spark_model.train(rdd, nb_epoch=num_epoch_in_one_step, batch_size=batch_size, verbose=0, validation_split=0.1) score1 = model.evaluate(x_train, y_train, verbose=0) score2 = model.evaluate(x_test, y_test, verbose=0) print('#############################') print('Finished epochs', (i + 1) * num_epoch_in_one_step) print('Train accuracy:', score1[1]) print('Test accuracy:', score2[1]) print('#############################') stat_lines.append(
model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(10)) model.add(Activation('softmax')) sgd = SGD(lr=0.1) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, x_train, y_train) # Initialize SparkModel from Keras model and Spark context adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='asynchronous', num_workers=2, master_optimizer=sgd) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=2, validation_split=0.1) # Evaluate Spark model by evaluating the underlying model score = spark_model.master_network.evaluate(x_test, y_test, verbose=2) print('Test accuracy:', score[1])
model.add(Dropout(0.2)) model.add(Dense(10)) model.add(Activation('softmax')) # Compile model sgd = SGD(lr=0.1) model.compile(loss='categorical_crossentropy', optimizer=sgd) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, x_train, y_train) # Initialize SparkModel from Keras model and Spark context adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='asynchronous', num_workers=2) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=2, validation_split=0.1) # Evaluate Spark model by evaluating the underlying model score = spark_model.master_network.evaluate(x_test, y_test, show_accuracy=True, verbose=2)
class KerasNeuralNetworkSpark(object): def __init__(self, layers, spark, batch_size=64, epoch=10, num_workers=2, predictionCol='prediction', labelCol='target', featuresCol='feature'): self._batch_size = batch_size self._epoch = epoch self._model = None self._spark = spark self._labels = labelCol self._features = featuresCol self._prediction = predictionCol self._layers = layers self._worker_num = num_workers self._build_model() def _build_model(self): model = Sequential() adam = elephas_optimizers.Adam() layers = self._layers model.add(Dense(layers[1], input_dim=layers[0], init='normal', activation='relu')) for i in range(2, len(layers) - 1): model.add(Dense(layers[i], activation='relu')) model.add(Dense(layers[-1], activation='sigmoid')) self._model = SparkModel(self._spark.sparkContext, model, optimizer=adam, frequency='epoch', mode='asynchronous', master_loss='mse', num_workers=self._worker_num) def fit(self, df): if hasattr(self._model, 'server'): self._model.server.terminate() pdf = df.toPandas() rdd = to_simple_rdd(self._spark.sparkContext, pdf[self._features], pdf[self._labels]) self._model.train(rdd, self._epoch, self._batch_size, 0, 0.1) def transform(self, df): pdf = df.toPandas() # df.write.save('test_df.parquet') pnparray = pdf[self._features].values container = np.zeros((pnparray.shape[0], len(pnparray[0]))) for i in range(pnparray.shape[0]): container[i, :] = pnparray[i][:] result = self._model.predict(container) pdf[self._prediction] = result # import pickle # with open('ann_result.p', 'w') as f: # pickle.dump(result, f) # result_df = pd.DataFrame(pdf new_df = self._spark.createDataFrame(pdf) # df.join(new_df) return new_df def stop_server(self): if hasattr(self._model, 'server') and hasattr(self._model.server, 'terminate'): self._model.server.terminate()
# output signal. Here's the activation function is given be ReLU. model.add(Activation('relu')) model.add(Dropout(0.5)) # dropout is then applied # finally the 128 outputs of the previous FC layer are fully connected to num_classes of neurons, which # is activated by a softmax function model.add( Dense(nb_classes, W_regularizer=l2(0.01) )) model.add( Activation('softmax') ) # write the neural network model representation to a png image #grapher.plot(model, 'nn_mnist.png') model.compile(loss='categorical_crossentropy', optimizer='adadelta') # model.compile(loss='categorical_crossentropy', optimizer='sgd' or 'adam or 'adadelta') ## spark conf = SparkConf().setAppName(APP_NAME) #.setMaster(MASTER_IP) sc = SparkContext(conf=conf) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, X_train, Y_train) # Initialize SparkModel from Keras model and Spark context spark_model = SparkModel(sc,model) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.15) # num_workers might not work in early spark version # Evaluate Spark model by evaluating the underlying model score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2) print('Test accuracy:', score[1])
print('Test data : x') print(type(x_test)) print(x_test) print('Test data : y') print(type(y_test)) print(y_test) print('Converting training data to RDD') rddataset = to_simple_rdd(sc, x_train, y_train) print('Initializing SPark Model') sgd = elephas_optimizers.SGD() spark_model = SparkModel(sc, model, optimizer=sgd, frequency='epoch', mode='asynchronous', num_workers=2) print('Commencing training') spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0) #model.fit(x_train, y_train, nb_epoch=5, batch_size=32) print('Training completed') sc.stop()
# Define elephas optimizer (which tells the model how to aggregate updates on the Spark master) adadelta = elephas_optimizers.Adadelta() from elephas.utils.rdd_utils import to_labeled_point from elephas.utils.rdd_utils import to_simple_rdd lp_rdd = to_simple_rdd(sc, features_train, labels_train) #print(lp_rdd.take(5)) from elephas.spark_model import SparkModel from elephas import optimizers as elephas_optimizers adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='asynchronous', num_workers=8) spark_model.train(lp_rdd, nb_epoch=20, batch_size=32, verbose=0, validation_split=0.1) print(spark_model) prediction = spark_model.predict_classes(features_test) print(prediction) truth = [l[1] for l in labels_test] from sklearn.metrics import confusion_matrix
model_config.input_split_index = encoder_train_input.shape[1] training_generator = WMTSequence(encoder_train_input, decoder_train_input, decoder_train_target, model_config) for raw_train_input, decoder_train_target in training_generator: encoder_train_input, decoder_train_input = raw_train_input train_input = np.hstack((encoder_train_input, decoder_train_input)) train_rdd = to_simple_rdd(sc, train_input, decoder_train_target) if args.ensemble: model = DistributedEnsembleSeq2Seq(model_config) else: model = DistributedSeq2Seq(model_config) spark_model = SparkModel(model.model, frequency='epoch', mode='synchronous', batch_size=args.batch_size, custom_objects={'EncoderSlice': EncoderSlice, 'DecoderSlice': DecoderSlice}) spark_model.fit(train_rdd, batch_size=model_config.batch_size, epochs=model_config.epochs, validation_split=0.0, verbose=1) model.evaluate(encoder_test_input, raw_test_target) else: training_generator = WMTSequence(encoder_train_input, decoder_train_input, decoder_train_target, model_config) validation_generator = WMTSequence(encoder_dev_input, decoder_dev_input, decoder_dev_target, model_config)
model.compile(loss='categorical_crossentropy', optimizer=SGD()) model.summary() # Create a Resilient Distributed Dataset (RDD) from training data # TODO: get data # TODO: is it possible to separate traininng data into multiple batches? rdd = to_simple_rdd(sc, X_train, Y_train) # Create the Elephas model instance spark_model = SparkModel(sc, model, optimizer = elephas_optimizers.Adagrad(), frequency = 'epoch', mode = 'asynchronous', num_workers = WORKERS ) # Train model spark_model.train(rdd, nb_epoch = EPOCHS, batch_size = BATCH_SIZE, verbose = False, validation_split = VAL_SPLIT, num_workers = WORKERS )