def preparar_RDD(seq_len = 0): from elephas.utils.rdd_utils import to_simple_rdd from os import rename as os_rename for nF in range(1, 99): # 1,...,(n-1) fichtr = 'clicks_X_train_' + str(seq_len) + '-' + str(nF) + '_para_spark.csv' if os_path_isfile(s_input_path + fichtr): print('Leyendo ficheros train+valid ' + str(nF) + ' - numAds ' + str(seq_len) + '...') X_train = read_csv(s_input_path + 'clicks_X_train_' + str(seq_len) + '-' + str(nF) + '_para_spark.csv', dtype=np_float64, header = None).values y_train = read_csv(s_input_path + 'clicks_y_train_' + str(seq_len) + '-' + str(nF) + '_para_spark.csv', dtype=int, header = None).values X_valid = read_csv(s_input_path + 'clicks_X_valid_' + str(seq_len) + '-' + str(nF) + '_para_spark.csv', dtype=np_float64, header = None).values y_valid = read_csv(s_input_path + 'clicks_y_valid_' + str(seq_len) + '-' + str(nF) + '_para_spark.csv', dtype=int, header = None).values print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape) X_train, y_train = mi_reshape(X_train, to_categorical(y_train), seq_len) X_valid, y_valid = mi_reshape(X_valid, to_categorical(y_valid), seq_len) X_train = np_concat((X_train, X_valid), axis=0) # Incluimos validset dentro del trainset en Spark y_train = np_concat((y_train, y_valid), axis=0) # Incluimos validset dentro del trainset en Spark print(X_train.shape, y_train.shape) print('Creando RDD (train+valid) ' + str(nF) + ' - numAds ' + str(seq_len) + '...') rdd_ini = to_simple_rdd(sc, X_train, y_train) # Convertimos ndarray [ i.e. array(...) ] en list [ i.e. [...] ]: rdd_lista = rdd_ini.map(lambda i: map(lambda s: s.tolist(), i)) # Y ahora guardamos como txt: rdd_lista.coalesce(numSparkWorkers, True).saveAsTextFile(s_spark_inputpath + 'clicks_train_seq' + str(seq_len) + '-' + str(nF) + '_rdd') # Forzamos a guardarlo en 4 trozos (al menos) print('Ok. Guardado en HDFS el RDD (train+valid) ' + str(nF) + ' - numAds ' + str(seq_len) + '.') os_rename(s_input_path + fichtr, s_input_path + 'ok_en_hdfs/' + 'clicks_X_train_' + str(seq_len) + '-' + str(nF) + '_para_spark.csv')
def test_spark_model_end_to_end(spark_context): rdd = to_simple_rdd(spark_context, x_train, y_train) # sync epoch spark_model = SparkModel(model, frequency='epoch', mode='synchronous', num_workers=2) spark_model.fit(rdd, epochs=epochs, batch_size=batch_size, verbose=2, validation_split=0.1) score = spark_model.master_network.evaluate(x_test, y_test, verbose=2) print('Test accuracy:', score[1]) # sync batch spark_model = SparkModel(model, frequency='batch', mode='synchronous', num_workers=2) spark_model.fit(rdd, epochs=epochs, batch_size=batch_size, verbose=2, validation_split=0.1) score = spark_model.master_network.evaluate(x_test, y_test, verbose=2) print('Test accuracy:', score[1]) # async epoch spark_model = SparkModel(model, frequency='epoch', mode='asynchronous') spark_model.fit(rdd, epochs=epochs, batch_size=batch_size, verbose=2, validation_split=0.1) score = spark_model.master_network.evaluate(x_test, y_test, verbose=2) print('Test accuracy:', score[1]) # hogwild epoch spark_model = SparkModel(model, frequency='epoch', mode='hogwild') spark_model.fit(rdd, epochs=epochs, batch_size=batch_size, verbose=2, validation_split=0.1) score = spark_model.master_network.evaluate(x_test, y_test, verbose=2) print('Test accuracy:', score[1])
def main(): gps_files = glob.glob('../data/prototype/**/gps_points.csv') trip_files = glob.glob('../data/prototype/**/gps_trips.csv') file_results = process_file(trip_file = trip_files[0], gps_file = gps_files[0]) seq_results = build_seq(input_df = file_results['df'], unique_trips = file_results['unique_trips']) X = seq_results['x'] y = seq_results['y'] print('Bulding training data from files..') for i in range(1, len(gps_files)): file_results = process_file(trip_file = trip_files[i], gps_file = gps_files[i]) seq_results = build_seq(input_df = file_results['df'], unique_trips = file_results['unique_trips']) X = np.vstack((X, seq_results['x'])) y = np.vstack((y, seq_results['y'])) x_train, x_val, y_train, y_val = train_test_split(X, y, random_state=1, train_size=0.8) rdd = to_simple_rdd(sc, x_train, y_train) model = build_model() spark_model = SparkModel(model, frequency='epoch', mode='asynchronous') spark_model.fit(rdd, epochs=5, batch_size=32, verbose=0, validation_split=0.1) # model.fit(x_train, y_train, epochs=5, validation_data=(x_val, y_val)) y_pred = spark_model.predict(x_val) acc = sum(np.argmax(y_pred, axis=1) == np.argmax(y_val, axis=1)) / y_pred.shape[0] print("Validation Accuracy: {number:.{digits}f}%".format(number=(acc*100), digits=2))
def fit(self, df): if hasattr(self._model, 'server'): self._model.server.terminate() pdf = df.toPandas() rdd = to_simple_rdd(self._spark.sparkContext, pdf[self._features], pdf[self._labels]) self._model.train(rdd, self._epoch, self._batch_size, 0, 0.1)
def predictMain(modelName,sc): timeSteps= 30 # No of past values that has to be used for Training purpose print "Going to Initialize the LSTM model" SMARTparameters=getSMARTParameters() print("The following are the SMART parameters:",SMARTparameters) lstm = ls.cloudLSTM(timeSteps=timeSteps,parms=SMARTparameters) # Initializing the DiskPrediction Model(LSTM Model) print "Initialized the Model" lstmModel = lstm.get_LSTM_Model() # Obtaining the LSTM model for initializing SparkModel Class trainSize= 0.2 # Fraction of input used for Training purpose acc = 0.0 # Model accuracy inputFilePath = os.environ.get('DATA_FILE_PATH') # Get the Input CSV filepath from environment year=sys.argv[1] # get the year from the Command Line arguments month=sys.argv[2] # get the month from the Command Line arguments inputFilePath=inputFilePath+str(year)+"/"+str(year)+"-"+str(month)+"*.csv" # For E.g "/home/user/Desktop/Cloud/Test/2014/2014-11*.csv" print("InputPath",inputFilePath) rd.generate_DataFrame(inputFilePath,SMARTparameters) inputCSVFilePath = os.environ.get('MODEL_CSV_FILEPATH')+str(modelName)+".csv" # For E.g "/hadoop/elephas/Output/ST4000DM000.csv" modelFeatures = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=SMARTparameters) modelLabel = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=['failure']) #"/hadoop/elephas/Output/ST4000DM000.csv" # Removing Not A Number values from the Input Dataframe modelFeatures = modelFeatures.fillna(0) modelLabel = modelLabel.fillna(0) # Obtaining 3D training and testing vectors (feature_train, label_train), (feature_test, label_test) = lstm.train_test_split(modelFeatures,modelLabel,trainSize,timeSteps) # Condition to check whether the failure cases exists in the data if len(feature_train)==0: print("DiskModel has no failure eleements. Training of the model cannot proceed!!") return # Initializing the Adam Optimizer for Elephas adam = elephas_optimizers.Adam() print "Adam Optimizer initialized" #Converting Dataframe to Spark RDD rddataset = to_simple_rdd(sc, feature_train, label_train) print "Training data converted into Resilient Distributed Dataset" #Initializing the SparkModel with Optimizer,Master-Worker Mode and Number of Workers spark_model = SparkModel(sc,lstmModel,optimizer=adam ,frequency='epoch', mode='asynchronous', num_workers=2) print "Spark Model Initialized" #Initial training run of the model spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0) # Saving the model score = spark_model.evaluate(feature_test, label_test,show_accuracy=True) while(score <= 0.5): # Training the Input Data set spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0) print "LSTM model training done !!" score = spark_model.evaluate(feature_test, label_test,show_accuracy=True) print "Saving weights!!" outFilePath=os.environ.get('GATOR_SQUAD_HOME') outFilePath=outFilePath+"Weights/"+str(year)+"/"+str(month)+"/"+str(modelName)+"_my_model_weights.h5" spark_model.save_weights(outFilePath) print "LSTM model testing commencing !!" predicted1=spark_model.predict_classes(feature_test) df_confusion = pd.crosstab(label_test.flatten(), predicted1.flatten(), rownames=['Actual'], colnames=['Predicted'], margins=True) print df_confusion
def __init__(self, x_train, y_train, x_test, y_test, spark_context=None): self.x_train = x_train self.y_train = y_train self.x_test = x_test self.y_test = y_test if (spark_context is not None): self.rdd = to_simple_rdd(spark_context, x_train, y_train) else: self.rdd = None
def test_to_simple_rdd(spark_context): features = np.ones((5, 10)) labels = np.ones((5, )) rdd = rdd_utils.to_simple_rdd(spark_context, features, labels) assert rdd.count() == 5 first = rdd.first() assert first[0].shape == (10, ) assert first[1] == 1.0
def test_training_classification(spark_context, mode, parameter_server_mode, mnist_data, classification_model): # Define basic parameters batch_size = 64 epochs = 10 # Load data x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] sgd = SGD(lr=0.1) classification_model.compile(sgd, 'categorical_crossentropy', ['acc']) # Build RDD from numpy features and labels rdd = to_simple_rdd(spark_context, x_train, y_train) # Initialize SparkModel from keras model and Spark context spark_model = SparkModel(classification_model, frequency='epoch', mode=mode, parameter_server_mode=parameter_server_mode, port=4000 + random.randint(0, 500)) # Train Spark model spark_model.fit(rdd, epochs=epochs, batch_size=batch_size, verbose=0, validation_split=0.1) # run inference on trained spark model predictions = spark_model.predict(x_test) # run evaluation on trained spark model evals = spark_model.evaluate(x_test, y_test) # assert we can supply rdd and get same prediction results when supplying numpy array test_rdd = spark_context.parallelize(x_test) assert [np.argmax(x) for x in predictions ] == [np.argmax(x) for x in spark_model.predict(test_rdd)] # assert we get the same prediction result with calling predict on keras model directly assert [np.argmax(x) for x in predictions] == [ np.argmax(x) for x in spark_model.master_network.predict(x_test) ] # assert we get the same evaluation results when calling evaluate on keras model directly assert isclose(evals[0], spark_model.master_network.evaluate(x_test, y_test)[0], abs_tol=0.01) assert isclose(evals[1], spark_model.master_network.evaluate(x_test, y_test)[1], abs_tol=0.01)
def test_sync_mode(spark_context): # Define basic parameters batch_size = 64 nb_classes = 10 epochs = 10 # Load data (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape(60000, 784) x_test = x_test.reshape(10000, 784) x_train = x_train.astype("float32") x_test = x_test.astype("float32") x_train /= 255 x_test /= 255 print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') # Convert class vectors to binary class matrices y_train = np_utils.to_categorical(y_train, nb_classes) y_test = np_utils.to_categorical(y_test, nb_classes) model = Sequential() model.add(Dense(128, input_dim=784)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(10)) model.add(Activation('softmax')) sgd = SGD(lr=0.1) model.compile(sgd, 'categorical_crossentropy', ['acc']) # Build RDD from numpy features and labels rdd = to_simple_rdd(spark_context, x_train, y_train) # Initialize SparkModel from Keras model and Spark context spark_model = SparkModel(model, mode='synchronous') # Train Spark model spark_model.fit(rdd, epochs=epochs, batch_size=batch_size, verbose=2, validation_split=0.1) # Evaluate Spark model by evaluating the underlying model score = spark_model.master_network.evaluate(x_test, y_test, verbose=2) assert score[1] >= 0.70
def test_training_regression(spark_context, mode, parameter_server_mode, boston_housing_dataset, regression_model): x_train, y_train, x_test, y_test = boston_housing_dataset rdd = to_simple_rdd(spark_context, x_train, y_train) # Define basic parameters batch_size = 64 epochs = 10 sgd = SGD(lr=0.0000001) regression_model.compile(sgd, 'mse', ['mae']) spark_model = SparkModel(regression_model, frequency='epoch', mode=mode, parameter_server_mode=parameter_server_mode, port=4000 + random.randint(0, 500)) # Train Spark model spark_model.fit(rdd, epochs=epochs, batch_size=batch_size, verbose=0, validation_split=0.1) # run inference on trained spark model predictions = spark_model.predict(x_test) # run evaluation on trained spark model evals = spark_model.evaluate(x_test, y_test) # assert we can supply rdd and get same prediction results when supplying numpy array test_rdd = spark_context.parallelize(x_test) assert all( np.isclose(x, y, 0.01) for x, y in zip(predictions, spark_model.predict(test_rdd))) # assert we get the same prediction result with calling predict on keras model directly assert all( np.isclose(x, y, 0.01) for x, y in zip( predictions, spark_model.master_network.predict(x_test))) # assert we get the same evaluation results when calling evaluate on keras model directly assert isclose(evals[0], spark_model.master_network.evaluate(x_test, y_test)[0], abs_tol=0.01) assert isclose(evals[1], spark_model.master_network.evaluate(x_test, y_test)[1], abs_tol=0.01)
def dist_training(n_iter): sbcnn = SBCNN_Model(field_size, bands, frames, num_channels, num_labels) sgd = SGD(lr=0.001, momentum=0.0, decay=0.0, nesterov=False) sbcnn.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=sgd) train_arr, train_labels_arr, test_arr, test_labels_arr = get_data() rdd = to_simple_rdd(sc, train_arr, train_labels_arr) spark_model = SparkModel(sbcnn, frequency='epoch', mode='asynchronous') spark_model.fit(rdd, epochs=n_iter, batch_size=32, verbose=0, validation_split=0.1) score = spark_model.master_network.evaluate(test_arr, test_labels_arr, verbose=2) print('Test accuracy:', score[1])
def train_elephas_model(x, y): model = models.Sequential() # Input Layer sgd = optimizers.Adam(lr=0.01) model.add(Dense(256, activation="relu", input_shape=(x.shape[1],))) model.add(Dropout(0.05)) model.add(Dense(256, activation="relu", input_shape=(x.shape[1],))) model.add(Dropout(0.05)) # output layer model.add(Dense(1)) model.compile(optimizer=sgd, loss="mse", metrics=["mse"]) model.summary() rdd = to_simple_rdd(sc, x, y) spark_model = SparkModel(model, frequency='epoch', mode='asynchronous') # spark_model.fit(rdd, epochs=10, batch_size=64, verbose=1, validation_split=0.2) spark_model.fit(rdd, epochs=25, batch_size=64, verbose=1, validation_split=0.2) return spark_model
print('Training data : x') print(type(x_train)) print(x_train) print('Training data : y') print(type(y_train)) print(y_train) print('Test data : x') print(type(x_test)) print(x_test) print('Test data : y') print(type(y_test)) print(y_test) print('Converting training data to RDD') rddataset = to_simple_rdd(sc, x_train, y_train) print('Initializing SPark Model') sgd = elephas_optimizers.SGD() spark_model = SparkModel(sc, model, optimizer=sgd, frequency='epoch', mode='asynchronous', num_workers=2) print('Commencing training') spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1,
y_test = np_utils.to_categorical(y_test, nb_classes) model = Sequential() model.add(Dense(128, input_dim=784)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(10)) model.add(Activation('softmax')) sgd = SGD(lr=0.1) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, x_train, y_train) # Initialize SparkModel from Keras model and Spark context adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='asynchronous', num_workers=2, master_optimizer=sgd) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size,
from pyspark.sql.window import Window for i in range(2, 3): col1 = "Price%d" % (i - 1) col2 = "Price%d" % i w = Window().partitionBy().orderBy(col("Date_Time")) FinalDf = FinalDf.select("*", lag(col1).over(w).alias(col2)).na.drop() FinalDf.show() FinalDf = FinalDf.selectExpr('Date_Time', 'Sentiment_score', 'Price2 as Input_price', 'Price1 as Price') # reshape input to be 3D [samples, timesteps, features] train_X = train_X.reshape((train_X.shape[0], n_hours, n_features)) test_X = test_X.reshape((test_X.shape[0], n_hours, n_features)) rdd = to_simple_rdd(sc, train_X, train_y) rdd.count() from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation from keras.optimizers import SGD #from keras.models import model_from_yaml,slice_X from keras.utils import np_utils from elephas.spark_model import SparkModel from elephas.utils.rdd_utils import to_simple_rdd from elephas import optimizers as elephas_optimizers model = Sequential() model.add(LSTM(5, input_shape=(train_X.shape[1], train_X.shape[2]))) model.add(Dense(1))
print("Creating Training and Test Data") ((x_train, y_train), (x_test, y_test)) = train_test_split(testinput.fillna(0), testoutput.fillna(0), test_size=0.3) print("Training data : x") print(type(x_train)) print(x_train) print("Training data : y") print(type(y_train)) print(y_train) print("Test data : x") print(type(x_test)) print(x_test) print("Test data : y") print(type(y_test)) print(y_test) print("Converting training data to RDD") rddataset = to_simple_rdd(sc, x_train, y_train) print("Initializing SPark Model") sgd = elephas_optimizers.SGD() spark_model = SparkModel(sc, model, optimizer=sgd, frequency="epoch", mode="asynchronous", num_workers=2) print("Commencing training") spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0) # model.fit(x_train, y_train, nb_epoch=5, batch_size=32) print("Training completed") sc.stop()
model.add(Activation('relu')) model.add(Convolution2D(nb_filters, nb_conv, nb_conv)) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adadelta') ## spark conf = SparkConf().setAppName(APP_NAME).setMaster(MASTER_IP) sc = SparkContext(conf=conf) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, X_train, Y_train) # Initialize SparkModel from Keras model and Spark context spark_model = SparkModel(sc,model) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=0, validation_split=0.1, num_workers=24) # Evaluate Spark model by evaluating the underlying model score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2) print('Test accuracy:', score[1])
model = Sequential() model.add(Dense(128, input_dim=784)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(10)) model.add(Activation('softmax')) # Compile model sgd = SGD(lr=0.1) model.compile(loss='categorical_crossentropy', optimizer=sgd) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, x_train, y_train) # Initialize SparkModel from Keras model and Spark context adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='asynchronous', num_workers=2) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=2,
model = Sequential() model.add(Dense(784, 128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(128, 128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(128, 10)) model.add(Activation('softmax')) # Compile model rms = RMSprop() model.compile(loss='categorical_crossentropy', optimizer=rms) # Create Spark context conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, X_train, Y_train) # Initialize SparkModel from Keras model and Spark context spark_model = SparkModel(sc,model) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=0, validation_split=0.1, num_workers=8) # Evaluate Spark model by evaluating the underlying model score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2) print('Test accuracy:', score[1])
# # normalize the dataset # scaler = MinMaxScaler(feature_range=(0, 1)) # inputdata = scaler.fit_transform(inputdata) # split into train and test sets train_size = int(len(inputdata) * 0.8) test_size = len(inputdata) - train_size train, test = inputdata[0:train_size,:], inputdata[train_size:len(inputdata),:] # reshape into X=t and Y=t+1 look_back = 2 trainX, trainY = create_dataset(train, look_back) testX, testY = create_dataset(test, look_back) # reshape input to be [samples, time steps, features] trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1])) testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1])) rdd = to_simple_rdd(spark.sparkContext, trainX, trainY) # create and fit the LSTM network model = Sequential() model.add(LSTM(4, input_shape=(1, look_back))) model.add(Dense(1)) model.compile(loss='mean_squared_error', optimazer='adam') model.fit(trainX, trainY, epochs=300, batch_size=1, verbose=2) # adam = elephas_optimizers.Adam() # # spark_model = SparkModel(spark.sparkContext, model, optimizer=adam, frequency='epoch', num_workers=2) # spark_model.train(rdd, nb_epoch=50, batch_size=4, verbose=2, validation_split=0.1) # make predictions
labels = [] features = [] for message in consumer: #print(message.value) labels.append(message.value["label"]) features.append(message.value["features"]["values"]) labeledpoints = np.array(labels, features) model = Sequential() model.add(Dense(2, input_dim=11)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(10)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer=SGD()) lp_rdd = to_simple_rdd(sc, features, labels, categorical=True) spark_model = SparkModel(model, frequency='epoch', mode='asynchronous') spark_model.fit(lp_rdd, epochs=20, batch_size=32, verbose=0, validation_split=0.1) spark_model.save("model.h5")
y_train = transformer.fit_transform(train['Target'].values.reshape(-1, 1)) del train['Target'] y_test = transformer.transform(test['Target'].values.reshape(-1, 1)) del test['Target'] model = Sequential() model.add(Dense(18, input_dim=26)) model.add(Activation('sigmoid')) model.add(Dense(6)) model.add(Activation('sigmoid')) model.add(Dense(1)) model.add(Activation('sigmoid')) spark = SparkSession.builder.appName('ElephasTest').getOrCreate() rdd = to_simple_rdd(spark.sparkContext, train, y_train) sgd = SGD(lr=0.1) adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(spark.sparkContext, model, optimizer=adagrad, frequency='epoch', mode='asynchronous', master_loss='mse', num_workers=2, master_optimizer=sgd) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=2, validation_split=0.1) # Evaluate Spark model by evaluating the underlying model
if args.single_threaded_worker: conf = SparkConf().setAppName('tardis').setMaster('local') else: conf = SparkConf().setAppName('tardis').setMaster('local[*]') sc = SparkContext.getOrCreate(conf=conf) generator_config = deepcopy(args) generator_config.batch_size = 1024 generator_config.target_vocab = target_vocab model_config.input_split_index = encoder_train_input.shape[1] training_generator = WMTSequence(encoder_train_input, decoder_train_input, decoder_train_target, model_config) for raw_train_input, decoder_train_target in training_generator: encoder_train_input, decoder_train_input = raw_train_input train_input = np.hstack((encoder_train_input, decoder_train_input)) train_rdd = to_simple_rdd(sc, train_input, decoder_train_target) if args.ensemble: model = DistributedEnsembleSeq2Seq(model_config) else: model = DistributedSeq2Seq(model_config) spark_model = SparkModel(model.model, frequency='epoch', mode='synchronous', batch_size=args.batch_size, custom_objects={'EncoderSlice': EncoderSlice, 'DecoderSlice': DecoderSlice}) spark_model.fit(train_rdd, batch_size=model_config.batch_size, epochs=model_config.epochs,
model.add(Dense(64)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(2)) model.add(Activation('sigmoid')) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) # Define elephas optimizer (which tells the model how to aggregate updates on the Spark master) adadelta = elephas_optimizers.Adadelta() from elephas.utils.rdd_utils import to_labeled_point from elephas.utils.rdd_utils import to_simple_rdd lp_rdd = to_simple_rdd(sc, features_train, labels_train) #print(lp_rdd.take(5)) from elephas.spark_model import SparkModel from elephas import optimizers as elephas_optimizers adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='asynchronous', num_workers=8) spark_model.train(lp_rdd, nb_epoch=20,