def train(self, dataframe, shuffle=False): """Applies model averaging to the model replicas distributed over the specified number of Spark executors. # Arguments dataframe: dataframe: A Spark Dataframe containing the training data. shuffle: boolean. Tells to shuffle the dataframe before training. Warning: this will tell Spark to shuffle all partitions over the network. It is recommended to shuffle the dataframe before training and store it. """ # Repartition the data in order to fit the number of workers. num_partitions = dataframe.rdd.getNumPartitions() # Check if the dataframe needs to be shuffled. if shuffle: dataframe = shuffle(dataframe) # Check if we need to repartition the dataframe. if num_partitions >= self.num_workers: dataframe = dataframe.coalesce(self.num_workers) else: dataframe = dataframe.repartition(self.num_workers) # Start the training procedure. self.record_training_start() for i in range(0, self.num_epoch): worker = self.allocate_worker() # Set the maximum number of mini-batches. worker.set_max_prefetch(self.max_mini_batches_prefetch) models = dataframe.rdd.mapPartitionsWithIndex( worker.train).collect() self.average_models(models) # End the training procedure. self.record_training_end() return deserialize_keras_model(self.master_model)
def train(self, dataframe, shuffle=False): """See kerasonspark.trainers.Trainer.train # Arguments dataframe: dataframe. A Spark Dataframe containing the training data. shuffle: boolean. Tells to shuffle the dataframe before training. Warning: this will tell Spark to shuffle all partitions over the network. It is recommended to shuffle the dataframe before training and store it. """ # Check if the data needs to be shuffled. if shuffle: dataframe = shuffle(dataframe) # Collect the dataframe on a single worker node. dataframe = dataframe.coalesce(1) # Cache the dataframe. dataframe.cache() # Allocate a worker. worker = self.allocate_worker() # Set the maximum number of mini-batches. worker.set_max_prefetch(self.max_mini_batches_prefetch) # Start recording training time. self.record_training_start() # Fetch the trained model. self.master_model = dataframe.rdd.mapPartitionsWithIndex( worker.train).collect()[0] # Stop recording of training time. self.record_training_end() return deserialize_keras_model(self.master_model)
def destroy_remote_job(self): address = self.address + '/api/destroy?secret=' + self.secret request = urllib2.Request(address) response = urllib2.urlopen(request) data = json.load(response) model = unpickle_object(data['model'].decode('hex_codec')) self.trained_model = deserialize_keras_model(model) self.history = unpickle_object(data['history'].decode('hex_codec'))
def average_models(self, models): """Averages the specified list of Keras models, and assigns the averaged model as the master model. # Arguments: models: list. A list of serialized Keras models. """ num_models = len(models) # Get all weights of the models. for i in range(0, num_models): weights = np.asarray( deserialize_keras_model(models[i]).get_weights()) self.parameter_buffer += weights # Average the parameters. self.parameter_buffer /= num_models temp_model = deserialize_keras_model(self.master_model) temp_model.set_weights(self.parameter_buffer) self.master_model = serialize_keras_model(temp_model)
def _predict(self, iterator): """Lambda method which will append a prediction column to the provided rows. # Arguments: iterator: iterator. Spark Row iterator. """ model = deserialize_keras_model(self.model) for row in iterator: features = [np.asarray([row[c]]) for c in self.features_column] prediction = model.predict(features) dense_prediction = DenseVector(prediction[0]) new_row = new_dataframe_row(row, self.output_column, dense_prediction) yield new_row
def prepare_model(self): """Prepares the model for training.""" # Set the Keras directory. set_keras_base_directory() if K.backend() == 'tensorflow': # set GPU option allow_growth to False for GPU-enabled tensorflow config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = False sess = tf.Session(config=config) K.set_session(sess) # Deserialize the Keras model. self.model = deserialize_keras_model(self.model) self.optimizer = deserialize(self.optimizer) # Compile the model with the specified loss and optimizer. self.model.compile(loss=self.loss, loss_weights=self.loss_weights, optimizer=self.optimizer, metrics=self.metrics)
def read_trained_model(self): home = expanduser("~") with open(home + "/models/" + self.secret, "r") as f: self.trained_model = deserialize_keras_model( unpickle_object(f.read()))
def __init__(self, model): self.model = deserialize_keras_model(model) self.num_updates = 1