Exemplo n.º 1
0
    def train(self, dataframe, shuffle=False):
        """Applies model averaging to the model replicas distributed over the specified
        number of Spark executors.

        # Arguments
            dataframe: dataframe: A Spark Dataframe containing the training data.
            shuffle: boolean. Tells to shuffle the dataframe before training.
                     Warning: this will tell Spark to shuffle all partitions over
                     the network. It is recommended to shuffle the dataframe before
                     training and store it.
        """
        # Repartition the data in order to fit the number of workers.
        num_partitions = dataframe.rdd.getNumPartitions()
        # Check if the dataframe needs to be shuffled.
        if shuffle:
            dataframe = shuffle(dataframe)
        # Check if we need to repartition the dataframe.
        if num_partitions >= self.num_workers:
            dataframe = dataframe.coalesce(self.num_workers)
        else:
            dataframe = dataframe.repartition(self.num_workers)
        # Start the training procedure.
        self.record_training_start()
        for i in range(0, self.num_epoch):
            worker = self.allocate_worker()
            # Set the maximum number of mini-batches.
            worker.set_max_prefetch(self.max_mini_batches_prefetch)
            models = dataframe.rdd.mapPartitionsWithIndex(
                worker.train).collect()
            self.average_models(models)
        # End the training procedure.
        self.record_training_end()

        return deserialize_keras_model(self.master_model)
Exemplo n.º 2
0
    def train(self, dataframe, shuffle=False):
        """See kerasonspark.trainers.Trainer.train

        # Arguments
            dataframe: dataframe. A Spark Dataframe containing the training data.
            shuffle: boolean. Tells to shuffle the dataframe before training.
                     Warning: this will tell Spark to shuffle all partitions over
                     the network. It is recommended to shuffle the dataframe before
                     training and store it.
        """
        # Check if the data needs to be shuffled.
        if shuffle:
            dataframe = shuffle(dataframe)
        # Collect the dataframe on a single worker node.
        dataframe = dataframe.coalesce(1)
        # Cache the dataframe.
        dataframe.cache()
        # Allocate a worker.
        worker = self.allocate_worker()
        # Set the maximum number of mini-batches.
        worker.set_max_prefetch(self.max_mini_batches_prefetch)
        # Start recording training time.
        self.record_training_start()
        # Fetch the trained model.
        self.master_model = dataframe.rdd.mapPartitionsWithIndex(
            worker.train).collect()[0]
        # Stop recording of training time.
        self.record_training_end()

        return deserialize_keras_model(self.master_model)
Exemplo n.º 3
0
 def destroy_remote_job(self):
     address = self.address + '/api/destroy?secret=' + self.secret
     request = urllib2.Request(address)
     response = urllib2.urlopen(request)
     data = json.load(response)
     model = unpickle_object(data['model'].decode('hex_codec'))
     self.trained_model = deserialize_keras_model(model)
     self.history = unpickle_object(data['history'].decode('hex_codec'))
Exemplo n.º 4
0
    def average_models(self, models):
        """Averages the specified list of Keras models, and assigns the
        averaged model as the master model.

        # Arguments:
            models: list. A list of serialized Keras models.
        """
        num_models = len(models)
        # Get all weights of the models.
        for i in range(0, num_models):
            weights = np.asarray(
                deserialize_keras_model(models[i]).get_weights())
            self.parameter_buffer += weights
        # Average the parameters.
        self.parameter_buffer /= num_models
        temp_model = deserialize_keras_model(self.master_model)
        temp_model.set_weights(self.parameter_buffer)
        self.master_model = serialize_keras_model(temp_model)
Exemplo n.º 5
0
    def _predict(self, iterator):
        """Lambda method which will append a prediction column to the provided rows.

        # Arguments:
            iterator: iterator. Spark Row iterator.
        """
        model = deserialize_keras_model(self.model)
        for row in iterator:
            features = [np.asarray([row[c]]) for c in self.features_column]
            prediction = model.predict(features)
            dense_prediction = DenseVector(prediction[0])
            new_row = new_dataframe_row(row, self.output_column,
                                        dense_prediction)
            yield new_row
Exemplo n.º 6
0
    def prepare_model(self):
        """Prepares the model for training."""
        # Set the Keras directory.
        set_keras_base_directory()
        if K.backend() == 'tensorflow':
            # set GPU option allow_growth to False for GPU-enabled tensorflow
            config = tf.compat.v1.ConfigProto()
            config.gpu_options.allow_growth = False
            sess = tf.Session(config=config)
            K.set_session(sess)

        # Deserialize the Keras model.
        self.model = deserialize_keras_model(self.model)
        self.optimizer = deserialize(self.optimizer)
        # Compile the model with the specified loss and optimizer.
        self.model.compile(loss=self.loss,
                           loss_weights=self.loss_weights,
                           optimizer=self.optimizer,
                           metrics=self.metrics)
Exemplo n.º 7
0
 def read_trained_model(self):
     home = expanduser("~")
     with open(home + "/models/" + self.secret, "r") as f:
         self.trained_model = deserialize_keras_model(
             unpickle_object(f.read()))
 def __init__(self, model):
     self.model = deserialize_keras_model(model)
     self.num_updates = 1