Exemplo n.º 1
0
    def __init__(self, in_size, memory_size):
        """
        initialize lstm layer
        @in_size size of expected input vectors
        @memory_size size of memory vector (state)
        """
        concat_size = in_size + memory_size
        # create forget layer using sigmoid activation function
        self.forget_gate_layer = BiasedLayer(
                in_size=concat_size,  # size of input/last output
                out_size=memory_size,  # size of state
                activation_fn=Layer.activation_sigmoid,
                activation_fn_deriv=Layer.activation_sigmoid_deriv)
        # initialize forget gate layer biases to 1
        self.forget_gate_layer.biases = np.ones(memory_size)
        # create input gate layer using sigmoid activation function
        self.input_gate_layer = BiasedLayer(
                in_size=concat_size,  # size of input/last output
                out_size=memory_size,  # size of update_values_layer (transposed state)
                activation_fn=Layer.activation_sigmoid,
                activation_fn_deriv=Layer.activation_sigmoid_deriv)
        # create update values layer using tanh activation function
        self.update_values_layer = BiasedLayer(
                in_size=concat_size,  # size of input/last output
                out_size=memory_size,  # size of state
                activation_fn=Layer.activation_tanh,
                activation_fn_deriv=Layer.activation_tanh_deriv)
        # create output gate layer using sigmoid activation function
        self.output_gate_layer = BiasedLayer(
                in_size=concat_size,  # size of input/last output
                out_size=memory_size,  # size of state
                activation_fn=Layer.activation_sigmoid,
                activation_fn_deriv=Layer.activation_sigmoid_deriv)
        # create container object for pending updates
        self.pending_updates = LSTMLayerPendingUpdates(in_size, memory_size)
        self.size = memory_size
        self.in_size = in_size

        #initialize cache chain (self.first_cache and self.last_cache are dummy caches)
        self.first_cache = LSTMLayerCache()
        self.first_cache.is_first_cache = True
        self.first_cache.state = np.zeros((self.size, 1))
        self.first_cache.output_values = np.zeros((self.size, 1))

        self.last_cache = LSTMLayerCache()
        self.first_cache.insert_after(self.last_cache)
        self.last_cache.is_last_cache = True

        self.caches = []
Exemplo n.º 2
0
class LSTMLayer(object):
    """
    represents a complete lstm layer (resp. lstm block) while offering a similar interface as the NeuralLayer class
    """
    def __init__(self, in_size, memory_size):
        """
        initialize lstm layer
        @in_size size of expected input vectors
        @memory_size size of memory vector (state)
        """
        concat_size = in_size + memory_size
        # create forget layer using sigmoid activation function
        self.forget_gate_layer = BiasedLayer(
                in_size=concat_size,  # size of input/last output
                out_size=memory_size,  # size of state
                activation_fn=Layer.activation_sigmoid,
                activation_fn_deriv=Layer.activation_sigmoid_deriv)
        # initialize forget gate layer biases to 1
        self.forget_gate_layer.biases = np.ones(memory_size)
        # create input gate layer using sigmoid activation function
        self.input_gate_layer = BiasedLayer(
                in_size=concat_size,  # size of input/last output
                out_size=memory_size,  # size of update_values_layer (transposed state)
                activation_fn=Layer.activation_sigmoid,
                activation_fn_deriv=Layer.activation_sigmoid_deriv)
        # create update values layer using tanh activation function
        self.update_values_layer = BiasedLayer(
                in_size=concat_size,  # size of input/last output
                out_size=memory_size,  # size of state
                activation_fn=Layer.activation_tanh,
                activation_fn_deriv=Layer.activation_tanh_deriv)
        # create output gate layer using sigmoid activation function
        self.output_gate_layer = BiasedLayer(
                in_size=concat_size,  # size of input/last output
                out_size=memory_size,  # size of state
                activation_fn=Layer.activation_sigmoid,
                activation_fn_deriv=Layer.activation_sigmoid_deriv)
        # create container object for pending updates
        self.pending_updates = LSTMLayerPendingUpdates(in_size, memory_size)
        self.size = memory_size
        self.in_size = in_size

        #initialize cache chain (self.first_cache and self.last_cache are dummy caches)
        self.first_cache = LSTMLayerCache()
        self.first_cache.is_first_cache = True
        self.first_cache.state = np.zeros((self.size, 1))
        self.first_cache.output_values = np.zeros((self.size, 1))

        self.last_cache = LSTMLayerCache()
        self.first_cache.insert_after(self.last_cache)
        self.last_cache.is_last_cache = True

        self.caches = []

    @classmethod
    def get_convolutional_layer(cls, reference_layer):
        """returns a copy of the lstm layer which shares the same weights and biases"""
        conv_layer = LSTMLayer(
            in_size=reference_layer.in_size,
            memory_size=reference_layer.size
        )
        # set the second layer's gates and update values layer to this one's
        conv_layer.input_gate_layer = reference_layer.input_gate_layer
        conv_layer.forget_gate_layer = reference_layer.forget_gate_layer
        conv_layer.output_gate_layer = reference_layer.output_gate_layer
        conv_layer.update_values_layer = reference_layer.update_values_layer

        return conv_layer

    def feed(self, input_data, time_steps=1):
        """
        calculate output vector for given input vector
        @time_steps number of feedforward results to be cached for use in backpropagation through time
        """
        assert time_steps > 0, "time_steps must be at least 1 (for recursive input)!"
        Logger.debug("feed(" + str(input_data) + ")")

        # concatenate input_vector with recurrent input (last output) vector
        concat_in = np.concatenate([input_data, self.last_cache.predecessor.output_values])
        # delete first cache if maximum number of time_steps are already cached
        if time_steps <= len(self.caches):
            self.first_cache.successor.remove()
            self.caches.pop(0)

        # create new cache at end of cache list
        self.last_cache.insert_before(LSTMLayerCache())
        cache = self.last_cache.predecessor
        self.caches.append(cache)

        # cache input and concatenated input values
        cache.input_values = input_data
        cache.concatenated_input = concat_in

        # calculate and cache gate/update_values results
        cache.forget_gate_results = self.forget_gate_layer.feed(concat_in)
        cache.input_gate_results = self.input_gate_layer.feed(concat_in)
        cache.update_values_layer_results = self.update_values_layer.feed(concat_in)
        cache.output_gate_results = self.output_gate_layer.feed(concat_in)

        # calculate state update values
        update_values = np.multiply(
                cache.input_gate_results,
                cache.update_values_layer_results)

        # apply forget layer and apply state update values
        cache.state = cache.predecessor.state * cache.forget_gate_results \
                      + update_values
        # calculate output from new state and output gate
        cache.output_values = Layer.activation_tanh(cache.state) * cache.output_gate_results

        # return calculated output vector
        return cache.output_values

    def learn_recursive(self, cache, deltas):
        """
        learn timesteps recursively
        @cache cache corresponding to current time step
        @deltas deltas from last (actually next) layer
        """
        # terminate if there are no target values or caches left
        if len(deltas) == 0 or cache.is_first_cache:
            return
        # get delta for current time step
        delta = deltas[-1]

        # calculate cumulative loss derived with respect to output (CEC - Constant Error Carousel)
        loss_output = delta + cache.successor.loss_output

        # retrieve loss from last time step (t+1) derived wrt state
        last_loss_state = cache.successor.loss_state

        ### calculate deltas
        delta_state = cache.output_gate_results * loss_output + last_loss_state

        delta_output_gate = self.output_gate_layer.activation_deriv(
                cache.output_gate_results) * cache.state * loss_output

        delta_input_gate = self.input_gate_layer.activation_deriv(
                cache.input_gate_results) * cache.update_values_layer_results * delta_state

        delta_update_values_layer = self.update_values_layer.activation_deriv(
                cache.update_values_layer_results) * cache.input_gate_results * delta_state

        delta_forget_gate = self.forget_gate_layer.activation_deriv(
                cache.forget_gate_results) * cache.predecessor.state * delta_state
        ###

        # retrieve concatenated input from cache
        concat_in = cache.concatenated_input

        # add weight adjustments to pending updates object
        self.pending_updates.input_gate_weights += \
            np.outer(delta_input_gate, concat_in)
        self.pending_updates.forget_gate_weights += \
            np.outer(delta_forget_gate, concat_in)
        self.pending_updates.output_gate_weights += \
            np.outer(delta_output_gate, concat_in)
        self.pending_updates.update_values_layer_weights += \
            np.outer(delta_update_values_layer, concat_in)

        # add bias adjustments to pending updates object
        self.pending_updates.input_gate_biases += np.ravel(delta_input_gate)
        self.pending_updates.forget_gate_biases += np.ravel(delta_forget_gate)
        self.pending_updates.output_gate_biases += np.ravel(delta_output_gate)
        self.pending_updates.update_values_layer_biases += np.ravel(delta_update_values_layer)

        # calculate loss with respect to concatenated input
        delta_concatinated_input = np.zeros_like(concat_in) + \
                                   np.dot(self.input_gate_layer.weights.T, delta_input_gate) + \
                                   np.dot(self.forget_gate_layer.weights.T, delta_forget_gate) + \
                                   np.dot(self.output_gate_layer.weights.T, delta_output_gate) + \
                                   np.dot(self.update_values_layer.weights.T, delta_update_values_layer)

        # save loss for Constant Error Carousel
        cache.loss_state = delta_state * cache.forget_gate_results
        cache.loss_input = delta_concatinated_input[:self.in_size]
        cache.loss_output = delta_concatinated_input[self.in_size:]

        # call itself recursively for next time step (t-1)
        return self.learn_recursive(cache.predecessor, deltas[:-1])

    def learn(self, deltas, learning_rate=0.001):
        """
        apply learning algorithm by using deltas from next layer
        @deltas deltas from last (actually next) layer
        """
        Logger.debug("learn(" + str(deltas) + ")")
        # learn recursively over all caches (corresponds to time steps), starting with last cache
        self.learn_recursive(self.last_cache.predecessor, deltas)
        # apply pending weight and bias updates
        self.apply_training(learning_rate)
        # calculate and return deltas for this layer from losses
        deltas = [cache.loss_input for cache in self.caches]
        return deltas

    def apply_training(self, learning_rate):
        """applies the calculated weight and bias updates and resets pending_updates object"""
        p_updates = self.pending_updates
        lr = learning_rate
        # subtract updates multiplied with learning rate from weight matrices/bias vectors
        self.forget_gate_layer.weights -= lr * p_updates.forget_gate_weights
        self.input_gate_layer.weights -= lr * p_updates.input_gate_weights
        self.update_values_layer.weights -= lr * p_updates.update_values_layer_weights
        self.output_gate_layer.weights -= lr * p_updates.output_gate_weights
        self.forget_gate_layer.biases -= lr * p_updates.forget_gate_biases
        self.input_gate_layer.biases -= lr * p_updates.input_gate_biases
        self.update_values_layer.biases -= lr * p_updates.update_values_layer_biases
        self.output_gate_layer.biases -= lr * p_updates.output_gate_biases
        # reset pending updates
        p_updates.reset()

        # clip matrices to prevent exploding gradient
        for matrix in [
                self.forget_gate_layer.weights,
                self.input_gate_layer.weights,
                self.update_values_layer.weights,
                self.output_gate_layer.weights,
                self.forget_gate_layer.biases,
                self.input_gate_layer.biases,
                self.update_values_layer.biases,
                self.output_gate_layer.biases]:
            np.clip(matrix, -5, 5, out=matrix)

    def save(self, directory):
        """save weights and biases to directory"""
        self.forget_gate_layer.save(os.path.join(directory, "forget_gate.npz"))
        self.input_gate_layer.save(os.path.join(directory, "input_gate.npz"))
        self.output_gate_layer.save(os.path.join(directory, "output_gate.npz"))
        self.update_values_layer.save(os.path.join(directory, "update_values_layer.npz"))

    def load(self, directory):
        """load weights and biases from directory"""
        self.forget_gate_layer.load(os.path.join(directory, "forget_gate.npz"))
        self.input_gate_layer.load(os.path.join(directory, "input_gate.npz"))
        self.output_gate_layer.load(os.path.join(directory, "output_gate.npz"))
        self.update_values_layer.load(os.path.join(directory, "update_values_layer.npz"))

    def visualize(self, path, layer_id):
        """generate visualization of weights and biases"""
        self.input_gate_layer.visualize(os.path.join(path, "LSTM" + str(layer_id), "obs_" + "InputG_1_0.pgm"))
        self.forget_gate_layer.visualize(os.path.join(path, "LSTM" + str(layer_id), "obs_" + "ForgetG_2_0.pgm"))
        self.output_gate_layer.visualize(os.path.join(path, "LSTM" + str(layer_id), "obs_" + "OutputG_3_0.pgm"))
        self.update_values_layer.visualize(os.path.join(path, "LSTM" + str(layer_id), "obs_" + "UpdateL_4_0.pgm"))

    def clear_cache(self):
        """clear all caches (i.e. state , error carousel, layer results)"""
        self.caches = []
        self.first_cache.successor = self.last_cache
        self.last_cache.predecessor = self.first_cache