def visualize_states(hidden_states, updates, train_stream, valid_stream, args):

    # Get all the hidden_states
    filter_states = VariableFilter(theano_name_regex="hidden_state_.*")
    all_states = filter_states(hidden_states)
    all_states = sorted(all_states, key=lambda var: var.name[-1])

    # Get all the hidden_cells
    filter_cells = VariableFilter(theano_name_regex="hidden_cells_.*")
    all_cells = filter_cells(hidden_states)
    all_cells = sorted(all_cells, key=lambda var: var.name[-1])

    # Handle the theano shared variables that allow carrying the hidden state
    givens, f_updates = carry_hidden_state(updates, 1,
                                           not (has_indices(args.dataset)))

    # Compile the function
    logger.info("The compilation of the function has started")
    if args.rnn_type == "lstm" and args.visualize_cells:
        compiled = theano.function(inputs=ComputationGraph(all_cells).inputs,
                                   outputs=all_cells,
                                   givens=givens,
                                   updates=f_updates,
                                   mode=Mode(optimizer='fast_compile'))
    else:
        compiled = theano.function(inputs=ComputationGraph(all_states).inputs,
                                   outputs=all_states,
                                   givens=givens,
                                   updates=f_updates,
                                   mode=Mode(optimizer='fast_compile'))

    # Plot the function
    plot("hidden_state", train_stream, compiled, args)
示例#2
0
    def __init__(self, cost, generation_length, dataset,
                 initial_text_length, softmax_sampling,
                 updates, ploting_path=None,
                 interactive_mode=False, **kwargs):
        self.generation_length = generation_length
        self.init_length = initial_text_length
        self.dataset = dataset
        self.output_size = get_output_size(dataset)
        self.ploting_path = ploting_path
        self.softmax_sampling = softmax_sampling
        self.interactive_mode = interactive_mode
        self.has_indices = has_indices(dataset)
        super(TextGenerationExtension, self).__init__(**kwargs)

        # Get presoft and its computation graph
        filter_presoft = VariableFilter(theano_name="presoft")
        presoft = filter_presoft(ComputationGraph(cost).variables)
        cg = ComputationGraph(presoft)

        # Handle the theano shared variables that allow carrying the hidden
        # state
        givens, f_updates = carry_hidden_state(updates, 1,
                                               reset=not(self.has_indices))

        # Compile the theano function
        self.generate = theano.function(inputs=cg.inputs, outputs=presoft,
                                        givens=givens, updates=f_updates)
示例#3
0
    def _compile(self, state_updates):
        """Compiles Theano functions.
        .. todo::
            The current compilation method does not account for updates
            attached to `ComputationGraph` elements. Compiling should
            be out-sourced to `ComputationGraph` to deal with it.
        """
        inputs = []
        outputs = []
        updates = None

        givens, f_updates = carry_hidden_state(
            state_updates,
            self.mini_batch_size,
            reset=not (has_indices(self.dataset)))

        if self.theano_buffer.accumulation_updates:
            updates = OrderedDict()
            updates.update(self.theano_buffer.accumulation_updates)
            if self.updates:
                updates.update(self.updates)
            inputs += self.theano_buffer.inputs
        inputs += self.monitored_quantities_buffer.inputs
        outputs = self.monitored_quantities_buffer.requires

        if inputs != []:
            self.unique_inputs = list(set(inputs))
            updates.update(f_updates)
            self._accumulate_fun = theano.function(self.unique_inputs,
                                                   outputs,
                                                   givens=givens,
                                                   updates=updates)
        else:
            self._accumulate_fun = None
    def _compile(self, state_updates):
        """Compiles Theano functions.
        .. todo::
            The current compilation method does not account for updates
            attached to `ComputationGraph` elements. Compiling should
            be out-sourced to `ComputationGraph` to deal with it.
        """
        inputs = []
        outputs = []
        updates = None

        givens, f_updates = carry_hidden_state(state_updates,
                                               self.mini_batch_size,
                                               reset=not(has_indices(self.dataset)))

        if self.theano_buffer.accumulation_updates:
            updates = OrderedDict()
            updates.update(self.theano_buffer.accumulation_updates)
            if self.updates:
                updates.update(self.updates)
            inputs += self.theano_buffer.inputs
        inputs += self.monitored_quantities_buffer.inputs
        outputs = self.monitored_quantities_buffer.requires

        if inputs != []:
            self.unique_inputs = list(set(inputs))
            updates.update(f_updates)
            self._accumulate_fun = theano.function(self.unique_inputs,
                                                   outputs,
                                                   givens=givens,
                                                   updates=updates)
        else:
            self._accumulate_fun = None
def visualize_states(hidden_states, updates,
                     train_stream, valid_stream,
                     args):

    # Get all the hidden_states
    filter_states = VariableFilter(theano_name_regex="hidden_state_.*")
    all_states = filter_states(hidden_states)
    all_states = sorted(all_states, key=lambda var: var.name[-1])

    # Get all the hidden_cells
    filter_cells = VariableFilter(theano_name_regex="hidden_cells_.*")
    all_cells = filter_cells(hidden_states)
    all_cells = sorted(all_cells, key=lambda var: var.name[-1])

    # Handle the theano shared variables that allow carrying the hidden state
    givens, f_updates = carry_hidden_state(updates, 1,
                                           not(has_indices(args.dataset)))

    # Compile the function
    logger.info("The compilation of the function has started")
    if args.rnn_type == "lstm" and args.visualize_cells:
        compiled = theano.function(inputs=ComputationGraph(all_cells).inputs,
                                   outputs=all_cells,
                                   givens=givens, updates=f_updates,
                                   mode=Mode(optimizer='fast_compile'))
    else:
        compiled = theano.function(inputs=ComputationGraph(all_states).inputs,
                                   outputs=all_states,
                                   givens=givens, updates=f_updates,
                                   mode=Mode(optimizer='fast_compile'))

    # Plot the function
    plot("hidden_state", train_stream, compiled, args)
def get_costs(presoft, args):

    if has_indices(args.dataset):
        # Targets: (Time X Batch)
        y = tensor.lmatrix('targets')
        y_mask = tensor.ones_like(y, dtype=floatX)
        y_mask = tensor.set_subtensor(
            y_mask[:args.context, :],
            tensor.zeros_like(y_mask[:args.context, :], dtype=floatX))

        time, batch, feat = presoft.shape
        cross_entropy = Softmax().categorical_cross_entropy(
            (y.flatten() * y_mask.reshape((batch * time, ))), (presoft.reshape(
                (batch * time, feat)) * y_mask.reshape((batch * time, 1))))

        # renormalization
        renormalized_cross_entropy = cross_entropy * (
            tensor.sum(tensor.ones_like(y_mask)) / tensor.sum(y_mask))

        # BPC: Bits Per Character
        unregularized_cost = renormalized_cross_entropy / tensor.log(2)
        unregularized_cost.name = "cross_entropy"

    else:
        # Targets: (Time X Batch X Features)
        y = tensor.tensor3('targets', dtype=floatX)
        y_mask = tensor.ones_like(y[:, :, 0], dtype=floatX)
        y_mask = tensor.set_subtensor(
            y_mask[:args.context, :],
            tensor.zeros_like(y_mask[:args.context, :], dtype=floatX))

        if args.used_inputs is not None:
            y_mask = tensor.set_subtensor(
                y_mask[:args.used_inputs, :],
                tensor.zeros_like(y_mask[:args.used_inputs, :], dtype=floatX))
        # SquaredError does not work on 3D tensor
        target = (y * y_mask.dimshuffle(0, 1, 'x'))
        values = (presoft[:-1, :, :] * y_mask.dimshuffle(0, 1, 'x'))

        target = target.reshape(
            (target.shape[0] * target.shape[1], target.shape[2]))

        values = values.reshape(
            (values.shape[0] * values.shape[1], values.shape[2]))

        unregularized_cost = SquaredError().apply(target, values)
        # renormalization
        unregularized_cost = unregularized_cost * (
            tensor.sum(tensor.ones_like(y_mask)) / tensor.sum(y_mask))
        unregularized_cost.name = "mean_squared_error"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = unregularized_cost + tensor.log(1)
    cost.name = "regularized_cost"
    return cost, unregularized_cost
示例#7
0
def visualize_gates_soft(gate_values, hidden_states, updates, train_stream,
                         valid_stream, args):

    # Handle the theano shared variables that allow carrying the hidden state
    givens, f_updates = carry_hidden_state(updates, 1,
                                           not (has_indices(args.dataset)))

    # Compile the function
    compiled = theano.function(inputs=ComputationGraph(gate_values).inputs,
                               outputs=gate_values,
                               givens=givens,
                               updates=f_updates,
                               mode=Mode(optimizer='fast_compile'))

    plot("gates_soft", train_stream, compiled, args)
def visualize_gates_soft(gate_values, hidden_states, updates,
                         train_stream, valid_stream,
                         args):

    # Handle the theano shared variables that allow carrying the hidden state
    givens, f_updates = carry_hidden_state(updates, 1,
                                           not(has_indices(args.dataset)))

    # Compile the function
    compiled = theano.function(inputs=ComputationGraph(gate_values).inputs,
                               outputs=gate_values,
                               givens=givens, updates=f_updates,
                               mode=Mode(optimizer='fast_compile'))

    plot("gates_soft", train_stream, compiled, args)
def get_presoft(h, args):
    output_size = get_output_size(args.dataset)
    # If args.skip_connections: dim = args.layers * args.state_dim
    # else: dim = args.state_dim
    use_all_states = args.skip_connections or args.skip_output or (args.rnn_type in ["clockwork", "soft"])
    output_layer = Linear(
        input_dim=use_all_states * args.layers *
        args.state_dim + (1 - use_all_states) * args.state_dim,
        output_dim=output_size, name="output_layer")

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()
    presoft = output_layer.apply(h)
    if not has_indices(args.dataset):
        presoft = Tanh().apply(presoft)
    presoft.name = 'presoft'
    return presoft
def get_presoft(h, args):
    output_size = get_output_size(args.dataset)
    # If args.skip_connections: dim = args.layers * args.state_dim
    # else: dim = args.state_dim
    use_all_states = args.skip_connections or args.skip_output or (
        args.rnn_type in ["clockwork", "soft"])
    output_layer = Linear(
        input_dim=use_all_states * args.layers * args.state_dim +
        (1 - use_all_states) * args.state_dim,
        output_dim=output_size,
        name="output_layer")

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()
    presoft = output_layer.apply(h)
    if not has_indices(args.dataset):
        presoft = Tanh().apply(presoft)
    presoft.name = 'presoft'
    return presoft
示例#11
0
def plot(what, train_stream, compiled, args):
    # states
    epoch_iterator = train_stream.get_epoch_iterator()
    for num in range(10):
        init_ = next(epoch_iterator)[0][0: args.visualize_length, 0:1]

        values = compiled(init_)

        layers = len(values)
        time = values[0].shape[0]
        if has_indices(args.dataset):
            ticks = tuple(conv_into_char(init_[:, 0], args.dataset))
        else:
            ticks = tuple(np.arange(time))

        for d in range(layers):
            # Change the subplot
            plt.subplot(layers, 1, d + 1)

            # print only 5 values of the hiddenstate
            for j in range(10):
                plt.plot(np.arange(time), values[d][:, 0, j])
            # plt.plot(
            #     np.arange(time), np.mean(np.abs(values[d][:, 0, :]), axis=1))

            # Add ticks for xaxis
            plt.xticks(range(args.visualize_length), ticks)

            # Fancy options
            plt.grid(True)
            plt.title(what + "_of_layer_" + str(d))
        plt.tight_layout()

        # Either plot on the current display or save the plot into a file
        if args.local:
            plt.show()
        else:
            plt.savefig(
                args.save_path + "/visualize_" + what + '_' + str(num) + ".png")
            logger.info("Figure \"visualize_" + what + '_' + str(num) +
                        ".png\" saved at directory: " + args.save_path)
示例#12
0
def plot(what, train_stream, compiled, args):
    # states
    epoch_iterator = train_stream.get_epoch_iterator()
    for num in range(10):
        init_ = next(epoch_iterator)[0][0:args.visualize_length, 0:1]

        values = compiled(init_)

        layers = len(values)
        time = values[0].shape[0]
        if has_indices(args.dataset):
            ticks = tuple(conv_into_char(init_[:, 0], args.dataset))
        else:
            ticks = tuple(np.arange(time))

        for d in range(layers):
            # Change the subplot
            plt.subplot(layers, 1, d + 1)

            # print only 5 values of the hiddenstate
            for j in range(10):
                plt.plot(np.arange(time), values[d][:, 0, j])
            # plt.plot(
            #     np.arange(time), np.mean(np.abs(values[d][:, 0, :]), axis=1))

            # Add ticks for xaxis
            plt.xticks(range(args.visualize_length), ticks)

            # Fancy options
            plt.grid(True)
            plt.title(what + "_of_layer_" + str(d))
        plt.tight_layout()

        # Either plot on the current display or save the plot into a file
        if args.local:
            plt.show()
        else:
            plt.savefig(args.save_path + "/visualize_" + what + '_' +
                        str(num) + ".png")
            logger.info("Figure \"visualize_" + what + '_' + str(num) +
                        ".png\" saved at directory: " + args.save_path)
示例#13
0
    def __init__(self,
                 cost,
                 generation_length,
                 dataset,
                 initial_text_length,
                 softmax_sampling,
                 updates,
                 ploting_path=None,
                 interactive_mode=False,
                 **kwargs):
        self.generation_length = generation_length
        self.init_length = initial_text_length
        self.dataset = dataset
        self.output_size = get_output_size(dataset)
        self.ploting_path = ploting_path
        self.softmax_sampling = softmax_sampling
        self.interactive_mode = interactive_mode
        self.has_indices = has_indices(dataset)
        super(TextGenerationExtension, self).__init__(**kwargs)

        # Get presoft and its computation graph
        filter_presoft = VariableFilter(theano_name="presoft")
        presoft = filter_presoft(ComputationGraph(cost).variables)
        cg = ComputationGraph(presoft)

        # Handle the theano shared variables that allow carrying the hidden
        # state
        givens, f_updates = carry_hidden_state(updates,
                                               1,
                                               reset=not (self.has_indices))

        # Compile the theano function
        self.generate = theano.function(inputs=cg.inputs,
                                        outputs=presoft,
                                        givens=givens,
                                        updates=f_updates)
示例#14
0
def visualize_gradients(hidden_states, updates, train_stream, valid_stream,
                        args):

    # Get all the hidden_states
    filter_states = VariableFilter(theano_name_regex="hidden_state_.*")
    all_states = filter_states(hidden_states)
    all_states = sorted(all_states, key=lambda var: var.name[-1])

    # Get all the hidden_cells
    filter_cells = VariableFilter(theano_name_regex="hidden_cell_.*")
    all_cells = filter_cells(hidden_states)
    all_cells = sorted(all_cells, key=lambda var: var.name[-1])

    # Get the variable on which we compute the gradients
    filter_pre_rnn = VariableFilter(theano_name_regex="pre_rnn.*")
    wrt = filter_pre_rnn(ComputationGraph(hidden_states).variables)
    wrt = sorted(wrt, key=lambda var: var.name[-1])
    len_wrt = len(wrt)

    # We have wrt = [pre_rnn] or [pre_rnn_0, pre_rnn_1, ...]

    # Assertion part
    assert len(all_states) == args.layers
    assert len(all_cells) == (args.layers * (args.rnn_type == "lstm"))
    if args.skip_connections:
        assert len_wrt == args.layers
    else:
        assert len_wrt == 1

    # Comupute the gradients of states or cells
    if args.rnn_type == "lstm" and args.visualize_cells:
        states = all_cells
    else:
        states = all_states

    logger.info("The computation of the gradients has started")
    gradients = []
    for i, state in enumerate(states):
        gradients.extend(
            tensor.grad(tensor.mean(tensor.abs_(state[-1, 0, :])),
                        wrt[:i + 1]))
    # -1 indicates that gradient is gradient of the last time-step.c
    logger.info("The computation of the gradients is done")

    # Handle the theano shared variables that allow carrying the hidden state
    givens, f_updates = carry_hidden_state(
        updates, 1, reset=not (has_indices(args.dataset)))

    # Compile the function
    logger.info("The compilation of the function has started")
    compiled = theano.function(inputs=ComputationGraph(states).inputs,
                               outputs=gradients,
                               givens=givens,
                               updates=f_updates,
                               mode=Mode(optimizer='fast_compile'))
    logger.info("The function has been compiled")

    # Generate
    epoch_iterator = train_stream.get_epoch_iterator()
    for num in range(10):
        init_ = next(epoch_iterator)[0][0:args.visualize_length, 0:1]

        # [layers * len_wrt] [Time, 1, Hidden_dim]
        gradients = compiled(init_)

        if args.skip_connections:
            assert len(gradients) == (args.layers * (args.layers + 1)) / 2
        else:
            assert len(gradients) == args.layers

        time = gradients[0].shape[0]
        if has_indices(args.dataset):
            ticks = tuple(conv_into_char(init_[:, 0], args.dataset))
        else:
            ticks = tuple(np.arange(time))

        # One row subplot for each variable wrt which we are computing
        # the gradients
        for var in range(len_wrt):
            plt.subplot(len_wrt, 1, var + 1)
            for d in range(args.layers - var):
                plt.plot(np.arange(time),
                         np.mean(np.abs(gradients[d][:, 0, :]), axis=1),
                         label="layer " + str(d + var))
            plt.xticks(range(args.visualize_length), ticks)
            plt.grid(True)
            plt.yscale('log')
            axes = plt.gca()
            axes.set_ylim([5e-20, 5e-1])
            plt.title("gradients plotting w.r.t pre_rrn" + str(var))
            plt.legend()
        plt.tight_layout()
        if args.local:
            plt.show()
        else:
            plt.savefig(args.save_path + "/visualize_gradients_" + str(num) +
                        ".png")
            logger.info("Figure \"visualize_gradients_" + str(num) +
                        ".png\" saved at directory: " + args.save_path)
示例#15
0
def visualize_gates_lstm(gate_values, hidden_states, updates,
                         train_stream, valid_stream,
                         args):

    in_gates = gate_values["in_gates"]
    out_gates = gate_values["out_gates"]
    forget_gates = gate_values["forget_gates"]

    # Handle the theano shared variables that allow carrying the hidden state
    givens, f_updates = carry_hidden_state(updates, 1,
                                           not(has_indices(args.dataset)))

    generate_in = theano.function(inputs=ComputationGraph(in_gates).inputs,
                                  outputs=in_gates,
                                  givens=givens,
                                  updates=f_updates,
                                  mode=Mode(optimizer='fast_compile'))
    generate_out = theano.function(inputs=ComputationGraph(out_gates).inputs,
                                   outputs=out_gates,
                                   givens=givens,
                                   updates=f_updates,
                                   mode=Mode(optimizer='fast_compile'))
    generate_forget = theano.function(inputs=ComputationGraph(forget_gates).inputs,
                                      outputs=forget_gates,
                                      givens=givens,
                                      updates=f_updates,
                                      mode=Mode(optimizer='fast_compile'))

    # Generate
    epoch_iterator = valid_stream.get_epoch_iterator()
    for num in range(10):
        init_ = next(epoch_iterator)[0][0: args.visualize_length, 0:1]

        last_output_in = generate_in(init_)
        last_output_out = generate_out(init_)
        last_output_forget = generate_forget(init_)
        layers = len(last_output_in)

        time = last_output_in[0].shape[0]
        if has_indices(args.dataset):
            ticks = tuple(conv_into_char(init_[:, 0], args.dataset))
        else:
            ticks = tuple(np.arange(time))

        for i in range(layers):

            plt.subplot(3, layers, 1 + i)
            plt.plot(np.arange(time), np.mean(
                np.abs(last_output_in[i][:, 0, :]), axis=1))
            plt.xticks(range(args.visualize_length), ticks)
            plt.grid(True)
            plt.title("in_gate of layer " + str(i))

            plt.subplot(3, layers, layers + 1 + i)
            plt.plot(np.arange(time), np.mean(
                np.abs(last_output_out[i][:, 0, :]), axis=1))
            plt.xticks(range(args.visualize_length), ticks)
            plt.grid(True)
            plt.title("out_gate of layer " + str(i))

            plt.subplot(3, layers, 2 * layers + 1 + i)
            plt.plot(np.arange(time), np.mean(
                np.abs(last_output_forget[i][:, 0, :]), axis=1))
            plt.xticks(range(args.visualize_length), ticks)
            plt.grid(True)
            plt.title("forget_gate of layer " + str(i))
        if args.local:
            plt.show()
        else:
            plt.savefig(
                args.save_path + "/visualize_gates_" + str(num) + ".png")
            logger.info("Figure \"visualize_gates_" + str(num) +
                        ".png\" saved at directory: " + args.save_path)
def visualize_generate(cost, hidden_states, updates,
                       train_stream, valid_stream,
                       args):

    use_indices = has_indices(args.dataset)
    output_size = get_output_size(args.dataset)

    # Get presoft and its computation graph
    filter_presoft = VariableFilter(theano_name="presoft")
    presoft = filter_presoft(ComputationGraph(cost).variables)[0]
    cg = ComputationGraph(presoft)

    # Handle the theano shared variables that allow carrying the hidden
    # state
    givens, f_updates = carry_hidden_state(updates, 1, reset=not(use_indices))

    # Compile the theano function
    compiled = theano.function(inputs=cg.inputs, outputs=presoft,
                               givens=givens, updates=f_updates)

    epoch_iterator = train_stream.get_epoch_iterator()
    for num in range(10):
        all_ = next(epoch_iterator)
        all_sequence = all_[0][:, 0:1]
        targets = all_[1][:, 0:1]

        # In the case of characters and text
        if use_indices:
            init_ = all_sequence[:args.initial_text_length]

            # Time X Features
            probability_array = np.zeros((0, output_size))
            generated_text = init_

            for i in range(args.generated_text_lenght):
                presoft = compiled(generated_text)
                # Get the last value of presoft
                last_presoft = presoft[-1:, 0, :]

                # Compute the probability distribution
                probabilities = softmax(last_presoft)
                # Store it in the list
                probability_array = np.vstack([probability_array,
                                               probabilities])

                # Sample a character out of the probability distribution
                argmax = (args.softmax_sampling == 'argmax')
                last_output_sample = sample(probabilities, argmax)[:, None, :]

                # Concatenate the new value to the text
                generated_text = np.vstack(
                    [generated_text, last_output_sample])

                ploting_path = None
                if args.save_path is not None:
                    ploting_path = os.path.join(
                        args.save_path, 'prob_plot.png')

                # Convert with real characters
                whole_sentence = conv_into_char(
                    generated_text[:, 0], args.dataset)
                initial_sentence = whole_sentence[:init_.shape[0]]
                selected_sentence = whole_sentence[init_.shape[0]:]

                logger.info(''.join(initial_sentence) + '...')
                logger.info(''.join(whole_sentence))

                if ploting_path is not None:
                    probability_plot(probability_array, selected_sentence,
                                     args.dataset, ploting_path)

        # In the case of sine wave dataset for example
        else:
            presoft = compiled(all_sequence)

            time_plot = presoft.shape[0] - 1

            plt.plot(np.arange(time_plot),
                     targets[:time_plot, 0, 0],
                     label="target")
            plt.plot(np.arange(time_plot), presoft[:time_plot, 0, 0],
                     label="predicted")
            plt.legend()
            plt.grid(True)
            plt.show()
def get_prernn(args):

    # time x batch
    x_mask = tensor.fmatrix('mask')

    # Compute the state dim
    if args.rnn_type == 'lstm':
        state_dim = 4 * args.state_dim
    else:
        state_dim = args.state_dim

    # Prepare the arguments for the fork
    output_names = []
    output_dims = []
    for d in range(args.layers):
        if d > 0:
            suffix = RECURRENTSTACK_SEPARATOR + str(d)
        else:
            suffix = ''
        if d == 0 or args.skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    # Prepare the brick to be forked (LookupTable or Linear)
    # Check if the dataset provides indices (in the case of a
    # fixed vocabulary, x is 2D tensor) or if it gives raw values
    # (x is 3D tensor)
    if has_indices(args.dataset):
        features = args.mini_batch_size
        x = tensor.lmatrix('features')
        vocab_size = get_output_size(args.dataset)
        lookup = LookupTable(length=vocab_size, dim=state_dim)
        lookup.weights_init = initialization.IsotropicGaussian(0.1)
        lookup.biases_init = initialization.Constant(0)
        forked = FeedforwardSequence([lookup.apply])
        if not has_mask(args.dataset):
            x_mask = tensor.ones_like(x, dtype=floatX)

    else:
        x = tensor.tensor3('features', dtype=floatX)
        if args.used_inputs is not None:
            x = tensor.set_subtensor(x[args.used_inputs:, :, :],
                                     tensor.zeros_like(x[args.used_inputs:,
                                                         :, :],
                                                       dtype=floatX))
        features = get_output_size(args.dataset)
        forked = Linear(input_dim=features, output_dim=state_dim)
        forked.weights_init = initialization.IsotropicGaussian(0.1)
        forked.biases_init = initialization.Constant(0)

        if not has_mask(args.dataset):
            x_mask = tensor.ones_like(x[:, :, 0], dtype=floatX)

    # Define the fork
    fork = Fork(output_names=output_names, input_dim=features,
                output_dims=output_dims,
                prototype=forked)
    fork.initialize()

    # Apply the fork
    prernn = fork.apply(x)

    # Give a name to the input of each layer
    if args.skip_connections:
        for t in range(len(prernn)):
            prernn[t].name = "pre_rnn_" + str(t)
    else:
        prernn.name = "pre_rnn"

    return prernn, x_mask
def visualize_presoft(cost, hidden_states, updates,
                      train_stream, valid_stream,
                      args):

    filter_presoft = VariableFilter(theano_name="presoft")
    presoft = filter_presoft(ComputationGraph(cost).variables)[0]

    # Get all the hidden_states
    filter_states = VariableFilter(theano_name_regex="hidden_state_.*")
    all_states = filter_states(hidden_states)
    all_states = sorted(all_states, key=lambda var: var.name[-1])

    # Assertion part
    assert len(all_states) == args.layers

    logger.info("The computation of the gradients has started")
    gradients = []

    for i in range(args.visualize_length - args.context):
        gradients.extend(
            tensor.grad(tensor.mean(tensor.abs_(presoft[i, 0, :])),
                        all_states))
    logger.info("The computation of the gradients is done")

    # Handle the theano shared variables that allow carrying the hidden state
    givens, f_updates = carry_hidden_state(updates, 1,
                                           not(has_indices(args.dataset)))

    # Compile the function
    logger.info("The compilation of the function has started")
    compiled = theano.function(inputs=ComputationGraph(presoft).inputs,
                               outputs=gradients,
                               givens=givens, updates=f_updates,
                               mode=Mode(optimizer='fast_compile'))
    logger.info("The function has been compiled")

    # Generate
    epoch_iterator = train_stream.get_epoch_iterator()
    for num in range(10):
        init_ = next(epoch_iterator)[0][
            0: args.visualize_length, 0:1]

        hidden_state = compiled(init_)

        value_of_layer = {}
        for d in range(args.layers):
            value_of_layer[d] = 0

        for i in range(len(hidden_state) / args.layers):
            for d in range(args.layers):
                value_of_layer[d] += hidden_state[d + i * args.layers]

        time = hidden_state[0].shape[0]
        if has_indices(args.dataset):
            ticks = tuple(conv_into_char(init_[:, 0], args.dataset))
        else:
            ticks = tuple(np.arange(time))

        for d in range(args.layers):
            plt.plot(
                np.arange(time),
                np.mean(np.abs(value_of_layer[d][:, 0, :]), axis=1),
                label="Layer " + str(d))
        plt.xticks(range(args.visualize_length), ticks)
        plt.grid(True)
        plt.title("hidden_state_of_layer_" + str(d))
        plt.legend()
        plt.tight_layout()
        if args.local:
            plt.show()
        else:
            plt.savefig(
                args.save_path + "/visualize_presoft_" + str(num) + ".png")
            logger.info("Figure \"visualize_presoft_" + str(num) +
                        ".png\" saved at directory: " + args.save_path)
def get_prernn(args):

    # time x batch
    x_mask = tensor.fmatrix('mask')

    # Compute the state dim
    if args.rnn_type == 'lstm':
        state_dim = 4 * args.state_dim
    else:
        state_dim = args.state_dim

    # Prepare the arguments for the fork
    output_names = []
    output_dims = []
    for d in range(args.layers):
        if d > 0:
            suffix = RECURRENTSTACK_SEPARATOR + str(d)
        else:
            suffix = ''
        if d == 0 or args.skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    # Prepare the brick to be forked (LookupTable or Linear)
    # Check if the dataset provides indices (in the case of a
    # fixed vocabulary, x is 2D tensor) or if it gives raw values
    # (x is 3D tensor)
    if has_indices(args.dataset):
        features = args.mini_batch_size
        x = tensor.lmatrix('features')
        vocab_size = get_output_size(args.dataset)
        lookup = LookupTable(length=vocab_size, dim=state_dim)
        lookup.weights_init = initialization.IsotropicGaussian(0.1)
        lookup.biases_init = initialization.Constant(0)
        forked = FeedforwardSequence([lookup.apply])
        if not has_mask(args.dataset):
            x_mask = tensor.ones_like(x, dtype=floatX)

    else:
        x = tensor.tensor3('features', dtype=floatX)
        if args.used_inputs is not None:
            x = tensor.set_subtensor(
                x[args.used_inputs:, :, :],
                tensor.zeros_like(x[args.used_inputs:, :, :], dtype=floatX))
        features = get_output_size(args.dataset)
        forked = Linear(input_dim=features, output_dim=state_dim)
        forked.weights_init = initialization.IsotropicGaussian(0.1)
        forked.biases_init = initialization.Constant(0)

        if not has_mask(args.dataset):
            x_mask = tensor.ones_like(x[:, :, 0], dtype=floatX)

    # Define the fork
    fork = Fork(output_names=output_names,
                input_dim=features,
                output_dims=output_dims,
                prototype=forked)
    fork.initialize()

    # Apply the fork
    prernn = fork.apply(x)

    # Give a name to the input of each layer
    if args.skip_connections:
        for t in range(len(prernn)):
            prernn[t].name = "pre_rnn_" + str(t)
    else:
        prernn.name = "pre_rnn"

    return prernn, x_mask
示例#20
0
def visualize_gates_lstm(gate_values, hidden_states, updates, train_stream,
                         valid_stream, args):

    in_gates = gate_values["in_gates"]
    out_gates = gate_values["out_gates"]
    forget_gates = gate_values["forget_gates"]

    # Handle the theano shared variables that allow carrying the hidden state
    givens, f_updates = carry_hidden_state(updates, 1,
                                           not (has_indices(args.dataset)))

    generate_in = theano.function(inputs=ComputationGraph(in_gates).inputs,
                                  outputs=in_gates,
                                  givens=givens,
                                  updates=f_updates,
                                  mode=Mode(optimizer='fast_compile'))
    generate_out = theano.function(inputs=ComputationGraph(out_gates).inputs,
                                   outputs=out_gates,
                                   givens=givens,
                                   updates=f_updates,
                                   mode=Mode(optimizer='fast_compile'))
    generate_forget = theano.function(
        inputs=ComputationGraph(forget_gates).inputs,
        outputs=forget_gates,
        givens=givens,
        updates=f_updates,
        mode=Mode(optimizer='fast_compile'))

    # Generate
    epoch_iterator = valid_stream.get_epoch_iterator()
    for num in range(10):
        init_ = next(epoch_iterator)[0][0:args.visualize_length, 0:1]

        last_output_in = generate_in(init_)
        last_output_out = generate_out(init_)
        last_output_forget = generate_forget(init_)
        layers = len(last_output_in)

        time = last_output_in[0].shape[0]
        if has_indices(args.dataset):
            ticks = tuple(conv_into_char(init_[:, 0], args.dataset))
        else:
            ticks = tuple(np.arange(time))

        for i in range(layers):

            plt.subplot(3, layers, 1 + i)
            plt.plot(np.arange(time),
                     np.mean(np.abs(last_output_in[i][:, 0, :]), axis=1))
            plt.xticks(range(args.visualize_length), ticks)
            plt.grid(True)
            plt.title("in_gate of layer " + str(i))

            plt.subplot(3, layers, layers + 1 + i)
            plt.plot(np.arange(time),
                     np.mean(np.abs(last_output_out[i][:, 0, :]), axis=1))
            plt.xticks(range(args.visualize_length), ticks)
            plt.grid(True)
            plt.title("out_gate of layer " + str(i))

            plt.subplot(3, layers, 2 * layers + 1 + i)
            plt.plot(np.arange(time),
                     np.mean(np.abs(last_output_forget[i][:, 0, :]), axis=1))
            plt.xticks(range(args.visualize_length), ticks)
            plt.grid(True)
            plt.title("forget_gate of layer " + str(i))
        if args.local:
            plt.show()
        else:
            plt.savefig(args.save_path + "/visualize_gates_" + str(num) +
                        ".png")
            logger.info("Figure \"visualize_gates_" + str(num) +
                        ".png\" saved at directory: " + args.save_path)
示例#21
0
def visualize_presoft(cost, hidden_states, updates, train_stream, valid_stream,
                      args):

    filter_presoft = VariableFilter(theano_name="presoft")
    presoft = filter_presoft(ComputationGraph(cost).variables)[0]

    # Get all the hidden_states
    filter_states = VariableFilter(theano_name_regex="hidden_state_.*")
    all_states = filter_states(hidden_states)
    all_states = sorted(all_states, key=lambda var: var.name[-1])

    # Assertion part
    assert len(all_states) == args.layers

    logger.info("The computation of the gradients has started")
    gradients = []

    for i in range(args.visualize_length - args.context):
        gradients.extend(
            tensor.grad(tensor.mean(tensor.abs_(presoft[i, 0, :])),
                        all_states))
    logger.info("The computation of the gradients is done")

    # Handle the theano shared variables that allow carrying the hidden state
    givens, f_updates = carry_hidden_state(updates, 1,
                                           not (has_indices(args.dataset)))

    # Compile the function
    logger.info("The compilation of the function has started")
    compiled = theano.function(inputs=ComputationGraph(presoft).inputs,
                               outputs=gradients,
                               givens=givens,
                               updates=f_updates,
                               mode=Mode(optimizer='fast_compile'))
    logger.info("The function has been compiled")

    # Generate
    epoch_iterator = train_stream.get_epoch_iterator()
    for num in range(10):
        init_ = next(epoch_iterator)[0][0:args.visualize_length, 0:1]

        hidden_state = compiled(init_)

        value_of_layer = {}
        for d in range(args.layers):
            value_of_layer[d] = 0

        for i in range(len(hidden_state) / args.layers):
            for d in range(args.layers):
                value_of_layer[d] += hidden_state[d + i * args.layers]

        time = hidden_state[0].shape[0]
        if has_indices(args.dataset):
            ticks = tuple(conv_into_char(init_[:, 0], args.dataset))
        else:
            ticks = tuple(np.arange(time))

        for d in range(args.layers):
            plt.plot(np.arange(time),
                     np.mean(np.abs(value_of_layer[d][:, 0, :]), axis=1),
                     label="Layer " + str(d))
        plt.xticks(range(args.visualize_length), ticks)
        plt.grid(True)
        plt.title("hidden_state_of_layer_" + str(d))
        plt.legend()
        plt.tight_layout()
        if args.local:
            plt.show()
        else:
            plt.savefig(args.save_path + "/visualize_presoft_" + str(num) +
                        ".png")
            logger.info("Figure \"visualize_presoft_" + str(num) +
                        ".png\" saved at directory: " + args.save_path)
def visualize_jacobian(hidden_states, updates,
                       train_stream, valid_stream,
                       args):

    # Get all the hidden_states
    all_states = [
        var for var in hidden_states if re.match("hidden_state_.*", var.name)]
    all_states = sorted(all_states, key=lambda var: var.name[-1])

    # Get all the hidden_cells
    all_cells = [var for var in hidden_states if re.match(
        "hidden_cell_.*", var.name)]
    all_cells = sorted(all_cells, key=lambda var: var.name[-1])

    # Get the variable on which we compute the gradients
    variables = ComputationGraph(hidden_states).variables
    wrt = [
        var for var in variables if
        (var.name is not None) and (re.match("pre_rnn.*", var.name))]
    wrt = sorted(wrt, key=lambda var: var.name[-1])
    len_wrt = len(wrt)
    # We have wrt = [pre_rnn] or [pre_rnn_0, pre_rnn_1, ...]

    # Assertion part
    assert len(all_states) == args.layers
    assert len(all_cells) == (args.layers * (args.rnn_type == "lstm"))
    if args.skip_connections:
        assert len_wrt == args.layers
    else:
        assert len_wrt == 1

    # Comupute the gradients of states or cells
    if args.rnn_type == "lstm" and args.visualize_cells:
        states = all_cells
    else:
        states = all_states

    logger.info("The computation of the gradients has started")
    gradients = []
    for i, state in enumerate(states):
        gradients.append(
            tensor.grad(tensor.mean(tensor.abs_(
                state[-1])), state))
    # -1 indicates that gradient is gradient of the last time-step.c
    logger.info("The computation of the gradients is done")

    # Handle the theano shared variables for the state
    state_vars = [theano.shared(
        v[0:1, :].zeros_like().eval(), v.name + '-gen')
        for v, _ in updates]
    givens = [(v, x) for (v, _), x in zip(updates, state_vars)]
    f_updates = [(x, upd) for x, (_, upd) in zip(state_vars, updates)]

    # Compile the function
    logger.info("The compilation of the function has started")
    compiled = theano.function(inputs=ComputationGraph(states).inputs,
                               outputs=gradients,
                               givens=givens, updates=f_updates,
                               mode=Mode(optimizer='fast_compile'))
    logger.info("The function has been compiled")
    import ipdb
    ipdb.set_trace()

    # Generate
    epoch_iterator = train_stream.get_epoch_iterator()
    for num in range(10):
        init_ = next(epoch_iterator)[0][
            0: args.visualize_length, 0:1]

        # [layers * len_wrt] [Time, 1, Hidden_dim]
        gradients = compiled(init_)

        time = gradients[0].shape[0]
        if has_indices(args.dataset):
            ticks = tuple(conv_into_char(init_[:, 0], args.dataset))
        else:
            ticks = tuple(np.arange(time))

        # One row subplot for each variable wrt which we are computing
        # the gradients
        for var in range(len_wrt):
            plt.subplot(len_wrt, 1, var + 1)
            for d in range(args.layers - var):
                plt.plot(
                    np.arange(time),
                    np.mean(np.abs(gradients[d][:, 0, :]), axis=1),
                    label="layer " + str(d + var))
            plt.xticks(range(args.visualize_length), ticks)
            plt.grid(True)
            plt.yscale('log')
            axes = plt.gca()
            axes.set_ylim([5e-20, 5e-1])
            plt.title("gradients plotting w.r.t pre_rrn" + str(var))
            plt.legend()
        plt.tight_layout()
        if args.local:
            plt.show()
        else:
            plt.savefig(
                args.save_path + "/visualize_jacobian_" + str(num) + ".png")
            logger.info("Figure \"visualize_jacobian_" + str(num) +
                        ".png\" saved at directory: " + args.save_path)
def visualize_generate(cost, hidden_states, updates,
                       train_stream, valid_stream,
                       args):

    use_indices = has_indices(args.dataset)
    output_size = get_output_size(args.dataset)

    # Get presoft and its computation graph
    filter_presoft = VariableFilter(theano_name="presoft")
    presoft = filter_presoft(ComputationGraph(cost).variables)[0]
    cg = ComputationGraph(presoft)

    # Handle the theano shared variables that allow carrying the hidden
    # state
    givens, f_updates = carry_hidden_state(updates, 1, reset=not(use_indices))

    if args.hide_all_except is not None:
        pass

    # Compile the theano function
    compiled = theano.function(inputs=cg.inputs, outputs=presoft,
                               givens=givens, updates=f_updates)

    epoch_iterator = train_stream.get_epoch_iterator()
    for num in range(10):
        all_ = next(epoch_iterator)
        all_sequence = all_[0][:, 0:1]
        targets = all_[1][:, 0:1]

        # In the case of characters and text
        if use_indices:
            init_ = all_sequence[:args.initial_text_length]

            # Time X Features
            probability_array = np.zeros((0, output_size))
            generated_text = init_

            for i in range(args.generated_text_lenght):
                presoft = compiled(generated_text)
                # Get the last value of presoft
                last_presoft = presoft[-1:, 0, :]

                # Compute the probability distribution
                probabilities = softmax(last_presoft)
                # Store it in the list
                probability_array = np.vstack([probability_array,
                                               probabilities])

                # Sample a character out of the probability distribution
                argmax = (args.softmax_sampling == 'argmax')
                last_output_sample = sample(probabilities, argmax)[:, None, :]

                # Concatenate the new value to the text
                generated_text = np.vstack(
                    [generated_text, last_output_sample])

                ploting_path = None
                if args.save_path is not None:
                    ploting_path = os.path.join(
                        args.save_path, 'prob_plot.png')

                # Convert with real characters
                whole_sentence = conv_into_char(
                    generated_text[:, 0], args.dataset)
                initial_sentence = whole_sentence[:init_.shape[0]]
                selected_sentence = whole_sentence[init_.shape[0]:]

                logger.info(''.join(initial_sentence) + '...')
                logger.info(''.join(whole_sentence))

                if ploting_path is not None:
                    probability_plot(probability_array, selected_sentence,
                                     args.dataset, ploting_path)

        # In the case of sine wave dataset for example
        else:
            presoft = compiled(all_sequence)

            time_plot = presoft.shape[0] - 1

            plt.plot(np.arange(time_plot),
                     targets[:time_plot, 0, 0],
                     label="target")
            plt.plot(np.arange(time_plot), presoft[:time_plot, 0, 0],
                     label="predicted")
            plt.legend()
            plt.grid(True)
            plt.show()
def visualize_gradients(hidden_states, updates,
                        train_stream, valid_stream,
                        args):

    # Get all the hidden_states
    filter_states = VariableFilter(theano_name_regex="hidden_state_.*")
    all_states = filter_states(hidden_states)
    all_states = sorted(all_states, key=lambda var: var.name[-1])

    # Get all the hidden_cells
    filter_cells = VariableFilter(theano_name_regex="hidden_cell_.*")
    all_cells = filter_cells(hidden_states)
    all_cells = sorted(all_cells, key=lambda var: var.name[-1])

    # Get the variable on which we compute the gradients
    filter_pre_rnn = VariableFilter(theano_name_regex="pre_rnn.*")
    wrt = filter_pre_rnn(ComputationGraph(hidden_states).variables)
    wrt = sorted(wrt, key=lambda var: var.name[-1])
    len_wrt = len(wrt)

    # We have wrt = [pre_rnn] or [pre_rnn_0, pre_rnn_1, ...]

    # Assertion part
    assert len(all_states) == args.layers
    assert len(all_cells) == (args.layers * (args.rnn_type == "lstm"))
    if args.skip_connections:
        assert len_wrt == args.layers
    else:
        assert len_wrt == 1

    # Comupute the gradients of states or cells
    if args.rnn_type == "lstm" and args.visualize_cells:
        states = all_cells
    else:
        states = all_states

    logger.info("The computation of the gradients has started")
    gradients = []
    for i, state in enumerate(states):
        gradients.extend(
            tensor.grad(tensor.mean(tensor.abs_(
                state[-1, 0, :])), wrt[:i + 1]))
    # -1 indicates that gradient is gradient of the last time-step.c
    logger.info("The computation of the gradients is done")

    # Handle the theano shared variables that allow carrying the hidden state
    givens, f_updates = carry_hidden_state(updates, 1,
                                           reset=not(has_indices(args.dataset)))

    # Compile the function
    logger.info("The compilation of the function has started")
    compiled = theano.function(inputs=ComputationGraph(states).inputs,
                               outputs=gradients,
                               givens=givens, updates=f_updates,
                               mode=Mode(optimizer='fast_compile'))
    logger.info("The function has been compiled")

    # Generate
    epoch_iterator = train_stream.get_epoch_iterator()
    for num in range(10):
        init_ = next(epoch_iterator)[0][
            0: args.visualize_length, 0:1]

        # [layers * len_wrt] [Time, 1, Hidden_dim]
        gradients = compiled(init_)

        if args.skip_connections:
            assert len(gradients) == (args.layers * (args.layers + 1)) / 2
        else:
            assert len(gradients) == args.layers

        time = gradients[0].shape[0]
        if has_indices(args.dataset):
            ticks = tuple(conv_into_char(init_[:, 0], args.dataset))
        else:
            ticks = tuple(np.arange(time))

        # One row subplot for each variable wrt which we are computing
        # the gradients
        for var in range(len_wrt):
            plt.subplot(len_wrt, 1, var + 1)
            for d in range(args.layers - var):
                plt.plot(
                    np.arange(time),
                    np.mean(np.abs(gradients[d][:, 0, :]), axis=1),
                    label="layer " + str(d + var))
            plt.xticks(range(args.visualize_length), ticks)
            plt.grid(True)
            plt.yscale('log')
            axes = plt.gca()
            axes.set_ylim([5e-20, 5e-1])
            plt.title("gradients plotting w.r.t pre_rrn" + str(var))
            plt.legend()
        plt.tight_layout()
        if args.local:
            plt.show()
        else:
            plt.savefig(
                args.save_path + "/visualize_gradients_" + str(num) + ".png")
            logger.info("Figure \"visualize_gradients_" + str(num) +
                        ".png\" saved at directory: " + args.save_path)
def get_costs(presoft, args):

    if has_indices(args.dataset):
        # Targets: (Time X Batch)
        y = tensor.lmatrix('targets')
        y_mask = tensor.ones_like(y, dtype=floatX)
        y_mask = tensor.set_subtensor(y_mask[:args.context, :],
                                      tensor.zeros_like(y_mask[:args.context,
                                                               :],
                                                        dtype=floatX))

        time, batch, feat = presoft.shape
        cross_entropy = Softmax().categorical_cross_entropy(
            (y.flatten() *
                y_mask.reshape((batch * time, ))),
            (presoft.reshape((batch * time, feat)) *
                y_mask.reshape((batch * time, 1))))

        # renormalization
        renormalized_cross_entropy = cross_entropy * (
            tensor.sum(tensor.ones_like(y_mask)) /
            tensor.sum(y_mask))

        # BPC: Bits Per Character
        unregularized_cost = renormalized_cross_entropy / tensor.log(2)
        unregularized_cost.name = "cross_entropy"

    else:
        # Targets: (Time X Batch X Features)
        y = tensor.tensor3('targets', dtype=floatX)
        y_mask = tensor.ones_like(y[:, :, 0], dtype=floatX)
        y_mask = tensor.set_subtensor(y_mask[:args.context, :],
                                      tensor.zeros_like(y_mask[:args.context, :],
                                                        dtype=floatX))

        if args.used_inputs is not None:
            y_mask = tensor.set_subtensor(y_mask[:args.used_inputs, :],
                                          tensor.zeros_like(y_mask[:args.used_inputs, :],
                                                            dtype=floatX))
        # SquaredError does not work on 3D tensor
        target = (y * y_mask.dimshuffle(0, 1, 'x'))
        values = (presoft[:-1, :, :] * y_mask.dimshuffle(0, 1, 'x'))

        target = target.reshape((target.shape[0] * target.shape[1],
                                 target.shape[2]))

        values = values.reshape((values.shape[0] * values.shape[1],
                                 values.shape[2]))

        unregularized_cost = SquaredError().apply(target, values)
        # renormalization
        unregularized_cost = unregularized_cost * (
            tensor.sum(tensor.ones_like(y_mask)) /
            tensor.sum(y_mask))
        unregularized_cost.name = "mean_squared_error"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = unregularized_cost + tensor.log(1)
    cost.name = "regularized_cost"
    return cost, unregularized_cost