def visualize_states(hidden_states, updates, train_stream, valid_stream, args): # Get all the hidden_states filter_states = VariableFilter(theano_name_regex="hidden_state_.*") all_states = filter_states(hidden_states) all_states = sorted(all_states, key=lambda var: var.name[-1]) # Get all the hidden_cells filter_cells = VariableFilter(theano_name_regex="hidden_cells_.*") all_cells = filter_cells(hidden_states) all_cells = sorted(all_cells, key=lambda var: var.name[-1]) # Handle the theano shared variables that allow carrying the hidden state givens, f_updates = carry_hidden_state(updates, 1, not (has_indices(args.dataset))) # Compile the function logger.info("The compilation of the function has started") if args.rnn_type == "lstm" and args.visualize_cells: compiled = theano.function(inputs=ComputationGraph(all_cells).inputs, outputs=all_cells, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) else: compiled = theano.function(inputs=ComputationGraph(all_states).inputs, outputs=all_states, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) # Plot the function plot("hidden_state", train_stream, compiled, args)
def __init__(self, cost, generation_length, dataset, initial_text_length, softmax_sampling, updates, ploting_path=None, interactive_mode=False, **kwargs): self.generation_length = generation_length self.init_length = initial_text_length self.dataset = dataset self.output_size = get_output_size(dataset) self.ploting_path = ploting_path self.softmax_sampling = softmax_sampling self.interactive_mode = interactive_mode self.has_indices = has_indices(dataset) super(TextGenerationExtension, self).__init__(**kwargs) # Get presoft and its computation graph filter_presoft = VariableFilter(theano_name="presoft") presoft = filter_presoft(ComputationGraph(cost).variables) cg = ComputationGraph(presoft) # Handle the theano shared variables that allow carrying the hidden # state givens, f_updates = carry_hidden_state(updates, 1, reset=not(self.has_indices)) # Compile the theano function self.generate = theano.function(inputs=cg.inputs, outputs=presoft, givens=givens, updates=f_updates)
def _compile(self, state_updates): """Compiles Theano functions. .. todo:: The current compilation method does not account for updates attached to `ComputationGraph` elements. Compiling should be out-sourced to `ComputationGraph` to deal with it. """ inputs = [] outputs = [] updates = None givens, f_updates = carry_hidden_state( state_updates, self.mini_batch_size, reset=not (has_indices(self.dataset))) if self.theano_buffer.accumulation_updates: updates = OrderedDict() updates.update(self.theano_buffer.accumulation_updates) if self.updates: updates.update(self.updates) inputs += self.theano_buffer.inputs inputs += self.monitored_quantities_buffer.inputs outputs = self.monitored_quantities_buffer.requires if inputs != []: self.unique_inputs = list(set(inputs)) updates.update(f_updates) self._accumulate_fun = theano.function(self.unique_inputs, outputs, givens=givens, updates=updates) else: self._accumulate_fun = None
def _compile(self, state_updates): """Compiles Theano functions. .. todo:: The current compilation method does not account for updates attached to `ComputationGraph` elements. Compiling should be out-sourced to `ComputationGraph` to deal with it. """ inputs = [] outputs = [] updates = None givens, f_updates = carry_hidden_state(state_updates, self.mini_batch_size, reset=not(has_indices(self.dataset))) if self.theano_buffer.accumulation_updates: updates = OrderedDict() updates.update(self.theano_buffer.accumulation_updates) if self.updates: updates.update(self.updates) inputs += self.theano_buffer.inputs inputs += self.monitored_quantities_buffer.inputs outputs = self.monitored_quantities_buffer.requires if inputs != []: self.unique_inputs = list(set(inputs)) updates.update(f_updates) self._accumulate_fun = theano.function(self.unique_inputs, outputs, givens=givens, updates=updates) else: self._accumulate_fun = None
def visualize_states(hidden_states, updates, train_stream, valid_stream, args): # Get all the hidden_states filter_states = VariableFilter(theano_name_regex="hidden_state_.*") all_states = filter_states(hidden_states) all_states = sorted(all_states, key=lambda var: var.name[-1]) # Get all the hidden_cells filter_cells = VariableFilter(theano_name_regex="hidden_cells_.*") all_cells = filter_cells(hidden_states) all_cells = sorted(all_cells, key=lambda var: var.name[-1]) # Handle the theano shared variables that allow carrying the hidden state givens, f_updates = carry_hidden_state(updates, 1, not(has_indices(args.dataset))) # Compile the function logger.info("The compilation of the function has started") if args.rnn_type == "lstm" and args.visualize_cells: compiled = theano.function(inputs=ComputationGraph(all_cells).inputs, outputs=all_cells, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) else: compiled = theano.function(inputs=ComputationGraph(all_states).inputs, outputs=all_states, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) # Plot the function plot("hidden_state", train_stream, compiled, args)
def get_costs(presoft, args): if has_indices(args.dataset): # Targets: (Time X Batch) y = tensor.lmatrix('targets') y_mask = tensor.ones_like(y, dtype=floatX) y_mask = tensor.set_subtensor( y_mask[:args.context, :], tensor.zeros_like(y_mask[:args.context, :], dtype=floatX)) time, batch, feat = presoft.shape cross_entropy = Softmax().categorical_cross_entropy( (y.flatten() * y_mask.reshape((batch * time, ))), (presoft.reshape( (batch * time, feat)) * y_mask.reshape((batch * time, 1)))) # renormalization renormalized_cross_entropy = cross_entropy * ( tensor.sum(tensor.ones_like(y_mask)) / tensor.sum(y_mask)) # BPC: Bits Per Character unregularized_cost = renormalized_cross_entropy / tensor.log(2) unregularized_cost.name = "cross_entropy" else: # Targets: (Time X Batch X Features) y = tensor.tensor3('targets', dtype=floatX) y_mask = tensor.ones_like(y[:, :, 0], dtype=floatX) y_mask = tensor.set_subtensor( y_mask[:args.context, :], tensor.zeros_like(y_mask[:args.context, :], dtype=floatX)) if args.used_inputs is not None: y_mask = tensor.set_subtensor( y_mask[:args.used_inputs, :], tensor.zeros_like(y_mask[:args.used_inputs, :], dtype=floatX)) # SquaredError does not work on 3D tensor target = (y * y_mask.dimshuffle(0, 1, 'x')) values = (presoft[:-1, :, :] * y_mask.dimshuffle(0, 1, 'x')) target = target.reshape( (target.shape[0] * target.shape[1], target.shape[2])) values = values.reshape( (values.shape[0] * values.shape[1], values.shape[2])) unregularized_cost = SquaredError().apply(target, values) # renormalization unregularized_cost = unregularized_cost * ( tensor.sum(tensor.ones_like(y_mask)) / tensor.sum(y_mask)) unregularized_cost.name = "mean_squared_error" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = unregularized_cost + tensor.log(1) cost.name = "regularized_cost" return cost, unregularized_cost
def visualize_gates_soft(gate_values, hidden_states, updates, train_stream, valid_stream, args): # Handle the theano shared variables that allow carrying the hidden state givens, f_updates = carry_hidden_state(updates, 1, not (has_indices(args.dataset))) # Compile the function compiled = theano.function(inputs=ComputationGraph(gate_values).inputs, outputs=gate_values, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) plot("gates_soft", train_stream, compiled, args)
def visualize_gates_soft(gate_values, hidden_states, updates, train_stream, valid_stream, args): # Handle the theano shared variables that allow carrying the hidden state givens, f_updates = carry_hidden_state(updates, 1, not(has_indices(args.dataset))) # Compile the function compiled = theano.function(inputs=ComputationGraph(gate_values).inputs, outputs=gate_values, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) plot("gates_soft", train_stream, compiled, args)
def get_presoft(h, args): output_size = get_output_size(args.dataset) # If args.skip_connections: dim = args.layers * args.state_dim # else: dim = args.state_dim use_all_states = args.skip_connections or args.skip_output or (args.rnn_type in ["clockwork", "soft"]) output_layer = Linear( input_dim=use_all_states * args.layers * args.state_dim + (1 - use_all_states) * args.state_dim, output_dim=output_size, name="output_layer") output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() presoft = output_layer.apply(h) if not has_indices(args.dataset): presoft = Tanh().apply(presoft) presoft.name = 'presoft' return presoft
def get_presoft(h, args): output_size = get_output_size(args.dataset) # If args.skip_connections: dim = args.layers * args.state_dim # else: dim = args.state_dim use_all_states = args.skip_connections or args.skip_output or ( args.rnn_type in ["clockwork", "soft"]) output_layer = Linear( input_dim=use_all_states * args.layers * args.state_dim + (1 - use_all_states) * args.state_dim, output_dim=output_size, name="output_layer") output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() presoft = output_layer.apply(h) if not has_indices(args.dataset): presoft = Tanh().apply(presoft) presoft.name = 'presoft' return presoft
def plot(what, train_stream, compiled, args): # states epoch_iterator = train_stream.get_epoch_iterator() for num in range(10): init_ = next(epoch_iterator)[0][0: args.visualize_length, 0:1] values = compiled(init_) layers = len(values) time = values[0].shape[0] if has_indices(args.dataset): ticks = tuple(conv_into_char(init_[:, 0], args.dataset)) else: ticks = tuple(np.arange(time)) for d in range(layers): # Change the subplot plt.subplot(layers, 1, d + 1) # print only 5 values of the hiddenstate for j in range(10): plt.plot(np.arange(time), values[d][:, 0, j]) # plt.plot( # np.arange(time), np.mean(np.abs(values[d][:, 0, :]), axis=1)) # Add ticks for xaxis plt.xticks(range(args.visualize_length), ticks) # Fancy options plt.grid(True) plt.title(what + "_of_layer_" + str(d)) plt.tight_layout() # Either plot on the current display or save the plot into a file if args.local: plt.show() else: plt.savefig( args.save_path + "/visualize_" + what + '_' + str(num) + ".png") logger.info("Figure \"visualize_" + what + '_' + str(num) + ".png\" saved at directory: " + args.save_path)
def plot(what, train_stream, compiled, args): # states epoch_iterator = train_stream.get_epoch_iterator() for num in range(10): init_ = next(epoch_iterator)[0][0:args.visualize_length, 0:1] values = compiled(init_) layers = len(values) time = values[0].shape[0] if has_indices(args.dataset): ticks = tuple(conv_into_char(init_[:, 0], args.dataset)) else: ticks = tuple(np.arange(time)) for d in range(layers): # Change the subplot plt.subplot(layers, 1, d + 1) # print only 5 values of the hiddenstate for j in range(10): plt.plot(np.arange(time), values[d][:, 0, j]) # plt.plot( # np.arange(time), np.mean(np.abs(values[d][:, 0, :]), axis=1)) # Add ticks for xaxis plt.xticks(range(args.visualize_length), ticks) # Fancy options plt.grid(True) plt.title(what + "_of_layer_" + str(d)) plt.tight_layout() # Either plot on the current display or save the plot into a file if args.local: plt.show() else: plt.savefig(args.save_path + "/visualize_" + what + '_' + str(num) + ".png") logger.info("Figure \"visualize_" + what + '_' + str(num) + ".png\" saved at directory: " + args.save_path)
def __init__(self, cost, generation_length, dataset, initial_text_length, softmax_sampling, updates, ploting_path=None, interactive_mode=False, **kwargs): self.generation_length = generation_length self.init_length = initial_text_length self.dataset = dataset self.output_size = get_output_size(dataset) self.ploting_path = ploting_path self.softmax_sampling = softmax_sampling self.interactive_mode = interactive_mode self.has_indices = has_indices(dataset) super(TextGenerationExtension, self).__init__(**kwargs) # Get presoft and its computation graph filter_presoft = VariableFilter(theano_name="presoft") presoft = filter_presoft(ComputationGraph(cost).variables) cg = ComputationGraph(presoft) # Handle the theano shared variables that allow carrying the hidden # state givens, f_updates = carry_hidden_state(updates, 1, reset=not (self.has_indices)) # Compile the theano function self.generate = theano.function(inputs=cg.inputs, outputs=presoft, givens=givens, updates=f_updates)
def visualize_gradients(hidden_states, updates, train_stream, valid_stream, args): # Get all the hidden_states filter_states = VariableFilter(theano_name_regex="hidden_state_.*") all_states = filter_states(hidden_states) all_states = sorted(all_states, key=lambda var: var.name[-1]) # Get all the hidden_cells filter_cells = VariableFilter(theano_name_regex="hidden_cell_.*") all_cells = filter_cells(hidden_states) all_cells = sorted(all_cells, key=lambda var: var.name[-1]) # Get the variable on which we compute the gradients filter_pre_rnn = VariableFilter(theano_name_regex="pre_rnn.*") wrt = filter_pre_rnn(ComputationGraph(hidden_states).variables) wrt = sorted(wrt, key=lambda var: var.name[-1]) len_wrt = len(wrt) # We have wrt = [pre_rnn] or [pre_rnn_0, pre_rnn_1, ...] # Assertion part assert len(all_states) == args.layers assert len(all_cells) == (args.layers * (args.rnn_type == "lstm")) if args.skip_connections: assert len_wrt == args.layers else: assert len_wrt == 1 # Comupute the gradients of states or cells if args.rnn_type == "lstm" and args.visualize_cells: states = all_cells else: states = all_states logger.info("The computation of the gradients has started") gradients = [] for i, state in enumerate(states): gradients.extend( tensor.grad(tensor.mean(tensor.abs_(state[-1, 0, :])), wrt[:i + 1])) # -1 indicates that gradient is gradient of the last time-step.c logger.info("The computation of the gradients is done") # Handle the theano shared variables that allow carrying the hidden state givens, f_updates = carry_hidden_state( updates, 1, reset=not (has_indices(args.dataset))) # Compile the function logger.info("The compilation of the function has started") compiled = theano.function(inputs=ComputationGraph(states).inputs, outputs=gradients, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) logger.info("The function has been compiled") # Generate epoch_iterator = train_stream.get_epoch_iterator() for num in range(10): init_ = next(epoch_iterator)[0][0:args.visualize_length, 0:1] # [layers * len_wrt] [Time, 1, Hidden_dim] gradients = compiled(init_) if args.skip_connections: assert len(gradients) == (args.layers * (args.layers + 1)) / 2 else: assert len(gradients) == args.layers time = gradients[0].shape[0] if has_indices(args.dataset): ticks = tuple(conv_into_char(init_[:, 0], args.dataset)) else: ticks = tuple(np.arange(time)) # One row subplot for each variable wrt which we are computing # the gradients for var in range(len_wrt): plt.subplot(len_wrt, 1, var + 1) for d in range(args.layers - var): plt.plot(np.arange(time), np.mean(np.abs(gradients[d][:, 0, :]), axis=1), label="layer " + str(d + var)) plt.xticks(range(args.visualize_length), ticks) plt.grid(True) plt.yscale('log') axes = plt.gca() axes.set_ylim([5e-20, 5e-1]) plt.title("gradients plotting w.r.t pre_rrn" + str(var)) plt.legend() plt.tight_layout() if args.local: plt.show() else: plt.savefig(args.save_path + "/visualize_gradients_" + str(num) + ".png") logger.info("Figure \"visualize_gradients_" + str(num) + ".png\" saved at directory: " + args.save_path)
def visualize_gates_lstm(gate_values, hidden_states, updates, train_stream, valid_stream, args): in_gates = gate_values["in_gates"] out_gates = gate_values["out_gates"] forget_gates = gate_values["forget_gates"] # Handle the theano shared variables that allow carrying the hidden state givens, f_updates = carry_hidden_state(updates, 1, not(has_indices(args.dataset))) generate_in = theano.function(inputs=ComputationGraph(in_gates).inputs, outputs=in_gates, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) generate_out = theano.function(inputs=ComputationGraph(out_gates).inputs, outputs=out_gates, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) generate_forget = theano.function(inputs=ComputationGraph(forget_gates).inputs, outputs=forget_gates, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) # Generate epoch_iterator = valid_stream.get_epoch_iterator() for num in range(10): init_ = next(epoch_iterator)[0][0: args.visualize_length, 0:1] last_output_in = generate_in(init_) last_output_out = generate_out(init_) last_output_forget = generate_forget(init_) layers = len(last_output_in) time = last_output_in[0].shape[0] if has_indices(args.dataset): ticks = tuple(conv_into_char(init_[:, 0], args.dataset)) else: ticks = tuple(np.arange(time)) for i in range(layers): plt.subplot(3, layers, 1 + i) plt.plot(np.arange(time), np.mean( np.abs(last_output_in[i][:, 0, :]), axis=1)) plt.xticks(range(args.visualize_length), ticks) plt.grid(True) plt.title("in_gate of layer " + str(i)) plt.subplot(3, layers, layers + 1 + i) plt.plot(np.arange(time), np.mean( np.abs(last_output_out[i][:, 0, :]), axis=1)) plt.xticks(range(args.visualize_length), ticks) plt.grid(True) plt.title("out_gate of layer " + str(i)) plt.subplot(3, layers, 2 * layers + 1 + i) plt.plot(np.arange(time), np.mean( np.abs(last_output_forget[i][:, 0, :]), axis=1)) plt.xticks(range(args.visualize_length), ticks) plt.grid(True) plt.title("forget_gate of layer " + str(i)) if args.local: plt.show() else: plt.savefig( args.save_path + "/visualize_gates_" + str(num) + ".png") logger.info("Figure \"visualize_gates_" + str(num) + ".png\" saved at directory: " + args.save_path)
def visualize_generate(cost, hidden_states, updates, train_stream, valid_stream, args): use_indices = has_indices(args.dataset) output_size = get_output_size(args.dataset) # Get presoft and its computation graph filter_presoft = VariableFilter(theano_name="presoft") presoft = filter_presoft(ComputationGraph(cost).variables)[0] cg = ComputationGraph(presoft) # Handle the theano shared variables that allow carrying the hidden # state givens, f_updates = carry_hidden_state(updates, 1, reset=not(use_indices)) # Compile the theano function compiled = theano.function(inputs=cg.inputs, outputs=presoft, givens=givens, updates=f_updates) epoch_iterator = train_stream.get_epoch_iterator() for num in range(10): all_ = next(epoch_iterator) all_sequence = all_[0][:, 0:1] targets = all_[1][:, 0:1] # In the case of characters and text if use_indices: init_ = all_sequence[:args.initial_text_length] # Time X Features probability_array = np.zeros((0, output_size)) generated_text = init_ for i in range(args.generated_text_lenght): presoft = compiled(generated_text) # Get the last value of presoft last_presoft = presoft[-1:, 0, :] # Compute the probability distribution probabilities = softmax(last_presoft) # Store it in the list probability_array = np.vstack([probability_array, probabilities]) # Sample a character out of the probability distribution argmax = (args.softmax_sampling == 'argmax') last_output_sample = sample(probabilities, argmax)[:, None, :] # Concatenate the new value to the text generated_text = np.vstack( [generated_text, last_output_sample]) ploting_path = None if args.save_path is not None: ploting_path = os.path.join( args.save_path, 'prob_plot.png') # Convert with real characters whole_sentence = conv_into_char( generated_text[:, 0], args.dataset) initial_sentence = whole_sentence[:init_.shape[0]] selected_sentence = whole_sentence[init_.shape[0]:] logger.info(''.join(initial_sentence) + '...') logger.info(''.join(whole_sentence)) if ploting_path is not None: probability_plot(probability_array, selected_sentence, args.dataset, ploting_path) # In the case of sine wave dataset for example else: presoft = compiled(all_sequence) time_plot = presoft.shape[0] - 1 plt.plot(np.arange(time_plot), targets[:time_plot, 0, 0], label="target") plt.plot(np.arange(time_plot), presoft[:time_plot, 0, 0], label="predicted") plt.legend() plt.grid(True) plt.show()
def get_prernn(args): # time x batch x_mask = tensor.fmatrix('mask') # Compute the state dim if args.rnn_type == 'lstm': state_dim = 4 * args.state_dim else: state_dim = args.state_dim # Prepare the arguments for the fork output_names = [] output_dims = [] for d in range(args.layers): if d > 0: suffix = RECURRENTSTACK_SEPARATOR + str(d) else: suffix = '' if d == 0 or args.skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) # Prepare the brick to be forked (LookupTable or Linear) # Check if the dataset provides indices (in the case of a # fixed vocabulary, x is 2D tensor) or if it gives raw values # (x is 3D tensor) if has_indices(args.dataset): features = args.mini_batch_size x = tensor.lmatrix('features') vocab_size = get_output_size(args.dataset) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) forked = FeedforwardSequence([lookup.apply]) if not has_mask(args.dataset): x_mask = tensor.ones_like(x, dtype=floatX) else: x = tensor.tensor3('features', dtype=floatX) if args.used_inputs is not None: x = tensor.set_subtensor(x[args.used_inputs:, :, :], tensor.zeros_like(x[args.used_inputs:, :, :], dtype=floatX)) features = get_output_size(args.dataset) forked = Linear(input_dim=features, output_dim=state_dim) forked.weights_init = initialization.IsotropicGaussian(0.1) forked.biases_init = initialization.Constant(0) if not has_mask(args.dataset): x_mask = tensor.ones_like(x[:, :, 0], dtype=floatX) # Define the fork fork = Fork(output_names=output_names, input_dim=features, output_dims=output_dims, prototype=forked) fork.initialize() # Apply the fork prernn = fork.apply(x) # Give a name to the input of each layer if args.skip_connections: for t in range(len(prernn)): prernn[t].name = "pre_rnn_" + str(t) else: prernn.name = "pre_rnn" return prernn, x_mask
def visualize_presoft(cost, hidden_states, updates, train_stream, valid_stream, args): filter_presoft = VariableFilter(theano_name="presoft") presoft = filter_presoft(ComputationGraph(cost).variables)[0] # Get all the hidden_states filter_states = VariableFilter(theano_name_regex="hidden_state_.*") all_states = filter_states(hidden_states) all_states = sorted(all_states, key=lambda var: var.name[-1]) # Assertion part assert len(all_states) == args.layers logger.info("The computation of the gradients has started") gradients = [] for i in range(args.visualize_length - args.context): gradients.extend( tensor.grad(tensor.mean(tensor.abs_(presoft[i, 0, :])), all_states)) logger.info("The computation of the gradients is done") # Handle the theano shared variables that allow carrying the hidden state givens, f_updates = carry_hidden_state(updates, 1, not(has_indices(args.dataset))) # Compile the function logger.info("The compilation of the function has started") compiled = theano.function(inputs=ComputationGraph(presoft).inputs, outputs=gradients, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) logger.info("The function has been compiled") # Generate epoch_iterator = train_stream.get_epoch_iterator() for num in range(10): init_ = next(epoch_iterator)[0][ 0: args.visualize_length, 0:1] hidden_state = compiled(init_) value_of_layer = {} for d in range(args.layers): value_of_layer[d] = 0 for i in range(len(hidden_state) / args.layers): for d in range(args.layers): value_of_layer[d] += hidden_state[d + i * args.layers] time = hidden_state[0].shape[0] if has_indices(args.dataset): ticks = tuple(conv_into_char(init_[:, 0], args.dataset)) else: ticks = tuple(np.arange(time)) for d in range(args.layers): plt.plot( np.arange(time), np.mean(np.abs(value_of_layer[d][:, 0, :]), axis=1), label="Layer " + str(d)) plt.xticks(range(args.visualize_length), ticks) plt.grid(True) plt.title("hidden_state_of_layer_" + str(d)) plt.legend() plt.tight_layout() if args.local: plt.show() else: plt.savefig( args.save_path + "/visualize_presoft_" + str(num) + ".png") logger.info("Figure \"visualize_presoft_" + str(num) + ".png\" saved at directory: " + args.save_path)
def get_prernn(args): # time x batch x_mask = tensor.fmatrix('mask') # Compute the state dim if args.rnn_type == 'lstm': state_dim = 4 * args.state_dim else: state_dim = args.state_dim # Prepare the arguments for the fork output_names = [] output_dims = [] for d in range(args.layers): if d > 0: suffix = RECURRENTSTACK_SEPARATOR + str(d) else: suffix = '' if d == 0 or args.skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) # Prepare the brick to be forked (LookupTable or Linear) # Check if the dataset provides indices (in the case of a # fixed vocabulary, x is 2D tensor) or if it gives raw values # (x is 3D tensor) if has_indices(args.dataset): features = args.mini_batch_size x = tensor.lmatrix('features') vocab_size = get_output_size(args.dataset) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) forked = FeedforwardSequence([lookup.apply]) if not has_mask(args.dataset): x_mask = tensor.ones_like(x, dtype=floatX) else: x = tensor.tensor3('features', dtype=floatX) if args.used_inputs is not None: x = tensor.set_subtensor( x[args.used_inputs:, :, :], tensor.zeros_like(x[args.used_inputs:, :, :], dtype=floatX)) features = get_output_size(args.dataset) forked = Linear(input_dim=features, output_dim=state_dim) forked.weights_init = initialization.IsotropicGaussian(0.1) forked.biases_init = initialization.Constant(0) if not has_mask(args.dataset): x_mask = tensor.ones_like(x[:, :, 0], dtype=floatX) # Define the fork fork = Fork(output_names=output_names, input_dim=features, output_dims=output_dims, prototype=forked) fork.initialize() # Apply the fork prernn = fork.apply(x) # Give a name to the input of each layer if args.skip_connections: for t in range(len(prernn)): prernn[t].name = "pre_rnn_" + str(t) else: prernn.name = "pre_rnn" return prernn, x_mask
def visualize_gates_lstm(gate_values, hidden_states, updates, train_stream, valid_stream, args): in_gates = gate_values["in_gates"] out_gates = gate_values["out_gates"] forget_gates = gate_values["forget_gates"] # Handle the theano shared variables that allow carrying the hidden state givens, f_updates = carry_hidden_state(updates, 1, not (has_indices(args.dataset))) generate_in = theano.function(inputs=ComputationGraph(in_gates).inputs, outputs=in_gates, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) generate_out = theano.function(inputs=ComputationGraph(out_gates).inputs, outputs=out_gates, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) generate_forget = theano.function( inputs=ComputationGraph(forget_gates).inputs, outputs=forget_gates, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) # Generate epoch_iterator = valid_stream.get_epoch_iterator() for num in range(10): init_ = next(epoch_iterator)[0][0:args.visualize_length, 0:1] last_output_in = generate_in(init_) last_output_out = generate_out(init_) last_output_forget = generate_forget(init_) layers = len(last_output_in) time = last_output_in[0].shape[0] if has_indices(args.dataset): ticks = tuple(conv_into_char(init_[:, 0], args.dataset)) else: ticks = tuple(np.arange(time)) for i in range(layers): plt.subplot(3, layers, 1 + i) plt.plot(np.arange(time), np.mean(np.abs(last_output_in[i][:, 0, :]), axis=1)) plt.xticks(range(args.visualize_length), ticks) plt.grid(True) plt.title("in_gate of layer " + str(i)) plt.subplot(3, layers, layers + 1 + i) plt.plot(np.arange(time), np.mean(np.abs(last_output_out[i][:, 0, :]), axis=1)) plt.xticks(range(args.visualize_length), ticks) plt.grid(True) plt.title("out_gate of layer " + str(i)) plt.subplot(3, layers, 2 * layers + 1 + i) plt.plot(np.arange(time), np.mean(np.abs(last_output_forget[i][:, 0, :]), axis=1)) plt.xticks(range(args.visualize_length), ticks) plt.grid(True) plt.title("forget_gate of layer " + str(i)) if args.local: plt.show() else: plt.savefig(args.save_path + "/visualize_gates_" + str(num) + ".png") logger.info("Figure \"visualize_gates_" + str(num) + ".png\" saved at directory: " + args.save_path)
def visualize_presoft(cost, hidden_states, updates, train_stream, valid_stream, args): filter_presoft = VariableFilter(theano_name="presoft") presoft = filter_presoft(ComputationGraph(cost).variables)[0] # Get all the hidden_states filter_states = VariableFilter(theano_name_regex="hidden_state_.*") all_states = filter_states(hidden_states) all_states = sorted(all_states, key=lambda var: var.name[-1]) # Assertion part assert len(all_states) == args.layers logger.info("The computation of the gradients has started") gradients = [] for i in range(args.visualize_length - args.context): gradients.extend( tensor.grad(tensor.mean(tensor.abs_(presoft[i, 0, :])), all_states)) logger.info("The computation of the gradients is done") # Handle the theano shared variables that allow carrying the hidden state givens, f_updates = carry_hidden_state(updates, 1, not (has_indices(args.dataset))) # Compile the function logger.info("The compilation of the function has started") compiled = theano.function(inputs=ComputationGraph(presoft).inputs, outputs=gradients, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) logger.info("The function has been compiled") # Generate epoch_iterator = train_stream.get_epoch_iterator() for num in range(10): init_ = next(epoch_iterator)[0][0:args.visualize_length, 0:1] hidden_state = compiled(init_) value_of_layer = {} for d in range(args.layers): value_of_layer[d] = 0 for i in range(len(hidden_state) / args.layers): for d in range(args.layers): value_of_layer[d] += hidden_state[d + i * args.layers] time = hidden_state[0].shape[0] if has_indices(args.dataset): ticks = tuple(conv_into_char(init_[:, 0], args.dataset)) else: ticks = tuple(np.arange(time)) for d in range(args.layers): plt.plot(np.arange(time), np.mean(np.abs(value_of_layer[d][:, 0, :]), axis=1), label="Layer " + str(d)) plt.xticks(range(args.visualize_length), ticks) plt.grid(True) plt.title("hidden_state_of_layer_" + str(d)) plt.legend() plt.tight_layout() if args.local: plt.show() else: plt.savefig(args.save_path + "/visualize_presoft_" + str(num) + ".png") logger.info("Figure \"visualize_presoft_" + str(num) + ".png\" saved at directory: " + args.save_path)
def visualize_jacobian(hidden_states, updates, train_stream, valid_stream, args): # Get all the hidden_states all_states = [ var for var in hidden_states if re.match("hidden_state_.*", var.name)] all_states = sorted(all_states, key=lambda var: var.name[-1]) # Get all the hidden_cells all_cells = [var for var in hidden_states if re.match( "hidden_cell_.*", var.name)] all_cells = sorted(all_cells, key=lambda var: var.name[-1]) # Get the variable on which we compute the gradients variables = ComputationGraph(hidden_states).variables wrt = [ var for var in variables if (var.name is not None) and (re.match("pre_rnn.*", var.name))] wrt = sorted(wrt, key=lambda var: var.name[-1]) len_wrt = len(wrt) # We have wrt = [pre_rnn] or [pre_rnn_0, pre_rnn_1, ...] # Assertion part assert len(all_states) == args.layers assert len(all_cells) == (args.layers * (args.rnn_type == "lstm")) if args.skip_connections: assert len_wrt == args.layers else: assert len_wrt == 1 # Comupute the gradients of states or cells if args.rnn_type == "lstm" and args.visualize_cells: states = all_cells else: states = all_states logger.info("The computation of the gradients has started") gradients = [] for i, state in enumerate(states): gradients.append( tensor.grad(tensor.mean(tensor.abs_( state[-1])), state)) # -1 indicates that gradient is gradient of the last time-step.c logger.info("The computation of the gradients is done") # Handle the theano shared variables for the state state_vars = [theano.shared( v[0:1, :].zeros_like().eval(), v.name + '-gen') for v, _ in updates] givens = [(v, x) for (v, _), x in zip(updates, state_vars)] f_updates = [(x, upd) for x, (_, upd) in zip(state_vars, updates)] # Compile the function logger.info("The compilation of the function has started") compiled = theano.function(inputs=ComputationGraph(states).inputs, outputs=gradients, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) logger.info("The function has been compiled") import ipdb ipdb.set_trace() # Generate epoch_iterator = train_stream.get_epoch_iterator() for num in range(10): init_ = next(epoch_iterator)[0][ 0: args.visualize_length, 0:1] # [layers * len_wrt] [Time, 1, Hidden_dim] gradients = compiled(init_) time = gradients[0].shape[0] if has_indices(args.dataset): ticks = tuple(conv_into_char(init_[:, 0], args.dataset)) else: ticks = tuple(np.arange(time)) # One row subplot for each variable wrt which we are computing # the gradients for var in range(len_wrt): plt.subplot(len_wrt, 1, var + 1) for d in range(args.layers - var): plt.plot( np.arange(time), np.mean(np.abs(gradients[d][:, 0, :]), axis=1), label="layer " + str(d + var)) plt.xticks(range(args.visualize_length), ticks) plt.grid(True) plt.yscale('log') axes = plt.gca() axes.set_ylim([5e-20, 5e-1]) plt.title("gradients plotting w.r.t pre_rrn" + str(var)) plt.legend() plt.tight_layout() if args.local: plt.show() else: plt.savefig( args.save_path + "/visualize_jacobian_" + str(num) + ".png") logger.info("Figure \"visualize_jacobian_" + str(num) + ".png\" saved at directory: " + args.save_path)
def visualize_generate(cost, hidden_states, updates, train_stream, valid_stream, args): use_indices = has_indices(args.dataset) output_size = get_output_size(args.dataset) # Get presoft and its computation graph filter_presoft = VariableFilter(theano_name="presoft") presoft = filter_presoft(ComputationGraph(cost).variables)[0] cg = ComputationGraph(presoft) # Handle the theano shared variables that allow carrying the hidden # state givens, f_updates = carry_hidden_state(updates, 1, reset=not(use_indices)) if args.hide_all_except is not None: pass # Compile the theano function compiled = theano.function(inputs=cg.inputs, outputs=presoft, givens=givens, updates=f_updates) epoch_iterator = train_stream.get_epoch_iterator() for num in range(10): all_ = next(epoch_iterator) all_sequence = all_[0][:, 0:1] targets = all_[1][:, 0:1] # In the case of characters and text if use_indices: init_ = all_sequence[:args.initial_text_length] # Time X Features probability_array = np.zeros((0, output_size)) generated_text = init_ for i in range(args.generated_text_lenght): presoft = compiled(generated_text) # Get the last value of presoft last_presoft = presoft[-1:, 0, :] # Compute the probability distribution probabilities = softmax(last_presoft) # Store it in the list probability_array = np.vstack([probability_array, probabilities]) # Sample a character out of the probability distribution argmax = (args.softmax_sampling == 'argmax') last_output_sample = sample(probabilities, argmax)[:, None, :] # Concatenate the new value to the text generated_text = np.vstack( [generated_text, last_output_sample]) ploting_path = None if args.save_path is not None: ploting_path = os.path.join( args.save_path, 'prob_plot.png') # Convert with real characters whole_sentence = conv_into_char( generated_text[:, 0], args.dataset) initial_sentence = whole_sentence[:init_.shape[0]] selected_sentence = whole_sentence[init_.shape[0]:] logger.info(''.join(initial_sentence) + '...') logger.info(''.join(whole_sentence)) if ploting_path is not None: probability_plot(probability_array, selected_sentence, args.dataset, ploting_path) # In the case of sine wave dataset for example else: presoft = compiled(all_sequence) time_plot = presoft.shape[0] - 1 plt.plot(np.arange(time_plot), targets[:time_plot, 0, 0], label="target") plt.plot(np.arange(time_plot), presoft[:time_plot, 0, 0], label="predicted") plt.legend() plt.grid(True) plt.show()
def visualize_gradients(hidden_states, updates, train_stream, valid_stream, args): # Get all the hidden_states filter_states = VariableFilter(theano_name_regex="hidden_state_.*") all_states = filter_states(hidden_states) all_states = sorted(all_states, key=lambda var: var.name[-1]) # Get all the hidden_cells filter_cells = VariableFilter(theano_name_regex="hidden_cell_.*") all_cells = filter_cells(hidden_states) all_cells = sorted(all_cells, key=lambda var: var.name[-1]) # Get the variable on which we compute the gradients filter_pre_rnn = VariableFilter(theano_name_regex="pre_rnn.*") wrt = filter_pre_rnn(ComputationGraph(hidden_states).variables) wrt = sorted(wrt, key=lambda var: var.name[-1]) len_wrt = len(wrt) # We have wrt = [pre_rnn] or [pre_rnn_0, pre_rnn_1, ...] # Assertion part assert len(all_states) == args.layers assert len(all_cells) == (args.layers * (args.rnn_type == "lstm")) if args.skip_connections: assert len_wrt == args.layers else: assert len_wrt == 1 # Comupute the gradients of states or cells if args.rnn_type == "lstm" and args.visualize_cells: states = all_cells else: states = all_states logger.info("The computation of the gradients has started") gradients = [] for i, state in enumerate(states): gradients.extend( tensor.grad(tensor.mean(tensor.abs_( state[-1, 0, :])), wrt[:i + 1])) # -1 indicates that gradient is gradient of the last time-step.c logger.info("The computation of the gradients is done") # Handle the theano shared variables that allow carrying the hidden state givens, f_updates = carry_hidden_state(updates, 1, reset=not(has_indices(args.dataset))) # Compile the function logger.info("The compilation of the function has started") compiled = theano.function(inputs=ComputationGraph(states).inputs, outputs=gradients, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) logger.info("The function has been compiled") # Generate epoch_iterator = train_stream.get_epoch_iterator() for num in range(10): init_ = next(epoch_iterator)[0][ 0: args.visualize_length, 0:1] # [layers * len_wrt] [Time, 1, Hidden_dim] gradients = compiled(init_) if args.skip_connections: assert len(gradients) == (args.layers * (args.layers + 1)) / 2 else: assert len(gradients) == args.layers time = gradients[0].shape[0] if has_indices(args.dataset): ticks = tuple(conv_into_char(init_[:, 0], args.dataset)) else: ticks = tuple(np.arange(time)) # One row subplot for each variable wrt which we are computing # the gradients for var in range(len_wrt): plt.subplot(len_wrt, 1, var + 1) for d in range(args.layers - var): plt.plot( np.arange(time), np.mean(np.abs(gradients[d][:, 0, :]), axis=1), label="layer " + str(d + var)) plt.xticks(range(args.visualize_length), ticks) plt.grid(True) plt.yscale('log') axes = plt.gca() axes.set_ylim([5e-20, 5e-1]) plt.title("gradients plotting w.r.t pre_rrn" + str(var)) plt.legend() plt.tight_layout() if args.local: plt.show() else: plt.savefig( args.save_path + "/visualize_gradients_" + str(num) + ".png") logger.info("Figure \"visualize_gradients_" + str(num) + ".png\" saved at directory: " + args.save_path)
def get_costs(presoft, args): if has_indices(args.dataset): # Targets: (Time X Batch) y = tensor.lmatrix('targets') y_mask = tensor.ones_like(y, dtype=floatX) y_mask = tensor.set_subtensor(y_mask[:args.context, :], tensor.zeros_like(y_mask[:args.context, :], dtype=floatX)) time, batch, feat = presoft.shape cross_entropy = Softmax().categorical_cross_entropy( (y.flatten() * y_mask.reshape((batch * time, ))), (presoft.reshape((batch * time, feat)) * y_mask.reshape((batch * time, 1)))) # renormalization renormalized_cross_entropy = cross_entropy * ( tensor.sum(tensor.ones_like(y_mask)) / tensor.sum(y_mask)) # BPC: Bits Per Character unregularized_cost = renormalized_cross_entropy / tensor.log(2) unregularized_cost.name = "cross_entropy" else: # Targets: (Time X Batch X Features) y = tensor.tensor3('targets', dtype=floatX) y_mask = tensor.ones_like(y[:, :, 0], dtype=floatX) y_mask = tensor.set_subtensor(y_mask[:args.context, :], tensor.zeros_like(y_mask[:args.context, :], dtype=floatX)) if args.used_inputs is not None: y_mask = tensor.set_subtensor(y_mask[:args.used_inputs, :], tensor.zeros_like(y_mask[:args.used_inputs, :], dtype=floatX)) # SquaredError does not work on 3D tensor target = (y * y_mask.dimshuffle(0, 1, 'x')) values = (presoft[:-1, :, :] * y_mask.dimshuffle(0, 1, 'x')) target = target.reshape((target.shape[0] * target.shape[1], target.shape[2])) values = values.reshape((values.shape[0] * values.shape[1], values.shape[2])) unregularized_cost = SquaredError().apply(target, values) # renormalization unregularized_cost = unregularized_cost * ( tensor.sum(tensor.ones_like(y_mask)) / tensor.sum(y_mask)) unregularized_cost.name = "mean_squared_error" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = unregularized_cost + tensor.log(1) cost.name = "regularized_cost" return cost, unregularized_cost