def initialize_rnn(rnn, args): # Dont initialize as Orthogonal if we are about to load new parameters if args.load_path is not None: rnn.weights_init = initialization.Constant(0) else: rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize()
def create_rnn(hidden_dim, vocab_dim, mode="rnn"): # input x = tensor.imatrix('inchar') y = tensor.imatrix('outchar') # W = LookupTable( name="W1", #dim = hidden_dim*4, dim=hidden_dim, length=vocab_dim, weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0)) if mode == "lstm": # Long Short Term Memory H = LSTM(hidden_dim, name='H', weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0.0)) else: # recurrent history weight H = SimpleRecurrent( name="H", dim=hidden_dim, activation=Tanh(), weights_init=initialization.IsotropicGaussian(0.01)) # S = Linear(name="W2", input_dim=hidden_dim, output_dim=vocab_dim, weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0)) A = NDimensionalSoftmax(name="softmax") initLayers([W, H, S]) activations = W.apply(x) hiddens = H.apply(activations) #[0] activations2 = S.apply(hiddens) y_hat = A.apply(activations2, extra_ndim=1) cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean() cg = ComputationGraph(cost) #print VariableFilter(roles=[WEIGHT])(cg.variables) #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables) layers = (x, W, H, S, A, y) return cg, layers, y_hat, cost
def __init__(self, input_dim, output_dim, hidden_size, init_ranges, **kwargs): linear1 = LinearMaxout(input_dim=input_dim, output_dim=hidden_size, num_pieces=2, name='linear1') linear2 = LinearMaxout(input_dim=hidden_size, output_dim=hidden_size, num_pieces=2, name='linear2') linear3 = Linear(input_dim=hidden_size, output_dim=output_dim) logistic = Logistic() bricks = [ linear1, BatchNormalization(input_dim=hidden_size, name='bn2'), linear2, BatchNormalization(input_dim=hidden_size, name='bnl'), linear3, logistic ] for init_range, b in zip(init_ranges, (linear1, linear2, linear3)): b.biases_init = initialization.Constant(0) b.weights_init = initialization.Uniform(width=init_range) kwargs.setdefault('use_bias', False) super(ConcatenateClassifier, self).__init__([b.apply for b in bricks], **kwargs)
def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim * 2], name='fork', output_names=["linear", "gates"], weights_init=initialization.Identity(), biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(), biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear(input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin, gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
def example(): """ Simple reccurent example. Taken from : https://github.com/mdda/pycon.sg-2015_deep-learning/blob/master/ipynb/blocks-recurrent-docs.ipynb """ x = tensor.tensor3('x') rnn = SimpleRecurrent(dim=3, activation=Identity(), weights_init=initialization.Identity()) rnn.initialize() h = rnn.apply(x) f = theano.function([x], h) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) doubler = Linear(input_dim=3, output_dim=3, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() h_doubler = rnn.apply(doubler.apply(x)) f = theano.function([x], h_doubler) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) #Initial State h0 = tensor.matrix('h0') h = rnn.apply(inputs=x, states=h0) f = theano.function([x, h0], h) print( f(np.ones((3, 1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX)))
def initialize_lasthid(last_hid, matrixfile=None, max_dim=None): rng = numpy.random.RandomState(42) w = 0.08 myarray = rng.uniform(-w, +w, size=(last_hid.input_dim, last_hid.output_dim)) print myarray.shape if matrixfile: typematrix = (numpy.load(matrixfile)) if max_dim == None: max_dim = len(typematrix) print typematrix.shape myarray[0:max_dim, :] = typematrix[0:max_dim, :] print myarray last_hid.weights_init = initialization.Constant(myarray) last_hid.biases_init = initialization.Constant(0) last_hid.initialize()
def initialize2(brick, num_feature_maps): fan_in = numpy.prod(brick.filter_size) fan_out = numpy.prod( brick.filter_size) * brick.num_filters / num_feature_maps W_bound = numpy.sqrt(6. / (fan_in + fan_out)) brick.weights_init = initialization.Uniform(width=W_bound) brick.biases_init = initialization.Constant(0) brick.initialize()
def initialize(to_init, width): """ Initialize weights according to Xavier Parameter Initialization :param to_init the block to initialize :param width width of uniform distribution """ to_init.weights_init = initialization.Uniform(width=width) to_init.biases_init = initialization.Constant(0) to_init.initialize()
def __init__(self, visual_dim, textual_dim, output_dim, hidden_size, init_ranges, **kwargs): (visual_range, textual_range, linear_range_1, linear_range_2, linear_range_3) = init_ranges visual_layer = FeedforwardSequence([ BatchNormalization(input_dim=visual_dim).apply, LinearMaxout( input_dim=visual_dim, output_dim=hidden_size, weights_init=initialization.Uniform(width=visual_range), use_bias=False, biases_init=initialization.Constant(0), num_pieces=2).apply ], name='visual_layer') textual_layer = FeedforwardSequence([ BatchNormalization(input_dim=textual_dim).apply, LinearMaxout( input_dim=textual_dim, output_dim=hidden_size, weights_init=initialization.Uniform(width=textual_range), biases_init=initialization.Constant(0), use_bias=False, num_pieces=2).apply ], name='textual_layer') logistic_mlp = MLPGenreClassifier( hidden_size, output_dim, hidden_size, [linear_range_1, linear_range_2, linear_range_3]) # logistic_mlp = Sequence([ # BatchNormalization(input_dim=hidden_size, name='bn1').apply, # Linear(hidden_size, output_dim, name='linear_output', use_bias=False, # weights_init=initialization.Uniform(width=linear_range_1)).apply, # Logistic().apply #], name='logistic_mlp') children = [visual_layer, textual_layer, logistic_mlp] kwargs.setdefault('use_bias', False) kwargs.setdefault('children', children) super(LinearSumClassifier, self).__init__(**kwargs)
def get_presoft(h, args): output_size = get_output_size(args.dataset) # If args.skip_connections: dim = args.layers * args.state_dim # else: dim = args.state_dim use_all_states = args.skip_connections or args.skip_output or ( args.rnn_type in ["clockwork", "soft"]) output_layer = Linear( input_dim=use_all_states * args.layers * args.state_dim + (1 - use_all_states) * args.state_dim, output_dim=output_size, name="output_layer") output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() presoft = output_layer.apply(h) if not has_indices(args.dataset): presoft = Tanh().apply(presoft) presoft.name = 'presoft' return presoft
def build_fork_lookup(vocab_size, args): x = tensor.lmatrix('features') virtual_dim = 6 time_length = 5 mini_batch_size = 2 skip_connections = True layers = 3 # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) print output_names print output_dims lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=time_length, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) # Return list of 3D Tensor, one for each layer # (Batch X Time X embedding_dim) pre_rnn = fork.apply(x) fork.initialize() f = theano.function([x], pre_rnn) return f
def initialize_inout(brick, fan_in, fan_out,seed=1): W_bound = numpy.sqrt(6. / (fan_in + fan_out)) brick.weights_init = initialization.Uniform(width=2 * W_bound) brick.biases_init = initialization.Constant(0) brick.initialize()
def initialize(to_init, rndstd=0.01): for bricks in to_init: bricks.weights_init = initialization.Uniform(width=0.08) bricks.biases_init = initialization.Constant(0) bricks.initialize()
def build_model_hard(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] for i in range(layers - 1): mlp = MLP(activations=[Logistic()], dims=[2 * state_dim, 1], weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear(input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have correctly: # h = [state_1, state_2, state_3 ...] # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
from blocks.bricks.parallel import Fork transition = SimpleRecurrent2(dim = dimension, activation = Identity()) readout = Readout( readout_dim=dimension, source_names=transition.apply.states + ["feedback"], name="readout") generator = SequenceGenerator( readout=readout, transition=transition, fork = Fork(['inputs'], prototype=Identity()), weights_init = initialization.Identity(1.), biases_init = initialization.Constant(0.), name="generator") generator.push_initialization_config() #generator.fork.weights_init = initialization.Identity(1.) generator.transition.transition.weights_init = initialization.Identity(2.) generator.initialize() results = generator.generate(n_steps=n_steps, batch_size=2, iterate=True, return_initial_states = True) results_cg = ComputationGraph(results) results_tf = results_cg.get_theano_function() generated_sequence_t = results_tf()[1]
def get_prernn(args): # time x batch x_mask = tensor.fmatrix('mask') # Compute the state dim if args.rnn_type == 'lstm': state_dim = 4 * args.state_dim else: state_dim = args.state_dim # Prepare the arguments for the fork output_names = [] output_dims = [] for d in range(args.layers): if d > 0: suffix = RECURRENTSTACK_SEPARATOR + str(d) else: suffix = '' if d == 0 or args.skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) # Prepare the brick to be forked (LookupTable or Linear) # Check if the dataset provides indices (in the case of a # fixed vocabulary, x is 2D tensor) or if it gives raw values # (x is 3D tensor) if has_indices(args.dataset): features = args.mini_batch_size x = tensor.lmatrix('features') vocab_size = get_output_size(args.dataset) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) forked = FeedforwardSequence([lookup.apply]) if not has_mask(args.dataset): x_mask = tensor.ones_like(x, dtype=floatX) else: x = tensor.tensor3('features', dtype=floatX) if args.used_inputs is not None: x = tensor.set_subtensor( x[args.used_inputs:, :, :], tensor.zeros_like(x[args.used_inputs:, :, :], dtype=floatX)) features = get_output_size(args.dataset) forked = Linear(input_dim=features, output_dim=state_dim) forked.weights_init = initialization.IsotropicGaussian(0.1) forked.biases_init = initialization.Constant(0) if not has_mask(args.dataset): x_mask = tensor.ones_like(x[:, :, 0], dtype=floatX) # Define the fork fork = Fork(output_names=output_names, input_dim=features, output_dims=output_dims, prototype=forked) fork.initialize() # Apply the fork prernn = fork.apply(x) # Give a name to the input of each layer if args.skip_connections: for t in range(len(prernn)): prernn[t].name = "pre_rnn_" + str(t) else: prernn.name = "pre_rnn" return prernn, x_mask
readout = Readout( readout_dim=dimension, source_names=['states', 'feedback'], emitter=TrivialEmitter2(readout_dim = dimension), feedback_brick=TrivialFeedback(output_dim = dimension), #merge = Merge(), post_merge = Identity(), merged_dim = dimension, name="readout") generator = SequenceGenerator( readout=readout, transition=transition, fork = Fork(['inputs'], prototype=Identity()), weights_init = initialization.Identity(1.), biases_init = initialization.Constant(0.), name="generator") generator.push_initialization_config() generator.transition.transition.weights_init = initialization.Identity(2.) generator.initialize() results = generator.generate(n_steps=n_steps, batch_size=1, iterate=True, return_initial_states = True) results_cg = ComputationGraph(results) results_tf = results_cg.get_theano_function() generated_sequence_t = results_tf()[1] generated_sequence_t.shape=(n_steps+1, dimension)
def build_fork_lookup(vocab_size, time_length, args): x = tensor.lmatrix('features') virtual_dim = 6 state_dim = 6 skip_connections = False layers = 1 # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=time_length, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) # Note that this order of the periods makes faster modules flow in slower # ones with is the opposite of the original paper transitions = [ClockworkBase(dim=state_dim, activation=Tanh(), period=2 ** i) for i in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # Return list of 3D Tensor, one for each layer # (Batch X Time X embedding_dim) pre_rnn = fork.apply(x) # Give time as the first index for each element in the list: # (Time X Batch X embedding_dim) if layers > 1 and skip_connections: for t in range(len(pre_rnn)): pre_rnn[t] = pre_rnn[t].dimshuffle(1, 0, 2) else: pre_rnn = pre_rnn.dimshuffle(1, 0, 2) f_pre_rnn = theano.function([x], pre_rnn) # Prepare inputs for the RNN kwargs = OrderedDict() for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] else: kwargs['inputs' + suffix] = pre_rnn print kwargs # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() f_h = theano.function([x], h) return f_pre_rnn, f_h
def build_model_soft(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) transitions = [SimpleRecurrent(dim=args.state_dim, activation=Tanh())] # Build the MLP dims = [2 * args.state_dim] activations = [] for i in range(args.mlp_layers): activations.append(Rectifier()) dims.append(args.state_dim) # Activation of the last layer of the MLP if args.mlp_activation == "logistic": activations.append(Logistic()) elif args.mlp_activation == "rectifier": activations.append(Rectifier()) elif args.mlp_activation == "hard_logistic": activations.append(HardLogistic()) else: assert False # Output of MLP has dimension 1 dims.append(1) for i in range(args.layers - 1): mlp = MLP(activations=activations, dims=dims, weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( SoftGatedRecurrent(dim=args.state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # Now we have: # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...] # Extract gate_values gate_values = h[2::2] new_h = [h[0]] new_h.extend(h[1::2]) h = new_h # Now we have: # h = [state, state_1, state_2, ...] # gate_values = [gate_value_1, gate_value_2, gate_value_3] for i, gate_value in enumerate(gate_values): gate_value.name = "gate_value_" + str(i) # Save all the last states last_states = {} hidden_states = [] for d in range(args.layers): h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) # Concatenate all the states if args.layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state_all" # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, cross_entropy = get_costs(presoft, args) return cost, cross_entropy, updates, gate_values, hidden_states
def build_model(args, dtype=floatX): logger.info('Building model ...') # Variables of the model # the rubik's cube stickers x = tensor.bmatrix("x") # the action taken action = tensor.bmatrix("action") # y is the reward (Batch,) y = tensor.fvector("y") ##### # LookupTable ##### lookup_x = LookupTable(length=6, dim=args.embed_dim) lookup_action = LookupTable(length=6 + args.cube_size + 3, dim=args.embed_dim) lookup_x.name = "lookup_x" lookup_x.weights_init = initialization.IsotropicGaussian(0.1) lookup_x.biases_init = initialization.Constant(0) lookup_action.name = "lookup_action" lookup_action.weights_init = initialization.IsotropicGaussian(0.1) lookup_action.biases_init = initialization.Constant(0) lookup_x.initialize() lookup_action.initialize() x_embeded = lookup_x.apply(x) action_embeded = lookup_action.apply(action) ##### # MLP ##### # Make x_embeded and action_embeded 2D x_embeded = x_embeded.reshape( (x_embeded.shape[0], x_embeded.shape[1] * x_embeded.shape[2])) action_embeded = action_embeded.reshape( (action_embeded.shape[0], action_embeded.shape[1] * action_embeded.shape[2])) # Concatenate inputs : mlp_input = tensor.concatenate((x_embeded, action_embeded), axis=1) # Bricks l = args.layers activations = [] # first layer dimension dims = [args.embed_dim * (6 * (args.cube_size**2) + 3)] # every hidden layer dimension and activation function for _ in range(l): activations.append(Rectifier()) dims.append(args.units_per_layer) # last layer dimension dims[-1] = 1 mlp = MLP(activations=activations, dims=dims) y_hat = mlp.apply(mlp_input) cost = SquaredError().apply(y.dimshuffle(0, "x"), y_hat) cost.name = "mean_squared_error" # Initialization mlp.weights_init = initialization.IsotropicGaussian(0.1) mlp.biases_init = initialization.Constant(0) mlp.initialize() # Q function # Check if the parameters in this function will change through # the updates of the gradient descent Q = theano.function(inputs=[x, action], outputs=y_hat, allow_input_downcast=True) # Cost, gradient and learning rate lr = tensor.scalar('lr') params = ComputationGraph(cost).parameters gradients = tensor.grad(cost, params) updates = OrderedDict((p, p - lr * g) for p, g in zip(params, gradients)) # Function to call to perfom a gradient descent on (y - Q)^2 gradient_descent_step = theano.function([x, action, y, lr], cost, updates=updates, allow_input_downcast=True) # Load the good parameters if args.load_path is not None: param_values = load_parameter_values(args.load_path) model = Model(cost) model.set_parameter_values(param_values) return Q, gradient_descent_step, params
from datasets import parrot_stream from model import Parrot from utils import train_parse args = train_parse() exp_name = args.experiment_name save_dir = args.save_dir print "Saving config ..." with open(os.path.join(save_dir, 'config', exp_name + '.pkl'), 'w') as f: cPickle.dump(args, f) print "Finished saving." w_init = initialization.IsotropicGaussian(0.01) b_init = initialization.Constant(0.) train_stream = parrot_stream( args.dataset, args.use_speaker, ('train',), args.batch_size, noise_level=args.feedback_noise_level, labels_type=args.labels_type, seq_size=args.seq_size, raw_data=args.raw_output) if args.feedback_noise_level is None: val_noise_level = None else: val_noise_level = 0. valid_stream = parrot_stream( args.dataset, args.use_speaker, ('valid',), args.batch_size, noise_level=val_noise_level, labels_type=args.labels_type, seq_size=args.seq_size, raw_data=args.raw_output)
def build_model_lstm(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections virtual_dim = 4 * state_dim # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) # Make sure time_length is what we need fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [ LSTM(dim=state_dim, activation=Tanh()) for _ in range(layers) ] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear(input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} init_cells = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) init_cells[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='cell0_%d' % d) kwargs['states' + suffix] = init_states[d] kwargs['cells' + suffix] = init_cells[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] last_states = {} last_cells = {} for d in range(layers): last_states[d] = h[5 * d][-1, :, :] last_cells[d] = h[5 * d + 1][-1, :, :] # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) updates.append((init_cells[d], last_states[d])) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] # Extract the values in_gates = h[2::5] forget_gates = h[3::5] out_gates = h[4::5] gate_values = { "in_gates": in_gates, "forget_gates": forget_gates, "out_gates": out_gates } h = h[::5] # Now we have correctly: # h = [state, state_1, state_2 ...] if layers > 1 # h = [state] if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer if layers > 1: if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: h = h[0] h.name = "hidden_state" presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() # Dont initialize as Orthogonal if we are about to load new parameters if args.load_path is not None: rnn.weights_init = initialization.Constant(0) else: rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values
from fuel.datasets import IterableDataset from blocks.filter import VariableFilter from blocks.roles import PARAMETER from collections import OrderedDict N_CLASSES = len(MORSE_CHR) x = T.ftensor3('x') input_layer = br.MLP(activations=[br.Rectifier()] * 2, dims=[CHUNK, 128, 128], name='input_layer', weights_init=blinit.Orthogonal(0.9), biases_init=blinit.Constant(0.0)) input_layer_app = input_layer.apply(x) input_layer.initialize() recurrent_layer = brrec.SimpleRecurrent(dim=128, activation=br.Rectifier(), name='recurrent_layer', weights_init=blinit.Orthogonal(0.01), biases_init=blinit.Constant(0.0)) state = T.fmatrix('state') recurrent_layer_app = recurrent_layer.apply(input_layer_app, state, iterate=False) recurrent_layer.initialize() output_layer = br.MLP(activations=[br.Rectifier()] * 1 + [None],
def initialize_identity(to_init): for bricks in to_init: bricks.weights_init = initialization.Identity() bricks.biases_init = initialization.Constant(0) bricks.initialize()