def build_model(input_shape, num_hidden, num_output, grad_clipping): l_in = InputLayer(input_shape, name='l_in') l_lstm1 = LSTMLayer( l_in, name='l_lstm1', num_units=num_hidden, grad_clipping=grad_clipping, nonlinearity=tanh, ) l_lstm2 = LSTMLayer( l_lstm1, name='l_lstm2', num_units=num_hidden, grad_clipping=grad_clipping, nonlinearity=tanh, only_return_final=True, ) l_out = DenseLayer(l_lstm2, name='l_out', W=Normal(), num_units=num_output, nonlinearity=softmax) layers = get_all_layers(l_out) return {layer.name: layer for layer in layers}
def nn_fn(self): l_in = InputLayer((None, self.max_length, self.emb_dim)) l_mask = InputLayer((None, self.max_length)) l_h = l_in l_h_all = [] for h in range(self.rnn_depth): if self.rnn_bidirectional: l_fwd = LSTMLayer(l_h, num_units=self.rnn_hid_units, mask_input=l_mask) l_bwd = LSTMLayer(l_h, num_units=self.rnn_hid_units, mask_input=l_mask, backwards=True) l_h = ConcatLayer((l_fwd, l_bwd), axis=-1) else: l_h = LSTMLayer(l_h, num_units=self.rnn_hid_units, mask_input=l_mask) l_h_all.append(l_h) l_h = SliceLayer(ElemwiseSumLayer(l_h_all), indices=-1, axis=1) for i in range(self.nn_dense_depth): l_h = DenseLayer(l_h, num_units=self.nn_dense_hid_units) l_mean = DenseLayer(l_h, self.z_dim, nonlinearity=None) l_cov = DenseLayer(l_h, self.z_dim, nonlinearity=softplus_safe) return (l_in, l_mask), (l_mean, l_cov)
def test_lstm_unroll_scan_fwd(): num_batch, seq_len, n_features1 = 2, 3, 4 num_units = 2 in_shp = (num_batch, seq_len, n_features1) l_inp = InputLayer(in_shp) l_mask_inp = InputLayer(in_shp[:2]) x_in = np.random.random(in_shp).astype('float32') mask_in = np.ones(in_shp[:2]).astype('float32') # need to set random seed. lasagne.random.get_rng().seed(1234) l_lstm_scan = LSTMLayer(l_inp, num_units=num_units, backwards=False, unroll_scan=False, mask_input=l_mask_inp) lasagne.random.get_rng().seed(1234) l_lstm_unrolled = LSTMLayer(l_inp, num_units=num_units, backwards=False, unroll_scan=True, mask_input=l_mask_inp) output_scan = helper.get_output(l_lstm_scan) output_unrolled = helper.get_output(l_lstm_unrolled) output_scan_val = output_scan.eval({l_inp.input_var: x_in, l_mask_inp.input_var: mask_in}) output_unrolled_val = output_unrolled.eval({l_inp.input_var: x_in, l_mask_inp.input_var: mask_in}) np.testing.assert_almost_equal(output_scan_val, output_unrolled_val)
def test_lstm_precompute(): num_batch, seq_len, n_features1 = 2, 3, 4 num_units = 2 in_shp = (num_batch, seq_len, n_features1) l_inp = InputLayer(in_shp) l_mask_inp = InputLayer(in_shp[:2]) x_in = np.random.random(in_shp).astype('float32') mask_in = np.ones((num_batch, seq_len), dtype='float32') # need to set random seed. lasagne.random.get_rng().seed(1234) l_lstm_precompute = LSTMLayer( l_inp, num_units=num_units, precompute_input=True, mask_input=l_mask_inp) lasagne.random.get_rng().seed(1234) l_lstm_no_precompute = LSTMLayer( l_inp, num_units=num_units, precompute_input=False, mask_input=l_mask_inp) output_precompute = helper.get_output( l_lstm_precompute).eval({l_inp.input_var: x_in, l_mask_inp.input_var: mask_in}) output_no_precompute = helper.get_output( l_lstm_no_precompute).eval({l_inp.input_var: x_in, l_mask_inp.input_var: mask_in}) # test that the backwards model reverses its final input np.testing.assert_almost_equal(output_precompute, output_no_precompute)
def build_convpool_lstm(input_vars, input_shape=None): """ Builds the complete network with LSTM layer to integrate time from sequences of EEG images. :param input_vars: list of EEG images (one image per time window) :return: a pointer to the output of last layer """ convnets = [] W_init = None # Build 7 parallel CNNs with shared weights for i in range(input_shape[0]): if i == 0: convnet, W_init = build_cnn(input_vars[i], input_shape) else: convnet, _ = build_cnn(input_vars[i], input_shape, W_init) convnets.append(FlattenLayer(convnet)) # at this point convnets shape is [numTimeWin][n_samples, features] # we want the shape to be [n_samples, features, numTimeWin] convpool = ConcatLayer(convnets) # convpool = ReshapeLayer(convpool, ([0], -1, numTimeWin)) convpool = ReshapeLayer( convpool, ([0], input_shape[0], get_output_shape(convnets[0])[1])) # Input to LSTM should have the shape as (batch size, SEQ_LENGTH, num_features) convpool = LSTMLayer(convpool, num_units=32, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.sigmoid) #convpool = lasagne.layers.dropout(convpool, p=.3) convpool = LSTMLayer(convpool, num_units=32, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.sigmoid) # After LSTM layer you either need to reshape or slice it (depending on whether you # want to keep all predictions or just the last prediction. # http://lasagne.readthedocs.org/en/latest/modules/layers/recurrent.html # https://github.com/Lasagne/Recipes/blob/master/examples/lstm_text_generation.py convpool = SliceLayer(convpool, -1, 1) # Selecting the last prediction # A fully-connected layer of 256 units with 50% dropout on its inputs: convpool = DenseLayer(lasagne.layers.dropout(convpool, p=.5), num_units=256, nonlinearity=lasagne.nonlinearities.rectify) # We only need the final prediction, we isolate that quantity and feed it # to the next layer. # And, finally, the output layer with 50% dropout on its inputs: convpool = DenseLayer(lasagne.layers.dropout(convpool, p=.5), num_units=num_classes, nonlinearity=lasagne.nonlinearities.softmax) return convpool
def create_blstm_dropout(input_vars, mask_vars, num_inputs, hidden_layer_size, num_outputs, dropout=0.2, noise=0.2): network = InputLayer((None, None, num_inputs), input_vars) mask = InputLayer((None, None), mask_vars) batch_size_theano, seqlen, _ = network.input_var.shape network = GaussianNoiseLayer(network, sigma=noise) for i in range(4): forward = LSTMLayer(network, hidden_layer_size, mask_input=mask, learn_init=True) backward = LSTMLayer(network, hidden_layer_size, mask_input=mask, learn_init=True, backwards=True) network = DropoutLayer( GaussianNoiseLayer(ElemwiseSumLayer([forward, backward]), noise), dropout) network = ReshapeLayer(network, (-1, hidden_layer_size)) network = DenseLayer(network, num_outputs, nonlinearity=softmax) network = ReshapeLayer(network, (batch_size_theano, seqlen, num_outputs)) return network
def __init__(self): print("Initialising network...") import theano import theano.tensor as T import lasagne from lasagne.layers import (InputLayer, LSTMLayer, ReshapeLayer, ConcatLayer, DenseLayer) theano.config.compute_test_value = 'raise' # Construct LSTM RNN: One LSTM layer and one dense output layer l_in = InputLayer(shape=input_shape) # setup fwd and bck LSTM layer. l_fwd = LSTMLayer( l_in, N_HIDDEN, backwards=False, learn_init=True, peepholes=True) l_bck = LSTMLayer( l_in, N_HIDDEN, backwards=True, learn_init=True, peepholes=True) # concatenate forward and backward LSTM layers concat_shape = (N_SEQ_PER_BATCH * SEQ_LENGTH, N_HIDDEN) l_fwd_reshape = ReshapeLayer(l_fwd, concat_shape) l_bck_reshape = ReshapeLayer(l_bck, concat_shape) l_concat = ConcatLayer([l_fwd_reshape, l_bck_reshape], axis=1) l_recurrent_out = DenseLayer(l_concat, num_units=N_OUTPUTS, nonlinearity=None) l_out = ReshapeLayer(l_recurrent_out, output_shape) input = T.tensor3('input') target_output = T.tensor3('target_output') # add test values input.tag.test_value = rand( *input_shape).astype(theano.config.floatX) target_output.tag.test_value = rand( *output_shape).astype(theano.config.floatX) print("Compiling Theano functions...") # Cost = mean squared error cost = T.mean((l_out.get_output(input) - target_output)**2) # Use NAG for training all_params = lasagne.layers.get_all_params(l_out) updates = lasagne.updates.nesterov_momentum(cost, all_params, LEARNING_RATE) # Theano functions for training, getting output, and computing cost self.train = theano.function( [input, target_output], cost, updates=updates, on_unused_input='warn', allow_input_downcast=True) self.y_pred = theano.function( [input], l_out.get_output(input), on_unused_input='warn', allow_input_downcast=True) self.compute_cost = theano.function( [input, target_output], cost, on_unused_input='warn', allow_input_downcast=True) print("Done initialising network.")
def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name): if cell_parameters is None: cell_parameters = Gate() if gate_parameters is None: gate_parameters = Gate() l_lstm = LSTMLayer( l_incoming, hidden_units, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='f_{}'.format(name)) # The "backwards" layer is the same as the first, # except that the backwards argument is set to True. l_lstm_back = LSTMLayer( l_incoming, hidden_units, ingate=gate_parameters, mask_input=l_mask, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name)) return l_lstm, l_lstm_back
def test_lstm_unroll_scan_bck(): num_batch, seq_len, n_features1 = 2, 3, 4 num_units = 2 x = T.tensor3() in_shp = (num_batch, seq_len, n_features1) l_inp = InputLayer(in_shp) x_in = np.random.random(in_shp).astype('float32') # need to set random seed. lasagne.random.get_rng().seed(1234) l_lstm_scan = LSTMLayer(l_inp, num_units=num_units, backwards=True, unroll_scan=False) lasagne.random.get_rng().seed(1234) l_lstm_unrolled = LSTMLayer(l_inp, num_units=num_units, backwards=True, unroll_scan=True) output_scan = helper.get_output(l_lstm_scan, x) output_scan_unrolled = helper.get_output(l_lstm_unrolled, x) output_scan_val = output_scan.eval({x: x_in}) output_unrolled_val = output_scan_unrolled.eval({x: x_in}) np.testing.assert_almost_equal(output_scan_val, output_unrolled_val)
def __init__(self, num_batch, max_len, n_features, hidden=[200, 200], **kwargs): self.num_batch = num_batch self.n_features = n_features self.max_len = max_len self.hidden = hidden rng = np.random.RandomState(123) self.drng = rng self.rng = RandomStreams(rng.randint(2**30)) # params # initial_W = np.asarray( # rng.uniform( # low=1e-5, # high=1, # size=(self.hidden[1], self.n_features) # ), # dtype=theano.config.floatX # ) # # self.W_y_theta = theano.shared(value=initial_W, name='W_y_theta', borrow=True) # # self.W_y_kappa = theano.shared(value=initial_W, name='W_y_kappa', borrow=True) # self.b_y_theta = theano.shared( # value=np.zeros( # self.n_features, # dtype=theano.config.floatX # ), # borrow=True # ) # self.b_y_kappa = theano.shared( # value=np.zeros( # self.n_features, # dtype=theano.config.floatX # ), # name='b', # borrow=True # ) # I could directly create the model here since it is fixed self.l_in = InputLayer(shape=(self.num_batch, self.max_len, self.n_features)) self.mask_input = InputLayer(shape=(self.num_batch, self.max_len)) first_hidden = LSTMLayer(self.l_in, mask_input=self.mask_input, num_units=hidden[0], nonlinearity=rectify) second_hidden = LSTMLayer(first_hidden, num_units=hidden[1], nonlinearity=rectify) # need some reshape voodoo l_shp = ReshapeLayer(second_hidden, (-1, hidden[1])) # after the reshape I have batch*max_len X features self.model = DenseLayer(l_shp, num_units=self.n_features, nonlinearity=rectify)
def test_lstm_init_val_error(): # check if errors are raised when inits are non matrix tensor vector = T.vector() with pytest.raises(ValueError): l_rec = LSTMLayer(InputLayer((2, 2, 3)), 5, hid_init=vector) with pytest.raises(ValueError): l_rec = LSTMLayer(InputLayer((2, 2, 3)), 5, cell_init=vector)
def create_network(config, BATCH_SIZE): input_dim = config['input_dim'] num_labels = config['num_labels'] input_layer = InputLayer(shape=(BATCH_SIZE, input_dim // 2, 2)) hidden_layer_1 = LSTMLayer(input_layer, 100) hidden_layer_2 = LSTMLayer(hidden_layer_1, 50, only_return_final=True) output_layer = DenseLayer(hidden_layer_2, num_units=num_labels, nonlinearity=lasagne.nonlinearities.softmax) return locals()
def build_lstm(input_vars, input_shape=None): ''' 1) InputLayer 2) ReshapeLayer 3) LSTM Layer 1 4) LSTM Layer 2 5) Slice Layer 6) Fully Connected Layer 1 w/ dropout tanh 7) Fully Connected Layer 2 w/ dropout softmax ''' # Input to LSTM should have the shape as (batch size, SEQ_LENGTH, num_features) network = InputLayer(shape=(input_shape[0], None, num_input_channels, input_shape[-3], input_shape[-2], input_shape[-1]), input_var=input_vars) network = ReshapeLayer(network, ([0], [1], -1)) network = DimshuffleLayer(network, (1, 0, 2)) #network = ReshapeLayer(network, (-1, 128)) #l_inp = InputLayer((None, None, num_inputs)) l_lstm1 = LSTMLayer(network, num_units=128, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh) #New LSTM l_lstm2 = LSTMLayer(l_lstm1, num_units=128, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh) #end of insertion # After LSTM layer you either need to reshape or slice it (depending on whether you # want to keep all predictions or just the last prediction. # http://lasagne.readthedocs.org/en/latest/modules/layers/recurrent.html # https://github.com/Lasagne/Recipes/blob/master/examples/lstm_text_generation.py l_lstm_slice = SliceLayer(l_lstm2, -1, 1) # Selecting the last prediction # A fully-connected layer of 256 units with 50% dropout on its inputs: l_dense = DenseLayer(lasagne.layers.dropout(l_lstm_slice, p=.5), num_units=256, nonlinearity=lasagne.nonlinearities.rectify) # We only need the final prediction, we isolate that quantity and feed it # to the next layer. # And, finally, the output layer with 50% dropout on its inputs: l_dense = DenseLayer(lasagne.layers.dropout(l_dense, p=.5), num_units=num_classes, nonlinearity=lasagne.nonlinearities.softmax) return l_dense
def build_lstm(input_layer): #network = sliding_window_input(input_layer) network = DimshuffleLayer(input_layer, (0, 1, 'x')) n_hidden = 50 grad_clipping = 20 network = LSTMLayer(network, num_units=n_hidden, grad_clipping=grad_clipping, nonlinearity=tanh) network = LSTMLayer(network, num_units=n_hidden, grad_clipping=grad_clipping, nonlinearity=tanh) network = SliceLayer(network, indices=-1, axis=1) #network = DenseLayer(network, num_units=256, nonlinearity=rectify) return network
def test_lstm_hid_init_layer_eval(): # Test `hid_init` as a `Layer` with some dummy input. Compare the output of # a network with a `Layer` as input to `hid_init` to a network with a # `np.array` as input to `hid_init` n_units = 7 n_test_cases = 2 in_shp = (n_test_cases, 2, 3) in_h_shp = (1, n_units) in_cell_shp = (1, n_units) # dummy inputs X_test = np.ones(in_shp, dtype=theano.config.floatX) Xh_test = np.ones(in_h_shp, dtype=theano.config.floatX) Xc_test = np.ones(in_cell_shp, dtype=theano.config.floatX) Xh_test_batch = np.tile(Xh_test, (n_test_cases, 1)) Xc_test_batch = np.tile(Xc_test, (n_test_cases, 1)) # network with `Layer` initializer for hid_init l_inp = InputLayer(in_shp) l_inp_h = InputLayer(in_h_shp) l_inp_cell = InputLayer(in_cell_shp) l_rec_inp_layer = LSTMLayer(l_inp, n_units, hid_init=l_inp_h, cell_init=l_inp_cell, nonlinearity=None) # network with `np.array` initializer for hid_init l_rec_nparray = LSTMLayer(l_inp, n_units, hid_init=Xh_test, cell_init=Xc_test, nonlinearity=None) # copy network parameters from l_rec_inp_layer to l_rec_nparray l_il_param = dict([(p.name, p) for p in l_rec_inp_layer.get_params()]) l_rn_param = dict([(p.name, p) for p in l_rec_nparray.get_params()]) for k, v in l_rn_param.items(): if k in l_il_param: v.set_value(l_il_param[k].get_value()) # build the theano functions X = T.tensor3() Xh = T.matrix() Xc = T.matrix() output_inp_layer = lasagne.layers.get_output(l_rec_inp_layer, {l_inp: X, l_inp_h: Xh, l_inp_cell: Xc}) output_nparray = lasagne.layers.get_output(l_rec_nparray, {l_inp: X}) # test both nets with dummy input output_val_inp_layer = output_inp_layer.eval({X: X_test, Xh: Xh_test_batch, Xc: Xc_test_batch}) output_val_nparray = output_nparray.eval({X: X_test}) # check output given `Layer` is the same as with `np.array` assert np.allclose(output_val_inp_layer, output_val_nparray)
def build_discriminator(input_var=None, dim_h=128, n_steps=1): layer = InputLayer(shape=(None, None, N_WORDS), input_var=input_var) for i in range(n_steps): layer = LSTMLayer( layer, dim_h, grad_clipping=GRAD_CLIP, nonlinearity=tanh) layer = LSTMLayer( layer, dim_h, grad_clipping=GRAD_CLIP, nonlinearity=tanh) layer = ReshapeLayer(layer, (-1, dim_h)) layer = DenseLayer(layer, 1, nonlinearity=None) layer = ReshapeLayer(layer, (-1, L_GEN)) logger.debug('Discriminator output: {}'.format(layer.output_shape)) return layer
def init_nn_structure(self, seq_length, pred_len): """ Inits network structure :param seq_length: number of features :type seq_length: int :param pred_len: number of predicted values (target dimensionality) :type pred_len: int :return: None """ self.iteration = 0 theano_input = T.tensor3() theano_output = T.matrix() from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ExpressionLayer, ConcatLayer from lasagne.nonlinearities import tanh model = {} model['input_layer'] = InputLayer((None, seq_length, 1), input_var=theano_input) lst_concat = [] for i, key in enumerate(self.feature_dict.keys()): if self.feature_dict[key] is None or len(self.feature_dict[key]) == 0: continue model['input_slice_' + str(i)] = ExpressionLayer(model['input_layer'], lambda X: X[:,self.feature_dict[key],:]) num_units = self.num_lstm_units_large if len(self.feature_dict[key]) > 10 else self.num_lstm_units_small model['hidden_layer_' + str(i) + '_1'] = LSTMLayer(model['input_slice_' + str(i)], num_units, grad_clipping=self.grad_clip, nonlinearity=tanh) model['hidden_layer_' + str(i) + '_2'] = LSTMLayer(model['hidden_layer_' + str(i) + '_1'], num_units, grad_clipping=self.grad_clip, nonlinearity=tanh, only_return_final=True) lst_concat.append(model['hidden_layer_' + str(i) + '_2']) model['concatenate_hidden'] = ConcatLayer(lst_concat, axis=1) model['output_layer'] = DenseLayer(model['concatenate_hidden'], pred_len, nonlinearity=None) model_output = lasagne.layers.get_output(model['output_layer']) params = lasagne.layers.get_all_params(model['output_layer'], trainable=True) self.loss = lasagne.objectives.squared_error(model_output, theano_output).mean() self.lr = theano.shared(np.array(self.learning_rate, dtype='float32')) self.updates = lasagne.updates.adam(self.loss, params, learning_rate=self.lr) self.l_out = model['output_layer'] self.trainT = theano.function([theano_input, theano_output], self.loss, updates=self.updates) self.compute_cost = theano.function([theano_input, theano_output], self.loss) self.forecast = theano.function([theano_input], model_output) '''
def nn_fn(self): l_in_z = InputLayer((None, self.z_dim)) l_in_x = InputLayer((None, self.max_length, self.emb_dim)) l_in_z_reshape = ReshapeLayer(l_in_z, ([0], 1, [1])) l_in_z_rep = TileLayer(l_in_z_reshape, (1, self.max_length, 1)) l_x_pre_pad = SliceLayer(PadLayer(l_in_x, [(1, 0), (0, 0)], batch_ndim=1), indices=slice(0, -1), axis=1) l_in_x_pre_pad_drop = DropoutLayer(l_x_pre_pad, self.nn_word_drop, shared_axes=(-1, )) l_concat = ConcatLayer((l_in_z_rep, l_in_x_pre_pad_drop), axis=-1) l_h = LSTMLayer(l_concat, num_units=self.nn_hid_units) if self.nn_skip: l_h = ConcatLayer((l_h, l_in_z_rep), axis=-1) l_out = DenseLayer(l_h, num_units=self.emb_dim, num_leading_axes=2, nonlinearity=None) return (l_in_z, l_in_x), l_out
def test_lstm_grad(num_units): num_batch, seq_len, n_features = 5, 3, 10 l_inp = InputLayer((num_batch, seq_len, n_features)) l_lstm = LSTMLayer(l_inp, num_units=num_units) output = helper.get_output(l_lstm) g = T.grad(T.mean(output), lasagne.layers.get_all_params(l_lstm)) assert isinstance(g, (list, tuple))
def _build(self, forget_bias=5.0, grad_clip=10.0): """Build architecture """ network = InputLayer(shape=(None, self.seq_length, self.input_size), name='input') self.input_var = network.input_var # Hidden layers tanh = lasagne.nonlinearities.tanh gate, constant = lasagne.layers.Gate, lasagne.init.Constant for _ in range(self.depth): network = LSTMLayer(network, self.width, nonlinearity=tanh, grad_clipping=grad_clip, forgetgate=gate(b=constant(forget_bias))) # Retain last-output state network = SliceLayer(network, -1, 1) # Output layer sigmoid = lasagne.nonlinearities.sigmoid loc_layer = DenseLayer(network, self.num_outputs * 2) conf_layer = DenseLayer(network, self.num_outputs, nonlinearity=sigmoid) # Grab all layers into DAPs instance self.network = get_all_layers([loc_layer, conf_layer]) # Get theano expression for outputs of DAPs model self.loc_var, self.conf_var = get_output([loc_layer, conf_layer], deterministic=True)
def build(timestep, vocab_size): # Input Layer l_in = InputLayer(shape=(None, timestep, vocab_size)) # 2 Hidden LSTM Layers l_lstm1 = LSTMLayer(l_in, num_units=10, nonlinearity=rectify) l_lstm2 = LSTMLayer(l_lstm1, num_units=10, nonlinearity=rectify, only_return_final=True) # Output Layer l_out = DenseLayer(l_lstm2, num_units=vocab_size, W=GlorotNormal, nonlinearity=softmax) return l_out
def get_decoder_1step_net(prev_state, emb_token): """ build nn that represents 1 step of decoder application :param prev_state: matrix of shape (batch_size, HIDDEN_LAYER_DIMENSION), float values :param inp_token: matrix of shape (batch_size, 1), stores id of the previous token :return: l_dec returns new though_vector, matrix shape (batch_size, HIDDEN_LAYER_DIMENSION) l_dist returns prob distribution of the next word, matrix shape (batch_size, vocab_size) """ l_dec = LSTMLayer( incoming=emb_token, num_units=HIDDEN_LAYER_DIMENSION, hid_init=prev_state, grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh, only_return_final=True, name="lstm_decoder") l_dec_long = ReshapeLayer(l_dec, shape=(-1, HIDDEN_LAYER_DIMENSION)) l_dist = DenseLayer( incoming=l_dec_long, num_units=self.vocab_size, nonlinearity=lasagne.nonlinearities.softmax, name="dense_output_probas") return l_dec, l_dist
def build_tempral_model(): net = {} net['input'] = InputLayer((None, 24, 2048)) net['lstm1'] = LSTMLayer(net['input'], 256) net['fc'] = DenseLayer(net['lstm1'], num_units=12, nonlinearity=sigmoid) return net
def lstm_layer(input, nunits, return_final, backwards=False, name='LSTM'): ingate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) forgetgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(5.0)) cell = Gate( W_cell=None, nonlinearity=T.tanh, W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), ) outgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) lstm = LSTMLayer(input, num_units=nunits, backwards=backwards, peepholes=False, ingate=ingate, forgetgate=forgetgate, cell=cell, outgate=outgate, name=name, only_return_final=return_final, mask_input=mask) return lstm
def create_lstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=False): if cell_parameters is None: cell_parameters = Gate() if gate_parameters is None: gate_parameters = Gate() l_lstm = LSTMLayer( l_incoming, hidden_units, use_peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name=name) return l_lstm
def build_lstm_decorer(): net = collections.OrderedDict() net['sent_input'] = InputLayer((None, CFG['SEQUENCE LENGTH'] - 1), input_var=T.imatrix()) net['word_emb'] = EmbeddingLayer(net['sent_input'], input_size=CFG['VOCAB SIZE'],\ output_size=CFG['EMBEDDING SIZE']) net['vis_input'] = InputLayer((None, CFG['VIS SIZE']), input_var=T.matrix()) net['vis_emb'] = DenseLayer(net['vis_input'], num_units=CFG['EMBEDDING SIZE'], nonlinearity=lasagne.nonlinearities.identity) net['vis_emb_reshp'] = ReshapeLayer(net['vis_emb'], (-1, 1, CFG['EMBEDDING SIZE'])) net['decorder_input'] = ConcatLayer( [net['vis_emb_reshp'], net['word_emb']]) net['feat_dropout'] = DropoutLayer(net['decorder_input'], p=0.5) net['mask_input'] = InputLayer((None, CFG['SEQUENCE LENGTH'])) net['lstm'] = LSTMLayer(net['feat_dropout'],num_units=CFG['EMBEDDING SIZE'], \ mask_input=net['mask_input'], grad_clipping=5.) net['lstm_dropout'] = DropoutLayer(net['lstm'], p=0.5) net['lstm_reshp'] = ReshapeLayer(net['lstm_dropout'], (-1, CFG['EMBEDDING SIZE'])) net['word_prob'] = DenseLayer(net['lstm_reshp'], num_units=CFG['VOCAB SIZE'] + 2, nonlinearity=softmax) net['sent_prob'] = ReshapeLayer( net['word_prob'], (-1, CFG['SEQUENCE LENGTH'], CFG['VOCAB SIZE'] + 2)) return net
def build_rnn(conv_input_var, seq_input_var, conv_shape, word_dims, n_hid, lstm_layers): ret = {} ret['seq_input'] = seq_layer = InputLayer((None, None, word_dims), input_var=seq_input_var) batchsize, seqlen, _ = seq_layer.input_var.shape ret['seq_resh'] = seq_layer = ReshapeLayer(seq_layer, shape=(-1, word_dims)) ret['seq_proj'] = seq_layer = DenseLayer(seq_layer, num_units=n_hid) ret['seq_resh2'] = seq_layer = ReshapeLayer(seq_layer, shape=(batchsize, seqlen, n_hid)) ret['conv_input'] = conv_layer = InputLayer(conv_shape, input_var=conv_input_var) ret['conv_proj'] = conv_layer = DenseLayer(conv_layer, num_units=n_hid) ret['conv_resh'] = conv_layer = ReshapeLayer(conv_layer, shape=([0], 1, -1)) ret['input_concat'] = layer = ConcatLayer([conv_layer, seq_layer], axis=1) for lstm_layer_idx in xrange(lstm_layers): ret['lstm_{}'.format(lstm_layer_idx)] = layer = LSTMLayer(layer, n_hid) ret['out_resh'] = layer = ReshapeLayer(layer, shape=(-1, n_hid)) ret['output_proj'] = layer = DenseLayer(layer, num_units=word_dims, nonlinearity=log_softmax) ret['output'] = layer = ReshapeLayer(layer, shape=(batchsize, seqlen + 1, word_dims)) ret['output'] = layer = SliceLayer(layer, indices=slice(None, -1), axis=1) return ret
def __init__(self, vocab): ### THEANO GRAPH INPUT ### self.input_phrase = T.imatrix("encoder phrase tokens") ########################## self.l_in = InputLayer((None, None), self.input_phrase, name='context input') self.l_mask = InputLayer((None, None), T.neq(self.input_phrase, vocab.PAD_ix), name='context mask') self.l_emb = EmbeddingLayer(self.l_in, vocab.n_tokens, Config.EMB_SIZE, name="context embedding") self.l_lstm = LSTMLayer(self.l_emb, Config.N_LSTM_UNITS, name='encoder_lstm', grad_clipping=Config.LSTM_LAYER_GRAD_CLIP, mask_input=self.l_mask, only_return_final=True, peepholes=False) self.output = self.l_lstm
def build_discriminator_lstm(params, gate_params, cell_params): from lasagne.layers import InputLayer, DenseLayer, concat from lasagne.layers.recurrent import LSTMLayer from lasagne.regularization import l2, regularize_layer_params # from layers import MinibatchLayer # input layers l_in = InputLayer( shape=params['input_shape'], name='d_in') l_mask = InputLayer( shape=params['mask_shape'], name='d_mask') # recurrent layers for bidirectional network l_forward = LSTMLayer( l_in, params['n_units'], grad_clipping=params['grad_clip'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, nonlinearity=params['non_linearities'][0], only_return_final=True, mask_input=l_mask) l_backward = LSTMLayer( l_in, params['n_units'], grad_clipping=params['grad_clip'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, nonlinearity=params['non_linearities'][1], only_return_final=True, mask_input=l_mask, backwards=True) # concatenate output of forward and backward layers l_concat = concat([l_forward, l_backward], axis=1) # minibatch layer on forward and backward layers # l_minibatch = MinibatchLayer(l_concat, num_kernels=100) # output layer l_out = DenseLayer( l_concat, num_units=params['n_output_units'], nonlinearity=params['non_linearities'][2]) regularization = regularize_layer_params( l_out, l2) * params['regularization'] class Discriminator: def __init__(self, l_in, l_mask, l_out): self.l_in = l_in self.l_mask = l_mask self.l_out = l_out self.regularization = regularization return Discriminator(l_in, l_mask, l_out)
def rnn_fn(self, max_length): l_in = InputLayer((None, max_length, self.vocab_size)) l_mask = InputLayer((None, max_length)) l_final = LSTMLayer(l_in, num_units=self.nn_rnn_hid_dim, mask_input=l_mask, only_return_final=True) return l_final
def build_convpool_mix(input_vars, nb_classes, grad_clip=110, imsize=32, n_colors=3, n_timewin=7): """ Builds the complete network with LSTM and 1D-conv layers combined :param input_vars: list of EEG images (one image per time window) :param nb_classes: number of classes :param grad_clip: the gradient messages are clipped to the given value during the backward pass. :param imsize: size of the input image (assumes a square input) :param n_colors: number of color channels in the image :param n_timewin: number of time windows in the snippet :return: a pointer to the output of last layer """ convnets = [] w_init = None # Build 7 parallel CNNs with shared weights for i in range(n_timewin): if i == 0: convnet, w_init = build_cnn(input_vars[i], imsize=imsize, n_colors=n_colors) else: convnet, _ = build_cnn(input_vars[i], w_init=w_init, imsize=imsize, n_colors=n_colors) convnets.append(FlattenLayer(convnet)) # at this point convnets shape is [numTimeWin][n_samples, features] # we want the shape to be [n_samples, features, numTimeWin] convpool = ConcatLayer(convnets) convpool = ReshapeLayer(convpool, ([0], n_timewin, get_output_shape(convnets[0])[1])) reformConvpool = DimshuffleLayer(convpool, (0, 2, 1)) # input to 1D convlayer should be in (batch_size, num_input_channels, input_length) conv_out = Conv1DLayer(reformConvpool, 64, 3) conv_out = FlattenLayer(conv_out) # Input to LSTM should have the shape as (batch size, SEQ_LENGTH, num_features) lstm = LSTMLayer(convpool, num_units=128, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh) lstm_out = SliceLayer(lstm, -1, 1) # Merge 1D-Conv and LSTM outputs dense_input = ConcatLayer([conv_out, lstm_out]) # A fully-connected layer of 256 units with 50% dropout on its inputs: convpool = DenseLayer(lasagne.layers.dropout(dense_input, p=.5), num_units=512, nonlinearity=lasagne.nonlinearities.rectify) # And, finally, the 10-unit output layer with 50% dropout on its inputs: convpool = DenseLayer(convpool, num_units=nb_classes, nonlinearity=lasagne.nonlinearities.softmax) return convpool
def __init__(self, number_words, num_hidden, seq_length, mb_size): self.mb_size = mb_size x = T.imatrix() #sequence x minibatch x index one_hot_input = T.ftensor3() use_one_hot_input_flag = T.scalar() self.indices = x self.use_one_hot_input_flag = use_one_hot_input_flag self.one_hot_input = one_hot_input ''' flag for input: one-hot or index. If index, compute one-hot and use that. If one-hot, just use one-hot input. ''' #Time seq x examples x words target = T.ivector() #word_embeddings = theano.shared(np.random.normal(size = ((number_words, 1, num_hidden))).astype('float32')) word_embeddings = theano.shared(np.random.normal(size = ((number_words, num_hidden))).astype('float32')) feature_lst = [] for i in range(0, seq_length): #feature = word_embeddings[x[:,i]] #instead of this, multiply by one-hot matrix one_hot = T.extra_ops.to_one_hot(x[:,i], number_words) #W : 30k x 1 x 400 #one_hot: 128 x 30k #one_hot * W #128 x 1 x 400 one_hot_use = ifelse(use_one_hot_input_flag, one_hot_input[i], T.extra_ops.to_one_hot(x[:,i], number_words)) feature = T.reshape(T.dot(one_hot_use, word_embeddings), (1,mb_size,num_hidden)).transpose(1,0,2) feature_lst.append(feature) features = T.concatenate(feature_lst, 1) #example x sequence_position x feature l_lstm_1 = LSTMLayer((seq_length, mb_size, num_hidden), num_units = num_hidden, nonlinearity = lasagne.nonlinearities.tanh, grad_clipping=100.0) l_lstm_2 = LSTMLayer((seq_length, mb_size, num_hidden * 2), num_units = num_hidden, nonlinearity = lasagne.nonlinearities.tanh, grad_clipping=100.0, backwards = True) l_lstm_3 = LSTMLayer((seq_length, mb_size, num_hidden * 2), num_units = num_hidden, nonlinearity = lasagne.nonlinearities.tanh, grad_clipping=100.0) lstm_1_out = l_lstm_1.get_output_for([features]) lstm_2_out = l_lstm_2.get_output_for([T.concatenate([lstm_1_out, features], axis = 2)]) lstm_3_out = l_lstm_3.get_output_for([T.concatenate([lstm_2_out, features], axis = 2)]) final_out = T.mean(lstm_3_out, axis = 1) #final_out = T.mean(features, axis = 1) h_out_1 = DenseLayer((mb_size, num_hidden), num_units = 2048, nonlinearity=lasagne.nonlinearities.rectify) h_out_2 = DenseLayer((mb_size, 2048), num_units = 2048, nonlinearity=lasagne.nonlinearities.rectify) h_out_3 = DenseLayer((mb_size, 2048), num_units = 1, nonlinearity=None) h_out_1_value = h_out_1.get_output_for(final_out) h_out_2_value = h_out_2.get_output_for(h_out_1_value) h_out_3_value = h_out_3.get_output_for(h_out_2_value) classification = T.nnet.sigmoid(h_out_3_value) self.loss = T.mean(T.nnet.binary_crossentropy(output = classification.flatten(), target = target)) self.params = lasagne.layers.get_all_params(h_out_1,trainable=True) + lasagne.layers.get_all_params(h_out_3,trainable=True) + [word_embeddings] + lasagne.layers.get_all_params(l_lstm_1, trainable = True) + lasagne.layers.get_all_params(l_lstm_2, trainable = True) self.params += lasagne.layers.get_all_params(h_out_2,trainable=True) self.params += lasagne.layers.get_all_params(l_lstm_3,trainable=True) all_grads = T.grad(self.loss, self.params) for j in range(0, len(all_grads)): all_grads[j] = T.switch(T.isnan(all_grads[j]), T.zeros_like(all_grads[j]), all_grads[j]) scaled_grads = lasagne.updates.total_norm_constraint(all_grads, 5.0) updates = lasagne.updates.adam(scaled_grads, self.params) self.train_func = theano.function(inputs = [x, target, use_one_hot_input_flag, one_hot_input], outputs = {'l' : self.loss, 'c' : classification, 'g_w' : T.sum(T.sqr(T.grad(self.loss, word_embeddings)))}, updates = updates) self.evaluate_func = theano.function(inputs = [x, use_one_hot_input_flag, one_hot_input], outputs = {'c' : classification})
def test_lasagne_ctc(): import lasagne from lasagne.layers import ( LSTMLayer, InputLayer, DenseLayer, NonlinearityLayer, ReshapeLayer, EmbeddingLayer, RecurrentLayer, ) import theano import theano.tensor as T import numpy as np num_batch, input_seq_len = 1, 12 num_classes = 5 target_seq_len = 3 num_rnn_units = 50 def print_pred(y_hat): blank_symbol = num_classes res = [] for i, s in enumerate(y_hat): if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]): res += [s] if len(res) > 0: return "".join(map(str, list(res))) else: return "-" * target_seq_len Y_hat = np.asarray(np.random.normal(0, 1, (input_seq_len, num_batch, num_classes + 1)), dtype=floatX) Y = np.zeros((target_seq_len, num_batch), dtype="int64") Y[25:, :] = 1 Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX) Y_hat_mask[-5:] = 0 # default blank symbol is the highest class index (3 in this case) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) X = np.random.random((num_batch, input_seq_len)).astype("int32") y = T.imatrix("phonemes") x = T.imatrix() # batchsize, input_seq_len, features print "num_batch =", num_batch, "input_seq_len =", input_seq_len print "num_classes =", num_classes # setup Lasagne Recurrent network # The output from the network is shape # a) output_lin_ctc is the activation before softmax (input_seq_len, batch_size, num_classes + 1) # b) ouput_softmax is the output after softmax (batch_size, input_seq_len, num_classes + 1) l_inp = InputLayer((num_batch, input_seq_len)) netshape = lasagne.layers.get_output_shape(l_inp) print ("Layer l_inp shape:") print (netshape) l_emb = EmbeddingLayer( l_inp, input_size=num_classes + 1, output_size=num_classes + 1, W=np.identity(num_classes + 1).astype("float32") ) netshape = lasagne.layers.get_output_shape(l_emb) print ("Layer l_emb shape:") print (netshape) l_rnn = LSTMLayer(l_emb, num_units=num_rnn_units) netshape = lasagne.layers.get_output_shape(l_rnn) print ("Layer l_rnn shape:") print (netshape) l_rnn_shp = ReshapeLayer(l_rnn, (num_batch * input_seq_len, num_rnn_units)) netshape = lasagne.layers.get_output_shape(l_rnn_shp) print ("Layer l_rnn_shp shape:") print (netshape) l_out = DenseLayer(l_rnn_shp, num_units=num_classes + 1, nonlinearity=lasagne.nonlinearities.identity) # + blank netshape = lasagne.layers.get_output_shape(l_out) print ("Layer l_out shape:") print (netshape) l_out_shp = ReshapeLayer(l_out, (num_batch, input_seq_len, num_classes + 1)) netshape = lasagne.layers.get_output_shape(l_out_shp) print ("Layer l_out_shp shape:") print (netshape) # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1) # l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2)) l_out_softmax = NonlinearityLayer(l_out, nonlinearity=lasagne.nonlinearities.softmax) netshape = lasagne.layers.get_output_shape(l_out_softmax) print ("Layer l_out_softmax shape:") print (netshape) l_out_softmax_shp = ReshapeLayer(l_out_softmax, (num_batch, input_seq_len, num_classes + 1)) netshape = lasagne.layers.get_output_shape(l_out_softmax_shp) print ("Layer l_out_softmax_shp shape:") print (netshape) output_lin_ctc = lasagne.layers.get_output(l_out_shp, x) output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x) all_params = l_rnn.get_params(trainable=True) # dont learn embeddingl print "x type:", type(x) print "x shape", x.shape print "y type:", type(y) print "y shape", y.shape ############### # GRADIENTS # ############### # the CTC cross entropy between y and linear output network # (num_batch,t,class+1) # output_lin_ctc shape (1,12,6) pseudo_cost = ctc_cost.pseudo_cost(y, output_lin_ctc) # calculate the gradients of the CTC wrt. linar output of network pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params) true_cost = ctc_cost.cost(y, output_softmax) cost = T.mean(true_cost) sh_lr = theano.shared(lasagne.utils.floatX(0.01)) updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, learning_rate=sh_lr) # x shape (1,12) # y shape (1,3) train = theano.function([x, y], [output_lin_ctc, output_softmax, cost, pseudo_cost], updates=updates) # Create test dataset num_samples = 10 np.random.seed(1234) # create simple dataset of format # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1] # targets [5,2,3,...,1] # etc... input_lst, output_lst = [], [] for i in range(num_samples): this_input = [] this_output = [] for j in range(target_seq_len): this_class = np.random.randint(num_classes) this_input += [this_class] * 3 + [num_classes] this_output += [this_class] this_input += (input_seq_len - len(this_input)) * [this_input[-1]] input_lst.append(this_input) output_lst.append(this_output) print this_input, this_output input_arr = np.concatenate([input_lst]).astype("int32") y_arr = np.concatenate([output_lst]).astype("int32") print "y_arr shape:", y_arr.shape y_mask_arr = np.ones((num_batch, target_seq_len), dtype="float32") input_mask_arr = np.ones((num_batch, input_seq_len), dtype="float32") for nn in range(1000): cost_lst = [] shuffle = np.random.permutation(num_samples) for i in range(num_samples // num_batch): idx = shuffle[i * num_batch : (i + 1) * num_batch] _, output_softmax_val, cost, pseudo_cost_val = train(input_arr[idx], y_arr[idx]) print "x=", input_arr[idx] # x shape (1,12) print "x shape", input_arr[idx].shape print "y=", y_arr[idx] # y shape (1,3) print "y shape", y_arr[idx].shape output_softmax_lst = output_softmax_val labels_lst = y_arr[idx] cost_lst += [cost] # testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4) # testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val) if (nn + 1) % 20 == 0: DECAY = 1.5 new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY) sh_lr.set_value(new_lr) print "----------------------->NEW LR:", new_lr print nn, "Mean cost:", np.mean(cost_lst) if (nn + 1) % 4 == 0: for jj in range(num_batch): pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1)) true = "".join(map(str, labels_lst[jj])) pred += (target_seq_len - len(pred)) * " " print "pred =", pred, "true =", true
def test_lasagne_ctc(): import lasagne from lasagne.layers import LSTMLayer, InputLayer, DenseLayer,\ NonlinearityLayer, ReshapeLayer, EmbeddingLayer, RecurrentLayer import theano import theano.tensor as T import numpy as np num_batch, input_seq_len = 10, 15 num_classes = 10 target_seq_len = 5 num_rnn_units = 50 input_seq_len += target_seq_len def print_pred(y_hat): blank_symbol = num_classes res = [] for i, s in enumerate(y_hat): if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]): res += [s] if len(res) > 0: return "".join(map(str, list(res))) else: return "-"*target_seq_len Y_hat = np.asarray(np.random.normal( 0, 1, (input_seq_len, num_batch, num_classes + 1)), dtype=floatX) Y = np.zeros((target_seq_len, num_batch), dtype='int64') Y[25:, :] = 1 Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX) Y_hat_mask[-5:] = 0 # default blank symbol is the highest class index (3 in this case) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) X = np.random.random( (num_batch, input_seq_len)).astype('int32') y = T.imatrix('phonemes') x = T.imatrix() # batchsize, input_seq_len, features # setup Lasagne Recurrent network # The output from the network is shape # a) output_lin_ctc is the activation before softmax (input_seq_len, batch_size, num_classes + 1) # b) ouput_softmax is the output after softmax (batch_size, input_seq_len, num_classes + 1) l_inp = InputLayer((num_batch, input_seq_len)) l_emb = EmbeddingLayer(l_inp, input_size=num_classes+1, output_size=num_classes+1, W=np.identity(num_classes+1).astype('float32')) ini = lasagne.init.Uniform(0.1) zero = lasagne.init.Constant(0.0) cell = lasagne.init.Uniform(0.1) l_rnn = LSTMLayer(l_emb, num_units=num_rnn_units, peepholes=True, W_in_to_ingate=ini, W_hid_to_ingate=ini, b_ingate=zero, W_in_to_forgetgate=ini, W_hid_to_forgetgate=ini, b_forgetgate=zero, W_in_to_cell=ini, W_hid_to_cell=ini, b_cell=zero, W_in_to_outgate=ini, W_hid_to_outgate=ini, b_outgate=zero, cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), W_cell_to_forgetgate=cell, W_cell_to_ingate=cell, W_cell_to_outgate=cell) l_rnn_shp = ReshapeLayer(l_rnn, (num_batch*input_seq_len, num_rnn_units)) l_out = DenseLayer(l_rnn_shp, num_units=num_classes+1, nonlinearity=lasagne.nonlinearities.identity) # + blank l_out_shp = ReshapeLayer(l_out, (num_batch, input_seq_len, num_classes+1)) # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1) #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2)) l_out_softmax = NonlinearityLayer( l_out, nonlinearity=lasagne.nonlinearities.softmax) l_out_softmax_shp = ReshapeLayer( l_out_softmax, (num_batch, input_seq_len, num_classes+1)) output_lin_ctc = lasagne.layers.get_output(l_out_shp, x) output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x) all_params = l_rnn.get_params(trainable=True) # dont learn embeddingl print all_params ############### # GRADIENTS # ############### # the CTC cross entropy between y and linear output network pseudo_cost = ctc_cost.pseudo_cost( y, output_lin_ctc) # calculate the gradients of the CTC wrt. linar output of network pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params) true_cost = ctc_cost.cost(y, output_softmax) cost = T.mean(true_cost) sh_lr = theano.shared(lasagne.utils.floatX(0.01)) #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=sh_lr) #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9) updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, learning_rate=sh_lr) train = theano.function([x, y], [output_lin_ctc, output_softmax, cost, pseudo_cost], updates=updates) # Create test dataset num_samples = 1000 np.random.seed(1234) # create simple dataset of format # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1] # targets [5,2,3,...,1] # etc... input_lst, output_lst = [], [] for i in range(num_samples): this_input = [] this_output = [] for j in range(target_seq_len): this_class = np.random.randint(num_classes) this_input += [this_class]*3 + [num_classes] this_output += [this_class] this_input += (input_seq_len - len(this_input))*[this_input[-1]] input_lst.append(this_input) output_lst.append(this_output) print this_input, this_output input_arr = np.concatenate([input_lst]).astype('int32') y_arr = np.concatenate([output_lst]).astype('int32') y_mask_arr = np.ones((num_batch, target_seq_len), dtype='float32') input_mask_arr = np.ones((num_batch, input_seq_len), dtype='float32') for nn in range(10000): cost_lst = [] shuffle = np.random.permutation(num_samples) for i in range(num_samples//num_batch): idx = shuffle[i*num_batch:(i+1)*num_batch] _, output_softmax_val, cost, pseudo_cost_val = train( input_arr[idx], y_arr[idx]) output_softmax_lst = output_softmax_val labels_lst = y_arr[idx] cost_lst += [cost] #testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4) #testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val) if (nn+1) % 200 == 0: DECAY = 1.5 new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY) sh_lr.set_value(new_lr) print "----------------------->NEW LR:", new_lr print nn, "Mean cost:", np.mean(cost_lst) if (nn+1) % 4 == 0: for jj in range(num_batch): pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1)) true = "".join(map(str, labels_lst[jj])) pred += (target_seq_len-len(pred)) * " " print pred, true