def test_ctc_symmetry_logscale(): LENGTH = 5000 BATCHES = 3 CLASSES = 4 y_hat = T.tensor3('features') input_mask = T.matrix('features_mask') y_hat_mask = input_mask y = T.lmatrix('phonemes') y_mask = T.matrix('phonemes_mask') ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask) Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX) Y_hat[:, :, 0] = .3 Y_hat[:, :, 1] = .2 Y_hat[:, :, 2] = .4 Y_hat[:, :, 3] = .1 Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) # default blank symbol is the highest class index (3 in this case) Y = np.repeat(np.array([0, 1, 2, 1, 2, 0, 2, 2, 2]), BATCHES).reshape((9, BATCHES)) # the masks for this test should be all ones. Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) forward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask, y_mask: Y_mask}) backward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y[::-1], y_hat_mask: Y_hat_mask, y_mask: Y_mask}) testing.assert_almost_equal(forward_cost[0], backward_cost[0]) assert not np.isnan(forward_cost[0]) assert not np.isnan(backward_cost[0]) assert not np.isinf(np.abs(forward_cost[0])) assert not np.isinf(np.abs(backward_cost[0]))
def test_ctc_exact(): LENGTH = 4 BATCHES = 1 CLASSES = 2 y_hat = T.tensor3('features') input_mask = T.matrix('features_mask') y_hat_mask = input_mask y = T.lmatrix('phonemes') y_mask = T.matrix('phonemes_mask') ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask, log_scale=False) Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX) Y_hat[:, :, 0] = .7 Y_hat[:, :, 1] = .3 Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) # default blank symbol is the highest class index (3 in this case) Y = np.zeros((2, 1), dtype='int64') # -0-0 # 0-0- # 0--0 # 0-00 # 00-0 answer = np.log(3 * (.3 * .7)**2 + 2 * .3 * .7**3) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) forward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask, y_mask: Y_mask}) backward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y[::-1], y_hat_mask: Y_hat_mask, y_mask: Y_mask}) assert not np.isnan(forward_cost[0]) assert not np.isnan(backward_cost[0]) assert not np.isinf(np.abs(forward_cost[0])) assert not np.isinf(np.abs(backward_cost[0])) testing.assert_almost_equal(-forward_cost[0], answer) testing.assert_almost_equal(-backward_cost[0], answer)
def setup(self): # setup Lasagne Recurrent network # The output from the network is shape # a) output_lin_ctc is the activation before softmax (input_seq_len, batch_size, num_classes + 1) # b) ouput_softmax is the output after softmax (batch_size, input_seq_len, num_classes + 1) l_inp = InputLayer(shape=(self.num_batch, self.input_seq_len, self.num_inputs)) l_mask = InputLayer(shape=(self.num_batch, self.input_seq_len)) l_emb = EmbeddingLayer(l_inp, input_size=self.num_inputs, output_size=self.num_features) l_rnn = LSTMLayer(l_inp, num_units=self.num_units, peepholes=True, mask_input=l_mask) l_rnn_shp = ReshapeLayer(l_rnn, shape=(-1, self.num_units)) l_out = DenseLayer(l_rnn_shp, num_units=self.num_outputs, nonlinearity=identity) l_out_shp = ReshapeLayer(l_out, shape=(-1, self.input_seq_len, self.num_outputs)) # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1) #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2)) l_out_softmax = NonlinearityLayer(l_out, nonlinearity=softmax) l_out_softmax_shp = ReshapeLayer(l_out_softmax, shape=(-1, self.input_seq_len, self.num_outputs)) # calculate grad and cost output_lin_ctc = get_output(l_out_shp, {l_inp: self.x, l_mask: self.mask_x}) output_softmax = get_output(l_out_softmax_shp, {l_inp: self.x, l_mask: self.mask_x}) all_params = get_all_params(l_out_softmax_shp, trainable=True) # dont learn embeddinglayer # the CTC cross entropy between y and linear output network pseudo_cost = ctc_cost.pseudo_cost(self.y, output_lin_ctc, self.mask_y, self.mask_x) # calculate the gradients of the CTC wrt. linar output of network pseudo_grad = T.grad(pseudo_cost.sum() / self.num_batch, all_params) true_cost = ctc_cost.cost(self.y, output_softmax, self.mask_y, self.mask_x) cost = T.mean(true_cost) shared_lr = theano.shared(lasagne.utils.floatX(0.001)) #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=shared_lr) #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9) updates = lasagne.updates.rmsprop(pseudo_grad, all_params, learning_rate=shared_lr) self.train = theano.function([self.x, self.mask_x, self.y, self.mask_y], [output_softmax, cost], updates=updates) self.test = theano.function([self.x, self.mask_x], [output_softmax])
def test_ctc_exact(): LENGTH = 4 BATCHES = 1 CLASSES = 2 y_hat = T.tensor3('features') input_mask = T.matrix('features_mask') y_hat_mask = input_mask y = T.lmatrix('phonemes') y_mask = T.matrix('phonemes_mask') ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask, log_scale=False) Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX) Y_hat[:, :, 0] = .7 Y_hat[:, :, 1] = .3 Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) # default blank symbol is the highest class index (3 in this case) Y = np.zeros((2, 1), dtype='int64') # -0-0 # 0-0- # 0--0 # 0-00 # 00-0 answer = np.log(3 * (.3 * .7)**2 + 2 * .3 * .7**3) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) forward_cost = ctc_cost_t.eval({ y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask, y_mask: Y_mask }) backward_cost = ctc_cost_t.eval({ y_hat: Y_hat, y: Y[::-1], y_hat_mask: Y_hat_mask, y_mask: Y_mask }) assert not np.isnan(forward_cost[0]) assert not np.isnan(backward_cost[0]) assert not np.isinf(np.abs(forward_cost[0])) assert not np.isinf(np.abs(backward_cost[0])) testing.assert_almost_equal(-forward_cost[0], answer) testing.assert_almost_equal(-backward_cost[0], answer)
def finite_diff(Y, Y_hat, Y_mask, Y_hat_mask, eps=1e-2, n_steps=None): y_hat = T.tensor3('features') y_hat_mask = T.matrix('features_mask') y = T.lmatrix('phonemes') y_mask = T.matrix('phonemes_mask') ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask) get_cost = theano.function([y, y_hat, y_mask, y_hat_mask], ctc_cost_t.sum()) diff_grad = np.zeros_like(Y_hat) for grad, val in islice(izip(np.nditer(diff_grad, op_flags=['readwrite']), np.nditer(Y_hat, op_flags=['readwrite'])), 0, n_steps): val += eps error_inc = get_cost(Y, Y_hat, Y_mask, Y_hat_mask) val -= 2.0 * eps error_dec = get_cost(Y, Y_hat, Y_mask, Y_hat_mask) grad[...] = .5 * (error_inc - error_dec) / eps val += eps return diff_grad
def finite_diff(Y, Y_hat, Y_mask, Y_hat_mask, eps=1e-2, n_steps=None): y_hat = T.tensor3('features') y_hat_mask = T.matrix('features_mask') y = T.lmatrix('phonemes') y_mask = T.matrix('phonemes_mask') ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask) get_cost = theano.function([y, y_hat, y_mask, y_hat_mask], ctc_cost_t.sum()) diff_grad = np.zeros_like(Y_hat) for grad, val in islice( izip(np.nditer(diff_grad, op_flags=['readwrite']), np.nditer(Y_hat, op_flags=['readwrite'])), 0, n_steps): val += eps error_inc = get_cost(Y, Y_hat, Y_mask, Y_hat_mask) val -= 2.0 * eps error_dec = get_cost(Y, Y_hat, Y_mask, Y_hat_mask) grad[...] = .5 * (error_inc - error_dec) / eps val += eps return diff_grad
def test_ctc_symmetry_logscale(): LENGTH = 5000 BATCHES = 3 CLASSES = 4 y_hat = T.tensor3('features') input_mask = T.matrix('features_mask') y_hat_mask = input_mask y = T.lmatrix('phonemes') y_mask = T.matrix('phonemes_mask') ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask) Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX) Y_hat[:, :, 0] = .3 Y_hat[:, :, 1] = .2 Y_hat[:, :, 2] = .4 Y_hat[:, :, 3] = .1 Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) # default blank symbol is the highest class index (3 in this case) Y = np.repeat(np.array([0, 1, 2, 1, 2, 0, 2, 2, 2]), BATCHES).reshape( (9, BATCHES)) # the masks for this test should be all ones. Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) forward_cost = ctc_cost_t.eval({ y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask, y_mask: Y_mask }) backward_cost = ctc_cost_t.eval({ y_hat: Y_hat, y: Y[::-1], y_hat_mask: Y_hat_mask, y_mask: Y_mask }) testing.assert_almost_equal(forward_cost[0], backward_cost[0]) assert not np.isnan(forward_cost[0]) assert not np.isnan(backward_cost[0]) assert not np.isinf(np.abs(forward_cost[0])) assert not np.isinf(np.abs(backward_cost[0]))
net['conv4b'].b.tag.grad_scale = 2 net['conv5a'].b.tag.grad_scale = 2 net['conv5b'].b.tag.grad_scale = 2 net['fc6-1'].b.tag.grad_scale = 2 # net['fc7-1'].b.tag.grad_scale = 2 # net['fc8-1'].b.tag.grad_scale = 2 net['fc8-1'].W.tag.grad_scale = 10 net['fc8-1'].b.tag.grad_scale = 20 output_train = lasagne.layers.get_output(net['prob'], deterministic=False) output_eval = lasagne.layers.get_output(net['prob'], deterministic=True) # compute the cost for training output_flat = T.reshape(output_train, (num_batch, clip_length, num_classes)) #cost = T.mean(T.nnet.categorical_crossentropy(output_flat+TOL, sym_y.flatten())) cost = T.mean(ctc_cost.cost(output_flat + TOL, sym_y)) # maybe it is necessary to add l2_penalty to the cost regularizable_params = lasagne.layers.get_all_params(net['prob'], regularizable=True) print 'the regularizable_params are:' for p in regularizable_params: print p.name l2_w = 0.0005 all_layers = lasagne.layers.get_all_layers(net['prob']) l2_penalty = lasagne.regularization.regularize_layer_params( all_layers, lasagne.regularization.l2) * l2_w cost += l2_penalty # compute the cost for evaluation
def setup(self): # setup Lasagne Recurrent network # The output from the network is shape # a) output_lin_ctc is the activation before softmax (input_seq_len, batch_size, num_classes + 1) # b) ouput_softmax is the output after softmax (batch_size, input_seq_len, num_classes + 1) l_inp = InputLayer(shape=(self.num_batch, self.input_seq_len, self.num_inputs)) l_mask = InputLayer(shape=(self.num_batch, self.input_seq_len)) l_emb = EmbeddingLayer(l_inp, input_size=self.num_inputs, output_size=self.num_features) l_rnn = LSTMLayer(l_inp, num_units=self.num_units, peepholes=True, mask_input=l_mask) l_rnn_shp = ReshapeLayer(l_rnn, shape=(-1, self.num_units)) l_out = DenseLayer(l_rnn_shp, num_units=self.num_outputs, nonlinearity=identity) l_out_shp = ReshapeLayer(l_out, shape=(-1, self.input_seq_len, self.num_outputs)) # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1) #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2)) l_out_softmax = NonlinearityLayer(l_out, nonlinearity=softmax) l_out_softmax_shp = ReshapeLayer(l_out_softmax, shape=(-1, self.input_seq_len, self.num_outputs)) # calculate grad and cost output_lin_ctc = get_output(l_out_shp, { l_inp: self.x, l_mask: self.mask_x }) output_softmax = get_output(l_out_softmax_shp, { l_inp: self.x, l_mask: self.mask_x }) all_params = get_all_params( l_out_softmax_shp, trainable=True) # dont learn embeddinglayer # the CTC cross entropy between y and linear output network pseudo_cost = ctc_cost.pseudo_cost(self.y, output_lin_ctc, self.mask_y, self.mask_x) # calculate the gradients of the CTC wrt. linar output of network pseudo_grad = T.grad(pseudo_cost.sum() / self.num_batch, all_params) true_cost = ctc_cost.cost(self.y, output_softmax, self.mask_y, self.mask_x) cost = T.mean(true_cost) shared_lr = theano.shared(lasagne.utils.floatX(0.001)) #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=shared_lr) #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9) updates = lasagne.updates.rmsprop(pseudo_grad, all_params, learning_rate=shared_lr) self.train = theano.function( [self.x, self.mask_x, self.y, self.mask_y], [output_softmax, cost], updates=updates) self.test = theano.function([self.x, self.mask_x], [output_softmax])
def test_lasagne_ctc(): import lasagne from lasagne.layers import ( LSTMLayer, InputLayer, DenseLayer, NonlinearityLayer, ReshapeLayer, EmbeddingLayer, RecurrentLayer, ) import theano import theano.tensor as T import numpy as np num_batch, input_seq_len = 1, 12 num_classes = 5 target_seq_len = 3 num_rnn_units = 50 def print_pred(y_hat): blank_symbol = num_classes res = [] for i, s in enumerate(y_hat): if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]): res += [s] if len(res) > 0: return "".join(map(str, list(res))) else: return "-" * target_seq_len Y_hat = np.asarray(np.random.normal(0, 1, (input_seq_len, num_batch, num_classes + 1)), dtype=floatX) Y = np.zeros((target_seq_len, num_batch), dtype="int64") Y[25:, :] = 1 Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX) Y_hat_mask[-5:] = 0 # default blank symbol is the highest class index (3 in this case) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) X = np.random.random((num_batch, input_seq_len)).astype("int32") y = T.imatrix("phonemes") x = T.imatrix() # batchsize, input_seq_len, features print "num_batch =", num_batch, "input_seq_len =", input_seq_len print "num_classes =", num_classes # setup Lasagne Recurrent network # The output from the network is shape # a) output_lin_ctc is the activation before softmax (input_seq_len, batch_size, num_classes + 1) # b) ouput_softmax is the output after softmax (batch_size, input_seq_len, num_classes + 1) l_inp = InputLayer((num_batch, input_seq_len)) netshape = lasagne.layers.get_output_shape(l_inp) print ("Layer l_inp shape:") print (netshape) l_emb = EmbeddingLayer( l_inp, input_size=num_classes + 1, output_size=num_classes + 1, W=np.identity(num_classes + 1).astype("float32") ) netshape = lasagne.layers.get_output_shape(l_emb) print ("Layer l_emb shape:") print (netshape) l_rnn = LSTMLayer(l_emb, num_units=num_rnn_units) netshape = lasagne.layers.get_output_shape(l_rnn) print ("Layer l_rnn shape:") print (netshape) l_rnn_shp = ReshapeLayer(l_rnn, (num_batch * input_seq_len, num_rnn_units)) netshape = lasagne.layers.get_output_shape(l_rnn_shp) print ("Layer l_rnn_shp shape:") print (netshape) l_out = DenseLayer(l_rnn_shp, num_units=num_classes + 1, nonlinearity=lasagne.nonlinearities.identity) # + blank netshape = lasagne.layers.get_output_shape(l_out) print ("Layer l_out shape:") print (netshape) l_out_shp = ReshapeLayer(l_out, (num_batch, input_seq_len, num_classes + 1)) netshape = lasagne.layers.get_output_shape(l_out_shp) print ("Layer l_out_shp shape:") print (netshape) # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1) # l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2)) l_out_softmax = NonlinearityLayer(l_out, nonlinearity=lasagne.nonlinearities.softmax) netshape = lasagne.layers.get_output_shape(l_out_softmax) print ("Layer l_out_softmax shape:") print (netshape) l_out_softmax_shp = ReshapeLayer(l_out_softmax, (num_batch, input_seq_len, num_classes + 1)) netshape = lasagne.layers.get_output_shape(l_out_softmax_shp) print ("Layer l_out_softmax_shp shape:") print (netshape) output_lin_ctc = lasagne.layers.get_output(l_out_shp, x) output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x) all_params = l_rnn.get_params(trainable=True) # dont learn embeddingl print "x type:", type(x) print "x shape", x.shape print "y type:", type(y) print "y shape", y.shape ############### # GRADIENTS # ############### # the CTC cross entropy between y and linear output network # (num_batch,t,class+1) # output_lin_ctc shape (1,12,6) pseudo_cost = ctc_cost.pseudo_cost(y, output_lin_ctc) # calculate the gradients of the CTC wrt. linar output of network pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params) true_cost = ctc_cost.cost(y, output_softmax) cost = T.mean(true_cost) sh_lr = theano.shared(lasagne.utils.floatX(0.01)) updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, learning_rate=sh_lr) # x shape (1,12) # y shape (1,3) train = theano.function([x, y], [output_lin_ctc, output_softmax, cost, pseudo_cost], updates=updates) # Create test dataset num_samples = 10 np.random.seed(1234) # create simple dataset of format # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1] # targets [5,2,3,...,1] # etc... input_lst, output_lst = [], [] for i in range(num_samples): this_input = [] this_output = [] for j in range(target_seq_len): this_class = np.random.randint(num_classes) this_input += [this_class] * 3 + [num_classes] this_output += [this_class] this_input += (input_seq_len - len(this_input)) * [this_input[-1]] input_lst.append(this_input) output_lst.append(this_output) print this_input, this_output input_arr = np.concatenate([input_lst]).astype("int32") y_arr = np.concatenate([output_lst]).astype("int32") print "y_arr shape:", y_arr.shape y_mask_arr = np.ones((num_batch, target_seq_len), dtype="float32") input_mask_arr = np.ones((num_batch, input_seq_len), dtype="float32") for nn in range(1000): cost_lst = [] shuffle = np.random.permutation(num_samples) for i in range(num_samples // num_batch): idx = shuffle[i * num_batch : (i + 1) * num_batch] _, output_softmax_val, cost, pseudo_cost_val = train(input_arr[idx], y_arr[idx]) print "x=", input_arr[idx] # x shape (1,12) print "x shape", input_arr[idx].shape print "y=", y_arr[idx] # y shape (1,3) print "y shape", y_arr[idx].shape output_softmax_lst = output_softmax_val labels_lst = y_arr[idx] cost_lst += [cost] # testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4) # testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val) if (nn + 1) % 20 == 0: DECAY = 1.5 new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY) sh_lr.set_value(new_lr) print "----------------------->NEW LR:", new_lr print nn, "Mean cost:", np.mean(cost_lst) if (nn + 1) % 4 == 0: for jj in range(num_batch): pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1)) true = "".join(map(str, labels_lst[jj])) pred += (target_seq_len - len(pred)) * " " print "pred =", pred, "true =", true
# dims_bidir=conf.dims_transition, dims_top=[num_classes], # bidir_trans=GatedRecurrent, bottom_activation=None) # ******************* output ******************* y_hat = recognizer.apply(x, x_m) y_hat.name = 'outputs' y_hat_softmax = NDimensionalSoftmax().apply(y_hat, extra_ndim=y_hat.ndim - 2) y_hat_softmax.name = 'outputs_softmax' # there is a cost function for monitoring and for training, because one is more stable to compute # gradients and seems also to be more memory efficient, but does not compute the true cost. if conf.task == 'CTC': cost_train = ctc.pseudo_cost(y, y_hat, y_m, x_m).mean() cost_train.name = "cost_train" cost_monitor = ctc.cost(y, y_hat_softmax, y_m, x_m).mean() cost_monitor.name = "cost_monitor" elif conf.task == 'framewise': cost_train = categorical_crossentropy_batch().apply(y_hat_softmax, y, x_m) cost_train.name = 'cost' cost_monitor = cost_train else: raise ValueError, conf.task recognizer.initialize() cg = ComputationGraph([cost_train, y_hat, x_m, y, y_m]) weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg = apply_noise(cg, weights, conf.weight_noise) #************* training algorithm *************
def build_model(self, Dir_features, args): self._set_model_param(Dir_features) # try to scale the gradients on the level of parameters like caffe # by now only change the code with sgd scale_grad = True scale_l2_w = False TOL = 1e-5 sym_y = T.imatrix() # W is regularizable, b is not regularizable (correspondence with caffe) if scale_grad: self.net['conv1a'].b.tag.grad_scale = 2 self.net['conv2a'].b.tag.grad_scale = 2 self.net['conv3a'].b.tag.grad_scale = 2 self.net['conv3b'].b.tag.grad_scale = 2 self.net['conv4a'].b.tag.grad_scale = 2 self.net['conv4b'].b.tag.grad_scale = 2 self.net['conv5a'].b.tag.grad_scale = 2 self.net['conv5b'].b.tag.grad_scale = 2 self.net['fc6-1'].b.tag.grad_scale = 2 self.net['fc8-1'].W.tag.grad_scale = 10 self.net['fc8-1'].b.tag.grad_scale = 20 output_train = lasagne.layers.get_output(self.net['prob'], deterministic=False) output_eval = lasagne.layers.get_output(self.net['prob'], deterministic=True) ############## # compute cost ############## # compute the cost for training output_flat = T.reshape( output_train, (self.batch_size, self.clip_length, self.num_classes)) cost = T.mean(ctc_cost.cost(output_flat + TOL, sym_y)) # maybe it is necessary to add l2_penalty to the cost regularizable_params = lasagne.layers.get_all_params( self.net['prob'], regularizable=True) l2_w = 0.0005 all_layers = lasagne.layers.get_all_layers(self.net['prob']) l2_penalty = lasagne.regularization.regularize_layer_params( all_layers, lasagne.regularization.l2) * l2_w cost += l2_penalty # compute the cost for evaluation output_eval_flat = T.reshape( output_eval, (self.num_batch_eval, self.clip_length, self.num_classes)) cost_eval = T.mean(ctc_cost.cost(output_eval_flat + TOL, sym_y)) trainable_params = lasagne.layers.get_all_params(self.net['prob'], trainable=True) sh_lr = theano.shared(lasagne.utils.floatX(args.lr)) ################################################################## # try to scale the gradients on the level of parameters like caffe # by now only change the code with sgd ################################################################## if scale_grad: grads = theano.grad(cost, trainable_params) for idx, param in enumerate(trainable_params): grad_scale = getattr(trainable_params, 'grad_scale', 1) if grad_scale != 1: grads[idx] *= grad_scale ################# # compute updates ################# # adam works with lr 0.001 if args.optimizer == 'rmsprop': updates_opt = lasagne.updates.rmsprop(cost, trainable_params, learning_rate=sh_lr) updates = lasagne.updates.apply_momentum(updates_opt, trainable_params, momentum=0.9) elif args.optimizer == 'adam': updates_opt = lasagne.updates.adam(cost, trainable_params, learning_rate=sh_lr) updates = lasagne.updates.apply_momentum(updates_opt, trainable_params, momentum=0.9) elif args.optimizer == 'sgd': # Stochastic Gradient Descent (SGD) with momentum if scale_grad: updates = lasagne.updates.momentum(grads, trainable_params, learning_rate=sh_lr, momentum=0.9) else: updates = lasagne.updates.momentum(cost, trainable_params, learning_rate=sh_lr, momentum=0.9) elif args.optimizer == 'adadelta': updates_opt = lasagne.updates.adadelta(cost, trainable_params, learning_rate=sh_lr) updates = lasagne.updates.apply_momentum(updates_opt, trainable_params, momentum=0.9) elif args.optimizer == 'adagrad': updates_opt = lasagne.updates.adagrad(cost, trainable_params, learning_rate=sh_lr) updates = lasagne.updates.apply_momentum(updates_opt, trainable_params, momentum=0.9) ############################# # set train and eval function ############################# f_train = theano.function( [self.net['input'].input_var, sym_y, self.net['mask'].input_var], [cost, output_train], updates=updates) f_eval = theano.function( [self.net['input'].input_var, sym_y, self.net['mask'].input_var], [cost_eval, output_eval]) return f_train, f_eval
def test_lasagne_ctc(): import lasagne from lasagne.layers import LSTMLayer, InputLayer, DenseLayer,\ NonlinearityLayer, ReshapeLayer, EmbeddingLayer, RecurrentLayer import theano import theano.tensor as T import numpy as np num_batch, input_seq_len = 10, 15 num_classes = 10 target_seq_len = 5 num_rnn_units = 50 input_seq_len += target_seq_len def print_pred(y_hat): blank_symbol = num_classes res = [] for i, s in enumerate(y_hat): if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]): res += [s] if len(res) > 0: return "".join(map(str, list(res))) else: return "-" * target_seq_len Y_hat = np.asarray(np.random.normal( 0, 1, (input_seq_len, num_batch, num_classes + 1)), dtype=floatX) Y = np.zeros((target_seq_len, num_batch), dtype='int64') Y[25:, :] = 1 Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX) Y_hat_mask[-5:] = 0 # default blank symbol is the highest class index (3 in this case) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) X = np.random.random((num_batch, input_seq_len)).astype('int32') y = T.imatrix('phonemes') x = T.imatrix() # batchsize, input_seq_len, features # setup Lasagne Recurrent network # The output from the network is shape # a) output_lin_ctc is the activation before softmax (input_seq_len, batch_size, num_classes + 1) # b) ouput_softmax is the output after softmax (batch_size, input_seq_len, num_classes + 1) l_inp = InputLayer((num_batch, input_seq_len)) l_emb = EmbeddingLayer(l_inp, input_size=num_classes + 1, output_size=num_classes + 1, W=np.identity(num_classes + 1).astype('float32')) ini = lasagne.init.Uniform(0.1) zero = lasagne.init.Constant(0.0) cell = lasagne.init.Uniform(0.1) l_rnn = LSTMLayer(l_emb, num_units=num_rnn_units, peepholes=True, W_in_to_ingate=ini, W_hid_to_ingate=ini, b_ingate=zero, W_in_to_forgetgate=ini, W_hid_to_forgetgate=ini, b_forgetgate=zero, W_in_to_cell=ini, W_hid_to_cell=ini, b_cell=zero, W_in_to_outgate=ini, W_hid_to_outgate=ini, b_outgate=zero, cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), W_cell_to_forgetgate=cell, W_cell_to_ingate=cell, W_cell_to_outgate=cell) l_rnn_shp = ReshapeLayer(l_rnn, (num_batch * input_seq_len, num_rnn_units)) l_out = DenseLayer(l_rnn_shp, num_units=num_classes + 1, nonlinearity=lasagne.nonlinearities.identity) # + blank l_out_shp = ReshapeLayer(l_out, (num_batch, input_seq_len, num_classes + 1)) # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1) #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2)) l_out_softmax = NonlinearityLayer( l_out, nonlinearity=lasagne.nonlinearities.softmax) l_out_softmax_shp = ReshapeLayer( l_out_softmax, (num_batch, input_seq_len, num_classes + 1)) output_lin_ctc = lasagne.layers.get_output(l_out_shp, x) output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x) all_params = l_rnn.get_params(trainable=True) # dont learn embeddingl print all_params ############### # GRADIENTS # ############### # the CTC cross entropy between y and linear output network pseudo_cost = ctc_cost.pseudo_cost(y, output_lin_ctc) # calculate the gradients of the CTC wrt. linar output of network pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params) true_cost = ctc_cost.cost(y, output_softmax) cost = T.mean(true_cost) sh_lr = theano.shared(lasagne.utils.floatX(0.01)) #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=sh_lr) #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9) updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, learning_rate=sh_lr) train = theano.function( [x, y], [output_lin_ctc, output_softmax, cost, pseudo_cost], updates=updates) # Create test dataset num_samples = 1000 np.random.seed(1234) # create simple dataset of format # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1] # targets [5,2,3,...,1] # etc... input_lst, output_lst = [], [] for i in range(num_samples): this_input = [] this_output = [] for j in range(target_seq_len): this_class = np.random.randint(num_classes) this_input += [this_class] * 3 + [num_classes] this_output += [this_class] this_input += (input_seq_len - len(this_input)) * [this_input[-1]] input_lst.append(this_input) output_lst.append(this_output) print this_input, this_output input_arr = np.concatenate([input_lst]).astype('int32') y_arr = np.concatenate([output_lst]).astype('int32') y_mask_arr = np.ones((num_batch, target_seq_len), dtype='float32') input_mask_arr = np.ones((num_batch, input_seq_len), dtype='float32') for nn in range(10000): cost_lst = [] shuffle = np.random.permutation(num_samples) for i in range(num_samples // num_batch): idx = shuffle[i * num_batch:(i + 1) * num_batch] _, output_softmax_val, cost, pseudo_cost_val = train( input_arr[idx], y_arr[idx]) output_softmax_lst = output_softmax_val labels_lst = y_arr[idx] cost_lst += [cost] #testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4) #testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val) if (nn + 1) % 200 == 0: DECAY = 1.5 new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY) sh_lr.set_value(new_lr) print "----------------------->NEW LR:", new_lr print nn, "Mean cost:", np.mean(cost_lst) if (nn + 1) % 4 == 0: for jj in range(num_batch): pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1)) true = "".join(map(str, labels_lst[jj])) pred += (target_seq_len - len(pred)) * " " print pred, true
# bidir_trans=GatedRecurrent, bottom_activation=None) # ******************* output ******************* y_hat = recognizer.apply(x,x_m) y_hat.name = 'outputs' y_hat_softmax = NDimensionalSoftmax().apply(y_hat, extra_ndim = y_hat.ndim - 2) y_hat_softmax.name = 'outputs_softmax' # there is a cost function for monitoring and for training, because one is more stable to compute # gradients and seems also to be more memory efficient, but does not compute the true cost. if conf.task=='CTC': cost_train = ctc.pseudo_cost(y, y_hat, y_m, x_m).mean() cost_train.name = "cost_train" cost_monitor = ctc.cost(y, y_hat_softmax, y_m, x_m).mean() cost_monitor.name = "cost_monitor" elif conf.task=='framewise': cost_train = categorical_crossentropy_batch().apply(y_hat_softmax, y, x_m) cost_train.name='cost' cost_monitor = cost_train else: raise ValueError, conf.task recognizer.initialize() cg = ComputationGraph([cost_train, y_hat, x_m, y, y_m]) weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg = apply_noise(cg, weights, conf.weight_noise)
all_params = L.get_all_params(l_rnn_2, trainable=True) # ## Costs, Gradients & Training Functions # Cost functions target_values = T.imatrix('target_output') input_values = T.imatrix() ### Gradients ### # pseudo costs - ctc cross entropy b/n targets and linear output - used in training pseudo_cost = ctc_cost.pseudo_cost(target_values, output_lin_ctc) pseudo_cost_grad = T.grad(pseudo_cost.sum() / batchsize, all_params) pseudo_cost = pseudo_cost.mean() # true costs cost = ctc_cost.cost(target_values, network_output) cost = cost.mean() # Compute SGD updates for training print("Computing updates ...") updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, LEARNING_RATE) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function( [l_in.input_var, target_values], [cost, pseudo_cost, network_output], updates=updates) validate = theano.function([l_in.input_var, target_values], [cost, network_output]) predict = theano.function([l_in.input_var], network_output) # ## Network Training
def test_lasagne_ctc(): import lasagne from lasagne.layers import LSTMLayer, InputLayer, DenseLayer,\ NonlinearityLayer, ReshapeLayer, EmbeddingLayer, RecurrentLayer import theano import theano.tensor as T import numpy as np num_batch, input_seq_len = 10, 15 num_classes = 10 target_seq_len = 5 num_rnn_units = 50 input_seq_len += target_seq_len def print_pred(y_hat): blank_symbol = num_classes res = [] for i, s in enumerate(y_hat): if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]): res += [s] if len(res) > 0: return "".join(map(str, list(res))) else: return "-"*target_seq_len Y_hat = np.asarray(np.random.normal( 0, 1, (input_seq_len, num_batch, num_classes + 1)), dtype=floatX) Y = np.zeros((target_seq_len, num_batch), dtype='int64') Y[25:, :] = 1 Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX) Y_hat_mask[-5:] = 0 # default blank symbol is the highest class index (3 in this case) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) X = np.random.random( (num_batch, input_seq_len)).astype('int32') y = T.imatrix('phonemes') x = T.imatrix() # batchsize, input_seq_len, features # setup Lasagne Recurrent network # The output from the network is shape # a) output_lin_ctc is the activation before softmax (input_seq_len, batch_size, num_classes + 1) # b) ouput_softmax is the output after softmax (batch_size, input_seq_len, num_classes + 1) l_inp = InputLayer((num_batch, input_seq_len)) l_emb = EmbeddingLayer(l_inp, input_size=num_classes+1, output_size=num_classes+1, W=np.identity(num_classes+1).astype('float32')) ini = lasagne.init.Uniform(0.1) zero = lasagne.init.Constant(0.0) cell = lasagne.init.Uniform(0.1) l_rnn = LSTMLayer(l_emb, num_units=num_rnn_units, peepholes=True, W_in_to_ingate=ini, W_hid_to_ingate=ini, b_ingate=zero, W_in_to_forgetgate=ini, W_hid_to_forgetgate=ini, b_forgetgate=zero, W_in_to_cell=ini, W_hid_to_cell=ini, b_cell=zero, W_in_to_outgate=ini, W_hid_to_outgate=ini, b_outgate=zero, cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), W_cell_to_forgetgate=cell, W_cell_to_ingate=cell, W_cell_to_outgate=cell) l_rnn_shp = ReshapeLayer(l_rnn, (num_batch*input_seq_len, num_rnn_units)) l_out = DenseLayer(l_rnn_shp, num_units=num_classes+1, nonlinearity=lasagne.nonlinearities.identity) # + blank l_out_shp = ReshapeLayer(l_out, (num_batch, input_seq_len, num_classes+1)) # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1) #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2)) l_out_softmax = NonlinearityLayer( l_out, nonlinearity=lasagne.nonlinearities.softmax) l_out_softmax_shp = ReshapeLayer( l_out_softmax, (num_batch, input_seq_len, num_classes+1)) output_lin_ctc = lasagne.layers.get_output(l_out_shp, x) output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x) all_params = l_rnn.get_params(trainable=True) # dont learn embeddingl print all_params ############### # GRADIENTS # ############### # the CTC cross entropy between y and linear output network pseudo_cost = ctc_cost.pseudo_cost( y, output_lin_ctc) # calculate the gradients of the CTC wrt. linar output of network pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params) true_cost = ctc_cost.cost(y, output_softmax) cost = T.mean(true_cost) sh_lr = theano.shared(lasagne.utils.floatX(0.01)) #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=sh_lr) #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9) updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, learning_rate=sh_lr) train = theano.function([x, y], [output_lin_ctc, output_softmax, cost, pseudo_cost], updates=updates) # Create test dataset num_samples = 1000 np.random.seed(1234) # create simple dataset of format # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1] # targets [5,2,3,...,1] # etc... input_lst, output_lst = [], [] for i in range(num_samples): this_input = [] this_output = [] for j in range(target_seq_len): this_class = np.random.randint(num_classes) this_input += [this_class]*3 + [num_classes] this_output += [this_class] this_input += (input_seq_len - len(this_input))*[this_input[-1]] input_lst.append(this_input) output_lst.append(this_output) print this_input, this_output input_arr = np.concatenate([input_lst]).astype('int32') y_arr = np.concatenate([output_lst]).astype('int32') y_mask_arr = np.ones((num_batch, target_seq_len), dtype='float32') input_mask_arr = np.ones((num_batch, input_seq_len), dtype='float32') for nn in range(10000): cost_lst = [] shuffle = np.random.permutation(num_samples) for i in range(num_samples//num_batch): idx = shuffle[i*num_batch:(i+1)*num_batch] _, output_softmax_val, cost, pseudo_cost_val = train( input_arr[idx], y_arr[idx]) output_softmax_lst = output_softmax_val labels_lst = y_arr[idx] cost_lst += [cost] #testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4) #testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val) if (nn+1) % 200 == 0: DECAY = 1.5 new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY) sh_lr.set_value(new_lr) print "----------------------->NEW LR:", new_lr print nn, "Mean cost:", np.mean(cost_lst) if (nn+1) % 4 == 0: for jj in range(num_batch): pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1)) true = "".join(map(str, labels_lst[jj])) pred += (target_seq_len-len(pred)) * " " print pred, true