def __init__(self, softmax=softmax): self.inpv = T.matrix('inpv') self.outv = T.imatrix('outv') # indices self.ep = T.matrix('ep') self.w = T.scalar('w') self.n = self.inpv.shape[0] self.enc_m = get_encoder() self.enc_s = get_encoder() self.dec = get_decoder() self.mu = get_output(self.enc_m, self.inpv) self.log_s = get_output(self.enc_s, self.inpv) self.log_v = 2 * self.log_s self.sigma = T.exp(self.log_s) self.var = T.exp(self.log_s * 2) self.z = self.mu + self.sigma * self.ep self.rec_linear = get_output(self.dec, self.z) self.rec_reshaped_ln = self.rec_linear.reshape((self.n * d2, 256)) self.rec_reshaped = softmax(self.rec_reshaped_ln) self.out_onehot = T.extra_ops.to_one_hot( self.outv.reshape((self.n * d2, )), 256) # lazy modeling just using squared error ... self.rec_losses_reshaped = cc(self.rec_reshaped, self.out_onehot) self.rec_losses = self.rec_losses_reshaped.reshape((self.n, d2)).sum(1) self.klss = - 0.5 * (1+self.log_v) + \ 0.5 * (self.mu**2 + self.var) self.kls = self.klss.sum(1) self.rec_loss = self.rec_losses.mean() self.kl = self.kls.mean() self.loss = self.rec_loss + self.kl * self.w self.params = get_all_params(self.enc_m) + \ get_all_params(self.enc_s) + \ get_all_params(self.dec) self.updates = lasagne.updates.adam(self.loss, self.params, lr) print '\tgetting train func' self.train_func = theano.function( [self.inpv, self.outv, self.ep, self.w], [self.loss.mean(), self.rec_loss.mean(), self.kl.mean()], updates=self.updates) print '\tgetting other useful funcs' self.recon = theano.function([self.inpv, self.ep], self.rec_reshaped.argmax(1).reshape( (self.n, d2))) self.recon_ = theano.function([self.inpv, self.ep], self.rec_reshaped.reshape( (self.n, d2, 256))) self.project = theano.function([self.inpv, self.ep], self.z) self.get_mu = theano.function([self.inpv], self.mu) self.get_var = theano.function([self.inpv], self.var) self.get_klss = theano.function([self.inpv], self.klss)
def step(hid_previous): tiled_hid_prev = T.tile( T.reshape(hid_previous, (-1, 1, 1, self.hid_state_size)), (1, C.shape[1], 1, 1)) g = Ep_Gate(C_reshaped, tiled_hid_prev, tiled_q, self.Wb, self.W1, self.W2, self.b1, self.b2) g = T.reshape(g, (-1, C.shape[1])) g = T.switch(T.eq(input_sentence_mask, 1), g, np.float32(-np.inf)) g = nonlin.softmax(g) e = T.sum(T.reshape(g, (g.shape[0], g.shape[1], 1)) * C, axis=1) input_n = e hid_input = T.dot(hid_previous, W_hid_stacked) input_n = T.dot(input_n, W_in_stacked) + b_stacked resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) hid_update_in = slice_w(input_n, 2) hid_update_hid = slice_w(hid_input, 2) hid_update = hid_update_in + resetgate * hid_update_hid hid_update = self.nonlinearity_hid(hid_update) hid = (1 - updategate) * hid_previous + updategate + hid_update return (hid, g)
def step(hid_previous): tiled_hid_prev = T.tile(T.reshape( hid_previous,(-1,1,1,self.hid_state_size)),(1,C.shape[1],1,1)) g = Ep_Gate(C_reshaped, tiled_hid_prev, tiled_q, self.Wb, self.W1, self.W2, self.b1, self.b2) g = T.reshape(g,(-1,C.shape[1])) g = T.switch(T.eq(input_sentence_mask, 1), g, np.float32(-np.inf)) g = nonlin.softmax(g) e = T.sum(T.reshape(g,(g.shape[0],g.shape[1],1)) * C, axis=1) input_n = e hid_input = T.dot(hid_previous, W_hid_stacked) input_n = T.dot(input_n, W_in_stacked) + b_stacked resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) hid_update_in = slice_w(input_n, 2) hid_update_hid = slice_w(hid_input, 2) hid_update = hid_update_in + resetgate*hid_update_hid hid_update = self.nonlinearity_hid(hid_update) hid = (1 - updategate)*hid_previous + updategate+hid_update return (hid, g)
def get_output_for(self, inputs, **kwargs): s_hat_t = inputs[0] h_hat_t = inputs[1] # s_hat_t = s_hat_t.dimshuffle(1, 0) # h_hat_t = h_hat_t.dimshuffle(1, 0) H = inputs[2] # H = H.dimshuffle(2, 0, 1) # H_len = H.shape[-1] # z_t 1*none*k zt = T.dot( self.nonlinearity( T.dot(H, self.W_v_to_attenGate) + T.dot( T.dot(h_hat_t, self.W_g_to_attenGate).dimshuffle( 0, 1, 'x'), T.ones((1, self.num_inputs)))), self.W_h_to_attenGate)[:, :, 0] vt = T.dot( self.nonlinearity( T.dot(s_hat_t, self.W_s_to_attenGate) + T.dot(h_hat_t, self.W_g_to_attenGate)), self.W_h_to_attenGate) alpha_hat_t = self.nonlinearity_atten(T.concatenate([zt, vt], axis=-1)) feature = T.concatenate([H, s_hat_t.dimshuffle(0, 'x', 1)], axis=1).dimshuffle(2, 0, 1) c_hat_t = T.sum(alpha_hat_t * feature, axis=-1) out = T.dot((c_hat_t.T + h_hat_t), self.W_p) return nonlinearities.softmax(out)
def _create_iter_funcs(self): X = T.imatrix('X') Y = T.imatrix('Y') sx0, sx1 = X.shape # input shape sy0, sy1 = Y.shape # output shape nt = T.iscalar('num tokens') inputs = [X, Y, nt] output_layer = self.layers_.values()[-1] Y_flat = T.reshape(Y, (sy0 * sy1, 1)).flatten() # bs x time x num_tokens output_train = get_output(output_layer, X, deterministic=False) # bs * time x num_tokens output_train_flat = T.reshape( output_train[:, :sy1, :], (sx0 * sy1, nt)) output_train_01 = softmax(output_train_flat) probs_train = output_train_01[T.arange(sx0 * sy1), Y_flat] loss_train = -T.mean(T.log(probs_train)) # bs x time x num_tokens output_valid = get_output(output_layer, X, deterministic=True) # bs * time x num_tokens output_valid_flat = T.reshape( output_valid[:, :sy1, :], (sx0 * sy1, nt)) output_valid_01 = softmax(output_valid_flat) probs_valid = output_valid_01[T.arange(sx0 * sy1), Y_flat] loss_valid = T.mean(-T.log(probs_valid)) pred_reshape = T.reshape(output_valid, (sx0 * sx1, nt)) pred_softmax = softmax(pred_reshape) pred_valid = T.reshape(pred_softmax, (sx0, sx1, nt)) all_params = get_all_params(output_layer) updates = self.updater(loss_train, all_params) train_iter = theano.function(inputs, loss_train, updates=updates) valid_iter = theano.function(inputs, loss_valid) predict_iter = theano.function([X, nt], pred_valid) return train_iter, valid_iter, predict_iter
def step(hid_previous, out_previous, *args): input_n = T.concatenate([out_previous, q], axis=1) hid_input = T.dot(hid_previous, W_hid_stacked) input_n = T.dot(input_n, W_in_stacked) + b_stacked resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) hid_update_in = slice_w(input_n, 2) hid_update_hid = slice_w(hid_input, 2) hid_update = hid_update_in + resetgate * hid_update_hid hid_update = self.nonlinearity_hid(hid_update) hid = (1 - updategate) * hid_previous + updategate + hid_update out = nonlin.softmax(T.dot(hid, self.W)) return (hid, out)
def step(hid_previous, out_previous, *args): input_n = T.concatenate([out_previous, q], axis=1) hid_input = T.dot(hid_previous, W_hid_stacked) input_n = T.dot(input_n, W_in_stacked) + b_stacked resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) hid_update_in = slice_w(input_n, 2) hid_update_hid = slice_w(hid_input, 2) hid_update = hid_update_in + resetgate*hid_update_hid hid_update = self.nonlinearity_hid(hid_update) hid = (1 - updategate)*hid_previous + updategate+hid_update out = nonlin.softmax(T.dot(hid, self.W)) return (hid, out)
def get_output_for(self, inputs, **kwargs): input = inputs[0] mask = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] (d1, d2, d3) = input.shape # out = T.tensordot(input, self.W, axes=[[2], [0]]) # b_shuffled = self.b.dimshuffle('x', 'x', 0) # out += b_shuffled # out = tanh(out) # out *= mask.dimshuffle(0, 1, 'x') # out = T.batched_dot(out, out.dimshuffle(0, 2, 1)) q = T.tensordot(input, self.W1, axes=[[2], [0]]) b1_shuffled = self.b1.dimshuffle('x', 'x', 0) q += b1_shuffled q = tanh(q) # k = T.tensordot(input, self.W2, axes=[[2], [0]]) # b2_shuffled = self.b2.dimshuffle('x', 'x', 0) # k += b2_shuffled # k = tanh(k) q *= mask.dimshuffle(0, 1, 'x') # k *= mask.dimshuffle(0, 1, 'x') out = T.batched_dot(q, q.dimshuffle(0, 2, 1)) #out /= np.sqrt(self.nu) #out *= 0.1 out *= (1 - T.eye(d2, d2)) matrix = softmax(out.reshape((d1 * d2, d2))).reshape((d1, d2, d2)) matrix *= mask.dimshuffle(0, 1, 'x') matrix *= mask.dimshuffle(0, 'x', 1) return matrix
def get_output_for(self, inputs, **kwargs): input = inputs[0] original_shape = input.shape mask = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] # reshape input input = input.reshape( (input.shape[0] * input.shape[1], input.shape[2])) # apply mask if mask is not None: mask = mask.reshape((mask.shape[0] * mask.shape[1], 1)) input *= mask # compute g(W* ... g(W* g(W*x+b) +b) ... +b) * v activation = input for W, b in zip(self.W, self.b): activation = T.dot(activation, W) + b.dimshuffle('x', 0) activation = self.nonlinearity(activation) activation = T.dot(activation, self.v) # apply softmax - acquiring attention weights for each letter in each tweet activation = activation.reshape((original_shape[0], original_shape[1])) attention_w = nonlinearities.softmax(activation) attention_w = attention_w.reshape( (original_shape[0] * original_shape[1], 1)) # get weighted sum of each hidden state according to attention weights context = input * attention_w context = context.reshape(original_shape) context = T.sum(context, axis=1) return context
def safe_softmax(x, eps=1e-6): """ Prevents that any of the outputs become exactly 1 or 0 """ x = softmax(x) x = T.maximum(x, eps) x = T.minimum(x, 1 - eps) return x
class_lab = T.batched_dot( T.reshape(output_before_softmax_lab, newshape=(args.batch_size, 2, num_classes)).dimshuffle(0, 2, 1), T.ones(shape=(args.batch_size, 2, 1))).dimshuffle( 0, 1, ) class_gen = T.batched_dot( T.reshape(output_before_softmax_gen, newshape=(args.batch_size, 2, num_classes)).dimshuffle(0, 2, 1), T.ones(shape=(args.batch_size, 2, 1))).dimshuffle( 0, 1, ) loss_gen_class = T.mean( categorical_crossentropy(predictions=softmax(class_gen), targets=labels_gen)) loss_gen_source = T.mean( categorical_crossentropy(predictions=softmax(source_gen), targets=T.zeros(shape=(args.batch_size, ), dtype='int32'))) loss_lab_class = T.mean( categorical_crossentropy(predictions=softmax(class_lab), targets=labels)) loss_lab_source = T.mean(categorical_crossentropy(predictions=softmax(source_lab), targets=T.zeros(shape=(args.batch_size,), dtype='int32'))) +\ T.mean(categorical_crossentropy(predictions=softmax(source_gen), targets=T.ones(shape=(args.batch_size,), dtype='int32'))) weight_gen_loss = th.shared(np.float32(0.)) output_lab = ll.get_output(disc_layers[-2], x_lab, deterministic=False) output_gen = ll.get_output(disc_layers[-2], gen_dat, deterministic=False) m1 = T.mean(output_lab, axis=0) m2 = T.mean(output_gen, axis=0) feature_loss = T.mean(abs(m1 - m2))
def get_output(a): return nonlin.softmax(T.dot(a, self.W))
def get_output_for(self, input, **kwargs): activation = T.dot(input, self.C) if self.b is not None: activation = activation + self.b.dimshuffle('x', 0) return nonlinearities.softmax(activation)
def get_output(a): return nonlin.softmax(T.dot(a,self.W))
def build_model(self, train_set_unlabeled, train_set_labeled, test_set, validation_set=None): """ Build the auxiliary deep generative model from the initialized hyperparameters. Define the lower bound term and compile it into a training function. :param train_set_unlabeled: Unlabeled train set containing variables x, t. :param train_set_labeled: Unlabeled train set containing variables x, t. :param test_set: Test set containing variables x, t. :param validation_set: Validation set containing variables x, t. :return: train, test, validation function and dicts of arguments. """ super(CSDGM, self).build_model(train_set_unlabeled, test_set, validation_set) sh_train_x_l = theano.shared(np.asarray(train_set_labeled[0], dtype=theano.config.floatX), borrow=True) sh_train_t_l = theano.shared(np.asarray(train_set_labeled[1], dtype=theano.config.floatX), borrow=True) n = self.sh_train_x.shape[0].astype( theano.config.floatX) # no. of data points n_l = sh_train_x_l.shape[0].astype( theano.config.floatX) # no. of labeled data points # Define the layers for the density estimation used in the lower bound. l_log_qa = GaussianLogDensityLayer(self.l_qa, self.l_qa_mu, self.l_qa_logvar) l_log_qz = GaussianLogDensityLayer(self.l_qz, self.l_qz_mu, self.l_qz_logvar) l_log_qy = MultinomialLogDensityLayer(self.l_qy, self.l_y_in, eps=1e-8) l_log_pz = StandardNormalLogDensityLayer(self.l_qz) l_log_pa = GaussianLogDensityLayer(self.l_qa, self.l_pa_mu, self.l_pa_logvar) l_x_in = ReshapeLayer(self.l_x_in, (-1, self.n_l * self.n_c)) l_px = DimshuffleLayer(self.l_px, (0, 3, 1, 2, 4)) l_px = ReshapeLayer(l_px, (-1, self.sym_samples, 1, self.n_c)) if self.x_dist == 'bernoulli': l_log_px = BernoulliLogDensityLayer(self.l_px, self.l_x_in) elif self.x_dist == 'multinomial': l_log_px = MultinomialLogDensityLayer(l_px, l_x_in) l_log_px = ReshapeLayer(l_log_px, (-1, self.n_l, 1, 1, 1)) l_log_px = MeanLayer(l_log_px, axis=1) elif self.x_dist == 'gaussian': l_px_mu = ReshapeLayer( DimshuffleLayer(self.l_px_mu, (0, 2, 3, 1, 4)), (-1, self.sym_samples, 1, self.n_l * self.n_c)) l_px_logvar = ReshapeLayer( DimshuffleLayer(self.l_px_logvar, (0, 2, 3, 1, 4)), (-1, self.sym_samples, 1, self.n_l * self.n_c)) l_log_px = GaussianLogDensityLayer(l_x_in, l_px_mu, l_px_logvar) def lower_bound(log_pa, log_qa, log_pz, log_qz, log_py, log_px): lb = log_px + log_py + (log_pz + log_pa - log_qa - log_qz) * (1.1 - self.sym_warmup) return lb # Lower bound for labeled data out_layers = [ l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px, l_log_qy ] inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = out # Prior p(y) expecting that all classes are evenly distributed py_l = softmax(T.zeros((self.sym_x_l.shape[0], self.n_y))) log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape( (-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_l = lower_bound(log_pa_l, log_qa_x_l, log_pz_l, log_qz_axy_l, log_py_l, log_px_zy_l) lb_l = lb_l.mean(axis=(1, 2)) # Mean over the sampling dimensions log_qy_ax_l *= ( self.sym_beta * (n / n_l) ) # Scale the supervised cross entropy with the alpha constant lb_l += log_qy_ax_l.mean(axis=( 1, 2 )) # Collect the lower bound term and mean over sampling dimensions # Lower bound for unlabeled data bs_u = self.sym_x_u.shape[0] # For the integrating out approach, we repeat the input matrix x, and construct a target (bs * n_y) x n_y # Example of input and target matrix for a 3 class problem and batch_size=2. 2D tensors of the form # x_repeat t_repeat # [[x[0,0], x[0,1], ..., x[0,n_x]] [[1, 0, 0] # [x[1,0], x[1,1], ..., x[1,n_x]] [1, 0, 0] # [x[0,0], x[0,1], ..., x[0,n_x]] [0, 1, 0] # [x[1,0], x[1,1], ..., x[1,n_x]] [0, 1, 0] # [x[0,0], x[0,1], ..., x[0,n_x]] [0, 0, 1] # [x[1,0], x[1,1], ..., x[1,n_x]]] [0, 0, 1]] t_eye = T.eye(self.n_y, k=0) t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u, axis=1).reshape( (-1, self.n_y)) x_u = self.sym_x_u.reshape( (1, bs_u, self.n_l, self.n_c)).repeat(self.n_y, axis=0).reshape( (-1, self.n_l, self.n_c)) # Since the expectation of var a is outside the integration we calculate E_q(a|x) first a_x_u = get_output(self.l_qa, self.sym_x_u, batch_norm_update_averages=True, batch_norm_use_averages=False) a_x_u_rep = a_x_u.reshape( (1, bs_u * self.sym_samples, self.n_a)).repeat(self.n_y, axis=0).reshape( (-1, self.n_a)) out_layers = [l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px] inputs = {self.l_x_in: x_u, self.l_y_in: t_u, self.l_a_in: a_x_u_rep} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = out # Prior p(y) expecting that all classes are evenly distributed py_u = softmax(T.zeros((bs_u * self.n_y, self.n_y))) log_py_u = -categorical_crossentropy(py_u, t_u).reshape( (-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_u = lower_bound(log_pa_u, log_qa_x_u, log_pz_u, log_qz_axy_u, log_py_u, log_px_zy_u) lb_u = lb_u.reshape( (self.n_y, 1, 1, bs_u)).transpose(3, 1, 2, 0).mean(axis=(1, 2)) inputs = { self.l_x_in: self.sym_x_u, self.l_a_in: a_x_u.reshape((-1, self.n_a)) } y_u = get_output(self.l_qy, inputs, batch_norm_update_averages=True, batch_norm_use_averages=False).mean(axis=(1, 2)) y_u += 1e-8 # Ensure that we get no NANs when calculating the entropy y_u /= T.sum(y_u, axis=1, keepdims=True) lb_u = (y_u * (lb_u - T.log(y_u))).sum(axis=1) # Regularizing with weight priors p(theta|N(0,1)), collecting and clipping gradients weight_priors = 0.0 for p in self.trainable_model_params: if 'W' not in str(p): continue weight_priors += log_normal(p, 0, 1).sum() # Collect the lower bound and scale it with the weight priors. elbo = ((lb_l.mean() + lb_u.mean()) * n + weight_priors) / -n lb_labeled = -lb_l.mean() lb_unlabeled = -lb_u.mean() log_px = log_px_zy_l.mean() + log_px_zy_u.mean() log_pz = log_pz_l.mean() + log_pz_u.mean() log_qz = log_qz_axy_l.mean() + log_qz_axy_u.mean() log_pa = log_pa_l.mean() + log_pa_u.mean() log_qa = log_qa_x_l.mean() + log_qa_x_u.mean() grads_collect = T.grad(elbo, self.trainable_model_params) params_collect = self.trainable_model_params sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads_collect, max_norm=max_norm) mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = adam(mgrads, params_collect, self.sym_lr, sym_beta1, sym_beta2) # Training function indices = self._srng.choice(size=[self.sym_bs_l], a=sh_train_x_l.shape[0], replace=False) x_batch_l = sh_train_x_l[indices] t_batch_l = sh_train_t_l[indices] x_batch_u = self.sh_train_x[self.batch_slice] if self.x_dist == 'bernoulli': # Sample bernoulli input. x_batch_u = self._srng.binomial(size=x_batch_u.shape, n=1, p=x_batch_u, dtype=theano.config.floatX) x_batch_l = self._srng.binomial(size=x_batch_l.shape, n=1, p=x_batch_l, dtype=theano.config.floatX) givens = { self.sym_x_l: x_batch_l, self.sym_x_u: x_batch_u, self.sym_t_l: t_batch_l } inputs = [ self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta, self.sym_lr, sym_beta1, sym_beta2, self.sym_samples, self.sym_warmup ] outputs = [ elbo, lb_labeled, lb_unlabeled, log_px, log_pz, log_qz, log_pa, log_qa ] f_train = theano.function(inputs=inputs, outputs=outputs, givens=givens, updates=updates) # Default training args. Note that these can be changed during or prior to training. self.train_args['inputs']['batchsize_unlabeled'] = 100 self.train_args['inputs']['batchsize_labeled'] = 100 self.train_args['inputs']['beta'] = 0.1 self.train_args['inputs']['learningrate'] = 3e-4 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['inputs']['samples'] = 1 self.train_args['inputs']['warmup'] = 0.1 self.train_args['outputs']['lb'] = '%0.3f' self.train_args['outputs']['lb-l'] = '%0.3f' self.train_args['outputs']['lb-u'] = '%0.3f' self.train_args['outputs']['px'] = '%0.3f' self.train_args['outputs']['pz'] = '%0.3f' self.train_args['outputs']['qz'] = '%0.3f' self.train_args['outputs']['pa'] = '%0.3f' self.train_args['outputs']['qa'] = '%0.3f' # Validation and test function y = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) class_err = (1. - categorical_accuracy(y, self.sym_t_l).mean()) * 100 givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t} f_test = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens) # Test args. Note that these can be changed during or prior to training. self.test_args['inputs']['samples'] = 1 self.test_args['outputs']['test'] = '%0.2f%%' f_validate = None if validation_set is not None: givens = { self.sym_x_l: self.sh_valid_x, self.sym_t_l: self.sh_valid_t } f_validate = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens) # Default validation args. Note that these can be changed during or prior to training. self.validate_args['inputs']['samples'] = 1 self.validate_args['outputs']['validation'] = '%0.2f%%' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def main(): parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-CNN-CRF') parser.add_argument('--num_epochs', type=int, default=1000, help='Number of training epochs') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM') parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--delta', type=float, default=0.0, help='weight for expectation-linear regularization') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', choices=['std', 'recurrent'], help='dropout patten') parser.add_argument('--schedule', nargs='+', type=int, help='schedule for learning rate decay') parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files') parser.add_argument('--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument('--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument('--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() logger = get_logger("Sequence Labeling") train_path = args.train dev_path = args.dev test_path = args.test num_epochs = args.num_epochs batch_size = args.batch_size num_units = args.num_units num_filters = args.num_filters regular = args.regular grad_clipping = args.grad_clipping gamma = args.gamma delta = args.delta learning_rate = args.learning_rate momentum = 0.9 decay_rate = args.decay_rate schedule = args.schedule output_predict = args.output_prediction dropout = args.dropout p = 0.5 logger.info("Creating Alphabets") word_alphabet, char_alphabet, pos_alphabet, type_alphabet = data_utils.create_alphabets("data/alphabets/", [train_path, dev_path, test_path], 40000) logger.info("Word Alphabet Size: %d" % word_alphabet.size()) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) num_labels = pos_alphabet.size() - 1 logger.info("Reading Data") data_train = data_utils.read_data(train_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) data_dev = data_utils.read_data(dev_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) data_test = data_utils.read_data(test_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) num_data = sum([len(bucket) for bucket in data_train]) logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) mask_nr_var = T.matrix(name='masks_nr', dtype=theano.config.floatX) word_var = T.imatrix(name='inputs') char_var = T.itensor3(name='char-inputs') network = build_network(word_var, char_var, mask_var, word_alphabet, char_alphabet, dropout, num_units, num_labels, grad_clipping, num_filters, p) logger.info("Network structure: hidden=%d, filter=%d, dropout=%s" % (num_units, num_filters, dropout)) # compute loss num_tokens = mask_var.sum(dtype=theano.config.floatX) num_tokens_nr = mask_nr_var.sum(dtype=theano.config.floatX) # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels] energies_train = lasagne.layers.get_output(network) energies_train_det = lasagne.layers.get_output(network, deterministic=True) energies_eval = lasagne.layers.get_output(network, deterministic=True) loss_train_org = chain_crf_loss(energies_train, target_var, mask_var).mean() energy_shape = energies_train.shape # [batch, length, num_labels, num_labels] --> [batch*length, num_labels*num_labels] energies = T.reshape(energies_train, (energy_shape[0] * energy_shape[1], energy_shape[2] * energy_shape[3])) energies = nonlinearities.softmax(energies) energies_det = T.reshape(energies_train_det, (energy_shape[0] * energy_shape[1], energy_shape[2] * energy_shape[3])) energies_det = nonlinearities.softmax(energies_det) # [batch*length, num_labels*num_labels] --> [batch, length*num_labels*num_labels] energies = T.reshape(energies, (energy_shape[0], energy_shape[1] * energy_shape[2] * energy_shape[3])) energies_det = T.reshape(energies_det, (energy_shape[0], energy_shape[1] * energy_shape[2] * energy_shape[3])) loss_train_expect_linear = lasagne.objectives.squared_error(energies, energies_det) loss_train_expect_linear = loss_train_expect_linear.sum(axis=1) loss_train_expect_linear = loss_train_expect_linear.mean() loss_train = loss_train_org + delta * loss_train_expect_linear # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params(network, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty _, corr_train = chain_crf_accuracy(energies_train, target_var) corr_nr_train = (corr_train * mask_nr_var).sum(dtype=theano.config.floatX) corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX) prediction_eval, corr_eval = chain_crf_accuracy(energies_eval, target_var) corr_nr_eval = (corr_eval * mask_nr_var).sum(dtype=theano.config.floatX) corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX) params = lasagne.layers.get_all_params(network, trainable=True) updates = nesterov_momentum(loss_train, params=params, learning_rate=learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function([word_var, char_var, target_var, mask_var, mask_nr_var], [loss_train, loss_train_org, loss_train_expect_linear, corr_train, corr_nr_train, num_tokens, num_tokens_nr], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function([word_var, char_var, target_var, mask_var, mask_nr_var], [corr_eval, corr_nr_eval, num_tokens, num_tokens_nr, prediction_eval]) # Finally, launch the training loop. logger.info( "Start training: regularization: %s(%f), dropout: %s, delta: %.2f (#training data: %d, batch size: %d, clip: %.1f)..." \ % (regular, (0.0 if regular == 'none' else gamma), dropout, delta, num_data, batch_size, grad_clipping)) num_batches = num_data / batch_size + 1 dev_correct = 0.0 dev_correct_nr = 0.0 best_epoch = 0 test_correct = 0.0 test_correct_nr = 0.0 test_total = 0 test_total_nr = 0 test_inst = 0 lr = learning_rate for epoch in range(1, num_epochs + 1): print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate) train_err = 0.0 train_err_org = 0.0 train_err_linear = 0.0 train_corr = 0.0 train_corr_nr = 0.0 train_total = 0 train_total_nr = 0 train_inst = 0 start_time = time.time() num_back = 0 for batch in xrange(1, num_batches + 1): wids, cids, pids, _, _, masks = data_utils.get_batch(data_train, batch_size) masks_nr = np.copy(masks) masks_nr[:, 0] = 0 err, err_org, err_linear, corr, corr_nr, num, num_nr = train_fn(wids, cids, pids, masks, masks_nr) train_err += err * wids.shape[0] train_err_org += err_org * wids.shape[0] train_err_linear += err_linear * wids.shape[0] train_corr += corr train_corr_nr += corr_nr train_total += num train_total_nr += num_nr train_inst += wids.shape[0] time_ave = (time.time() - start_time) / batch time_left = (num_batches - batch) * time_ave # update log sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, loss_org: %.4f, loss_linear: %.4f, acc: %.2f%%, acc(no root): %.2f%%, time left (estimated): %.2fs' % ( batch, num_batches, train_err / train_inst, train_err_org / train_inst, train_err_linear / train_inst, train_corr * 100 / train_total, train_corr_nr * 100 / train_total_nr, time_left) sys.stdout.write(log_info) num_back = len(log_info) # update training log after each epoch assert train_inst == num_batches * batch_size assert train_total == train_total_nr + train_inst sys.stdout.write("\b" * num_back) print 'train: %d/%d loss: %.4f, loss_org: %.4f, loss_linear: %.4f, acc: %.2f%%, acc(no root): %.2f%%, time: %.2fs' % ( train_inst, train_inst, train_err / train_inst, train_err_org / train_inst, train_err_linear / train_inst, train_corr * 100 / train_total, train_corr_nr * 100 / train_total_nr, time.time() - start_time) # evaluate performance on dev data dev_corr = 0.0 dev_corr_nr = 0.0 dev_total = 0 dev_total_nr = 0 dev_inst = 0 for batch in data_utils.iterate_batch(data_dev, batch_size): wids, cids, pids, _, _, masks = batch masks_nr = np.copy(masks) masks_nr[:, 0] = 0 corr, corr_nr, num, num_nr, predictions = eval_fn(wids, cids, pids, masks, masks_nr) dev_corr += corr dev_corr_nr += corr_nr dev_total += num dev_total_nr += num_nr dev_inst += wids.shape[0] assert dev_total == dev_total_nr + dev_inst print 'dev corr: %d, total: %d, acc: %.2f%%, no root corr: %d, total: %d, acc: %.2f%%' % ( dev_corr, dev_total, dev_corr * 100 / dev_total, dev_corr_nr, dev_total_nr, dev_corr_nr * 100 / dev_total_nr) if dev_correct_nr < dev_corr_nr: dev_correct = dev_corr dev_correct_nr = dev_corr_nr best_epoch = epoch # evaluate on test data when better performance detected test_corr = 0.0 test_corr_nr = 0.0 test_total = 0 test_total_nr = 0 test_inst = 0 for batch in data_utils.iterate_batch(data_test, batch_size): wids, cids, pids, _, _, masks = batch masks_nr = np.copy(masks) masks_nr[:, 0] = 0 corr, corr_nr, num, num_nr, predictions = eval_fn(wids, cids, pids, masks, masks_nr) test_corr += corr test_corr_nr += corr_nr test_total += num test_total_nr += num_nr test_inst += wids.shape[0] assert test_total + test_total_nr + test_inst test_correct = test_corr test_correct_nr = test_corr_nr print "best dev corr: %d, total: %d, acc: %.2f%%, no root corr: %d, total: %d, acc: %.2f%% (epoch: %d)" % ( dev_correct, dev_total, dev_correct * 100 / dev_total, dev_correct_nr, dev_total_nr, dev_correct_nr * 100 / dev_total_nr, best_epoch) print "best test corr: %d, total: %d, acc: %.2f%%, no root corr: %d, total: %d, acc: %.2f%% (epoch: %d)" % ( test_correct, test_total, test_correct * 100 / test_total, test_correct_nr, test_total_nr, test_correct_nr * 100 / test_total_nr, best_epoch) if epoch in schedule: lr = lr * decay_rate updates = nesterov_momentum(loss_train, params=params, learning_rate=lr, momentum=momentum) train_fn = theano.function([word_var, char_var, target_var, mask_var, mask_nr_var], [loss_train, loss_train_org, loss_train_expect_linear, corr_train, corr_nr_train, num_tokens, num_tokens_nr], updates=updates)