def model(x, embedding_size, n_hidden): # hidden and input weights U = shared_glorot_uniform((embedding_size, n_hidden), name="U") W = shared_glorot_uniform((n_hidden, n_hidden), name="W") bh = shared_zeros((n_hidden, ), name="bh") # output weights V = shared_glorot_uniform((n_hidden, embedding_size), name="V") by = shared_zeros((embedding_size, ), name="by") params = [U, V, W, by, bh] def step(x_t, h_tm1): h_t = T.tanh(U[x_t] + T.dot(h_tm1, W) + bh) y_t = T.dot(h_t, V) + by return h_t, y_t h0 = shared_zeros((n_hidden, ), name='h0') [h, y_pred], _ = theano.scan(step, sequences=x, outputs_info=[h0, None], truncate_gradient=10) model = T.nnet.softmax(y_pred) return model, params
def model(inputs, _is_training, params, batch_size, hidden_size, drop_i, drop_s, init_scale, init_H_bias, tied_noise, _theano_rng): noise_i_for_i = get_dropout_noise((batch_size, hidden_size), drop_i, _theano_rng) noise_i_for_f = get_dropout_noise((batch_size, hidden_size), drop_i, _theano_rng) if not tied_noise else noise_i_for_i noise_i_for_c = get_dropout_noise((batch_size, hidden_size), drop_i, _theano_rng) if not tied_noise else noise_i_for_i noise_i_for_o = get_dropout_noise((batch_size, hidden_size), drop_i, _theano_rng) if not tied_noise else noise_i_for_i i_for_i = ifelse(_is_training, inputs* noise_i_for_i, inputs) i_for_f = ifelse(_is_training, inputs* noise_i_for_f, inputs) i_for_c = ifelse(_is_training, inputs* noise_i_for_c, inputs) i_for_o = ifelse(_is_training, inputs* noise_i_for_o, inputs) i_for_i = linear.model(i_for_i, params, hidden_size, hidden_size, init_scale, bias_init=init_H_bias) i_for_f = linear.model(i_for_f, params, hidden_size, hidden_size, init_scale, bias_init=init_H_bias) i_for_c = linear.model(i_for_c, params, hidden_size, hidden_size, init_scale, bias_init=init_H_bias) i_for_o = linear.model(i_for_o, params, hidden_size, hidden_size, init_scale, bias_init=init_H_bias) # Dropout noise for recurrent hidden state. noise_s = get_dropout_noise((batch_size, hidden_size), drop_s, _theano_rng) if not tied_noise: noise_s = T.stack(noise_s, get_dropout_noise((batch_size, hidden_size), drop_s, _theano_rng), get_dropout_noise((batch_size, hidden_size), drop_s, _theano_rng), get_dropout_noise((batch_size, hidden_size), drop_s, _theano_rng)) def step(i_for_i_t,i_for_f_t,i_for_c_t,i_for_o_t, y_tm1, c_tm1, noise_s): noise_s_for_i = noise_s if tied_noise else noise_s[0] noise_s_for_f = noise_s if tied_noise else noise_s[1] noise_s_for_c = noise_s if tied_noise else noise_s[2] noise_s_for_o = noise_s if tied_noise else noise_s[3] s_lm1_for_i = ifelse(_is_training, y_tm1 * noise_s_for_i, y_tm1) s_lm1_for_f = ifelse(_is_training, y_tm1 * noise_s_for_f, y_tm1) s_lm1_for_c = ifelse(_is_training, y_tm1 * noise_s_for_c, y_tm1) s_lm1_for_o = ifelse(_is_training, y_tm1 * noise_s_for_o, y_tm1) i_t = T.nnet.sigmoid(i_for_i_t + linear.model(s_lm1_for_i, params, hidden_size, hidden_size, init_scale)) f_t = T.nnet.sigmoid(i_for_o_t + linear.model(s_lm1_for_f, params, hidden_size, hidden_size, init_scale)) c_t = f_t * c_tm1 + i_t * T.tanh(i_for_c_t + linear.model(s_lm1_for_c, params, hidden_size, hidden_size, init_scale)) o_t = T.nnet.sigmoid(i_for_o_t + linear.model(s_lm1_for_o, params, hidden_size, hidden_size, init_scale)) return o_t * T.tanh(c_t), c_t y_0 = shared_zeros((batch_size,hidden_size), name='h0') c_0 = shared_zeros((batch_size,hidden_size), name='c0') [y, c], _ = theano.scan(step, sequences=[i_for_i,i_for_f,i_for_c,i_for_o], outputs_info=[y_0,c_0], non_sequences = [noise_s]) y_last = y[-1] sticky_state_updates = [(y_0, y_last)] return y, y_0, sticky_state_updates
def get_rnn_params(number, n_visible, n_hidden_recurrent): w_in_update = shared_normal('w_in_update_%d' % number, n_visible, n_hidden_recurrent, scale=0.0001) w_hidden_update = shared_normal('w_hidden_update_%d' % number, n_hidden_recurrent, n_hidden_recurrent, scale=0.0001) b_update = shared_zeros('b_update_%d' % number, n_hidden_recurrent) w_in_reset = shared_normal('w_in_reset_%d' % number, n_visible, n_hidden_recurrent, scale=0.0001) w_hidden_reset = shared_normal('w_hidden_reset_%d' % number, n_hidden_recurrent, n_hidden_recurrent, scale=0.0001) b_reset = shared_zeros('b_reset_%d' % number, n_hidden_recurrent) w_in_hidden = shared_normal('w_in_hidden_%d' % number, n_visible, n_hidden_recurrent, scale=0.0001) w_reset_hidden = shared_normal('w_reset_hidden_%d' % number, n_hidden_recurrent, n_hidden_recurrent, scale=0.0001) b_hidden = shared_zeros('b_hidden_%d' % number, n_hidden_recurrent) return [w_in_update, w_hidden_update, b_update, w_in_reset, w_hidden_reset, b_reset, w_in_hidden, w_reset_hidden, b_hidden]
def __init__(self, n_visible, n_hidden=150, n_hidden_recurrent=100, lr=0.001, l2_norm=None, l1_norm=None): (v, v_sample, cost, monitor, params, updates_train, v_t, updates_generate, n_steps) = build_rnnrbm(n_visible, n_hidden, n_hidden_recurrent, lr, l2_norm=l2_norm, l1_norm=l1_norm) for param in params: gradient = T.grad(cost, param, consider_constant=[v_sample]) # remove nan and inf values not_finite = T.or_(T.isnan(gradient), T.isinf(gradient)) gradient = T.switch(not_finite, 0.1 * param, gradient) # max_grad = param * 1e-3 # gradient = T.switch(T.gt(gradient, max_grad), max_grad, gradient) # momentum # velocity = shared_zeros('velocity_' + str(param.name), param.get_value(borrow=True).shape) # update = param - T.cast(lr, dtype=dtype) * gradient # x = momentum * velocity + update - param # updates_train[velocity] = x # updates_train[param] = momentum * x + update # rmsprop accu = shared_zeros('accu_' + str(param.name), param.get_value(borrow=True).shape) accu_new = 0.9 * accu + 0.1 * gradient ** 2 updates_train[accu] = accu_new updates_train[param] = param - (lr * gradient / T.sqrt(accu_new + 1e-6)) self.params = params self.train_function = theano.function([v], monitor, updates=updates_train) self.generate_function = theano.function([n_steps], v_t, updates=updates_generate)
def model(inputs, _is_training, params, batch_size, hidden_size, drop_i, drop_s, init_scale, init_H_bias, _theano_rng): noise_i_for_H = get_dropout_noise((batch_size, hidden_size), drop_i, _theano_rng) i_for_H = ifelse(_is_training, inputs * noise_i_for_H, inputs) i_for_H = linear.model(i_for_H, params, hidden_size, hidden_size, init_scale, bias_init=init_H_bias) # Dropout noise for recurrent hidden state. noise_s = get_dropout_noise((batch_size, hidden_size), drop_s, _theano_rng) def step(i_for_H_t, y_tm1, noise_s): s_lm1_for_H = ifelse(_is_training, y_tm1 * noise_s, y_tm1) return T.tanh(i_for_H_t + linear.model( s_lm1_for_H, params, hidden_size, hidden_size, init_scale)) y_0 = shared_zeros((batch_size, hidden_size), name='h0') y, _ = theano.scan(step, sequences=i_for_H, outputs_info=[y_0], non_sequences=[noise_s]) y_last = y[-1] sticky_state_updates = [(y_0, y_last)] return y, y_0, sticky_state_updates
def model(x, embedding_size, n_hidden): # Update gate weights W_xz = shared_glorot_uniform((embedding_size, n_hidden)) W_hz = shared_glorot_uniform((n_hidden, n_hidden)) b_z = shared_zeros((n_hidden, )) # Reset gate weights W_xr = shared_glorot_uniform((embedding_size, n_hidden)) W_hr = shared_glorot_uniform((n_hidden, n_hidden)) b_r = shared_zeros((n_hidden, )) # Hidden layer W_xh = shared_glorot_uniform((embedding_size, n_hidden)) W_hh = shared_glorot_uniform((n_hidden, n_hidden)) b_h = shared_zeros((n_hidden, )) # Output weights W_y = shared_glorot_uniform((n_hidden, embedding_size), name="V") b_y = shared_zeros((embedding_size, ), name="by") params = [W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_y, b_y] def step(x_t, h_tm1): z_t = T.nnet.sigmoid(W_xz[x_t] + T.dot(W_hz, h_tm1) + b_z) r_t = T.nnet.sigmoid(W_xr[x_t] + T.dot(W_hr, h_tm1) + b_r) can_h_t = T.tanh(W_xh[x_t] + r_t * T.dot(W_hh, h_tm1) + b_h) h_t = (1 - z_t) * h_tm1 + z_t * can_h_t y_t = T.dot(h_t, W_y) + b_y return h_t, y_t h0 = shared_zeros((n_hidden, ), name='h0') [h, y_pred], _ = theano.scan(step, sequences=x, outputs_info=[h0, None], truncate_gradient=10) model = T.nnet.softmax(y_pred) return model, params
def build_rnnrbm(n_visible, n_hidden, n_hidden_recurrent, lr, l2_norm=None, l1_norm=None): # rbm params W = shared_normal('W', n_visible, n_hidden, scale=0.01) bv = shared_zeros('bv', n_visible) bh = shared_zeros('bh', n_hidden) # rnn -> rbm connections Wuh = shared_normal('Wuh', n_hidden_recurrent, n_hidden, scale=0.0001) Wuv = shared_normal('Wuv', n_hidden_recurrent, n_visible, scale=0.0001) params = [W, bv, bh, Wuh, Wuv] def get_rnn_params(number, n_visible, n_hidden_recurrent): w_in_update = shared_normal('w_in_update_%d' % number, n_visible, n_hidden_recurrent, scale=0.0001) w_hidden_update = shared_normal('w_hidden_update_%d' % number, n_hidden_recurrent, n_hidden_recurrent, scale=0.0001) b_update = shared_zeros('b_update_%d' % number, n_hidden_recurrent) w_in_reset = shared_normal('w_in_reset_%d' % number, n_visible, n_hidden_recurrent, scale=0.0001) w_hidden_reset = shared_normal('w_hidden_reset_%d' % number, n_hidden_recurrent, n_hidden_recurrent, scale=0.0001) b_reset = shared_zeros('b_reset_%d' % number, n_hidden_recurrent) w_in_hidden = shared_normal('w_in_hidden_%d' % number, n_visible, n_hidden_recurrent, scale=0.0001) w_reset_hidden = shared_normal('w_reset_hidden_%d' % number, n_hidden_recurrent, n_hidden_recurrent, scale=0.0001) b_hidden = shared_zeros('b_hidden_%d' % number, n_hidden_recurrent) return [w_in_update, w_hidden_update, b_update, w_in_reset, w_hidden_reset, b_reset, w_in_hidden, w_reset_hidden, b_hidden] def build_rnn(params, v_t, u_tm1): w_in_update, w_hidden_update, b_update, \ w_in_reset, w_hidden_reset, b_reset, \ w_in_hidden, w_reset_hidden, b_hidden = params update_gate = T.tanh(T.dot(v_t, w_in_update) + T.dot(u_tm1, w_hidden_update) + b_update) reset_gate = T.tanh(T.dot(v_t, w_in_reset) + T.dot(u_tm1, w_hidden_reset) + b_reset) u_t_temp = T.tanh(T.dot(v_t, w_in_hidden) + T.dot(u_tm1 * reset_gate, w_reset_hidden) + b_hidden) u_t = (1 - update_gate) * u_t_temp + update_gate * u_tm1 return u_t # update gate rnn_params_1 = get_rnn_params(1, n_visible, n_hidden_recurrent) rnn_params_2 = get_rnn_params(2, n_hidden_recurrent, n_hidden_recurrent) rnn_params_3 = get_rnn_params(3, n_hidden_recurrent, n_hidden_recurrent) params += rnn_params_1 + rnn_params_2 + rnn_params_3 def build_rbm(v, W, bv, bh, k): def gibbs_step(v, binomial=False): mean_h = sigm(T.dot(v, W) + bh) h = rng.binomial(size=mean_h.shape, n=1, p=mean_h, dtype=dtype) mean_v = sigm(T.dot(h, W.T) + bv) v = rng.binomial(size=mean_v.shape, n=1, p=mean_v, dtype=theano.config.floatX) if binomial else mean_v return mean_v, v chain, updates = theano.scan(lambda v: gibbs_step(v)[1], outputs_info=[v], n_steps=k) v_sample = chain[-1] mean_v = gibbs_step(v_sample)[0] monitor = T.xlogx.xlogy0(v, mean_v) + T.xlogx.xlogy0(1 - v, 1 - mean_v) monitor = monitor.sum() / v.shape[0] def free_energy(v): return -(v * bv).sum() - T.log(1 + T.exp(T.dot(v, W) + bh)).sum() cost = (free_energy(v) - free_energy(v_sample)) / v.shape[0] return v_sample, cost, monitor, updates def recurrence(v_t, u1_tm1, u2_tm1): bv_t = bv + T.dot(u2_tm1, Wuv) bh_t = bh + T.dot(u2_tm1, Wuh) generate = v_t is None # generate a probability distribution for the visible units, with certain biases if generate: v_t, _, _, updates = build_rbm(T.zeros((n_visible,)), W, bv_t, bh_t, k=15) u1_t = build_rnn(rnn_params_1, v_t, u1_tm1) u2_t = build_rnn(rnn_params_2, u1_t, u2_tm1) return ([v_t, u1_t, u2_t], updates) if generate else [u1_t, u2_t, bv_t, bh_t] v = T.matrix() # rnn initial values u1_0 = T.zeros((n_hidden_recurrent,)) u2_0 = T.zeros((n_hidden_recurrent,)) (_, _, bv_t, bh_t), updates_train = theano.scan( lambda v_t, u1_tm1, u2_tm1, *_: recurrence(v_t, u1_tm1, u2_tm1), sequences=v, outputs_info=[u1_0, u2_0, None, None]) v_sample, cost, monitor, updates_rbm = build_rbm(v, W, bv_t, bh_t, k=20) updates_train.update(updates_rbm) n_steps = T.scalar(dtype='int32') (v_t, _, _), updates_generate = theano.scan( lambda u1_tm1, u2_tm1, *_: recurrence(None, u1_tm1, u2_tm1), outputs_info=[None, u1_0, u2_0], n_steps=n_steps) # l1 and l2 regularizers for param in rnn_params_1 + rnn_params_2 + rnn_params_3: if l2_norm is not None: cost += T.sum(param ** 2) * l2_norm * lr if l1_norm is not None: cost += T.sum(abs(param)) * l1_norm * lr return (v, v_sample, cost, monitor, params, updates_train, v_t, updates_generate, n_steps)
def model(inputs, _is_training, params, depth, batch_size, hidden_size, drop_i, drop_s, init_scale, init_T_bias, init_H_bias, tied_noise, _theano_rng): noise_i_for_H = get_dropout_noise((batch_size, hidden_size), drop_i, _theano_rng) noise_i_for_T = get_dropout_noise( (batch_size, hidden_size), drop_i, _theano_rng) if not tied_noise else noise_i_for_H i_for_H = ifelse(_is_training, noise_i_for_H * inputs, inputs) i_for_T = ifelse(_is_training, noise_i_for_T * inputs, inputs) i_for_H = linear(i_for_H, params, in_size=hidden_size, out_size=hidden_size, init_scale=init_scale, bias_init=init_H_bias) i_for_T = linear(i_for_T, params, in_size=hidden_size, out_size=hidden_size, init_scale=init_scale, bias_init=init_T_bias) # Dropout noise for recurrent hidden state. noise_s = get_dropout_noise((batch_size, hidden_size), drop_s, _theano_rng) if not tied_noise: noise_s = T.stack( noise_s, get_dropout_noise((batch_size, hidden_size), drop_s, _theano_rng)) def deep_step_fn(i_for_H_t, i_for_T_t, y_tm1, noise_s): tanh, sigm = T.tanh, T.nnet.sigmoid noise_s_for_H = noise_s if tied_noise else noise_s[0] noise_s_for_T = noise_s if tied_noise else noise_s[1] s_lm1 = y_tm1 for l in range(depth): s_lm1_for_H = ifelse(_is_training, s_lm1 * noise_s_for_H, s_lm1) s_lm1_for_T = ifelse(_is_training, s_lm1 * noise_s_for_T, s_lm1) if l == 0: # On the first micro-timestep of each timestep we already have bias # terms summed into i_for_H_t and into i_for_T_t. H = tanh(i_for_H_t + linear(s_lm1_for_H, params, in_size=hidden_size, out_size=hidden_size, init_scale=init_scale)) Tr = sigm(i_for_T_t + linear(s_lm1_for_T, params, in_size=hidden_size, out_size=hidden_size, init_scale=init_scale)) else: H = tanh( linear(s_lm1_for_H, params, in_size=hidden_size, out_size=hidden_size, init_scale=init_scale, bias_init=init_H_bias)) Tr = sigm( linear(s_lm1_for_T, params, in_size=hidden_size, out_size=hidden_size, init_scale=init_scale, bias_init=init_T_bias)) s_l = (H - s_lm1) * Tr + s_lm1 s_lm1 = s_l y_t = s_l return y_t y_0 = shared_zeros((batch_size, hidden_size)) y, _ = theano.scan(deep_step_fn, sequences=[i_for_H, i_for_T], outputs_info=[y_0], non_sequences=[noise_s]) y_last = y[-1] sticky_state_updates = [(y_0, y_last)] return y, y_0, sticky_state_updates