def __init__(self, num_hidden, num_features, seq_length, mb_size, tf_states, rf_states): tf_states = T.specify_shape(tf_states, (seq_length, mb_size, num_features)) rf_states = T.specify_shape(rf_states, (seq_length, mb_size, num_features)) hidden_state_features = T.specify_shape(T.concatenate([tf_states, rf_states], axis = 1), (seq_length, mb_size * 2, num_features)) gru_params_1 = init_tparams(param_init_gru(None, {}, prefix = "gru1", dim = num_hidden, nin = num_features)) #gru_params_2 = init_tparams(param_init_gru(None, {}, prefix = "gru2", dim = num_hidden, nin = num_hidden + num_features)) #gru_params_3 = init_tparams(param_init_gru(None, {}, prefix = "gru3", dim = num_hidden, nin = num_hidden + num_features)) gru_1_out = gru_layer(gru_params_1, hidden_state_features, None, prefix = 'gru1')[0] #gru_2_out = gru_layer(gru_params_2, T.concatenate([gru_1_out, hidden_state_features], axis = 2), None, prefix = 'gru2', backwards = True)[0] #gru_3_out = gru_layer(gru_params_3, T.concatenate([gru_2_out, hidden_state_features], axis = 2), None, prefix = 'gru3')[0] final_out_recc = T.specify_shape(T.mean(gru_1_out, axis = 0), (mb_size * 2, num_hidden)) h_out_1 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) #h_out_2 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) #h_out_3 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) h_out_4 = DenseLayer((mb_size * 2, num_hidden), num_units = 1, nonlinearity=None) h_out_1_value = h_out_1.get_output_for(final_out_recc) h_out_4_value = h_out_4.get_output_for(h_out_1_value) raw_y = h_out_4_value #raw_y = T.clip(h_out_4_value, -10.0, 10.0) classification = T.nnet.sigmoid(raw_y) #tf comes before rf. p_real = classification[:mb_size] p_gen = classification[mb_size:] #bce = lambda r,t: t * T.nnet.softplus(-r) + (1 - t) * (r + T.nnet.softplus(-r)) self.d_cost_real = bce(p_real, 0.9 * T.ones(p_real.shape)).mean() self.d_cost_gen = bce(p_gen, 0.1 + T.zeros(p_gen.shape)).mean() self.g_cost_d = bce(p_gen, 0.9 * T.ones(p_gen.shape)).mean() self.d_cost = self.d_cost_real + self.d_cost_gen self.g_cost = self.g_cost_d self.classification = classification self.params = [] self.params += lasagne.layers.get_all_params(h_out_4,trainable=True) #self.params += lasagne.layers.get_all_params(h_out_3,trainable=True) #self.params += lasagne.layers.get_all_params(h_out_2,trainable=True) self.params += lasagne.layers.get_all_params(h_out_1,trainable=True) self.params += gru_params_1.values() #self.params += gru_params_2.values() #self.params += gru_params_3.values() self.accuracy = T.mean(T.eq(T.ones(p_real.shape).flatten(), T.gt(p_real, 0.5).flatten())) + T.mean(T.eq(T.ones(p_gen.shape).flatten(), T.lt(p_gen, 0.5).flatten()))
def rnn_one_step(config, params, observed_sequence_last, observed_sequence_current, use_samples, last_states, last_outputs, last_loss): mb_size = config['mb_size'] num_hidden = config['num_hidden'] last_states = T.specify_shape(last_states, (config['mb_size'],2 * config['num_hidden'])) last_outputs = T.specify_shape(last_outputs, (config['mb_size'],)) obs_last = T.specify_shape(observed_sequence_last, (mb_size,)).reshape((mb_size,1)) obs_curr = T.specify_shape(observed_sequence_current, (mb_size,)) obs_use = theano.ifelse.ifelse(use_samples, last_outputs.reshape((mb_size,1)), obs_last) last_states_1 = last_states[:,0:1024] last_states_2 = last_states[:,1024:2048] next_states_1 = T.specify_shape(gru_layer(params,state_below = obs_use, options = None, prefix='gru1', mask=None, one_step=True, init_state=last_states_1, backwards=False)[0], (mb_size, num_hidden)) next_states_2 = T.specify_shape(gru_layer(params,state_below = next_states_1, options = None, prefix='gru2', mask=None, one_step=True, init_state=last_states_2, backwards=False)[0], (mb_size, num_hidden)) h1 = T.specify_shape(fflayer(params,next_states_2,options=None,prefix='ff_h1',activ='lambda x: tensor.maximum(x,0.0)'), (mb_size, num_hidden)) h2 = T.specify_shape(fflayer(params,h1,options=None,prefix='ff_h2',activ='lambda x: tensor.maximum(x,0.0)'), (mb_size, num_hidden)) y = T.specify_shape(fflayer(params,h2,options = None,prefix='ff_1',activ='lambda x: x').flatten(), (mb_size,)) #y = T.specify_shape(T.sum(next_states, axis = 1), (mb_size,)) loss = T.sqr(y - obs_curr) obs_curr = T.specify_shape(observed_sequence_current, (mb_size,)) next_outputs = y next_states = T.specify_shape(T.concatenate([next_states_1, next_states_2], axis = 1), (mb_size, num_hidden * 2)) return next_states, next_outputs, loss
def __init__(self, num_hidden, num_features, mb_size, hidden_state_features, target): self.mb_size = mb_size #self.seq_length = seq_length #using 0.8 hidden_state_features = dropout(hidden_state_features, 1.0) gru_params_1 = init_tparams( param_init_gru(None, {}, prefix="gru1", dim=num_hidden, nin=num_features)) gru_params_2 = init_tparams( param_init_gru(None, {}, prefix="gru2", dim=num_hidden, nin=num_hidden + num_features)) gru_1_out = gru_layer(gru_params_1, hidden_state_features, None, prefix='gru1', gradient_steps=100)[0] gru_2_out = gru_layer(gru_params_2, T.concatenate([gru_1_out, hidden_state_features], axis=2), None, prefix='gru2', backwards=True, gradient_steps=100)[0] self.gru_1_out = gru_1_out final_out_recc = T.mean(gru_2_out, axis=0) h_out_1 = DenseLayer((mb_size * 2, num_hidden), num_units=num_hidden, nonlinearity=lasagne.nonlinearities.rectify) h_out_2 = DenseLayer((mb_size * 2, num_hidden), num_units=num_hidden, nonlinearity=lasagne.nonlinearities.rectify) h_out_4 = DenseLayer((mb_size * 2, num_hidden), num_units=1, nonlinearity=None) h_out_1_value = dropout(h_out_1.get_output_for(final_out_recc), 1.0) h_out_2_value = dropout(h_out_2.get_output_for(h_out_1_value), 1.0) h_out_4_value = h_out_4.get_output_for(h_out_2_value) raw_y = T.clip(h_out_4_value, -10.0, 10.0) classification = T.nnet.sigmoid(raw_y) self.accuracy = T.mean( T.eq(target, T.gt(classification, 0.5).flatten())) p_real = classification[0:mb_size] p_gen = classification[mb_size:mb_size * 2] self.d_cost_real = bce(p_real, T.ones(p_real.shape)).mean() self.d_cost_gen = bce(p_gen, T.zeros(p_gen.shape)).mean() self.g_cost_real = bce(p_real, T.zeros(p_gen.shape)).mean() self.g_cost_gen = bce(p_gen, T.ones(p_real.shape)).mean() #self.g_cost = self.g_cost_gen self.g_cost = self.g_cost_real + self.g_cost_gen print "pulling both TF and PF togeher" self.d_cost = self.d_cost_real + self.d_cost_gen #if d_cost < 1.0, use g cost. self.d_cost = T.switch( T.gt(self.accuracy, 0.95) * T.gt(p_real.mean(), 0.99) * T.lt(p_gen.mean(), 0.01), 0.0, self.d_cost) ''' gX = gen(Z, *gen_params) p_real = discrim(X, *discrim_params) p_gen = discrim(gX, *discrim_params) d_cost_real = bce(p_real, T.ones(p_real.shape)).mean() d_cost_gen = bce(p_gen, T.zeros(p_gen.shape)).mean() g_cost_d = bce(p_gen, T.ones(p_gen.shape)).mean() d_cost = d_cost_real + d_cost_gen g_cost = g_cost_d cost = [g_cost, d_cost, g_cost_d, d_cost_real, d_cost_gen] d_updates = d_updater(discrim_params, d_cost) g_updates = g_updater(gen_params, g_cost) ''' self.classification = classification self.params = [] self.params += lasagne.layers.get_all_params(h_out_4, trainable=True) self.params += lasagne.layers.get_all_params(h_out_1, trainable=True) self.params += lasagne.layers.get_all_params(h_out_2, trainable=True) #self.params += h_out_1.getParams() + h_out_2.getParams() + h_out_3.getParams() # self.params += lasagne.layers.get_all_params(h_initial_1,trainable=True) # self.params += lasagne.layers.get_all_params(h_initial_2,trainable=True) self.params += gru_params_1.values() self.params += gru_params_2.values() ''' layerParams = c1.getParams() for paramKey in layerParams: self.params += [layerParams[paramKey]] layerParams = c2.getParams() for paramKey in layerParams: self.params += [layerParams[paramKey]] layerParams = c3.getParams() for paramKey in layerParams: self.params += [layerParams[paramKey]] ''' #all_grads = T.grad(self.loss, self.params) #for j in range(0, len(all_grads)): # all_grads[j] = T.switch(T.isnan(all_grads[j]), T.zeros_like(all_grads[j]), all_grads[j]) #self.updates = lasagne.updates.adam(all_grads, self.params, learning_rate = 0.0001, beta1 = 0.5) '''
def __init__(self, num_hidden, num_features, seq_length, mb_size, tf_states, rf_states): tf_states = T.specify_shape(tf_states, (seq_length, mb_size, num_features)) rf_states = T.specify_shape(rf_states, (seq_length, mb_size, num_features)) hidden_state_features = T.specify_shape( T.concatenate([tf_states, rf_states], axis=1), (seq_length, mb_size * 2, num_features)) gru_params_1 = init_tparams( param_init_gru(None, {}, prefix="gru1", dim=num_hidden, nin=num_features)) #gru_params_2 = init_tparams(param_init_gru(None, {}, prefix = "gru2", dim = num_hidden, nin = num_hidden + num_features)) #gru_params_3 = init_tparams(param_init_gru(None, {}, prefix = "gru3", dim = num_hidden, nin = num_hidden + num_features)) gru_1_out = gru_layer(gru_params_1, hidden_state_features, None, prefix='gru1')[0] #gru_2_out = gru_layer(gru_params_2, T.concatenate([gru_1_out, hidden_state_features], axis = 2), None, prefix = 'gru2', backwards = True)[0] #gru_3_out = gru_layer(gru_params_3, T.concatenate([gru_2_out, hidden_state_features], axis = 2), None, prefix = 'gru3')[0] final_out_recc = T.specify_shape(T.mean(gru_1_out, axis=0), (mb_size * 2, num_hidden)) h_out_1 = DenseLayer((mb_size * 2, num_hidden), num_units=num_hidden, nonlinearity=lasagne.nonlinearities.rectify) #h_out_2 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) #h_out_3 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) h_out_4 = DenseLayer((mb_size * 2, num_hidden), num_units=1, nonlinearity=None) h_out_1_value = h_out_1.get_output_for(final_out_recc) h_out_4_value = h_out_4.get_output_for(h_out_1_value) raw_y = h_out_4_value #raw_y = T.clip(h_out_4_value, -10.0, 10.0) classification = T.nnet.sigmoid(raw_y) #tf comes before rf. p_real = classification[:mb_size] p_gen = classification[mb_size:] #bce = lambda r,t: t * T.nnet.softplus(-r) + (1 - t) * (r + T.nnet.softplus(-r)) self.d_cost_real = bce(p_real, 0.9 * T.ones(p_real.shape)).mean() self.d_cost_gen = bce(p_gen, 0.1 + T.zeros(p_gen.shape)).mean() self.g_cost_d = bce(p_gen, 0.9 * T.ones(p_gen.shape)).mean() self.d_cost = self.d_cost_real + self.d_cost_gen self.g_cost = self.g_cost_d self.classification = classification self.params = [] self.params += lasagne.layers.get_all_params(h_out_4, trainable=True) #self.params += lasagne.layers.get_all_params(h_out_3,trainable=True) #self.params += lasagne.layers.get_all_params(h_out_2,trainable=True) self.params += lasagne.layers.get_all_params(h_out_1, trainable=True) self.params += gru_params_1.values() #self.params += gru_params_2.values() #self.params += gru_params_3.values() self.accuracy = T.mean( T.eq(T.ones(p_real.shape).flatten(), T.gt(p_real, 0.5).flatten())) + T.mean( T.eq( T.ones(p_gen.shape).flatten(), T.lt(p_gen, 0.5).flatten()))
def build_encoder(tparams,options,trng,use_noise,x_mask=None,sampling=False): x = tensor.matrix('x',dtype='int64') x.tag.test_value = (numpy.random.rand(5,10)*100).astype('int64') #for the backward rnn, we just need to invert x xr = x[::-1] #此处有区别 xr = x[:,::-1] if x_mask is None: #测试的时候 xr_mask = None else: xr_mask = x_mask[::-1] #时间步数,和样本个数 n_timesteps = x.shape[0] n_samples = x.shape[1] #是否使用 dropout if options['use_dropout']: retain_probability_emb = 1-options['dropout_embedding'] retain_probability_hidden = 1-options['dropout_hidden'] retain_probability_source = 1-options['dropout_source'] if sampling: if options['model_version'] < 0.1: rec_dropout = theano.shared(numpy.array([retain_probability_hidden]*2, dtype='float32')) rec_dropout_r = theano.shared(numpy.array([retain_probability_hidden]*2, dtype='float32')) emb_dropout = theano.shared(numpy.array([retain_probability_emb]*2, dtype='float32')) emb_dropout_r = theano.shared(numpy.array([retain_probability_emb]*2, dtype='float32')) source_dropout = theano.shared(numpy.float32(retain_probability_source)) else: rec_dropout = theano.shared(numpy.array([1.]*2, dtype='float32')) rec_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32')) emb_dropout = theano.shared(numpy.array([1.]*2, dtype='float32')) emb_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32')) source_dropout = theano.shared(numpy.float32(1.)) else: if options['model_version'] < 0.1: scaled = False else: scaled = True rec_dropout = shared_dropout_layer((2, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled) rec_dropout_r = shared_dropout_layer((2, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled) emb_dropout = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled) emb_dropout_r = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled) source_dropout = shared_dropout_layer((n_timesteps, n_samples, 1), use_noise, trng, retain_probability_source, scaled) source_dropout = tensor.tile(source_dropout, (1,1,options['dim_word'])) else: rec_dropout = theano.shared(numpy.array([1.]*2, dtype='float32')) rec_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32')) emb_dropout = theano.shared(numpy.array([1.]*2, dtype='float32')) emb_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32')) # word embedding for forward rnn (source) emb = tparams['Wemb'][x.flatten()] #此处不同 emb = emb.reshape([n_timesteps,n_samples,options['dim_word']]) if options['use_dropout']: emb *= source_dropout proj = gru_layer(tparams,emb,options, prefix='encoder', mask=x_mask, emb_dropout=emb_dropout, rec_dropout=rec_dropout, profile=profile) # word embedding for backward rnn (source) embr = tparams['Wemb'][xr.flatten()] embr = embr.reshape([n_timesteps,n_samples,options['dim_word']]) if options['use_dropout']: if sampling: embr *= source_dropout else: embr *= source_dropout[::-1] projr = gru_layer(tparams,embr,options, prefix='encoder_r', mask=xr_mask, emb_dropout=emb_dropout_r, rec_dropout=rec_dropout, profile=profile) #context will be the concatenation of forward and backward rnns ctx = concatenate([proj[0],projr[0][::-1]],axis=proj[0].ndim-1) return x,ctx
def rnn_one_step(config, params, observed_sequence_last, observed_sequence_current, use_samples, last_states, last_outputs, last_loss): mb_size = config['mb_size'] num_hidden = config['num_hidden'] last_states = T.specify_shape( last_states, (config['mb_size'], 2 * config['num_hidden'])) last_outputs = T.specify_shape(last_outputs, (config['mb_size'], )) obs_last = T.specify_shape(observed_sequence_last, (mb_size, )).reshape( (mb_size, 1)) obs_curr = T.specify_shape(observed_sequence_current, (mb_size, )) obs_use = theano.ifelse.ifelse(use_samples, last_outputs.reshape((mb_size, 1)), obs_last) last_states_1 = last_states[:, 0:1024] last_states_2 = last_states[:, 1024:2048] next_states_1 = T.specify_shape( gru_layer(params, state_below=obs_use, options=None, prefix='gru1', mask=None, one_step=True, init_state=last_states_1, backwards=False)[0], (mb_size, num_hidden)) next_states_2 = T.specify_shape( gru_layer(params, state_below=next_states_1, options=None, prefix='gru2', mask=None, one_step=True, init_state=last_states_2, backwards=False)[0], (mb_size, num_hidden)) h1 = T.specify_shape( fflayer(params, next_states_2, options=None, prefix='ff_h1', activ='lambda x: tensor.maximum(x,0.0)'), (mb_size, num_hidden)) h2 = T.specify_shape( fflayer(params, h1, options=None, prefix='ff_h2', activ='lambda x: tensor.maximum(x,0.0)'), (mb_size, num_hidden)) y = T.specify_shape( fflayer(params, h2, options=None, prefix='ff_1', activ='lambda x: x').flatten(), (mb_size, )) #y = T.specify_shape(T.sum(next_states, axis = 1), (mb_size,)) loss = T.sqr(y - obs_curr) obs_curr = T.specify_shape(observed_sequence_current, (mb_size, )) next_outputs = y next_states = T.specify_shape( T.concatenate([next_states_1, next_states_2], axis=1), (mb_size, num_hidden * 2)) return next_states, next_outputs, loss