def lstm_cell(x, h, c, name=None, reuse=False): """LSTM returning hidden state and content cell at a specific timestep.""" nin = x.shape[-1].value nout = h.shape[-1].value with tf.variable_scope(name, default_name="lstm", values=[x, h, c], reuse=reuse): wx = tf.get_variable("kernel/input", [nin, nout * 4], dtype=tf.float32, initializer=tf.orthogonal_initializer(1.0)) wh = tf.get_variable("kernel/hidden", [nout, nout * 4], dtype=tf.float32, initializer=tf.orthogonal_initializer(1.0)) b = tf.get_variable("bias", [nout * 4], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) z = tf.matmul(x, wx) + tf.matmul(h, wh) + b i, f, o, u = tf.split(z, 4, axis=1) i = tf.sigmoid(i) f = tf.sigmoid(f + 1.0) o = tf.sigmoid(o) u = tf.tanh(u) c = f * c + i * u h = o * tf.tanh(c) return h, c
def testGain(self): shape = (10, 10) for dtype in [tf.float32, tf.float64]: init1 = tf.orthogonal_initializer(seed=1, dtype=dtype) init2 = tf.orthogonal_initializer(gain=3.14, seed=1, dtype=dtype) with self.test_session(graph=tf.Graph(), use_gpu=True): t1 = init1(shape).eval() with self.test_session(graph=tf.Graph(), use_gpu=True): t2 = init2(shape).eval() return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
def define_generator( self, z, out_dim=2, num_hidden_neuron=256, num_layers=2): """inference procedure of generative model.""" with tf.variable_scope('generator'): hidden = z for hidden_idx in range(num_layers): hidden = fully_connected( hidden, num_hidden_neuron, activation_fn=self.leakyrelu, weights_initializer=tf.orthogonal_initializer(gain=1.4)) x = fully_connected( hidden, out_dim, activation_fn=None, weights_initializer=tf.orthogonal_initializer(gain=1.4)) return x
def recurrent_layer(tensor, cell=None, hidden_dims=128, sequence_length=None, decoder_fn=None, activation=tf.nn.tanh, initializer=tf.orthogonal_initializer(), initial_state=None, keep_prob=1.0, return_final_state=False, return_next_cell_input=True, **opts): if cell is None: cell = tf.contrib.rnn.BasicRNNCell(hidden_dims, activation=activation) # cell = tf.contrib.rnn.LSTMCell(hidden_dims, activation=activation) if keep_prob < 1.0: keep_prob = _global_keep_prob(keep_prob) cell = tf.contrib.rnn.DropoutWrapper(cell, keep_prob, keep_prob) if opts.get("name"): tf.add_to_collection(opts.get("name"), cell) if decoder_fn is None: outputs, final_state = tf.nn.dynamic_rnn(cell, tensor, sequence_length=sequence_length, initial_state=initial_state, dtype=tf.float32) final_context_state = None else: # TODO: turn off sequence_length? outputs, final_state, final_context_state = seq2seq.dynamic_rnn_decoder( cell, decoder_fn, inputs=None, sequence_length=sequence_length) if return_final_state: return final_state else: return outputs
def make_tf_Linv(layer, V_shape, c_shape, lr, act=tf.nn.tanh): """ builds graph for layer-local training of V and c """ with tf.name_scope('layer'+str(layer)+'_inv') as scope: V = tf.get_variable(scope+'V', shape=V_shape, dtype=tf.float32, initializer=tf.orthogonal_initializer(0.95)) #V = tf.get_variable(scope+'V', shape=V_shape, dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32)) c = tf.get_variable(scope+'c', shape=c_shape, dtype=tf.float32, initializer=tf.constant_initializer(0.)) W = tf.placeholder(tf.float32, shape=[V_shape[1], V_shape[0]], name='W') b = tf.placeholder(tf.float32, shape=[1, V_shape[0]], name='b') x_0 = tf.placeholder(tf.float32, shape=[None, V_shape[1]], name='input') fx = act(tf.matmul(x_0, W) + b) loss = 0.5*tf.reduce_mean((act(tf.matmul(fx, V) + c) - x_0)**2, name='loss') s1 = tf.summary.scalar('log_loss'+str(layer), tf.log(loss)) s2 = tf.summary.histogram('V'+str(layer), V) s3 = tf.summary.histogram('c'+str(layer), c) opt = tf.train.RMSPropOptimizer(lr) gvs = opt.compute_gradients(loss, var_list=[V, c]) sg = [tf.summary.scalar('norm_grad'+var.name[-3], tf.nn.l2_loss(grad)) for grad, var in gvs] # var.name = 'namescope/V:0' and we want just 'V' clipped_gvs = [(tf.clip_by_norm(grad, 100.), var) for grad, var in gvs] return opt.apply_gradients(clipped_gvs), tf.summary.merge([s1] + sg)
def __init__(self, params=params, dyn='FCC'): tf.reset_default_graph() data = self.sample_mog(params['batch_size']) noise = ds.Normal(tf.zeros(params['z_dim']), tf.ones(params['z_dim'])).sample(params['batch_size']) # Construct generator and discriminator nets with slim.arg_scope([slim.fully_connected], weights_initializer=tf.orthogonal_initializer(gain=1.4)): samples = self.generator(noise, output_dim=params['x_dim']) real_score = self.discriminator(data) fake_score = self.discriminator(samples, reuse=True) # Saddle objective loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=real_score, labels=tf.ones_like(real_score)) + tf.nn.sigmoid_cross_entropy_with_logits(logits=fake_score, labels=tf.zeros_like(fake_score))) gen_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "generator") disc_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "discriminator") gen_shapes = [tuple(v.get_shape().as_list()) for v in gen_vars] disc_shapes = [tuple(v.get_shape().as_list()) for v in disc_vars] # Generator gradient g_opt = tf.train.GradientDescentOptimizer(learning_rate=params['gen_learning_rate']) g_grads = g_opt.compute_gradients(-loss, var_list=gen_vars) # Discriminator gradient d_opt = tf.train.GradientDescentOptimizer(learning_rate=params['disc_learning_rate']) d_grads = d_opt.compute_gradients(loss, var_list=disc_vars) # Squared Norm of Gradient: d/dx 1/2||F||^2 = J^T F grads_norm_sep = [tf.reduce_sum(g[0]**2) for g in g_grads+d_grads] grads_norm = 0.5*tf.reduce_sum(grads_norm_sep) # Gradient of Squared Norm JTF = tf.gradients(grads_norm, xs=gen_vars+disc_vars) sess = tf.Session() sess.run(tf.global_variables_initializer()) self.params = params self.data = data self.samples = samples self.gen_vars = gen_vars self.disc_vars = disc_vars self.gen_shapes = gen_shapes self.disc_shapes = disc_shapes self.Fg = g_grads self.Fd = d_grads self.JTF = JTF self.sess = sess self.findiff_step = params['findiff_step'] self.gamma = params['gamma'] self.dyn = dyn if dyn == 'FCC': self.F = self.FCC else: self.F = self._F
def conv_layer(inputs, filters, kernel_size, strides, gain=1.0): return tf.layers.conv2d(inputs=inputs, filters=filters, kernel_size=kernel_size, strides=(strides, strides), activation=tf.nn.relu, kernel_initializer=tf.orthogonal_initializer(gain=gain))
def get_variable_initializer(hparams): """Get variable initializer from hparams.""" if not hparams.initializer: return None mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_INITIALIZER_GAIN, value=hparams.initializer_gain, hparams=hparams) if not tf.contrib.eager.in_eager_mode(): tf.logging.info("Using variable initializer: %s", hparams.initializer) if hparams.initializer == "orthogonal": return tf.orthogonal_initializer(gain=hparams.initializer_gain) elif hparams.initializer == "uniform": max_val = 0.1 * hparams.initializer_gain return tf.random_uniform_initializer(-max_val, max_val) elif hparams.initializer == "normal_unit_scaling": return tf.variance_scaling_initializer( hparams.initializer_gain, mode="fan_avg", distribution="normal") elif hparams.initializer == "uniform_unit_scaling": return tf.variance_scaling_initializer( hparams.initializer_gain, mode="fan_avg", distribution="uniform") elif hparams.initializer == "xavier": return tf.contrib.layers.xavier_initializer() else: raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
def define_discriminator( self, x, num_hidden_neuron=256, num_layers=2, reuse=False): """inference procedure of adversarial model.""" # classifies whether x is real (1) or fake (0) # with a logistic regression output with tf.variable_scope('discriminator') as scope: if reuse: scope.reuse_variables() hidden = x for h_idx in range(num_layers): hidden = fully_connected( hidden, num_hidden_neuron, activation_fn=self.leakyrelu, weights_initializer=tf.orthogonal_initializer(gain=1.4)) logit = fully_connected( hidden, 1, activation_fn=None, weights_initializer=tf.orthogonal_initializer(gain=1.4)) return logit, tf.nn.sigmoid(logit)
def define_graph(glove_embeddings_arr): """ Define the tensorflow graph that forms your model. You must use at least one recurrent unit. The input placeholder should be of size [batch_size, 40] as we are restricting each review to it's first 40 words. The following naming convention must be used: Input placeholder: name="input_data" labels placeholder: name="labels" accuracy tensor: name="accuracy" loss tensor: name="loss" RETURN: input placeholder, labels placeholder, dropout_keep_prob, optimizer, accuracy and loss tensors""" # Input data input_data = tf.placeholder(tf.int32,(batch_size, 40),name='input_data') # 50 * 40 labels = tf.placeholder(tf.float32,(batch_size, 2),name='labels') # 50 * 2 # Here is the difference !! # ****************************************************** # dropout_keep_prob = tf.placeholder_with_default(0.5, shape=()) # keep_prob = tf.placeholder(tf.float32,name='keep_prob') # ****************************************************** # # Embedding embedding = tf.Variable(tf.convert_to_tensor(glove_embeddings_arr, dtype=tf.float32)) embed = tf.nn.embedding_lookup(embedding,input_data) # rnn_cell: here is GRU def rnn_cell(): gru = tf.contrib.rnn.GRUCell(rnn_size) drop = tf.contrib.rnn.DropoutWrapper(gru, output_keep_prob = dropout_keep_prob) # YUNQIUXU return drop # single GRU with tf.variable_scope('init_name', initializer=tf.orthogonal_initializer()): cell = tf.contrib.rnn.MultiRNNCell([rnn_cell() for _ in range(rnn_layers)]) outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype = "float32") # Attention layer attention_output = attention(outputs, attention_size) # Full connected layer W = tf.Variable(tf.truncated_normal([attention_output.get_shape()[1].value, 2], stddev=0.1)) # 128,2 b = tf.Variable(tf.constant(0., shape=[2])) # 2, logits = tf.nn.xw_plus_b(attention_output, W, b) logits = tf.squeeze(logits) # compute cross entropy loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels), name = "loss") optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss) accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(logits)), labels), tf.float32), name = "accuracy") return input_data, labels, dropout_keep_prob, optimizer, accuracy, loss
def _get_variable_initializer(hparams): if hparams.initializer == "orthogonal": return tf.orthogonal_initializer(gain=hparams.initializer_gain) elif hparams.initializer == "uniform": max_val = 0.1 * hparams.initializer_gain return tf.random_uniform_initializer(-max_val, max_val) elif hparams.initializer == "normal_unit_scaling": return tf.variance_scaling_initializer( hparams.initializer_gain, mode="fan_avg", distribution="normal") elif hparams.initializer == "uniform_unit_scaling": return tf.variance_scaling_initializer( hparams.initializer_gain, mode="fan_avg", distribution="uniform") else: raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
def testShapesValues(self): for dtype in [tf.float32, tf.float64]: for shape in [(10, 10), (10, 9, 8), (100, 5, 5), (50, 40), (40, 50)]: init = tf.orthogonal_initializer(dtype=dtype) with self.test_session(graph=tf.Graph(), use_gpu=True): # Check the shape t = init(shape).eval() self.assertAllEqual(shape, t.shape) # Check orthogonality by computing the inner product t = t.reshape((np.prod(t.shape[:-1]), t.shape[-1])) if t.shape[0] > t.shape[1]: self.assertAllClose(np.dot(t.T, t), np.eye(t.shape[1])) else: self.assertAllClose(np.dot(t, t.T), np.eye(t.shape[0]))
def define_graph(glove_embeddings_arr): # Input data input_data = tf.placeholder(tf.int32,(batch_size, 40),name='input_data') # 50 * 40 labels = tf.placeholder(tf.float32,(batch_size, 2),name='labels') # 50 * 2 keep_prob = tf.placeholder(tf.float32,name='keep_prob') # Embedding embedding = tf.Variable(tf.convert_to_tensor(glove_embeddings_arr, dtype=tf.float32)) # 注意这里的数据结构 embed = tf.nn.embedding_lookup(embedding,input_data) # lstm_cell: here is GRU # def lstm_cell(): # lstm = tf.contrib.rnn.GRUCell(lstm_size) # drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob = keep_prob) # YUNQIUXU # return drop def rnn_cell(): gru = tf.contrib.rnn.GRUCell(rnn_size) drop = tf.contrib.rnn.DropoutWrapper(gru, output_keep_prob = keep_prob) # YUNQIUXU return drop with tf.variable_scope('init_name', initializer=tf.orthogonal_initializer()): cell = tf.contrib.rnn.MultiRNNCell([rnn_cell() for _ in range(rnn_layers)]) outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype = "float32") # single GRU # with tf.variable_scope('init_name', initializer=tf.orthogonal_initializer()): # cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)]) # outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype = "float32") # Attention layer attention_output = attention(outputs, attention_size) # Full connected layer W = tf.Variable(tf.truncated_normal([attention_output.get_shape()[1].value, 2], stddev=0.1)) # 128,2 b = tf.Variable(tf.constant(0., shape=[2])) # 2, logits = tf.nn.xw_plus_b(attention_output, W, b) logits = tf.squeeze(logits) # compute cross entropy loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels), name = "loss") optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss) accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(logits)), labels), tf.float32), name = "accuracy") return input_data, labels, keep_prob, optimizer, accuracy, loss
def get_variable_initializer(hparams): """Get variable initializer from hparams.""" if not hparams.initializer: return None tf.logging.info("Using variable initializer: %s", hparams.initializer) if hparams.initializer == "orthogonal": return tf.orthogonal_initializer(gain=hparams.initializer_gain) elif hparams.initializer == "uniform": max_val = 0.1 * hparams.initializer_gain return tf.random_uniform_initializer(-max_val, max_val) elif hparams.initializer == "normal_unit_scaling": return tf.variance_scaling_initializer( hparams.initializer_gain, mode="fan_avg", distribution="normal") elif hparams.initializer == "uniform_unit_scaling": return tf.variance_scaling_initializer( hparams.initializer_gain, mode="fan_avg", distribution="uniform") else: raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
def __init__(self, component): """Initializes weights and layers. Args: component: Parent ComponentBuilderBase object. """ super(BiaffineDigraphNetwork, self).__init__(component) check.Eq(len(self._fixed_feature_dims.items()), 0, 'Expected no fixed features') check.Eq(len(self._linked_feature_dims.items()), 2, 'Expected two linked features') check.In('sources', self._linked_feature_dims, 'Missing required linked feature') check.In('targets', self._linked_feature_dims, 'Missing required linked feature') self._source_dim = self._linked_feature_dims['sources'] self._target_dim = self._linked_feature_dims['targets'] self._weights = [] self._weights.append( tf.get_variable('weights_arc', [self._source_dim, self._target_dim], tf.float32, tf.orthogonal_initializer())) self._weights.append( tf.get_variable('weights_source', [self._source_dim], tf.float32, tf.zeros_initializer())) self._weights.append( tf.get_variable('root', [self._source_dim], tf.float32, tf.zeros_initializer())) self._params.extend(self._weights) self._regularized_weights.extend(self._weights) # Add runtime hooks for pre-computed weights. self._derived_params.append(self._get_root_weights) self._derived_params.append(self._get_root_bias) # Negative Layer.dim indicates that the dimension is dynamic. self._layers.append(network_units.Layer(component, 'adjacency', -1))
def make_tf_L(layer, W_shape, b_shape, lr, act=tf.nn.tanh): """ TODO: implement initialization as input option builds graph for layer-local training of W and b args: layer (int): which layer W_shape: b_shape: lr: learning rate act: activation function returns: training op merged summaries of this layer """ with tf.name_scope('layer'+str(layer)+'_ff') as scope: W = tf.get_variable(scope+'W', shape=W_shape, dtype=tf.float32, initializer=tf.orthogonal_initializer(0.95)) #W = tf.get_variable(scope+'W', shape=W_shape, dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32)) b = tf.get_variable(scope+'b', shape=b_shape, dtype=tf.float32, initializer=tf.constant_initializer(0.)) x_0 = tf.placeholder(tf.float32, shape=[None, W_shape[0]], name='input') y = tf.placeholder(tf.float32, shape=[None, W_shape[1]], name='output') loss = 0.5*tf.reduce_mean((act(tf.matmul(x_0, W) + b) - y)**2, name='loss') s1 = tf.summary.scalar('log_loss'+str(layer), tf.log(loss)) s2 = tf.summary.histogram('W'+str(layer), W) s3 = tf.summary.histogram('b'+str(layer), b) # opt = tf.train.RMSPropOptimizer(lr) # rmsprop works *way* better than adam for local loss functions. unclear why. opt = tf.train.GradientDescentOptimizer(lr) # rmsprop works *way* better than adam for local loss functions. unclear why. gvs = opt.compute_gradients(loss, var_list=[W, b]) sg = [tf.summary.scalar('norm_grad'+var.name[-3], tf.nn.l2_loss(grad)) for grad, var in gvs] # var.name = 'namescope/V:0' and we want just 'V' clipped_gvs = [(tf.clip_by_norm(grad, 100.), var) for grad, var in gvs] # hmmmmmm. clip by norm value? return opt.apply_gradients(clipped_gvs), tf.summary.merge([s1] + sg)
def create_gru_cell(self): cell = tf.nn.rnn_cell.GRUCell(params.rnn_units,kernel_initializer=tf.orthogonal_initializer()) return cell
def get_rnn_cell(): return tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer())
def fc_layer(inputs, units, activation_fn=tf.nn.relu, gain=1.0): return tf.layers.dense(inputs=inputs, units=units, activation=activation_fn, kernel_initializer=tf.orthogonal_initializer(gain))
def cells(reuse=False): return tf.nn.rnn_cell.LSTMCell(size_layer, initializer=tf.orthogonal_initializer(), reuse=reuse)
def build_train(self): # this line of code is just a message to inform that batch size should be set to 1 only batch_size = 1 inputs = {} outputs = {} #******************** Define Proposal Module ******************# ## dim1: batch, dim2: video sequence length, dim3: video feature dimension ## video feature sequence # forward video feature sequence video_feat_fw = tf.placeholder( tf.float32, [None, None, self.options['video_feat_dim']], name='video_feat_fw') inputs['video_feat_fw'] = video_feat_fw # backward video feature sequence video_feat_bw = tf.placeholder( tf.float32, [None, None, self.options['video_feat_dim']], name='video_feat_bw') inputs['video_feat_bw'] = video_feat_bw ## proposal data, densely annotated, in forward direction proposal_fw = tf.placeholder(tf.int32, [None, None, self.options['num_anchors']], name='proposal_fw') inputs['proposal_fw'] = proposal_fw ## proposal data, densely annotated, in backward direction proposal_bw = tf.placeholder(tf.int32, [None, None, self.options['num_anchors']], name='proposal_bw') inputs['proposal_bw'] = proposal_bw ## proposal to feed into captioning module, i choose high tiou proposals for training captioning module, forward pass proposal_caption_fw = tf.placeholder(tf.int32, [None, None], name='proposal_caption_fw') inputs['proposal_caption_fw'] = proposal_caption_fw ## proposal to feed into captioning module, i choose high tiou proposals for training captioning module, backward pass proposal_caption_bw = tf.placeholder(tf.int32, [None, None], name='proposal_caption_bw') inputs['proposal_caption_bw'] = proposal_caption_bw ## weighting for positive/negative labels (solve imbalance data problem) proposal_weight = tf.placeholder(tf.float32, [self.options['num_anchors'], 2], name='proposal_weight') inputs['proposal_weight'] = proposal_weight rnn_cell_video_fw = tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer()) rnn_cell_video_bw = tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer()) if self.options['rnn_drop'] > 0: print('using dropout in rnn!') rnn_drop = tf.placeholder(tf.float32) inputs['rnn_drop'] = rnn_drop rnn_cell_video_fw = tf.contrib.rnn.DropoutWrapper( rnn_cell_video_fw, input_keep_prob=1.0 - rnn_drop, output_keep_prob=1.0 - rnn_drop) rnn_cell_video_bw = tf.contrib.rnn.DropoutWrapper( rnn_cell_video_bw, input_keep_prob=1.0 - rnn_drop, output_keep_prob=1.0 - rnn_drop) with tf.variable_scope('proposal_module') as proposal_scope: '''video feature sequence encoding: forward pass ''' with tf.variable_scope('video_encoder_fw') as scope: #sequence_length = tf.reduce_sum(video_feat_mask, axis=-1) sequence_length = tf.expand_dims(tf.shape(video_feat_fw)[1], axis=0) initial_state = rnn_cell_video_fw.zero_state( batch_size=batch_size, dtype=tf.float32) rnn_outputs_fw, _ = tf.nn.dynamic_rnn( cell=rnn_cell_video_fw, inputs=video_feat_fw, sequence_length=sequence_length, initial_state=initial_state, dtype=tf.float32) rnn_outputs_fw_reshape = tf.reshape(rnn_outputs_fw, [-1, self.options['rnn_size']], name='rnn_outputs_fw_reshape') # predict proposal at each time step: use fully connected layer to output scores for every anchors with tf.variable_scope('predict_proposal_fw') as scope: logit_output_fw = tf.contrib.layers.fully_connected( inputs=rnn_outputs_fw_reshape, num_outputs=self.options['num_anchors'], activation_fn=None) '''video feature sequence encoding: backward pass ''' with tf.variable_scope('video_encoder_bw') as scope: #sequence_length = tf.reduce_sum(video_feat_mask, axis=-1) sequence_length = tf.expand_dims(tf.shape(video_feat_bw)[1], axis=0) initial_state = rnn_cell_video_bw.zero_state( batch_size=batch_size, dtype=tf.float32) rnn_outputs_bw, _ = tf.nn.dynamic_rnn( cell=rnn_cell_video_bw, inputs=video_feat_bw, sequence_length=sequence_length, initial_state=initial_state, dtype=tf.float32) rnn_outputs_bw_reshape = tf.reshape(rnn_outputs_bw, [-1, self.options['rnn_size']], name='rnn_outputs_bw_reshape') # predict proposal at each time step: use fully connected layer to output scores for every anchors with tf.variable_scope('predict_proposal_bw') as scope: logit_output_bw = tf.contrib.layers.fully_connected( inputs=rnn_outputs_bw_reshape, num_outputs=self.options['num_anchors'], activation_fn=None) # calculate multi-label loss: use weighted binary cross entropy objective proposal_fw_reshape = tf.reshape(proposal_fw, [-1, self.options['num_anchors']], name='proposal_fw_reshape') proposal_fw_float = tf.to_float(proposal_fw_reshape) proposal_bw_reshape = tf.reshape(proposal_bw, [-1, self.options['num_anchors']], name='proposal_bw_reshape') proposal_bw_float = tf.to_float(proposal_bw_reshape) # weighting positive samples weight0 = tf.reshape(proposal_weight[:, 0], [-1, self.options['num_anchors']]) # weighting negative samples weight1 = tf.reshape(proposal_weight[:, 1], [-1, self.options['num_anchors']]) # tile weight batch_size times weight0 = tf.tile(weight0, [tf.shape(logit_output_fw)[0], 1]) weight1 = tf.tile(weight1, [tf.shape(logit_output_fw)[0], 1]) # get weighted sigmoid xentropy loss loss_term_fw = tf.nn.weighted_cross_entropy_with_logits( targets=proposal_fw_float, logits=logit_output_fw, pos_weight=weight0) loss_term_bw = tf.nn.weighted_cross_entropy_with_logits( targets=proposal_bw_float, logits=logit_output_bw, pos_weight=weight0) loss_term_fw_sum = tf.reduce_sum(loss_term_fw, axis=-1, name='loss_term_fw_sum') loss_term_bw_sum = tf.reduce_sum(loss_term_bw, axis=-1, name='loss_term_bw_sum') proposal_fw_loss = tf.reduce_sum(loss_term_fw_sum) / ( float(self.options['num_anchors']) * tf.to_float(tf.shape(video_feat_fw)[1])) proposal_bw_loss = tf.reduce_sum(loss_term_bw_sum) / ( float(self.options['num_anchors']) * tf.to_float(tf.shape(video_feat_bw)[1])) proposal_loss = (proposal_fw_loss + proposal_bw_loss) / 2. # summary data, for visualization using Tensorboard tf.summary.scalar('proposal_fw_loss', proposal_fw_loss) tf.summary.scalar('proposal_bw_loss', proposal_bw_loss) tf.summary.scalar('proposal_loss', proposal_loss) # outputs from proposal module outputs['proposal_fw_loss'] = proposal_fw_loss outputs['proposal_bw_loss'] = proposal_bw_loss outputs['proposal_loss'] = proposal_loss #*************** Define Captioning Module *****************# ## caption data: densely annotate sentences for each time step of a video, use mask data to mask out time steps when no caption should be output caption = tf.placeholder(tf.int32, [None, None, self.options['caption_seq_len']], name='caption') caption_mask = tf.placeholder( tf.int32, [None, None, self.options['caption_seq_len']], name='caption_mask') inputs['caption'] = caption inputs['caption_mask'] = caption_mask proposal_caption_fw_reshape = tf.reshape( proposal_caption_fw, [-1], name='proposal_caption_fw_reshape') proposal_caption_bw_reshape = tf.reshape( proposal_caption_bw, [-1], name='proposal_caption_bw_reshape') # use correct or 'nearly correct' proposal output as input to the captioning module boolean_mask = tf.greater(proposal_caption_fw_reshape, 0, name='boolean_mask') # guarantee that at least one pos has True value boolean_mask = tf.cond( tf.equal(tf.reduce_sum(tf.to_int32(boolean_mask)), 0), lambda: tf. concat([boolean_mask[:-1], tf.constant([True])], axis=-1), lambda: boolean_mask) # select input video state feat_len = tf.shape(video_feat_fw)[1] forward_indices = tf.boolean_mask(tf.range(feat_len), boolean_mask) event_feats_fw = tf.boolean_mask(rnn_outputs_fw_reshape, boolean_mask) backward_indices = tf.boolean_mask(proposal_caption_bw_reshape, boolean_mask) event_feats_bw = tf.gather_nd( rnn_outputs_bw_reshape, tf.expand_dims(backward_indices, axis=-1)) start_ids = feat_len - 1 - backward_indices end_ids = forward_indices event_c3d_seq, _ = self.get_c3d_seq(video_feat_fw[0], start_ids, end_ids, self.options['max_proposal_len']) context_feats_fw = tf.gather_nd(rnn_outputs_fw_reshape, tf.expand_dims(start_ids, axis=-1)) context_feats_bw = tf.gather_nd( rnn_outputs_bw_reshape, tf.expand_dims(feat_len - 1 - end_ids, axis=-1)) # proposal feature sequences proposal_feats = event_c3d_seq # corresponding caption ground truth (batch size = 1) caption_proposed = tf.boolean_mask(caption[0], boolean_mask, name='caption_proposed') caption_mask_proposed = tf.boolean_mask(caption_mask[0], boolean_mask, name='caption_mask_proposed') # the number of proposal-caption pairs for training n_proposals = tf.shape(caption_proposed)[0] rnn_cell_caption = tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer()) rnn_cell_caption = tf.contrib.rnn.DropoutWrapper( rnn_cell_caption, input_keep_prob=1.0 - rnn_drop, output_keep_prob=1.0 - rnn_drop) def get_rnn_cell(): return tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer()) # multi-layer LSTM multi_rnn_cell_caption = tf.contrib.rnn.MultiRNNCell( [get_rnn_cell() for _ in range(self.options['num_rnn_layers'])], state_is_tuple=True) caption_loss = 0 with tf.variable_scope('caption_module') as caption_scope: batch_size = n_proposals # initialize memory cell and hidden output, note that the returned state is a tuple containing all states for each cell in MultiRNNCell state = multi_rnn_cell_caption.zero_state(batch_size=batch_size, dtype=tf.float32) proposal_feats_reshape = tf.reshape( proposal_feats, [-1, self.options['video_feat_dim']], name='proposal_feats_reshape') event_hidden_feats = tf.concat([event_feats_fw, event_feats_bw], axis=-1) event_hidden_feats_tile = tf.tile( event_hidden_feats, [1, self.options['max_proposal_len']]) event_hidden_feats_reshape = tf.reshape( event_hidden_feats_tile, [-1, 2 * self.options['rnn_size']]) ''' The caption data should be prepared in equal length, namely, with length of 'caption_seq_len' ## use caption mask data to mask out loss from sequence after end of token (<END>) Only the first loop create variable, the other loops reuse them ''' for i in range(self.options['caption_seq_len'] - 1): if i > 0: caption_scope.reuse_variables() # word embedding word_embed = self.build_caption_embedding(caption_proposed[:, i]) # calculate attention over proposal feature elements # state[:, 1] return all hidden states for all cells in MultiRNNCell h_state = tf.concat([s[1] for s in state], axis=-1) h_state_tile = tf.tile(h_state, [1, self.options['max_proposal_len']]) h_state_reshape = tf.reshape(h_state_tile, [ -1, self.options['num_rnn_layers'] * self.options['rnn_size'] ]) feat_state_concat = tf.concat([ proposal_feats_reshape, h_state_reshape, event_hidden_feats_reshape ], axis=-1, name='feat_state_concat') #feat_state_concat = tf.concat([tf.reshape(tf.tile(word_embed, [1, self.options['max_proposal_len']]), [-1, self.options['word_embed_size']]), proposal_feats_reshape, h_state_reshape, event_hidden_feats_reshape], axis=-1, name='feat_state_concat') # use a two-layer network to model attention over video feature sequence when predicting next word (dynamic) with tf.variable_scope('attention') as attention_scope: attention_layer1 = tf.contrib.layers.fully_connected( inputs=feat_state_concat, num_outputs=self.options['attention_hidden_size'], activation_fn=tf.nn.tanh, weights_initializer=tf.contrib.layers. xavier_initializer()) attention_layer2 = tf.contrib.layers.fully_connected( inputs=attention_layer1, num_outputs=1, activation_fn=None, weights_initializer=tf.contrib.layers. xavier_initializer()) # reshape to match attention_reshape = tf.reshape( attention_layer2, [-1, self.options['max_proposal_len']], name='attention_reshape') attention_score = tf.nn.softmax(attention_reshape, dim=-1, name='attention_score') attention = tf.reshape( attention_score, [-1, 1, self.options['max_proposal_len']], name='attention') # attended video feature attended_proposal_feat = tf.matmul( attention, proposal_feats, name='attended_proposal_feat') attended_proposal_feat_reshape = tf.reshape( attended_proposal_feat, [-1, self.options['video_feat_dim']], name='attended_proposal_feat_reshape') if self.options['no_context']: proposal_feats_full = attended_proposal_feat_reshape else: if self.options['context_gating']: # model a gate to weight each element of context and feature attended_proposal_feat_reshape = tf.nn.tanh( attended_proposal_feat_reshape) with tf.variable_scope('context_gating'): ''' context_feats_transform = tf.contrib.layers.fully_connected( inputs=event_hidden_feats, num_outputs=self.options['video_feat_dim'], activation_fn=None, weights_initializer=tf.contrib.layers.xavier_initializer() ) ''' context_feats_transform = event_hidden_feats proposal_feats_transform = tf.contrib.layers.fully_connected( inputs=attended_proposal_feat_reshape, num_outputs=2 * self.options['rnn_size'], activation_fn=tf.nn.tanh, weights_initializer=tf.contrib.layers. xavier_initializer()) # context gating gate = tf.contrib.layers.fully_connected( inputs=tf.concat([ word_embed, h_state, context_feats_transform, proposal_feats_transform ], axis=-1), num_outputs=2 * self.options['rnn_size'], activation_fn=tf.nn.sigmoid, weights_initializer=tf.contrib.layers. xavier_initializer()) gated_context_feats = tf.multiply( context_feats_transform, gate) gated_proposal_feats = tf.multiply( proposal_feats_transform, 1. - gate) proposal_feats_full = tf.concat( [gated_context_feats, gated_proposal_feats], axis=-1) else: proposal_feats_full = tf.concat([ event_hidden_feats, attended_proposal_feat_reshape ], axis=-1) # proposal feature embedded into word space proposal_feat_embed = self.build_video_feat_embedding( proposal_feats_full) # get next state caption_output, state = multi_rnn_cell_caption( tf.concat([proposal_feat_embed, word_embed], axis=-1), state) # predict next word with tf.variable_scope('logits') as logits_scope: logits = tf.contrib.layers.fully_connected( inputs=caption_output, num_outputs=self.options['vocab_size'], activation_fn=None) labels = caption_proposed[:, i + 1] # predict next word # loss term loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels) output_mask = tf.to_float(caption_mask_proposed[:, i]) loss = tf.reduce_sum(tf.multiply(loss, output_mask)) caption_loss = caption_loss + loss # mean loss for each word caption_loss = caption_loss / (tf.to_float(batch_size) * tf.to_float( tf.reduce_sum(caption_mask_proposed)) + 1) tf.summary.scalar('caption_loss', caption_loss) reg_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if not v.name.startswith('caption_module/word_embed') ]) total_loss = self.options[ 'weight_proposal'] * proposal_loss + self.options[ 'weight_caption'] * caption_loss tf.summary.scalar('total_loss', total_loss) outputs['caption_loss'] = caption_loss outputs['loss'] = total_loss outputs['reg_loss'] = reg_loss outputs['n_proposals'] = n_proposals return inputs, outputs
def conv(inputs, nf, ks, strides, gain=1.0): return tf.layers.conv2d(inputs=inputs, filters=nf, kernel_size=ks, strides=(strides, strides), activation=tf.nn.relu, kernel_initializer=tf.orthogonal_initializer(gain=gain), name="enc_net_layer%s" % (layer_count), reuse=tf.AUTO_REUSE)
def weight_variable(shape, name, init_method=None, dtype=tf.float32, init_para=None, seed=1234, trainable=True): """ @brief: Initialize weights @input: shape: list of int, shape of the weights init_method: string, indicates initialization method init_para: a dictionary, init_val: if it is not None, it should be a tensor @output: var: a TensorFlow Variable """ if init_method is None or init_method == 'zero': initializer = tf.zeros_initializer(shape, dtype=dtype) if init_method == "normc": var = normc_initializer(shape, stddev=init_para['stddev'], seed=seed, dtype=dtype) return tf.get_variable(initializer=var, name=name, trainable=trainable) elif init_method == "normal": initializer = tf.random_normal_initializer(mean=init_para["mean"], stddev=init_para["stddev"], seed=seed, dtype=dtype) elif init_method == "truncated_normal": initializer = tf.truncated_normal_initializer( mean=init_para["mean"], stddev=init_para["stddev"], seed=seed, dtype=dtype) elif init_method == "uniform": initializer = tf.random_uniform_initializer(minval=init_para["minval"], maxval=init_para["maxval"], seed=seed, dtype=dtype) elif init_method == "constant": initializer = tf.constant_initializer(value=init_para["val"], dtype=dtype) elif init_method == "xavier": initializer = tf.contrib.layers.xavier_initializer( uniform=init_para['uniform'], seed=seed, dtype=dtype) elif init_method == 'orthogonal': initializer = tf.orthogonal_initializer(gain=1.0, seed=seed, dtype=dtype) else: raise ValueError("Unsupported initialization method!") var = tf.get_variable(initializer=initializer(shape), name=name, trainable=trainable) return var
def lstm_cell(): return LSTMCell(n_hidden, initializer=tf.orthogonal_initializer())
def lstm_cell(self): return tf.nn.rnn_cell.LSTMCell(self.cell_size, initializer=tf.orthogonal_initializer())
def train_rnn(monkey, beta0=0.0, beta1=0.0, beta2=0.0, stddev_state=0.0, stddev_out=0.0, activation='tanh', rnn_init='default', num_neurons=100, learning_rate=0.0001, num_iters=2000, save_model_path='./saves/', tb_path='./tensorboard/', load_prev=False, load_model_path=None): """ monkey: 'D' or 'C' beta1: regularization hyperparameter for l2_loss(A) beta2: regularization hyperparameter for l2_loss(C) stddev_state: stddev of injected noise in state variable stddev_out: stddev of injected noise in output activation: nonlinearity for the RNN. use lambda x: x for linear. num_neurons: state dimension learning_rate: learning rate for Adam num_iters: training iterations load_prev: whether or not to load the previous TF variables save_model_path: where to save the TF model using tf.train.Saver() load_model_path: If load_prev=True, where to load the previous model tb_path: tensorboard path local_machine: is this a local machine or cluster run? """ # TODO: just load *_preprocessed.mat data. if monkey == 'D': try: data = sio.loadmat('./drakeFeb.mat') #TODO: fix, '../' or './' depending on whether running from wrapper or not except: data = sio.loadmat('../drakeFeb.mat') elif monkey == 'C': try: data = sio.loadmat('./cousFeb.mat') except: data = sio.loadmat('../cousFeb.mat') # Set activation if activation == 'tanh': activation = tf.tanh elif activation == 'linear': activation = tf.identity elif activation == 'softplus': activation = tf.nn.softplus # Preprocess data emg = preprocess_array(data['D'][0, 0]['EMG']) time_axis, time_inds1, time_inds2 = get_time_axis(data['D'][0, 0]['KIN']) y_data1 = emg[time_axis] p = y_data1.shape[-1] # Build inputs m = 2 u_data1 = create_input_array(y_data1.shape) # Augmented data # For regularizing the network -- it must fit actual and augmented data period = int(np.round(np.diff(time_inds2).mean())) y_cat1 = augmented_data(emg, time_inds1, period=period, tiles=10) y_cat1 = y_cat1[::25] y_cat2 = augmented_data(emg, time_inds2, period=period, tiles=10) y_cat2 = y_cat2[::25] u_cat1 = create_input_array(y_cat1.shape) u_cat2 = create_input_array(y_cat2.shape) sequence_length = [y_data1.shape[0], y_cat1.shape[0], y_cat2.shape[0]] y_data = np.zeros((np.max(sequence_length), 4*3, p)) u_data = np.zeros((np.max(sequence_length), 4*3, m)) y_data[:sequence_length[0], 0:4, :] = y_data1 y_data[:sequence_length[1], 4:8, :] = y_cat1 y_data[:sequence_length[2], 8:12, :] = y_cat2 u_data[:sequence_length[0], 0:4, :] = u_data1 u_data[:sequence_length[1], 4:8, :] = u_cat1 u_data[:sequence_length[2], 8:12, :] = u_cat2 total_data_points = np.sum([v*4 for v in sequence_length]) # Tensorflow graph tf.reset_default_graph() #tf.set_random_seed(1234) n = num_neurons batch_size = y_data.shape[1] x0 = tf.Variable(tf.random_normal([batch_size, n], stddev=0.1), name='x0') C = tf.get_variable('C', shape=[n, p], initializer=tf.contrib.layers.xavier_initializer()) #C = tf.Variable(tf.random_normal([n, p], stddev=1/np.sqrt(n)), name='C') d = tf.get_variable('d', shape=[1, p], initializer=tf.constant_initializer(0)) #d = tf.Variable(tf.constant(0.01, shape=[1, p]), name='d') U = tf.placeholder(tf.float32, [u_data.shape[0], batch_size, m], name='U') Y = tf.placeholder(tf.float32, [y_data.shape[0], batch_size, p], name='Y') noise_state = tf.placeholder(tf.float32, name='stddev_state') time_steps = tf.shape(U)[0] # set initializer for rnn matrix if rnn_init == 'orth': rnn_initializer = tf.orthogonal_initializer(0.95) elif rnn_init == 'xavier': rnn_initializer = tf.contrib.layers.xavier_initializer() elif rnn_init == 'normal': rnn_initializer = tf.random_normal_initializer(1/np.sqrt(n)) elif rnn_init == 'default': rnn_init = None # assign to rnn_init not rnn_initializer. # get a tf var scope to set the rnn initializer. if rnn_init is not None: with tf.variable_scope('RNN', initializer=rnn_initializer) as scope: pass else: scope=None #cell = tf.nn.rnn_cell.BasicRNNCell(n, activation=activation) cell = BasicRNNCellNoise(n, activation=activation, stddev=noise_state) output, state = tf.nn.dynamic_rnn(cell, U, sequence_length=4*[sequence_length[0]]+4*[sequence_length[1]]+4*[sequence_length[2]], initial_state=x0, dtype=tf.float32, time_major=True, scope=scope) Y_hat = tf.reshape(output, (time_steps*batch_size, n)) Y_hat = tf.matmul(Y_hat, C) + d Y_hat = tf.reshape(Y_hat, (time_steps, batch_size, p), name='Y_hat') # Get RNN variables with tf.variable_scope('RNN/BasicRNNCellNoise/Linear', reuse=True): Mat = tf.get_variable('Matrix') #note: calling an initializer here will not give it a new one. A = tf.gather(tf.get_variable('Matrix'), range(m, m+n)) B = tf.gather(tf.get_variable('Matrix'), range(0, m)) b = tf.get_variable('Bias') # Training ops # take L2 loss only over data points. note that dynamic_rnn zeros out output, but not y_hat because we have the bias vector d cost_term0 = tf.reduce_sum((output[:sequence_length[0], :4, :])**2) cost_term0 += tf.reduce_sum((output[:sequence_length[1], 4:8, :])**2) cost_term0 += tf.reduce_sum((output[:sequence_length[2], 8:, :])**2) cost_term0 = beta0*0.5*cost_term0/total_data_points cost_term1 = tf.reduce_sum((Y_hat[:sequence_length[0], :4, :] - Y[:sequence_length[0], :4, :])**2) cost_term1 += tf.reduce_sum((Y_hat[:sequence_length[1], 4:8, :] - Y[:sequence_length[1], 4:8, :])**2) cost_term1 += tf.reduce_sum((Y_hat[:sequence_length[2], 8:, :] - Y[:sequence_length[2], 8:, :])**2) cost_term1 = 0.5*cost_term1/total_data_points cost_term2 = beta1*tf.nn.l2_loss(A) cost_term3 = beta2*tf.nn.l2_loss(C) cost = cost_term0 + cost_term1 + cost_term2 + cost_term3 train_op = tf.train.AdamOptimizer(learning_rate=learning_rate) gvs = train_op.compute_gradients(cost) sg = [tf.summary.scalar('norm_grad'+var.name[:-2], 2*tf.nn.l2_loss(grad)) for grad, var in gvs] # var.name = 'namescope/V:0' and we want just 'V' clipped_gvs = [(tf.clip_by_norm(grad, 100000.), var) for grad, var in gvs] sg_clip = [tf.summary.scalar('norm_grad_clipped'+var.name[:-2], 2*tf.nn.l2_loss(grad)) for grad, var in clipped_gvs] # var.name = 'namescope/V:0' and we want just 'V' opt_op = train_op.apply_gradients(clipped_gvs) # Summary ops tf.summary.scalar('log_loss', tf.log(cost)) tf.summary.scalar('log_cost0', tf.log(cost_term0)) tf.summary.scalar('log_cost1', tf.log(cost_term1)) tf.summary.scalar('log_cost2', tf.log(cost_term2)) tf.summary.scalar('log_cost3', tf.log(cost_term3)) merged_summary_op = tf.summary.merge_all() # Saver ops saver = tf.train.Saver() # Train with tf.Session() as sess: summary_writer = tf.summary.FileWriter(tb_path) sess.run(tf.global_variables_initializer()) # TODO: fix restore. new tf version saves files differently? if load_prev and os.path.exists(load_model_path): saver.restore(sess, load_model_path) for i in range(num_iters): feed_dict = {Y: y_data + np.random.randn(*y_data.shape)*y_data.var()*stddev_out, U: u_data, noise_state: stddev_state} _, loss_val, summary_str = sess.run([opt_op, cost, merged_summary_op], feed_dict=feed_dict) if i % 200 == 0: summary_writer.add_summary(summary_str, i) if i % 1000 == 0: print ' iter:', '%04d' % (i), \ ' Loss:', '{:.6f}'.format(loss_val) print ' iter:', '%04d' % (num_iters), \ ' Loss:', '{:.6f}'.format(loss_val) saver.save(sess, save_model_path) print ' Finished' # Simulate y_tf, x_tf = sess.run([Y_hat, output], feed_dict=feed_dict) summary_writer.close() return y_tf, x_tf
def _attention_step(self, doc): words_per_line = tf.math.count_nonzero(doc, 1) num_lines = tf.math.count_nonzero(words_per_line) max_words_ = tf.reduce_max(words_per_line) doc_input_reduced = doc[:num_lines, :max_words_] num_words = words_per_line[:num_lines] #word embeddings word_embeds = tf.gather( tf.get_variable('embeddings', initializer=self.embedding_matrix, dtype=tf.float32), doc_input_reduced) word_embeds = tf.nn.dropout(word_embeds, self.dropout) #masking mask_base = tf.cast(tf.sequence_mask(num_words, max_words_), tf.float32) mask = tf.tile(tf.expand_dims(mask_base, 2), [1, 1, self.attention_size]) mask2 = tf.tile(tf.expand_dims(mask_base, 2), [self.attention_heads, 1, max_words_]) #word self attention Q = tf.layers.conv1d( word_embeds, self.attention_size, 1, padding='same', activation=self.activation, kernel_initializer=tf.contrib.layers.xavier_initializer()) K = tf.layers.conv1d( word_embeds, self.attention_size, 1, padding='same', activation=self.activation, kernel_initializer=tf.contrib.layers.xavier_initializer()) V = tf.layers.conv1d( word_embeds, self.attention_size, 1, padding='same', activation=self.activation, kernel_initializer=tf.contrib.layers.xavier_initializer()) Q = tf.where(tf.equal(mask, 0), tf.zeros_like(Q), Q) K = tf.where(tf.equal(mask, 0), tf.zeros_like(K), K) V = tf.where(tf.equal(mask, 0), tf.zeros_like(V), V) Q_ = tf.concat(tf.split(Q, self.attention_heads, axis=2), axis=0) K_ = tf.concat(tf.split(K, self.attention_heads, axis=2), axis=0) V_ = tf.concat(tf.split(V, self.attention_heads, axis=2), axis=0) outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) outputs = outputs / (K_.get_shape().as_list()[-1]**0.5) outputs = tf.where(tf.equal(outputs, 0), tf.ones_like(outputs) * -1000, outputs) outputs = tf.nn.dropout(tf.nn.softmax(outputs), self.dropout) word_self = tf.where(tf.equal(mask2, 0), tf.zeros_like(outputs), outputs) outputs = tf.matmul(word_self, V_) outputs = tf.concat(tf.split(outputs, self.attention_heads, axis=0), axis=2) outputs = tf.where(tf.equal(mask, 0), tf.zeros_like(outputs), outputs) #word target attention Q = tf.get_variable('word_Q', (1, 1, self.attention_size), tf.float32, tf.orthogonal_initializer()) Q = tf.tile(Q, [num_lines, 1, 1]) Q_ = tf.concat(tf.split(Q, self.attention_heads, axis=2), axis=0) K_ = tf.concat(tf.split(outputs, self.attention_heads, axis=2), axis=0) V_ = tf.concat(tf.split(outputs, self.attention_heads, axis=2), axis=0) outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) outputs = outputs / (K_.get_shape().as_list()[-1]**0.5) outputs = tf.where(tf.equal(outputs, 0), tf.ones_like(outputs) * -1000, outputs) word_target = tf.nn.dropout(tf.nn.softmax(outputs), self.dropout) outputs = tf.matmul(word_target, V_) outputs = tf.concat(tf.split(outputs, self.attention_heads, axis=0), axis=2) sent_embeds = tf.transpose(outputs, [1, 0, 2]) sent_embeds = tf.nn.dropout(sent_embeds, self.dropout) #sent self attention Q = tf.layers.conv1d( sent_embeds, self.attention_size, 1, padding='same', activation=self.activation, kernel_initializer=tf.contrib.layers.xavier_initializer()) K = tf.layers.conv1d( sent_embeds, self.attention_size, 1, padding='same', activation=self.activation, kernel_initializer=tf.contrib.layers.xavier_initializer()) V = tf.layers.conv1d( sent_embeds, self.attention_size, 1, padding='same', activation=self.activation, kernel_initializer=tf.contrib.layers.xavier_initializer()) Q_ = tf.concat(tf.split(Q, self.attention_heads, axis=2), axis=0) K_ = tf.concat(tf.split(K, self.attention_heads, axis=2), axis=0) V_ = tf.concat(tf.split(V, self.attention_heads, axis=2), axis=0) outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) outputs = outputs / (K_.get_shape().as_list()[-1]**0.5) sent_self = tf.nn.dropout(tf.nn.softmax(outputs), self.dropout) outputs = tf.matmul(sent_self, V_) outputs = tf.concat(tf.split(outputs, self.attention_heads, axis=0), axis=2) #sent target attention Q = tf.get_variable('sent_Q', (1, 1, self.attention_size), tf.float32, tf.orthogonal_initializer()) Q_ = tf.concat(tf.split(Q, self.attention_heads, axis=2), axis=0) K_ = tf.concat(tf.split(outputs, self.attention_heads, axis=2), axis=0) V_ = tf.concat(tf.split(outputs, self.attention_heads, axis=2), axis=0) outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) outputs = outputs / (K_.get_shape().as_list()[-1]**0.5) sent_target = tf.nn.dropout(tf.nn.softmax(outputs), self.dropout) outputs = tf.matmul(sent_target, V_) outputs = tf.concat(tf.split(outputs, self.attention_heads, axis=0), axis=2) doc_embed = tf.nn.dropout(tf.squeeze(outputs, [0]), self.dropout) doc_embed = tf.squeeze(doc_embed, [0]) return doc_embed
def bulid_train(self, network, value_network=None): self.advantage = tf.placeholder(tf.float32, [None], name="Advantage") self.old_value = tf.placeholder(tf.float32, [None], name="Old_value") self.returns = tf.placeholder(tf.float32, [None], name="Returns") self.returns_in = tf.placeholder(tf.float32, [None], name="Returns_intrinsic") self.prevneglogp = tf.placeholder(tf.float32, [None], name="Old_pi_a") self.lr = tf.placeholder(tf.float32, [], name="Learning_rate") if value_network == None: self.value = tf.layers.dense( network, 1, kernel_initializer=tf.orthogonal_initializer(), name="Value") else: self.value = tf.layers.dense( value_network, 1, kernel_initializer=tf.orthogonal_initializer(), name="Value") self.value = self.value[:, 0] self.value_in = self.value_in[:, 0] with tf.variable_scope('Actor_loss'): pi_a = self.action.neglogp(self.actions) ratio = tf.exp(self.prevneglogp - pi_a) actor_loss = ratio * -self.advantage clipped_loss = tf.clip_by_value(ratio, 1 - self.epsilon, 1 + self.epsilon) * -self.advantage self.actor_loss = tf.reduce_mean( tf.maximum(actor_loss, clipped_loss)) self.clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), self.epsilon))) with tf.variable_scope('Entropy'): self.entropy = tf.reduce_mean(self.action.entropy()) with tf.variable_scope('Critic_loss'): critic_loss1 = tf.squared_difference(self.returns, self.value) critic_loss2 = tf.squared_difference( self.returns, self.old_value + tf.clip_by_value( self.value - self.old_value, -self.epsilon, self.epsilon)) self.vclipfrac = tf.reduce_mean( tf.to_float( tf.greater(tf.abs(self.value - self.old_value), self.epsilon))) self.critic_loss = tf.reduce_mean( tf.maximum(critic_loss1, critic_loss2)) * 0.5 self.critic_in_loss = tf.reduce_mean( tf.squared_difference(self.returns_in, self.value_in)) * 0.5 with tf.variable_scope('RND_loss'): self.rnd_loss = tf.reduce_mean( tf.square( tf.stop_gradient(self.target_network) - self.predictor_network)) with tf.variable_scope('Total_loss'): self.loss = self.actor_loss - self.entropy * self.beta2 + ( self.critic_loss + self.critic_in_loss) * self.beta params = tf.trainable_variables(self.name) with tf.variable_scope('train'): trainer = tf.train.AdamOptimizer(learning_rate=self.lr, epsilon=1e-5) grads_and_var = trainer.compute_gradients(self.loss, params) grads, var = zip(*grads_and_var) if self.max_grad_norm != None: grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) grads_and_var = list(zip(grads, var)) self.train = trainer.apply_gradients(grads_and_var) with tf.variable_scope('train_rnd'): self.train_rnd = tf.train.AdamOptimizer( learning_rate=self.rnd_lr).minimize(self.rnd_loss)
def __init__(self, v_lr, pi_lr, model_dir, delta=1e-3): self.state = tf.placeholder(tf.float32, [None, 10], name='state') self.action = tf.placeholder(tf.float32, [None, 1], name='action') self.reward = tf.placeholder(tf.float32, [None, 1], name='reward') # Advantage function definition print(' [*] Building advantage function...') kwargs = {'kernel_initializer': tf.orthogonal_initializer()} with tf.variable_scope('value'): h1 = tf.layers.dense(self.state, 128, activation=tf.nn.relu, name='h1', **kwargs) self.value = tf.layers.dense(h1, 1, activation=None, name='value', **kwargs) self.advantage = self.reward - self.value self.v_loss = tf.reduce_mean(tf.square(self.advantage)) v_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='value') self.v_train = tf.train.AdamOptimizer(v_lr).minimize(self.v_loss, var_list=v_vars) # Policy function definition print(' [*] Building policy function...') self.policy, pi_vars = build_gaussian_network(self.state, 1, scope='policy') old_policy, old_vars = build_gaussian_network(self.state, 1, scope='policy', trainable=False, reuse=True) with tf.name_scope('policy_ops'): # self.assign_op = [old.assign(new) for old, new in zip(old_vars, pi_vars)] self.sample_op = self.policy.sample(1) with tf.name_scope('surrogate_loss'): ratio = self.policy.prob(self.action) / old_policy.prob( self.action) surrogate = ratio * self.advantage self.pi_loss = -tf.reduce_mean(surrogate) # Convert Adam gradient to natural gradient print(' [*] Building natural gradient...') with tf.variable_scope('policy_optim'): kl = tf.distributions.kl_divergence(old_policy, self.policy) optim = tf.train.AdamOptimizer(pi_lr) pi_grads_and_vars = optim.compute_gradients(surrogate, var_list=pi_vars) pi_grads = [pair[0] for pair in pi_grads_and_vars] kl_grads = tf.gradients(kl, pi_vars) conj_grads = [] for grad, kl_grad, var in zip(pi_grads, kl_grads, pi_vars): conj = build_conjugate_gradient(grad, kl_grad, var) nat_grad = tf.sqrt( (2.0 * delta) / (tf.reduce_sum(grad * conj) + EPSILON)) * conj conj_grads.append((nat_grad, var)) self.pi_train = optim.apply_gradients(conj_grads) # Summaries definition print(' [*] Building summaries...') model_variance = tf.reduce_mean(self.policy._scale) self.sums = tf.summary.merge([ tf.summary.scalar('max_rewards', tf.reduce_max(self.reward)), tf.summary.scalar('mean_advantage', tf.reduce_mean( self.advantage)), tf.summary.scalar('pi_loss', self.pi_loss), tf.summary.scalar('v_loss', self.v_loss), tf.summary.scalar('model_variance', model_variance) ], name='summaries') config = tf.ConfigProto() # config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) print(' [*] Model built finished') _, self.counter = load(self.sess, model_dir)
def lstm_cell(self, reuse=False): return tf.nn.rnn_cell.LSTMCell(self.rnn_size, initializer=tf.orthogonal_initializer(), reuse=reuse)
def multi_hop_match(aware_repr, answer_repr, nb_hops, rnn_dim, attention_dim, scope_name, ans_max_len, ans_lens, l2_reg): # aware_repr: [batch_size, feature_dim] # answer_repr: [batch_size, seq_length, answer_dim] # nb_hops: int # attention: int # rnn_dim: int with tf.variable_scope(scope_name): assert nb_hops > 0 batch_size = batch_size = tf.shape(answer_repr)[0] aware_dim = aware_repr.get_shape().as_list()[-1] answer_dim = answer_repr.get_shape().as_list()[-1] # init memory ones_temp = tf.to_float( tf.reshape(tf.ones([batch_size, ans_max_len]), [batch_size, ans_max_len, 1])) memories = tf.concat([answer_repr, ones_temp], axis=-1) attention_ws = tf.get_variable( name='W_al', shape=[nb_hops, 1, rnn_dim + answer_dim + aware_dim + 1], initializer=tf.contrib.layers.xavier_initializer(), regularizer=tf.contrib.layers.l2_regularizer(l2_reg), dtype=tf.float32) attention_bs = tf.get_variable( name='B_al', shape=[nb_hops, 1, ans_max_len], initializer=tf.zeros_initializer(), regularizer=tf.contrib.layers.l2_regularizer(l2_reg), dtype=tf.float32) gru_r = tf.get_variable( name='W_r', shape=[rnn_dim, answer_dim + 1], initializer=tf.orthogonal_initializer(), regularizer=tf.contrib.layers.l2_regularizer(l2_reg), dtype=tf.float32) gru_z = tf.get_variable( name='W_z', shape=[rnn_dim, answer_dim + 1], initializer=tf.orthogonal_initializer(), regularizer=tf.contrib.layers.l2_regularizer(l2_reg), dtype=tf.float32) gru_g = tf.get_variable( name='W_g', shape=[rnn_dim, rnn_dim], initializer=tf.orthogonal_initializer(), regularizer=tf.contrib.layers.l2_regularizer(l2_reg), dtype=tf.float32) gru_x = tf.get_variable( name='W_x', shape=[rnn_dim, answer_dim + 1], initializer=tf.orthogonal_initializer(), regularizer=tf.contrib.layers.l2_regularizer(l2_reg), dtype=tf.float32) gru_r_update = tf.get_variable( name='U_r', shape=[rnn_dim, rnn_dim], initializer=tf.orthogonal_initializer(), regularizer=tf.contrib.layers.l2_regularizer(l2_reg)) gru_z_update = tf.get_variable( name='U_z', shape=[rnn_dim, rnn_dim], initializer=tf.orthogonal_initializer(), regularizer=tf.contrib.layers.l2_regularizer(l2_reg)) e = tf.zeros([batch_size, rnn_dim]) scores_list = [] aware_repr = tf.tile(tf.expand_dims(aware_repr, 1), [1, ans_max_len, 1]) for h in range(nb_hops): memories_iter = tf.TensorArray(tf.float32, 1, dynamic_size=True, infer_shape=False) memories_iter = memories_iter.unstack(memories) e_iter = tf.TensorArray(tf.float32, 1, dynamic_size=True, infer_shape=False) e_iter = e_iter.unstack(e) aware_iter = tf.TensorArray(tf.float32, 1, dynamic_size=True, infer_shape=False) aware_iter = aware_iter.unstack(aware_repr) sentence_lens_iter = tf.TensorArray(tf.int32, 1, dynamic_size=True, infer_shape=False) sentence_lens_iter = sentence_lens_iter.unstack(ans_lens) newe = tf.TensorArray(size=batch_size, dtype=tf.float32) score = tf.TensorArray(size=batch_size, dtype=tf.float32) def body(i, newe, score): a = memories_iter.read(i) olde = e_iter.read(i) b = tf.tile(tf.expand_dims(olde, 0), [ans_max_len, 1]) c = aware_iter.read(i) g = tf.matmul( attention_ws[h], tf.transpose(tf.concat([a, b, c], 1), perm=[1, 0])) + attention_bs[h] l = math_ops.to_int32(sentence_lens_iter.read(i)) score_temp = tf.concat([ tf.nn.softmax(tf.slice(g, [0, 0], [1, l])), tf.zeros([1, ans_max_len - l]) ], 1) # score_temp = tf.nn.softmax(g) score = score.write(i, score_temp) i_AL = tf.reshape(tf.matmul(score_temp, a), [-1, 1]) olde = tf.reshape(olde, [-1, 1]) r = tf.nn.sigmoid( tf.matmul(gru_r, i_AL) + tf.matmul(gru_r_update, olde)) z = tf.nn.sigmoid( tf.matmul(gru_z, i_AL) + tf.matmul(gru_z_update, olde)) e0 = tf.nn.tanh( tf.matmul(gru_x, i_AL) + tf.matmul(gru_g, tf.multiply(r, olde))) newe_temp = tf.multiply(1 - z, olde) + tf.multiply(z, e0) newe = newe.write(i, newe_temp) return (i + 1, newe, score) def condition(i, newe, score): return i < batch_size _, newe_final, score_final = tf.while_loop(cond=condition, body=body, loop_vars=(0, newe, score)) e = tf.reshape(newe_final.stack(), [-1, rnn_dim]) batch_score = tf.reshape(score_final.stack(), [-1, ans_max_len]) scores_list.append(batch_score) return e
def add_lstm_cells(self): cell = tf.nn.rnn_cell.LSTMCell(self.cell_size, initializer=tf.orthogonal_initializer()) cell = tf.nn.rnn_cell.DropoutWrapper(cell, self.rnn_keep_prob) self.cell = cell
def gru_cell(): return GRUCell(n_hidden, kernel_initializer=tf.orthogonal_initializer())
def initialize(sess=None): """Initialize data and model.""" global MAXLEN_F # Create training directory if it does not exist. if not tf.gfile.IsDirectory(FLAGS.train_dir): data.print_out("Creating training directory %s." % FLAGS.train_dir) tf.gfile.MkDir(FLAGS.train_dir) decode_suffix = "beam%dln%d" % (FLAGS.beam_size, int(100 * FLAGS.length_norm)) if FLAGS.mode == 0: decode_suffix = "" if FLAGS.task >= 0: data.log_filename = os.path.join(FLAGS.train_dir, "log%d%s" % (FLAGS.task, decode_suffix)) else: data.log_filename = os.path.join(FLAGS.train_dir, "neural_gpu/log") # Set random seed. if FLAGS.random_seed > 0: seed = FLAGS.random_seed + max(0, FLAGS.task) tf.set_random_seed(seed) random.seed(seed) np.random.seed(seed) # Check data sizes. assert data.bins max_length = min(FLAGS.max_length, data.bins[-1]) while len(data.bins) > 1 and data.bins[-2] >= max_length + EXTRA_EVAL: data.bins = data.bins[:-1] if sess is None and FLAGS.task == 0 and FLAGS.num_replicas > 1: if max_length > 60: max_length = max_length * 1 / 2 # Save memory on chief. min_length = min(14, max_length - 3) if FLAGS.problem == "wmt" else 3 for p in FLAGS.problem.split("-"): if p in ["progeval", "progsynth"]: min_length = max(26, min_length) assert max_length + 1 > min_length while len(data.bins) > 1 and data.bins[-2] >= max_length + EXTRA_EVAL: data.bins = data.bins[:-1] # Create checkpoint directory if it does not exist. if FLAGS.mode == 0 or FLAGS.task < 0: checkpoint_dir = os.path.join(FLAGS.train_dir, "neural_gpu%s" % ("" if FLAGS.task < 0 else str(FLAGS.task))) else: checkpoint_dir = FLAGS.train_dir if not tf.gfile.IsDirectory(checkpoint_dir): data.print_out("Creating checkpoint directory %s." % checkpoint_dir) tf.gfile.MkDir(checkpoint_dir) # Prepare data. if FLAGS.problem == "wmt": # Prepare WMT data. data.print_out("Preparing WMT data in %s" % FLAGS.data_dir) if FLAGS.simple_tokenizer: MAXLEN_F = 3.5 (en_train, fr_train, en_dev, fr_dev, en_path, fr_path) = wmt.prepare_wmt_data( FLAGS.data_dir, FLAGS.vocab_size, tokenizer=wmt.space_tokenizer, normalize_digits=FLAGS.normalize_digits) else: (en_train, fr_train, en_dev, fr_dev, en_path, fr_path) = wmt.prepare_wmt_data( FLAGS.data_dir, FLAGS.vocab_size) # Read data into buckets and compute their sizes. fr_vocab, rev_fr_vocab = wmt.initialize_vocabulary(fr_path) data.vocab = fr_vocab data.rev_vocab = rev_fr_vocab data.print_out("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = {} dev_set["wmt"] = read_data(en_dev, fr_dev, data.bins) def data_read(size, print_out): read_data_into_global(en_train, fr_train, data.bins, size, print_out) data_read(50000, False) read_thread_small = threading.Thread( name="reading-data-small", target=lambda: data_read(900000, False)) read_thread_small.start() read_thread_full = threading.Thread( name="reading-data-full", target=lambda: data_read(FLAGS.max_train_data_size, True)) read_thread_full.start() data.print_out("Data reading set up.") else: # Prepare algorithmic data. en_path, fr_path = None, None tasks = FLAGS.problem.split("-") data_size = FLAGS.train_data_size for t in tasks: data.print_out("Generating data for %s." % t) if t in ["progeval", "progsynth"]: data.init_data(t, data.bins[-1], 20 * data_size, FLAGS.vocab_size) if len(program_utils.prog_vocab) > FLAGS.vocab_size - 2: raise ValueError("Increase vocab_size to %d for prog-tasks." % (len(program_utils.prog_vocab) + 2)) data.rev_vocab = program_utils.prog_vocab data.vocab = program_utils.prog_rev_vocab else: for l in xrange(max_length + EXTRA_EVAL - 1): data.init_data(t, l, data_size, FLAGS.vocab_size) data.init_data(t, data.bins[-2], data_size, FLAGS.vocab_size) data.init_data(t, data.bins[-1], data_size, FLAGS.vocab_size) if t not in global_train_set: global_train_set[t] = [] global_train_set[t].append(data.train_set[t]) calculate_buckets_scale(data.train_set[t], data.bins, t) dev_set = data.test_set # Grid-search parameters. lr = FLAGS.lr init_weight = FLAGS.init_weight max_grad_norm = FLAGS.max_grad_norm if sess is not None and FLAGS.task > -1: def job_id_factor(step): """If jobid / step mod 3 is 0, 1, 2: say 0, 1, -1.""" return ((((FLAGS.task / step) % 3) + 1) % 3) - 1 lr *= math.pow(2, job_id_factor(1)) init_weight *= math.pow(1.5, job_id_factor(3)) max_grad_norm *= math.pow(2, job_id_factor(9)) # Print out parameters. curriculum = FLAGS.curriculum_seq msg1 = ("layers %d kw %d h %d kh %d batch %d noise %.2f" % (FLAGS.nconvs, FLAGS.kw, FLAGS.height, FLAGS.kh, FLAGS.batch_size, FLAGS.grad_noise_scale)) msg2 = ("cut %.2f lr %.3f iw %.2f cr %.2f nm %d d%.4f gn %.2f %s" % (FLAGS.cutoff, lr, init_weight, curriculum, FLAGS.nmaps, FLAGS.dropout, max_grad_norm, msg1)) data.print_out(msg2) # Create model and initialize it. tf.get_variable_scope().set_initializer( tf.orthogonal_initializer(gain=1.8 * init_weight)) max_sampling_rate = FLAGS.max_sampling_rate if FLAGS.mode == 0 else 0.0 o = FLAGS.vocab_size if FLAGS.max_target_vocab < 1 else FLAGS.max_target_vocab ngpu.CHOOSE_K = FLAGS.soft_mem_size do_beam_model = FLAGS.train_beam_freq > 0.0001 and FLAGS.beam_size > 1 beam_size = FLAGS.beam_size if FLAGS.mode > 0 and not do_beam_model else 1 beam_size = min(beam_size, FLAGS.beam_size) beam_model = None def make_ngpu(cur_beam_size, back): return ngpu.NeuralGPU( FLAGS.nmaps, FLAGS.vec_size, FLAGS.vocab_size, o, FLAGS.dropout, max_grad_norm, FLAGS.cutoff, FLAGS.nconvs, FLAGS.kw, FLAGS.kh, FLAGS.height, FLAGS.mem_size, lr / math.sqrt(FLAGS.num_replicas), min_length + 3, FLAGS.num_gpus, FLAGS.num_replicas, FLAGS.grad_noise_scale, max_sampling_rate, atrous=FLAGS.atrous, do_rnn=FLAGS.rnn_baseline, do_layer_norm=FLAGS.layer_norm, beam_size=cur_beam_size, backward=back) if sess is None: with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)): model = make_ngpu(beam_size, True) if do_beam_model: tf.get_variable_scope().reuse_variables() beam_model = make_ngpu(FLAGS.beam_size, False) else: model = make_ngpu(beam_size, True) if do_beam_model: tf.get_variable_scope().reuse_variables() beam_model = make_ngpu(FLAGS.beam_size, False) sv = None if sess is None: # The supervisor configuration has a few overriden options. sv = tf.train.Supervisor(logdir=checkpoint_dir, is_chief=(FLAGS.task < 1), saver=model.saver, summary_op=None, save_summaries_secs=60, save_model_secs=15 * 60, global_step=model.global_step) config = tf.ConfigProto(allow_soft_placement=True) sess = sv.PrepareSession(FLAGS.master, config=config) data.print_out("Created model. Checkpoint dir %s" % checkpoint_dir) # Load model from parameters if a checkpoint exists. ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path + ".index"): data.print_out("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) elif sv is None: sess.run(tf.global_variables_initializer()) data.print_out("Initialized variables (no supervisor mode).") elif FLAGS.task < 1 and FLAGS.mem_size > 0: # sess.run(model.mem_norm_op) data.print_out("Created new model and normalized mem (on chief).") # Return the model and needed variables. return (model, beam_model, min_length, max_length, checkpoint_dir, (global_train_set, dev_set, en_path, fr_path), sv, sess)
def get_instance(args): """ create an instance of the initializer """ gain = float(args.get('gain', 1.0)) return tf.orthogonal_initializer(gain, seed=SEED)
def main(model, T, n_epochs, n_batch, n_hidden, learning_rate, decay, nb_v, norm, capacity, n_layers, clip_threshold, keep_prob, lr_decay, max_n_epoch, grid_name, is_gates, n_hyper_hidden, layer_norm, slow_size, fast_size): max_len_data = 1000000000 epoch_train, vocab_to_idx = file_data('train', n_batch, max_len_data, T, n_epochs, None) n_input = len(vocab_to_idx) epoch_val, _ = file_data('valid', nb_v, max_len_data, T, 10000, vocab_to_idx) epoch_test, _ = file_data('test', nb_v, max_len_data, T, 10000, vocab_to_idx) n_output = n_input x = tf.placeholder("int64", [None, T]) y = tf.placeholder("int64", [None, T]) new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") lr = tf.get_variable("learning_rate", shape=[], dtype=tf.float32, trainable=False) update = tf.assign(lr, new_lr) if model == "LSTM": i_s = tuple([ LSTMStateTuple(tf.placeholder("float", [None, n_hidden]), tf.placeholder("float", [None, n_hidden])) for _ in range(n_layers) ]) elif model == "HyperDRUM": i_s = tuple([ LSTMStateTuple( tf.placeholder("float", [None, n_hyper_hidden]), tf.placeholder("float", [None, n_hidden + n_hyper_hidden])) for _ in range(n_layers) ]) elif model == "FSRUM": i_s = tuple([ tuple([ tuple([ tf.placeholder("float", [None, fast_size]), tf.placeholder("float", [None, fast_size]) ]), tf.placeholder("float", [None, slow_size]) ]) for _ in range(n_layers) ]) else: i_s = tuple([ tf.placeholder("float", [None, n_hidden]) for _ in range(n_layers) ]) input_data = tf.one_hot(x, n_input, dtype=tf.float32) if keep_prob != None: tf.nn.dropout(input_data, keep_prob) if model == "HyperDRUM": def hyperdrum_cell(): return HyperDRUMCell(n_hidden, hyper_num_units=n_hyper_hidden, use_recurrent_dropout=layer_norm, normalization=norm) mcell = MultiRNNCell([hyperdrum_cell() for _ in range(n_layers)], state_is_tuple=True) if model == "RUM": def rum_cell(): return RUMCell(n_hidden, T_norm=1.0, use_zoneout=True, use_layer_norm=True) mcell = MultiRNNCell([rum_cell() for _ in range(n_layers)], state_is_tuple=True) if model == "FSRUM": def rum_cell(): return RUMCell(slow_size, T_norm=1.0, use_zoneout=True, use_layer_norm=True) def ln_lstm_cell(): return LN_LSTMCell(fast_size, use_zoneout=True, is_training=True, zoneout_keep_h=True, zoneout_keep_c=True) # def fs_rum_cell(): # return FSRNNCell([ln_lstm_cell(), ln_lstm_cell()], rum_cell(), 0.65, training = True) mcell = MultiRNNCell([fs_rum_cell() for _ in range(n_layers)], state_is_tuple=True) if model == "LSTM": def lstm_cell(): return LSTMCell(n_hidden, initializer=tf.orthogonal_initializer()) mcell = MultiRNNCell([lstm_cell() for _ in range(n_layers)], state_is_tuple=True) if model == "EUNN": def eunn_cell(i): return EUNNCell(n_hidden, capacity=capacity, comp=False, name=i) mcell = MultiRNNCell([eunn_cell(str(i)) for i in range(n_layers)], state_is_tuple=True) if model == "GRU": def gru_cell(): return GRUCell(n_hidden, kernel_initializer=tf.orthogonal_initializer()) mcell = MultiRNNCell([gru_cell() for _ in range(n_layers)], state_is_tuple=True) hidden_out, states = tf.nn.dynamic_rnn(mcell, input_data, dtype=tf.float32, initial_state=i_s) V_init_val = np.sqrt(6.) / np.sqrt(n_output + n_input) V_weights = tf.get_variable( "V_weights", shape=[n_hidden, n_output], dtype=tf.float32, initializer=tf.orthogonal_initializer(gain=V_init_val)) V_bias = tf.get_variable("V_bias", shape=[n_output], dtype=tf.float32, initializer=tf.constant_initializer(0.01)) hidden_out_list = tf.unstack(hidden_out, axis=1) temp_out = tf.stack([tf.matmul(i, V_weights) for i in hidden_out_list]) output_data = tf.nn.bias_add(tf.transpose(temp_out, [1, 0, 2]), V_bias) if keep_prob != None: tf.nn.dropout(output_data, keep_prob) cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output_data, labels=y)) correct_pred = tf.equal(tf.argmax(output_data, 2), y) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_op = optimizer.minimize(cost) init = tf.global_variables_initializer() for i in tf.global_variables(): print(i.name) tmp_filename = "./output/character/" if grid_name is not None: tmp_filename += grid_name + "/" tmp_filename += "T=" + str(T) + "/" filename = tmp_filename + str(n_layers) + model + "_N=" + str(n_hidden) + \ "_B=" + str(n_batch) + "_nb_v=" + str(nb_v) + \ "_numEpochs=" + str(n_epochs) + "_lr=" + str(learning_rate) if norm is not None: filename += "_norm=" + str(norm) if keep_prob is not None: filename += "_keepProb=" + str(keep_prob) filename = filename + ".txt" research_filename = tmp_filename + "researchModels" + "/" + \ str(n_layers) + model + "_N=" + str(n_hidden) + \ "_B=" + str(n_batch) + "_nb_v=" + str(nb_v) + \ "_numEpochs=" + str(n_epochs) + "_lr=" + str(learning_rate) if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: if exc.errno != errno.EEXIST: raise if not os.path.exists(os.path.dirname(research_filename)): try: os.makedirs(os.path.dirname(research_filename)) except OSError as exc: if exc.errno != errno.EEXIST: raise if not os.path.exists( os.path.dirname(research_filename + "/modelCheckpoint/")): try: os.makedirs( os.path.dirname(research_filename + "/modelCheckpoint/")) except OSError as exc: if exc.errno != errno.EEXIST: raise f = open(filename, 'w') f.write("########\n\n") f.write("## \tModel: %s with N=%d" % (model, n_hidden)) f.write("\n\n") f.write("########\n\n") def do_test(): j = 0 test_losses = [] for test in epoch_test: j += 1 if j >= 2: break print("Running test...") if model == "LSTM": test_state = tuple([ LSTMStateTuple(np.zeros((nb_v, n_hidden), dtype=np.float), np.zeros((nb_v, n_hidden), dtype=np.float)) for _ in range(n_layers) ]) elif model == "HyperDRUM": test_state = tuple([ LSTMStateTuple( np.zeros((nb_v, n_hyper_hidden), dtype=np.float), np.zeros((nb_v, n_hyper_hidden + n_hidden), dtype=np.float)) for _ in range(n_layers) ]) elif model == "FSRUM": test_state = tuple([ tuple([ tuple([ np.zeros((nb_v, fast_size), dtype=np.float), np.zeros((nb_v, fast_size), dtype=np.float) ]), np.zeros((nb_v, slow_size), dtype=np.float) ]) for _ in range(n_layers) ]) else: test_state = tuple([ np.zeros((nb_v, n_hidden), dtype=np.float) for _ in range(n_layers) ]) for stepb, (X_test, Y_test) in enumerate(test): test_batch_x = X_test test_batch_y = Y_test test_dict = {x: test_batch_x, y: test_batch_y, i_s: test_state} test_acc, test_loss, test_state = sess.run( [accuracy, cost, states], feed_dict=test_dict) test_losses.append(test_loss) print("test:", ) test_losses.append(sum(test_losses) / len(test_losses)) print("test Loss= " + "{:.6f}".format(test_losses[-1])) return test_losses[-1] def do_validation(loss, curr_epoch): curr_epoch = int(curr_epoch) j = 0 val_losses = [] val_max = 0 val_norm_max = 0 for val in epoch_val: j += 1 if j >= 2: break print("Running validation...") if model == "LSTM": val_state = tuple([ LSTMStateTuple(np.zeros((nb_v, n_hidden), dtype=np.float), np.zeros((nb_v, n_hidden), dtype=np.float)) for _ in range(n_layers) ]) elif model == "HyperDRUM": val_state = tuple([ LSTMStateTuple( np.zeros((nb_v, n_hyper_hidden), dtype=np.float), np.zeros((nb_v, n_hyper_hidden + n_hidden), dtype=np.float)) for _ in range(n_layers) ]) elif model == "FSRUM": val_state = tuple([ tuple([ tuple([ np.zeros((nb_v, fast_size), dtype=np.float), np.zeros((nb_v, fast_size), dtype=np.float) ]), np.zeros((nb_v, slow_size), dtype=np.float) ]) for _ in range(n_layers) ]) else: val_state = tuple([ np.zeros((nb_v, n_hidden), dtype=np.float) for _ in range(n_layers) ]) for stepb, (X_val, Y_val) in enumerate(val): val_batch_x = X_val val_batch_y = Y_val val_dict = {x: val_batch_x, y: val_batch_y, i_s: val_state} val_acc, val_loss, val_state = sess.run( [accuracy, cost, states], feed_dict=val_dict) val_losses.append(val_loss) print("Validations:", ) validation_losses.append(sum(val_losses) / len(val_losses)) print("Validation Loss= " + "{:.6f}".format(validation_losses[-1])) test_loss = do_test() lr = [v for v in tf.global_variables() if v.name == "learning_rate:0"][0] lr = sess.run(lr) f.write( "Step: %d\t TrLoss: %f\t TestLoss: %f\t ValLoss: %f\t Epoch: %d\t Learning rate: %f\n" % (t, loss, test_loss, validation_losses[-1], curr_epoch, lr)) f.flush() saver = tf.train.Saver() step = 0 with tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=False)) as sess: print("Session Created") steps = [] losses = [] accs = [] validation_losses = [] sess.run(init) if lr_decay == None: sess.run(update, feed_dict={new_lr: learning_rate}) if model == "LSTM": training_state = tuple([ LSTMStateTuple(np.zeros((n_batch, n_hidden), dtype=np.float), np.zeros((n_batch, n_hidden), dtype=np.float)) for _ in range(n_layers) ]) elif model == "HyperDRUM": training_state = tuple([ LSTMStateTuple( np.zeros((n_batch, n_hyper_hidden), dtype=np.float), np.zeros((n_batch, n_hyper_hidden + n_hidden), dtype=np.float)) for _ in range(n_layers) ]) elif model == "FSRUM": training_state = tuple([ tuple([ tuple([ np.zeros((n_batch, fast_size), dtype=np.float), np.zeros((n_batch, fast_size), dtype=np.float) ]), np.zeros((n_batch, slow_size), dtype=np.float) ]) for _ in range(n_layers) ]) else: training_state = tuple([ np.zeros((n_batch, n_hidden), dtype=np.float) for _ in range(n_layers) ]) i = 0 t = 0 val_cnt = 0 for epoch in epoch_train: print("Epoch: ", i) if lr_decay != None: sess.run(update, feed_dict={ new_lr: learning_rate * (lr_decay**max(i + 1 - max_n_epoch, 0.0)) }) for step, (X, Y) in enumerate(epoch): batch_x = X batch_y = Y myfeed_dict = {x: batch_x, y: batch_y, i_s: training_state} _, acc, loss, training_state = sess.run( [train_op, accuracy, cost, states], feed_dict=myfeed_dict) lr = [ v for v in tf.global_variables() if v.name == "learning_rate:0" ][0] lr = sess.run(lr) print("Iter " + str(t) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc) + ", Epoch " + str(i) + ", Learning rate= " + str(lr)) steps.append(t) losses.append(loss) accs.append(acc) t += 1 if step % 499 == 500: do_validation(loss, i) if is_gates and (model == "GRU" or model == "DRUM") and (n_layers == 1): if model == "GRU": tmp = "gru" if model == "DRUM": tmp = "drum" kernel = [ v for v in tf.global_variables() if v.name == "rnn/multi_rnn_cell/cell_0/" + tmp + "_cell/gates/kernel:0" ][0] bias = [ v for v in tf.global_variables() if v.name == "rnn/multi_rnn_cell/cell_0/" + tmp + "_cell/gates/bias:0" ][0] k, b = sess.run([kernel, bias]) np.save(research_filename + "/kernel_" + str(val_cnt), k) np.save(research_filename + "/bias_" + str(val_cnt), b) val_cnt += 1 i += 1 saver.save(sess, research_filename + "/modelCheckpoint/model") print("Optimization Finished!") test_loss = do_test() f.write("Test result: %d (step) \t%f (loss)\n" % (t, test_loss[-1]))
def train_net(batch_size=100, t_steps=100, l_dim=8*[240], act=tf.nn.tanh, alpha=0.1, beta0=0., beta1=1., beta2=0., noise_str=0.5, learning_rate=0.01, learning_rate_inv=0.01, err_alg=1, mode='autoencoder', dataset='mnist', preprocess=False, return_sess=False): """ Args: batch_size: batch size t_steps: number of training steps l_dim: list of network architecture / dimension of 'hidden' layers, not including input and output layer. alpha: in (0,1], scaling for top layer target; x_tar[-1] = x[-1] - alpha*(dL/dx[-1]) beta0: regularization constant beta1: regularization constant beta2: regularization constant noise_str: value of standard dev of noise injected into neurons, but only for the L_inv loss functions, and for t_step=0 (decays through training) learning_rate: learning rate for optimization err_alg: error propagation method. 0 for difference target prop. 1 for regularized target prop. 2 for reg target prop with learnable inverses. 3 for backprop. mode: 'autoencoder' or 'classification' dataset: 'mnist' or 'cifar' preprocess: bool. PCA+whiten the data? Good for cifar but whatevs for mnist return_sess: should we return the tf session? Returns: sess: the tf session if return_sess is True """ # Params from conti_dtp.py -- unclear if this is one hyperparam search or the optimal one # alpha, L learning rate, L_inv learning rate, noise_inj # 0.327736332653, 0.0148893490317, 0.00501149118237, 0.359829566008 ### DATA ### if dataset == 'cifar': data = ds.cifar10_data() data_test = ds.cifar10_data_test() elif dataset == 'mnist': data = ds.mnist_data() data_test = ds.mnist_data_test() if preprocess: from sklearn.decomposition import PCA pca = PCA(n_components=1000, whiten=True) data.inputs = pca.fit_transform(data.inputs) data_test.inputs = pca.transform(data_test.inputs) if mode == 'autoencoder': # autoencoderify data.outputs = data.inputs data_test.outputs = data_test.inputs m_dim = data.inputs.shape[1] # input dimension p_dim = data.outputs.shape[1] # output dimension l_dim = [m_dim] + l_dim + [p_dim] # layer dimensions layers = len(l_dim)-1 ### MODEL ### tf.reset_default_graph() tf.set_random_seed(1234) np.random.seed(1234) # placeholders x_in = tf.placeholder(tf.float32, shape=[None, m_dim], name='x_in') # Input y = tf.placeholder(tf.float32, shape=[None, p_dim], name='y') # Output epoch = tf.placeholder(tf.float32, shape=None, name='epoch') # training iteration # in dtp code, 0.5/(1 + epoch / 100) noise_inj = noise_str/(1.+epoch/100.) # std dev of noise in L_inv loss # initialize lists x = (layers+1)*[None] # activations W = (layers+1)*[None] # feedforward matrix b = (layers+1)*[None] # feedforward bias x_ = (layers+1)*[None] # targets V = (layers+1)*[None] # feedback matrix c = (layers+1)*[None] # feedback bias L = (layers+1)*[None] # local layer loss for training W and b L_inv = (layers+1)*[None] # local inverse loss for training V and c L_inv0 = (layers+1)*[None] # (testing) L_inv1 = (layers+1)*[None] L_inv2 = (layers+1)*[None] eps = (layers+1)*[None] # noise in L_inv term eps0 = (layers+1)*[None] # (testing) eps1 = (layers+1)*[None] vscope = (layers+1)*[None] # variable scopes train_op_L = (layers+1)*[None] # training op train_op_inv = (layers+1)*[None] # training op # init with numpy arrays from scipy import linalg for l in range(1, layers+1): low = -np.sqrt(6.0/(l_dim[l-1] + l_dim[l])) high = np.sqrt(6.0/(l_dim[l-1] + l_dim[l])) W[l] = np.random.uniform(low=low, high=high, size=(l_dim[l-1], l_dim[l])).astype('float32') if l_dim[l-1] >= l_dim[l]: W[l] = 1.0*linalg.orth(W[l]) # transpose for autoencoder if mode == 'autoencoder': for l in range(layers/2+1, layers+1): W[l] = W[layers+1-l].T for l in range(layers, 1, -1): if err_alg==0 or err_alg==1: #V[l] = np.linalg.pinv(W[l]) low = -np.sqrt(6.0/(l_dim[l-1] + l_dim[l])) high = np.sqrt(6.0/(l_dim[l-1] + l_dim[l])) V[l] = np.random.uniform(low=low, high=high, size=(l_dim[l], l_dim[l-1])).astype('float32') if l_dim[l] >= l_dim[l-1]: V[l] = 1.0*linalg.orth(V[l]) if err_alg==2: pinv = np.linalg.pinv(W[l]) V[l] = np.concatenate((pinv, np.eye(l_dim[l-1]) - np.dot(W[l], pinv)), axis=0).astype('float32') # Variable creation # xavier: # tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True) # orth: # tf.orthogonal_initializer(0.5) # feedforward variables for l in range(1, layers+1): with tf.variable_scope('vars_Layer'+str(l)) as vscope[l]: b[l] = tf.get_variable( 'b', shape=[1, l_dim[l]], initializer=tf.constant_initializer(0.0)) W[l] = tf.get_variable( 'W', shape=[l_dim[l-1], l_dim[l]], initializer=tf.orthogonal_initializer()) #W[l] = tf.get_variable( 'W', initializer=W[l]) # feedback variables for l in range(layers, 1, -1): with tf.variable_scope(vscope[l]): if err_alg==0 or err_alg==1: c[l] = tf.get_variable( 'c', shape=[1, l_dim[l-1]], initializer=tf.constant_initializer(0.0)) V[l] = tf.get_variable( 'V', shape=[l_dim[l], l_dim[l-1]], initializer=tf.orthogonal_initializer()) #V[l] = tf.get_variable( 'V', initializer=V[l]) if err_alg==2: c[l] = tf.get_variable( 'c', shape=[1, l_dim[l-1]], initializer=tf.constant_initializer(0.0)) V[l] = tf.get_variable( 'V', shape=[l_dim[l]+l_dim[l-1], l_dim[l-1]], initializer=tf.orthogonal_initializer()) #V[l] = tf.get_variable( 'V', initializer=V[l]) # feedforward functions def f(layer, x_in, act=tf.nn.tanh): with tf.variable_scope(vscope[layer], reuse=True): # note: could also just use W[l] and b[l] W_ = tf.get_variable('W') b_ = tf.get_variable('b') return act(tf.add(tf.matmul(x_in, W_), b_), name='x') # Feedback functions def g(layer, x_target, act=tf.nn.tanh): with tf.variable_scope(vscope[layer], reuse=True): V_ = tf.get_variable('V') c_ = tf.get_variable('c') return act(tf.add(tf.matmul(x_target, V_), c_), name='x_') def g_dtp(layer, x1_target, x1_activation, x0_activation, act=tf.nn.tanh): with tf.variable_scope(vscope[layer], reuse=True): V_ = tf.get_variable('V') c_ = tf.get_variable('c') return tf.add(x0_activation, tf.sub(act(tf.add(tf.matmul(x1_target, V[layer], name='x3_'), c[layer], name='x2_'), name='x1_'), act(tf.add(tf.matmul(x1_activation, V[layer], name='x3_'), c[layer], name='x2_'), name='x1_')), name='x_target') def g_rinv(layer, x1_target, x0_activation): with tf.variable_scope(vscope[layer], reuse=True): V_ = tf.get_variable('V') c_ = tf.get_variable('c') relu_inv = tf.py_func(ops.relu().f_inv, [x1_target, x0_activation], [tf.float32], name='x3_')[0] add_inv = tf.sub(relu_inv, b[layer], name='x2_') return tf.py_func(ops.linear().f_inv, [add_inv, x0_activation, W[layer]], [tf.float32], name='x1_')[0] # TESTING # def g_full(layer, input1, input2, act=tf.nn.tanh): # """ generalized g. g(x_[layer], x[layer-1]) -> x_[layer-1] """ # with tf.name_scope(scope[l]): # V[layer] = tf.get_variable( 'V' ) # c[layer] = tf.get_variable( 'c' ) # return act(tf.matmul(tf.concat( 1, [input1, input2] ), V[layer]) + c[layer], name='g_full') # def g_full2(layer, input1, input2, input3, act=tf.nn.tanh): # """ generalized g. g(x_[layer], x[layer-1]) -> x_[layer-1] """ # with tf.name_scope('Layer'): # V[layer] = tf.get_variable( 'V' ) # c[layer] = tf.get_variable( 'c' ) # return act(tf.matmul(tf.concat( 1, [input1, input2, input3] ), V[layer]) + c[layer], name='g_full') # /TESTING # forward propagation x[0] = x_in for l in range(1, layers+1): with tf.name_scope('layer'+str(l)+'_ff'): if l==layers and mode=='classification': # last layer x[layers] = f(layers, x[layers-1], tf.nn.softmax) else: # other layers x[l] = f(l, x[l-1], act) # top layer loss / top layer target # L[-1] = tf.nn.softmax_cross_entropy_with_logits(x[-1], y) with tf.name_scope('top_layer'): if mode == 'classification': #L[-1] = tf.reduce_mean(-tf.reduce_sum(y*tf.log(x[-1] + 1e-10), reduction_indices=[1]), name='global_loss') # add 1e-10 so you don't get nan'd L[-1] = tf.reduce_mean((x[-1] - y)**2, name='global_loss') elif mode == 'autoencoder': L[-1] = tf.reduce_mean((x[-1] - y)**2, name='global_loss') x_[-1] = tf.sub(x[-1], alpha*(x[-1] - y), name='x_target_top') # feedback propagation for l in range(layers, 1, -1): with tf.name_scope('layer'+str(l)+'_fb'): if err_alg==0: x_[l-1] = tf.add(x[l-1] - g(l, x[l], act), g(l, x_[l], act), name='x_target') if err_alg==1: x_[l-1] = g_rinv(l, x_[l], x[l-1]) # noise terms for loss functions if err_alg==0 or err_alg==2: for l in range(1, layers+1): with tf.name_scope('layer'+str(l)+'_eps'): eps[l] = tf.random_normal(tf.shape(x[l]), mean=0, stddev=noise_inj, name='eps'+str(l-1)) #eps0[l] = noise_inj*tf.random_normal(tf.shape(x[l]), mean=0, stddev=1., name='eps0'+str(l-1)) # uh, tf.shape(x[l-1]) right? #eps1[l] = noise_inj*tf.random_normal(tf.shape(x[l]), mean=0, stddev=1., name='eps1'+str(l-1)) # uh, tf.shape(x[l-1]) right? # loss functions for l in range(1, layers): # FOR NOW; LAYERS+1, BUT SHOULD BE LAYERS with tf.name_scope('layer'+str(l)+'_loss'): if err_alg!=3: L[l] = tf.reduce_mean((x[l] - tf.stop_gradient(x_[l]))**2, name='Loss') # note: stop_gradients not necessary for l in range(2, layers+1): with tf.name_scope('layer'+str(l)+'_loss_inv'): if err_alg==0: L_inv[l] = tf.reduce_mean((g(l, tf.stop_gradient(f(l, x[l-1]+eps[l-1], act)), act) - tf.stop_gradient(x[l-1]+eps[l-1]))**2, name='L_inv') if err_alg==1: pass if err_alg==2: # STILL TESTING # L_inv0 - g as left inverse of f; regardless of what x_0 is, g should send f(x) to x. just use, for now, the activation x+eps L_inv0[l] = tf.reduce_mean((g_full(l, f(l, x[l-1]+eps0[l-1], act), x[l-1], act) - (x[l-1]+eps0[l-1]))**2, name='L_inv0') # L_inv1 - g as the right inverse of f; regardless of what x_0 is, f should send g(y) to y; make sure to use x_targ as y because that's what matters L_inv1[l] = tf.reduce_mean((f(l, g_full(l, x_[l]+eps1[l], x[l-1], act), act) - (x_[l]+eps1[l]))**2, name='L_inv1') # L_inv2 - g should send y close to x_0 L_inv2[l] = tf.reduce_mean((g_full(l, x_[l], x[l-1], act) - x[l-1])**2, name='L_inv2') L_inv[l] = beta0*L_inv0[l] + beta1*L_inv1[l] + beta2*L_inv2[l] # L_inv[l] = tf.add(L_inv1[l], beta*L_inv2[l], name='L_inv') # L_inv[l] = tf.add(tf.reduce_mean(0.5*(f(l, g_full(l, x_[l]+eps[l], x[l], x[l-1])) - x_[l]-eps[l])**2), beta*tf.reduce_mean(0.5*(g_full(l, x_[l], x[l], x[l-1]) - x[l-1])**2), name='L_inv') # triple check -- where to put beta, where to put reduce_means? # optimizers if err_alg!=3: for l in range(1, layers+1): with tf.name_scope('layer'+str(l)+'_opts'): train_op_L[l] = tf.train.RMSPropOptimizer(learning_rate, name='Opt').minimize(L[l], var_list=[W[l], b[l]]) if err_alg==0 or err_alg==2: for l in range(2, layers+1): with tf.name_scope('layer'+str(l)+'_opts_inv'): train_op_inv[l] = tf.train.RMSPropOptimizer(learning_rate_inv, name='Opt_inv').minimize(L_inv[l], var_list=[V[l], c[l]]) if err_alg==3: train_op_L[-1] = tf.train.RMSPropOptimizer(learning_rate, name='Opt').minimize(L[-1], var_list=[i for i in W+b if i is not None]) if mode == 'classification': correct_prediction = tf.equal(tf.argmax(x[-1], 1), tf.argmax(y,1)) # note: normally, tf.nn.softmax(x[-1]), but we already softmax'd accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) elif mode == 'autoencoder': accuracy = tf.constant(0) # :( # clean up train_op_L = [i for i in train_op_L if i is not None] train_op_inv = [i for i in train_op_inv if i is not None] # tensorboard with tf.name_scope('key_summaries'): tf.summary.scalar('accuracy', accuracy) tf.summary.scalar('global_loss', L[-1]) with tf.name_scope('layer_losses'): for l in range(layers+1): if L[l] is not None: tf.summary.scalar('L'+str(l), L[l]) if L_inv[l] is not None: tf.summary.scalar('L_inv'+str(l), L_inv[l]) with tf.name_scope('weights'): for varlist in ['W', 'V', 'b', 'c']: for iv, var in enumerate(eval(varlist)): if var is not None: tf.summary.histogram(varlist+str(iv), var) with tf.name_scope('grads'): for varlist in ['W', 'b']: for iv, var in enumerate(eval(varlist)): if var is not None and L[iv] is not None: tf.summary.histogram('grad'+varlist+str(iv), tf.gradients(L[iv], [var])[0]) # does this actually recompute gradients? if so, whatevs for varlist in ['V', 'c']: for iv, var in enumerate(eval(varlist)): if var is not None and L_inv[iv] is not None: tf.summary.histogram('grad'+varlist+str(iv), tf.gradients(L_inv[iv], [var])[0]) merged_summary_op = tf.summary.merge_all() ### TRAIN ### sess = tf.Session() sess.run(tf.global_variables_initializer()) make_dir('/tmp/targ-prop/') run = str(len(os.listdir('/tmp/targ-prop'))+1) print 'Run: '+run summary_writer = tf.summary.FileWriter('/tmp/targ-prop/'+str(run), sess.graph) for i in range(t_steps): x_batch, y_batch = data.next_batch(batch_size) feed_dict = {x_in: x_batch, y: y_batch, epoch: i} sess.run(train_op_inv, feed_dict=feed_dict) sess.run(train_op_L, feed_dict=feed_dict) if i % 25 == 0: loss_val, summary_str, acc_val = sess.run([L[-1], merged_summary_op, accuracy], feed_dict=feed_dict) summary_writer.add_summary(summary_str, i) if i % 200 == 0: x_test, y_test = data_test.inputs, data_test.outputs feed_dict = {x_in: x_test, y: y_test, epoch: i} loss_val_test, acc_val_test = sess.run([L[-1], accuracy], feed_dict=feed_dict) print "iter:", "%04d" % (i), \ "| TRAINING ", \ "loss:", "{:.4f}".format(loss_val), \ "accuracy:", "{:.4f}".format(acc_val), \ "| TEST ", \ "loss:", "{:.4f}".format(loss_val_test), \ "accuracy:", "{:.4f}".format(acc_val_test) print "finished" if return_sess: return sess else: sess.close() return
def rnn_cell(self, reuse=False): return tf.nn.rnn_cell.GRUCell( self.rnn_size, kernel_initializer=tf.orthogonal_initializer(), reuse=reuse)
def __init__(self,seq_len=200,first_read=50,rnn_size=200): self.seq_len = seq_len self.first_read = first_read #dictionary of possible characters self.chars = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',\ '1','2','3','4','5','6','7','8','9','0','-','.',',','!','?','(',')','\'','"',' '] self.num_chars = len(self.chars) #dictionary mapping characters to indices self.char2idx = {char:i for (i,char) in enumerate(self.chars)} self.idx2char = {i:char for (i,char) in enumerate(self.chars)} ''' #training portion of language model ''' # input sequence of character indices # self.input = tf.placeholder(tf.int32,[1,seq_len]) # tf Graph input x = tf.placeholder("float", [None, seq_max_len, 1]) y = tf.placeholder("float", [None, n_classes]) # A placeholder for indicating each sequence length seqlen = tf.placeholder(tf.int32, [None]) #convert to one hot one_hot = tf.one_hot(self.input,self.num_chars) #rnn layer self.gru = GRUCell(rnn_size) outputs, states = tf.nn.dynamic_rnn(self.gru, one_hot,sequence_length=[seqlen],dtype=tf.float32) outputs = tf.squeeze(outputs,[0]) #ignore all outputs during first read steps outputs = outputs[first_read:-1] #softmax logit to predict next character (actual softmax is applied in cross entropy function) logits = tf.layers.dense(outputs,self.num_chars,None,True,tf.orthogonal_initializer(),name='dense') #target character at each step (after first read chars) is following character targets = one_hot[0,first_read+1:] #loss and train functions self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=targets)) self.optimizer = tf.train.AdamOptimizer(0.0002,0.9,0.999).minimize(self.loss) ''' #generation portion of language model ''' #use output and state from last word in training sequence state = tf.expand_dims(states[-1],0) output = one_hot[:,-1] #save predicted characters to list self.predictions = [] #generate 100 new characters that come after input sequence for i in range(100): #run GRU cell and softmax output,state = self.gru(output,state) logits = tf.layers.dense(output,self.num_chars,None,True,tf.orthogonal_initializer(),name='dense',reuse=True) #get index of most probable character output = tf.argmax(tf.nn.softmax(logits),1) #save predicted character to list self.predictions.append(output) #one hot and cast to float for GRU API output = tf.cast(tf.one_hot(output,self.num_chars),tf.float32) #init op self.sess = tf.Session() self.sess.run(tf.global_variables_initializer())
def build_inference(self, reuse=False): """ Build inference model for generating next states """ inputs = {} outputs = {} video_feat = tf.placeholder(tf.float32, [None, self.options['video_feat_dim']], name='video_feat') sentence = tf.placeholder(tf.float32, [None, self.options['max_sentence_len'], self.options['word_embed_size']]) sentence_mask = tf.placeholder(tf.float32, [None, None]) if self.options['bidirectional_lstm_sentence']: sentence_bw = tf.placeholder(tf.float32, [None, self.options['max_sentence_len'], self.options['word_embed_size']]) inputs['sentence_bw'] = sentence_bw video_c_state = tf.placeholder(tf.float32, [None, self.options['rnn_size']]) video_h_state = tf.placeholder(tf.float32, [None, self.options['rnn_size']]) interactor_c_state = tf.placeholder(tf.float32, [None, self.options['rnn_size']]) interactor_h_state = tf.placeholder(tf.float32, [None, self.options['rnn_size']]) inputs['video_feat'] = video_feat inputs['sentence'] = sentence inputs['sentence_mask'] = sentence_mask inputs['video_c_state'] = video_c_state inputs['video_h_state'] = video_h_state inputs['interactor_c_state'] = interactor_c_state inputs['interactor_h_state'] = interactor_h_state video_state = tf.nn.rnn_cell.LSTMStateTuple(video_c_state, video_h_state) interactor_state = tf.nn.rnn_cell.LSTMStateTuple(interactor_c_state, interactor_h_state) batch_size = tf.shape(video_feat)[0] rnn_cell_sentence = tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer() ) rnn_cell_video = tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer() ) rnn_cell_interator = tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer() ) with tf.variable_scope('sentence_encoding', reuse=reuse) as sentence_scope: #sequence_length = tf.fill([batch_size, ], self.options['max_sentence_len']) sequence_length = tf.reduce_sum(sentence_mask, axis=-1) initial_state = rnn_cell_sentence.zero_state(batch_size=batch_size, dtype=tf.float32) sentence_states, sentence_final_state = tf.nn.dynamic_rnn( cell=rnn_cell_sentence, inputs=sentence, sequence_length=sequence_length, initial_state=initial_state, dtype=tf.float32 ) if self.options['bidirectional_lstm_sentence']: rnn_cell_sentence_bw = tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer() ) with tf.variable_scope('sentence_bw') as scope: sentence_states_bw, sentence_final_state_bw = tf.nn.dynamic_rnn( cell=rnn_cell_sentence_bw, inputs=sentence_bw, sequence_length=sequence_length, initial_state=initial_state, dtype=tf.float32 ) sentence_states_bw = tf.reverse_sequence(sentence_states_bw, seq_lengths=tf.to_int32(sequence_length), seq_axis=1) sentence_states = tf.concat([sentence_states, sentence_states_bw], axis=-1) with tf.variable_scope('interactor', reuse=reuse) as interactor_scope: sentence_states_reshape = tf.reshape(sentence_states, [-1, ( 1 + int(self.options['bidirectional_lstm_sentence'])) * self.options['rnn_size']]) # get video state with tf.variable_scope('video_rnn') as video_rnn_scope: _, video_state = rnn_cell_video(inputs=video_feat, state=video_state) video_c_state, video_h_state = video_state # calculate attention over words # use a one-layer network to do this with tf.variable_scope('word_attention', reuse=reuse) as attention_scope: h_states = tf.tile(tf.concat([interactor_h_state, video_h_state], axis=-1), [1, self.options['max_sentence_len']]) h_states = tf.reshape(h_states, [-1, 2 * self.options['rnn_size']]) attention_input = tf.concat([h_states, sentence_states_reshape], axis=-1) attention_layer1 = tf.contrib.layers.fully_connected( inputs=attention_input, num_outputs=self.options['attention_hidden_size'], activation_fn=tf.nn.tanh, weights_initializer=tf.contrib.layers.xavier_initializer() ) attention_layer2 = tf.contrib.layers.fully_connected( inputs=attention_layer1, num_outputs=1, activation_fn=None, weights_initializer=tf.contrib.layers.xavier_initializer() ) # reshape to match attention_reshape = tf.reshape(attention_layer2, [-1, self.options['max_sentence_len']]) attention_score = tf.nn.softmax(attention_reshape, dim=-1) attention_score = tf.reshape(attention_score, [-1, 1, self.options['max_sentence_len']]) # attended word feature attended_word_feature = tf.matmul(attention_score, sentence_states) # already support batch matrix multiplication in v1.0 attended_word_feature = tf.reshape(attended_word_feature, [-1, ( 1 + int(self.options['bidirectional_lstm_sentence'])) * self.options['rnn_size']]) # calculate next interator state interactor_input = tf.concat([video_h_state, attended_word_feature], axis=-1) with tf.variable_scope('interactor_rnn') as interactor_rnn_scope: _, interactor_state = rnn_cell_interator(inputs=interactor_input, state=interactor_state) interactor_c_state, interactor_h_state = interactor_state with tf.variable_scope('predict_proposal'): logit_output = tf.contrib.layers.fully_connected( inputs=interactor_h_state, num_outputs=self.options['num_anchors'], activation_fn=None ) # score proposal_score = tf.sigmoid(logit_output, name='proposal_scores') outputs['proposal_score'] = proposal_score outputs['video_c_state'] = video_c_state outputs['video_h_state'] = video_h_state outputs['interactor_c_state'] = interactor_c_state outputs['interactor_h_state'] = interactor_h_state return inputs, outputs
def build_caption_greedy_inference(self, reuse=False): inputs = {} outputs = {} # proposal feature sequences (the localized proposals/events can be of different length, I set a 'max_proposal_len' to make it easy for GPU processing) proposal_feats = tf.placeholder(tf.float32, [ None, self.options['max_proposal_len'], self.options['video_feat_dim'] ]) # combination of forward and backward hidden state, which encode event context information event_hidden_feats = tf.placeholder( tf.float32, [None, 2 * self.options['rnn_size']]) inputs['event_hidden_feats'] = event_hidden_feats inputs['proposal_feats'] = proposal_feats # batch size for inference, depends on how many proposals are generated for a video eval_batch_size = tf.shape(proposal_feats)[0] # intialize the rnn cell for captioning rnn_cell_caption = tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer()) def get_rnn_cell(): return tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer()) # multi-layer LSTM multi_rnn_cell_caption = tf.contrib.rnn.MultiRNNCell( [get_rnn_cell() for _ in range(self.options['num_rnn_layers'])], state_is_tuple=True) # start word word_id = tf.fill([eval_batch_size], self.options['vocab']['<START>']) word_id = tf.to_int64(word_id) word_ids = tf.expand_dims(word_id, axis=-1) # probability (confidence) for the predicted word word_confidences = tf.expand_dims(tf.fill([eval_batch_size], 1.), axis=-1) # initial state of caption generation initial_state = multi_rnn_cell_caption.zero_state( batch_size=eval_batch_size, dtype=tf.float32) state = initial_state with tf.variable_scope('caption_module', reuse=reuse) as caption_scope: # initialize memory cell and hidden output, note that the returned state is a tuple containing all states for each cell in MultiRNNCell state = multi_rnn_cell_caption.zero_state( batch_size=eval_batch_size, dtype=tf.float32) proposal_feats_reshape = tf.reshape( proposal_feats, [-1, self.options['video_feat_dim']], name='video_feat_reshape') ## the caption data should be prepared in equal length, namely, with length of 'caption_seq_len' ## use caption mask data to mask out loss from sequence after end of token (<END>) # only the first loop create variable, the other loops reuse them, need to give variable scope name to each variable, otherwise tensorflow will create a new one for i in range(self.options['caption_seq_len'] - 1): if i > 0: caption_scope.reuse_variables() # word embedding word_embed = self.build_caption_embedding(word_id) # get attention, receive both hidden state information (previous generated words) and video feature # state[:, 1] return all hidden states for all cells in MultiRNNCell h_state = tf.concat([s[1] for s in state], axis=-1) h_state_tile = tf.tile(h_state, [1, self.options['max_proposal_len']]) h_state_reshape = tf.reshape(h_state_tile, [ -1, self.options['num_rnn_layers'] * self.options['rnn_size'] ]) # repeat to match each feature vector in the localized proposal event_hidden_feats_tile = tf.tile( event_hidden_feats, [1, self.options['max_proposal_len']]) event_hidden_feats_reshape = tf.reshape( event_hidden_feats_tile, [-1, 2 * self.options['rnn_size']]) feat_state_concat = tf.concat([ proposal_feats_reshape, h_state_reshape, event_hidden_feats_reshape ], axis=-1, name='feat_state_concat') #feat_state_concat = tf.concat([tf.reshape(tf.tile(word_embed, [1, self.options['max_proposal_len']]), [-1, self.options['word_embed_size']]), proposal_feats_reshape, h_state_reshape, event_hidden_feats_reshape], axis=-1, name='feat_state_concat') # use a two-layer network to model temporal soft attention over proposal feature sequence when predicting next word (dynamic) with tf.variable_scope('attention', reuse=reuse) as attention_scope: attention_layer1 = tf.contrib.layers.fully_connected( inputs=feat_state_concat, num_outputs=self.options['attention_hidden_size'], activation_fn=tf.nn.tanh, weights_initializer=tf.contrib.layers. xavier_initializer()) attention_layer2 = tf.contrib.layers.fully_connected( inputs=attention_layer1, num_outputs=1, activation_fn=None, weights_initializer=tf.contrib.layers. xavier_initializer()) # reshape to match attention_reshape = tf.reshape( attention_layer2, [-1, self.options['max_proposal_len']], name='attention_reshape') attention_score = tf.nn.softmax(attention_reshape, dim=-1, name='attention_score') attention = tf.reshape( attention_score, [-1, 1, self.options['max_proposal_len']], name='attention') # attended video feature attended_proposal_feat = tf.matmul( attention, proposal_feats, name='attended_proposal_feat') attended_proposal_feat_reshape = tf.reshape( attended_proposal_feat, [-1, self.options['video_feat_dim']], name='attended_proposal_feat_reshape') # whether to use proposal contexts to help generate the corresponding caption if self.options['no_context']: proposal_feats_full = attended_proposal_feat_reshape else: # whether to use gating function to combine the proposal contexts if self.options['context_gating']: # model a gate to weight each element of context and feature attended_proposal_feat_reshape = tf.nn.tanh( attended_proposal_feat_reshape) with tf.variable_scope('context_gating', reuse=reuse): ''' context_feats_transform = tf.contrib.layers.fully_connected( inputs=event_hidden_feats, num_outputs=self.options['video_feat_dim'], activation_fn=None, weights_initializer=tf.contrib.layers.xavier_initializer() ) ''' context_feats_transform = event_hidden_feats proposal_feats_transform = tf.contrib.layers.fully_connected( inputs=attended_proposal_feat_reshape, num_outputs=2 * self.options['rnn_size'], activation_fn=tf.nn.tanh, weights_initializer=tf.contrib.layers. xavier_initializer()) gate = tf.contrib.layers.fully_connected( inputs=tf.concat([ word_embed, h_state, context_feats_transform, proposal_feats_transform ], axis=-1), num_outputs=2 * self.options['rnn_size'], activation_fn=tf.nn.sigmoid, weights_initializer=tf.contrib.layers. xavier_initializer()) gated_context_feats = tf.multiply( context_feats_transform, gate) gated_proposal_feats = tf.multiply( proposal_feats_transform, 1. - gate) proposal_feats_full = tf.concat( [gated_context_feats, gated_proposal_feats], axis=-1) else: proposal_feats_full = tf.concat([ event_hidden_feats, attended_proposal_feat_reshape ], axis=-1) # proposal feature embedded into word space proposal_feat_embed = self.build_video_feat_embedding( proposal_feats_full) # get next state caption_output, state = multi_rnn_cell_caption( tf.concat([proposal_feat_embed, word_embed], axis=-1), state) # predict next word with tf.variable_scope('logits', reuse=reuse) as logits_scope: logits = tf.contrib.layers.fully_connected( inputs=caption_output, num_outputs=self.options['vocab_size'], activation_fn=None) softmax = tf.nn.softmax(logits, name='softmax') word_id = tf.argmax(softmax, axis=-1) word_confidence = tf.reduce_max(softmax, axis=-1) word_ids = tf.concat( [word_ids, tf.expand_dims(word_id, axis=-1)], axis=-1) word_confidences = tf.concat([ word_confidences, tf.expand_dims(word_confidence, axis=-1) ], axis=-1) #sentence_confidences = tf.reduce_sum(tf.log(tf.clip_by_value(word_confidences, 1e-20, 1.)), axis=-1) word_confidences = tf.log(tf.clip_by_value(word_confidences, 1e-20, 1.)) outputs['word_ids'] = word_ids outputs['word_confidences'] = word_confidences return inputs, outputs
def build_train(self): """ Build training model """ inputs = {} outputs = {} video_feat = tf.placeholder(tf.float32, [None, None, self.options['video_feat_dim']], name='video_feat') video_feat_mask = tf.placeholder(tf.float32, [None, None]) anchor_mask = tf.placeholder(tf.float32, [None, None, self.options['num_anchors']]) sentence = tf.placeholder(tf.float32, [None, None, self.options['word_embed_size']]) sentence_mask = tf.placeholder(tf.float32, [None, None]) if self.options['bidirectional_lstm_sentence']: sentence_bw = tf.placeholder(tf.float32, [None, self.options['max_sentence_len'], self.options['word_embed_size']]) inputs['sentence_bw'] = sentence_bw inputs['video_feat'] = video_feat inputs['video_feat_mask'] = video_feat_mask inputs['anchor_mask'] = anchor_mask inputs['sentence'] = sentence inputs['sentence_mask'] = sentence_mask ## proposal, densely annotated proposal = tf.placeholder(tf.int32, [None, None, self.options['num_anchors']], name='proposal') inputs['proposal'] = proposal ## weighting for positive/negative labels (solve imblance data problem) proposal_weight = tf.placeholder(tf.float32, [self.options['num_anchors'], 2], name='proposal_weight') inputs['proposal_weight'] = proposal_weight # fc dropout dropout = tf.placeholder(tf.float32) inputs['dropout'] = dropout # get batch size, which is a scalar tensor batch_size = tf.shape(video_feat)[0] rnn_cell_sentence = tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer() ) rnn_cell_video = tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer() ) rnn_cell_interator = tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer() ) rnn_cell_sentence = tf.contrib.rnn.DropoutWrapper( rnn_cell_sentence, input_keep_prob=1.0 - dropout, output_keep_prob=1.0 - dropout ) rnn_cell_video = tf.contrib.rnn.DropoutWrapper( rnn_cell_video, input_keep_prob=1.0 - dropout, output_keep_prob=1.0 - dropout ) rnn_cell_interator = tf.contrib.rnn.DropoutWrapper( rnn_cell_interator, input_keep_prob=1.0 - dropout, output_keep_prob=1.0 - dropout ) with tf.variable_scope('sentence_encoding') as sentence_scope: #sequence_length = tf.fill([batch_size, ], self.options['max_sentence_len']) sequence_length = tf.reduce_sum(sentence_mask, axis=-1) initial_state = rnn_cell_sentence.zero_state(batch_size=batch_size, dtype=tf.float32) sentence_states, sentence_final_state = tf.nn.dynamic_rnn( cell=rnn_cell_sentence, inputs=sentence, sequence_length=sequence_length, initial_state=initial_state, dtype=tf.float32 ) if self.options['bidirectional_lstm_sentence']: rnn_cell_sentence_bw = tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer() ) with tf.variable_scope('sentence_bw') as scope: sentence_states_bw, sentence_final_state_bw = tf.nn.dynamic_rnn( cell=rnn_cell_sentence_bw, inputs=sentence_bw, sequence_length=sequence_length, initial_state=initial_state, dtype=tf.float32 ) sentence_states_bw = tf.reverse_sequence(sentence_states_bw, seq_lengths=tf.to_int32(sequence_length), seq_axis=1) sentence_states = tf.concat([sentence_states, sentence_states_bw], axis=-1) logit_outputs = tf.fill([batch_size, 0, self.options['num_anchors']], 0.) with tf.variable_scope('interactor') as interactor_scope: interactor_state = rnn_cell_interator.zero_state(batch_size=batch_size, dtype=tf.float32) video_state = rnn_cell_video.zero_state(batch_size=batch_size, dtype=tf.float32) sentence_states_reshape = tf.reshape(sentence_states, [-1, ( 1 + int(self.options['bidirectional_lstm_sentence'])) * self.options['rnn_size']]) for i in range(self.options['sample_len']): if i > 0: interactor_scope.reuse_variables() # get video state with tf.variable_scope('video_rnn') as video_rnn_scope: _, video_state = rnn_cell_video(inputs=video_feat[:, i, :], state=video_state) # calculate attention over words # use a one-layer network to do this with tf.variable_scope('word_attention') as attention_scope: h_states = tf.tile(tf.concat([interactor_state[1], video_state[1]], axis=-1), [1, self.options['max_sentence_len']]) h_states = tf.reshape(h_states, [-1, 2 * self.options['rnn_size']]) attention_input = tf.concat([h_states, sentence_states_reshape], axis=-1) attention_layer1 = tf.contrib.layers.fully_connected( inputs=attention_input, num_outputs=self.options['attention_hidden_size'], activation_fn=tf.nn.tanh, weights_initializer=tf.contrib.layers.xavier_initializer() ) attention_layer2 = tf.contrib.layers.fully_connected( inputs=attention_layer1, num_outputs=1, activation_fn=None, weights_initializer=tf.contrib.layers.xavier_initializer() ) # reshape to match attention_reshape = tf.reshape(attention_layer2, [-1, self.options['max_sentence_len']]) attention_score = tf.nn.softmax(attention_reshape, axis=-1) attention_score = tf.reshape(attention_score, [-1, 1, self.options['max_sentence_len']]) # attended word feature attended_word_feature = tf.matmul(attention_score, sentence_states) attended_word_feature = tf.reshape(attended_word_feature, [-1, ( 1 + int(self.options['bidirectional_lstm_sentence'])) * self.options['rnn_size']]) # calculate next interator state interactor_input = tf.concat([video_state[1], attended_word_feature], axis=-1) with tf.variable_scope('interactor_rnn') as interactor_rnn_scope: _, interactor_state = rnn_cell_interator(inputs=interactor_input, state=interactor_state) with tf.variable_scope('predict_proposal') as proposal_scope: logit_output = tf.contrib.layers.fully_connected( inputs=interactor_state[1], num_outputs=self.options['num_anchors'], activation_fn=None ) logit_output = tf.expand_dims(logit_output, axis=1) logit_outputs = tf.concat([logit_outputs, logit_output], axis=1) logit_outputs = tf.reshape(logit_outputs, [-1, self.options['num_anchors']]) # weighting positive samples proposal_weight0 = tf.reshape(proposal_weight[:, 0], [-1, self.options['num_anchors']]) # weighting negative samples proposal_weight1 = tf.reshape(proposal_weight[:, 1], [-1, self.options['num_anchors']]) # tile proposal_weight0 = tf.tile(proposal_weight0, [tf.shape(logit_outputs)[0], 1]) proposal_weight1 = tf.tile(proposal_weight1, [tf.shape(logit_outputs)[0], 1]) # get weighted sigmoid xentropy loss # use tensorflow built-in function # weight1 will be always 1. proposal = tf.reshape(proposal, [-1, self.options['num_anchors']]) proposal_loss_term = tf.nn.weighted_cross_entropy_with_logits( targets=tf.to_float(proposal), logits=logit_outputs, pos_weight=proposal_weight0) if self.options['anchor_mask']: proposal_loss_term = tf.reshape(anchor_mask, [-1, self.options['num_anchors']]) * proposal_loss_term proposal_loss_term = tf.reduce_sum(proposal_loss_term, axis=-1) proposal_loss_term = tf.reshape(proposal_loss_term, [-1]) video_feat_mask = tf.reshape(video_feat_mask, [-1]) proposal_loss = tf.reduce_sum((video_feat_mask * proposal_loss_term)) / tf.to_float( tf.reduce_sum(video_feat_mask)) # summary data, for visualization using Tensorboard tf.summary.scalar('proposal_loss', proposal_loss) # outputs from proposal module outputs['loss'] = proposal_loss reg_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()]) outputs['reg_loss'] = reg_loss return inputs, outputs
def build_proposal_inference(self, reuse=False): inputs = {} outputs = {} # this line of code is just a message to inform that batch size should be set to 1 only batch_size = 1 #******************** Define Proposal Module ******************# ## dim1: batch, dim2: video sequence length, dim3: video feature dimension ## video feature sequence # forward video_feat_fw = tf.placeholder( tf.float32, [None, None, self.options['video_feat_dim']], name='video_feat_fw') inputs['video_feat_fw'] = video_feat_fw # backward video_feat_bw = tf.placeholder( tf.float32, [None, None, self.options['video_feat_dim']], name='video_feat_bw') inputs['video_feat_bw'] = video_feat_bw rnn_cell_video_fw = tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer()) rnn_cell_video_bw = tf.contrib.rnn.LSTMCell( num_units=self.options['rnn_size'], state_is_tuple=True, initializer=tf.orthogonal_initializer()) with tf.variable_scope('proposal_module', reuse=reuse) as proposal_scope: '''video feature sequence encoding: forward pass ''' with tf.variable_scope('video_encoder_fw', reuse=reuse) as scope: sequence_length = tf.expand_dims(tf.shape(video_feat_fw)[1], axis=0) initial_state = rnn_cell_video_fw.zero_state( batch_size=batch_size, dtype=tf.float32) rnn_outputs_fw, _ = tf.nn.dynamic_rnn( cell=rnn_cell_video_fw, inputs=video_feat_fw, sequence_length=sequence_length, initial_state=initial_state, dtype=tf.float32) rnn_outputs_fw_reshape = tf.reshape(rnn_outputs_fw, [-1, self.options['rnn_size']], name='rnn_outputs_fw_reshape') # predict proposal at each time step: use fully connected layer to output scores for every anchors with tf.variable_scope('predict_proposal_fw', reuse=reuse) as scope: logit_output_fw = tf.contrib.layers.fully_connected( inputs=rnn_outputs_fw_reshape, num_outputs=self.options['num_anchors'], activation_fn=None) '''video feature sequence encoding: backward pass ''' with tf.variable_scope('video_encoder_bw', reuse=reuse) as scope: #sequence_length = tf.reduce_sum(video_feat_mask, axis=-1) sequence_length = tf.expand_dims(tf.shape(video_feat_bw)[1], axis=0) initial_state = rnn_cell_video_bw.zero_state( batch_size=batch_size, dtype=tf.float32) rnn_outputs_bw, _ = tf.nn.dynamic_rnn( cell=rnn_cell_video_bw, inputs=video_feat_bw, sequence_length=sequence_length, initial_state=initial_state, dtype=tf.float32) rnn_outputs_bw_reshape = tf.reshape(rnn_outputs_bw, [-1, self.options['rnn_size']], name='rnn_outputs_bw_reshape') # predict proposal at each time step: use fully connected layer to output scores for every anchors with tf.variable_scope('predict_proposal_bw', reuse=reuse) as scope: logit_output_bw = tf.contrib.layers.fully_connected( inputs=rnn_outputs_bw_reshape, num_outputs=self.options['num_anchors'], activation_fn=None) # score proposal_score_fw = tf.sigmoid(logit_output_fw, name='proposal_score_fw') proposal_score_bw = tf.sigmoid(logit_output_bw, name='proposal_score_bw') # outputs from proposal module outputs['proposal_score_fw'] = proposal_score_fw outputs['proposal_score_bw'] = proposal_score_bw outputs['rnn_outputs_fw'] = rnn_outputs_fw_reshape outputs['rnn_outputs_bw'] = rnn_outputs_bw_reshape return inputs, outputs
def build_model(self): with tf.name_scope('inputs'): self.sentences = tf.placeholder(tf.int32, [None, self.max_sentence_len]) self.aspects = tf.placeholder(tf.int32, [None, self.max_aspect_len]) self.sentence_lens = tf.placeholder(tf.int32, None) self.sentence_locs = tf.placeholder(tf.float32, [None, self.max_sentence_len]) self.labels = tf.placeholder(tf.int32, [None, self.n_class]) self.dropout_keep_prob = tf.placeholder(tf.float32) inputs = tf.nn.embedding_lookup(self.word2vec, self.sentences) inputs = tf.cast(inputs, tf.float32) inputs = tf.nn.dropout(inputs, keep_prob=self.dropout_keep_prob) aspect_inputs = tf.nn.embedding_lookup(self.word2vec, self.aspects) aspect_inputs = tf.cast(aspect_inputs, tf.float32) aspect_inputs = tf.reduce_mean(aspect_inputs, 1) with tf.name_scope('weights'): weights = { 'attention': tf.get_variable( name='W_al', shape=[self.n_hop, 1, self.n_hidden * 3 + self.embedding_dim + 1], initializer=tf.contrib.layers.xavier_initializer(), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ), 'gru_r': tf.get_variable( name='W_r', shape=[self.n_hidden, self.n_hidden * 2 + 1], initializer=tf.orthogonal_initializer(), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ), 'gru_z': tf.get_variable( name='W_z', shape=[self.n_hidden, self.n_hidden * 2 + 1], initializer=tf.orthogonal_initializer(), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ), 'gru_g': tf.get_variable( name='W_g', shape=[self.n_hidden, self.n_hidden], initializer=tf.orthogonal_initializer(), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ), 'gru_x': tf.get_variable( name='W_x', shape=[self.n_hidden, self.n_hidden * 2 + 1], initializer=tf.orthogonal_initializer(), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ), 'softmax': tf.get_variable( name='W_l', shape=[self.n_hidden, self.n_class], initializer=tf.contrib.layers.xavier_initializer(), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ), } with tf.name_scope('biases'): biases = { 'attention': tf.get_variable( name='B_al', shape=[self.n_hop, 1, self.max_sentence_len], initializer=tf.zeros_initializer(), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ), 'softmax': tf.get_variable( name='B_l', shape=[self.n_class], initializer=tf.zeros_initializer(), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ), } with tf.name_scope('updates'): updates = { 'gru_r': tf.get_variable( name='U_r', shape=[self.n_hidden, self.n_hidden], initializer=tf.orthogonal_initializer(), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ), 'gru_z': tf.get_variable( name='U_z', shape=[self.n_hidden, self.n_hidden], initializer=tf.orthogonal_initializer(), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ), } with tf.name_scope('dynamic_rnn'): lstm_cell_fw = tf.contrib.rnn.LSTMCell( self.n_hidden, initializer=tf.orthogonal_initializer(), ) lstm_cell_bw = tf.contrib.rnn.LSTMCell( self.n_hidden, initializer=tf.orthogonal_initializer(), ) outputs, state, _ = tf.nn.static_bidirectional_rnn( lstm_cell_fw, lstm_cell_bw, tf.unstack(tf.transpose(inputs, perm=[1, 0, 2])), sequence_length=self.sentence_lens, dtype=tf.float32, scope='BiLSTM' ) outputs = tf.reshape(tf.concat(outputs, 1), [-1, self.max_sentence_len, self.n_hidden * 2]) batch_size = tf.shape(outputs)[0] outputs_iter = tf.TensorArray(tf.float32, 1, dynamic_size=True, infer_shape=False) outputs_iter = outputs_iter.unstack(outputs) sentence_locs_iter = tf.TensorArray(tf.float32, 1, dynamic_size=True, infer_shape=False) sentence_locs_iter = sentence_locs_iter.unstack(self.sentence_locs) sentence_lens_iter = tf.TensorArray(tf.int32, 1, dynamic_size=True, infer_shape=False) sentence_lens_iter = sentence_lens_iter.unstack(self.sentence_lens) memory = tf.TensorArray(size=batch_size, dtype=tf.float32) def body(i, memory): a = outputs_iter.read(i) b = sentence_locs_iter.read(i) c = sentence_lens_iter.read(i) weight = 1 - b memory = memory.write(i, tf.concat([tf.multiply(a, tf.tile(tf.expand_dims(weight, -1), [1, self.n_hidden * 2])), tf.reshape(b, [-1, 1])], 1)) return (i + 1, memory) def condition(i, memory): return i < batch_size _, memory_final = tf.while_loop(cond=condition, body=body, loop_vars=(0, memory)) self.memories = tf.reshape(memory_final.stack(), [-1, self.max_sentence_len, self.n_hidden * 2 + 1]) e = tf.zeros([batch_size, self.n_hidden]) scores_list = [] aspect_inputs = tf.tile(tf.expand_dims(aspect_inputs, 1), [1, self.max_sentence_len, 1]) for h in range(self.n_hop): memories_iter = tf.TensorArray(tf.float32, 1, dynamic_size=True, infer_shape=False) memories_iter = memories_iter.unstack(self.memories) e_iter = tf.TensorArray(tf.float32, 1, dynamic_size=True, infer_shape=False) e_iter = e_iter.unstack(e) aspect_inputs_iter = tf.TensorArray(tf.float32, 1, dynamic_size=True, infer_shape=False) aspect_inputs_iter = aspect_inputs_iter.unstack(aspect_inputs) sentence_lens_iter = tf.TensorArray(tf.int32, 1, dynamic_size=True, infer_shape=False) sentence_lens_iter = sentence_lens_iter.unstack(self.sentence_lens) newe = tf.TensorArray(size=batch_size, dtype=tf.float32) score = tf.TensorArray(size=batch_size, dtype=tf.float32) def body(i, newe, score): a = memories_iter.read(i) olde = e_iter.read(i) b = tf.tile(tf.expand_dims(olde, 0), [self.max_sentence_len, 1]) c = aspect_inputs_iter.read(i) l = math_ops.to_int32(sentence_lens_iter.read(i)) g = tf.matmul(weights['attention'][h], tf.transpose(tf.concat([a, b, c], 1), perm=[1, 0])) + biases['attention'][h] score_temp = tf.concat([tf.nn.softmax(tf.slice(g, [0, 0], [1, l])), tf.zeros([1, self.max_sentence_len - l])], 1) score = score.write(i, score_temp) i_AL = tf.reshape(tf.matmul(score_temp, a), [-1, 1]) olde = tf.reshape(olde, [-1, 1]) r = tf.nn.sigmoid(tf.matmul(weights['gru_r'], i_AL) + tf.matmul(updates['gru_r'], olde)) z = tf.nn.sigmoid(tf.matmul(weights['gru_z'], i_AL) + tf.matmul(updates['gru_z'], olde)) e0 = tf.nn.tanh(tf.matmul(weights['gru_x'], i_AL) + tf.matmul(weights['gru_g'], tf.multiply(r, olde))) newe_temp = tf.multiply(1 - z, olde) + tf.multiply(z, e0) newe = newe.write(i, newe_temp) return (i + 1, newe, score) def condition(i, newe, score): return i < batch_size _, newe_final, score_final = tf.while_loop(cond=condition, body=body, loop_vars=(0, newe, score)) e = tf.reshape(newe_final.stack(), [-1, self.n_hidden]) batch_score = tf.reshape(score_final.stack(), [-1, self.max_sentence_len]) scores_list.append(batch_score) self.scores = tf.transpose(tf.reshape(tf.stack(scores_list), [self.n_hop, -1, self.max_sentence_len]), [1, 0, 2]) self.predict = tf.matmul(e, weights['softmax']) + biases['softmax'] with tf.name_scope('loss'): self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.predict, labels = self.labels)) self.global_step = tf.Variable(0, name="tr_global_step", trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost, global_step=self.global_step) with tf.name_scope('predict'): self.predict_label = tf.argmax(self.predict, 1) self.correct_pred = tf.equal(self.predict_label, tf.argmax(self.labels, 1)) self.accuracy = tf.reduce_sum(tf.cast(self.correct_pred, tf.int32)) summary_loss = tf.summary.scalar('loss', self.cost) summary_acc = tf.summary.scalar('acc', self.accuracy) self.train_summary_op = tf.summary.merge([summary_loss, summary_acc]) self.test_summary_op = tf.summary.merge([summary_loss, summary_acc]) _dir = 'logs/' + str(self.timestamp) + '_r' + str(self.learning_rate) + '_b' + str(self.batch_size) + '_l' + str(self.l2_reg) self.train_summary_writer = tf.summary.FileWriter(_dir + '/train', self.sess.graph) self.test_summary_writer = tf.summary.FileWriter(_dir + '/test', self.sess.graph)
def __init__(self, embedding_matrix, num_classes, max_sents, max_words, rnn_type="gru", rnn_units=50, attention_size=200, dropout_keep=1.0): ''' hierarchical convolutional attention network for text classification parameters: - embedding_matrix: numpy array numpy array of word embeddings each row should represent a word embedding NOTE: the word index 0 is dropped, so the first row is ignored - num_classes: int number of output classes - max_sents: int maximum number of sentences per document - max_words: int maximum number of words per sentence - rnn_type: string (default: "gru") rnn cells to use, can be "gru" or "lstm" - rnn_units: int (default: 50) number of rnn units to use for embedding layers - attention_size: int (default: 200) number of dimensions to use for attention hidden layer - dropout_keep: float (default: 1.0) dropout keep rate RNNs methods: - train(,data,labels,validation_data,epochs=30,savebest=False,filepath=None) train network on given data - predict(data) return the one-hot-encoded predicted labels for given data - score(data,labels) return the accuracy of predicted labels on given data - save(filepath) save the model weights to a file - load(filepath) load model weights from a file ''' self.rnn_units = rnn_units if rnn_type == "gru": self.rnn_cell = GRUCell elif rnn_type == "lstm": self.rnn_cell = LSTMCell else: raise Exception("rnn_type parameter must be set to gru or lstm") self.dropout_keep = dropout_keep self.dropout = tf.placeholder(tf.float32) self.ms = max_sents self.mw = max_words #doc input and mask self.doc_input = tf.placeholder(tf.int32, shape=[max_sents, max_words]) words_per_line = tf.reduce_sum(tf.sign(self.doc_input), 1) num_lines = tf.reduce_sum(tf.sign(words_per_line)) max_words_ = tf.reduce_max(words_per_line) doc_input_reduced = self.doc_input[:num_lines, :max_words_] num_words = words_per_line[:num_lines] #word rnn layer word_embeds = tf.gather( tf.get_variable('embeddings', initializer=embedding_matrix.astype(np.float32), dtype=tf.float32), doc_input_reduced) with tf.variable_scope('words'): [word_outputs_fw,word_outputs_bw],_ = \ tf.nn.bidirectional_dynamic_rnn( tf.contrib.rnn.DropoutWrapper(self.rnn_cell(self.rnn_units),state_keep_prob=self.dropout), tf.contrib.rnn.DropoutWrapper(self.rnn_cell(self.rnn_units),state_keep_prob=self.dropout), word_embeds,sequence_length=num_words,dtype=tf.float32) word_outputs = tf.concat((word_outputs_fw, word_outputs_bw), 2) #word attention seq_mask = tf.reshape(tf.sequence_mask(num_words, max_words_), [-1]) word_u = tf.layers.dense( tf.reshape(word_outputs, [-1, self.rnn_units * 2]), attention_size, tf.nn.tanh, kernel_initializer=tf.contrib.layers.xavier_initializer()) word_exps = tf.layers.dense( word_u, 1, tf.exp, False, kernel_initializer=tf.contrib.layers.xavier_initializer()) word_exps = tf.where(seq_mask, word_exps, tf.ones_like(word_exps) * 0.000000001) word_alpha = tf.reshape(word_exps, [-1, max_words_, 1]) word_alpha /= tf.reshape(tf.reduce_sum(word_alpha, 1), [-1, 1, 1]) sent_embeds = tf.reduce_sum(word_outputs * word_alpha, 1) sent_embeds = tf.expand_dims(sent_embeds, 0) #sentence rnn layer with tf.variable_scope('sentence'): [sent_outputs_fw,sent_outputs_bw],_ = \ tf.nn.bidirectional_dynamic_rnn( tf.contrib.rnn.DropoutWrapper(self.rnn_cell(self.rnn_units),state_keep_prob=self.dropout), tf.contrib.rnn.DropoutWrapper(self.rnn_cell(self.rnn_units),state_keep_prob=self.dropout), sent_embeds,sequence_length=tf.expand_dims(num_lines,0),dtype=tf.float32) sent_outputs = tf.concat( (tf.squeeze(sent_outputs_fw, [0]), tf.squeeze( sent_outputs_bw, [0])), 1) #sentence attention sent_u = tf.layers.dense( sent_outputs, attention_size, tf.nn.tanh, kernel_initializer=tf.contrib.layers.xavier_initializer()) sent_exp = tf.layers.dense( sent_u, 1, tf.exp, False, kernel_initializer=tf.contrib.layers.xavier_initializer()) sent_atten = sent_exp / tf.reduce_sum(sent_exp) doc_embed = tf.transpose( tf.matmul(tf.transpose(sent_outputs), sent_atten)) #classification functions logits = tf.layers.dense( doc_embed, num_classes, kernel_initializer=tf.orthogonal_initializer()) self.prediction = tf.nn.softmax(logits) #loss, accuracy, and training functions self.labels = tf.placeholder(tf.float32, shape=[num_classes]) labels_ = tf.expand_dims(self.labels, 0) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels_)) self.optimizer = tf.train.AdamOptimizer(0.00002, 0.9, 0.99).minimize(self.loss) #init op self.saver = tf.train.Saver() self.sess = tf.Session() self.sess.run(tf.global_variables_initializer())
def testInvalidShape(self): init1 = tf.orthogonal_initializer() with self.test_session(graph=tf.Graph(), use_gpu=True): self.assertRaises(ValueError, init1, shape=[5])
def __init__(self, data, training=False): self.data = data self.initializer = tf.orthogonal_initializer() q_mask = make_mask(self.data.ql, 25) # (1, L_q, E) s_mask = make_mask(self.data.sl, 29) # (N, L_s, E) a_mask = make_mask(self.data.al, 34) # (5, L_a, E) ques_shape = tf.shape(q_mask) subt_shape = tf.shape(s_mask) ans_shape = tf.shape(a_mask) with tf.variable_scope('Embedding'): self.embedding = tf.get_variable('embedding_matrix', initializer=np.load( _mp.embedding_file), trainable=False) self.ques = tf.nn.embedding_lookup(self.embedding, self.data.ques) # (1, L_q, E) self.ans = tf.nn.embedding_lookup(self.embedding, self.data.ans) # (5, L_a, E) self.subt = tf.nn.embedding_lookup(self.embedding, self.data.subt) # (N, L_s, E) # self.ques = dropout(self.ques, training=training) # (1, L_q, E) # self.ans = dropout(self.ans, training=training) # (5, L_a, E) # self.subt = dropout(self.subt, training=training) # (N, L_s, E) with tf.variable_scope('Embedding_Linear'): # (1, L_q, E_t) self.ques_embedding = unit_norm( mask_dense(self.ques, q_mask, reuse=False)) # (5, L_a, E_t) self.ans_embedding = unit_norm(mask_dense(self.ans, a_mask)) # (N, L_s, E_t) self.subt_embedding = unit_norm(mask_dense(self.subt, s_mask)) with tf.variable_scope('Language_Encode'): mask = tf.expand_dims(tf.sequence_mask(self.data.ql, 25), axis=-1) # (1, E_t) self.ques_enc = unit_norm(conv_encode(self.ques_embedding, mask, 'ques'), dim=1) mask = tf.expand_dims(tf.sequence_mask(self.data.al, 34), axis=-1) # (5, E_t) self.ans_enc = unit_norm(conv_encode(self.ans_embedding, mask, 'ans'), dim=1) mask = tf.expand_dims(tf.sequence_mask(self.data.sl, 29), axis=-1) # (N, E_t) self.subt_enc = unit_norm(conv_encode(self.subt_embedding, mask, 'subt'), dim=1) with tf.variable_scope('Temporal_Attention'): # (N, 2 * E_t) self.temp_attn = tf.concat( [self.subt_enc, tf.tile(self.ques_enc, [subt_shape[0], 1])], axis=-1) # (1, N, E_t) self.temp_attn = unit_norm(tf.expand_dims(self.temp_attn, axis=0)) # (1, N, 1) self.temp_attn = tf.layers.conv1d(self.temp_attn, 1, 5, padding='same', activation=tf.nn.relu) # (N, 1) self.temp_attn = tf.squeeze(tf.nn.softmax(self.temp_attn, axis=1), axis=0) nth = nn.nth_element(tf.transpose(self.temp_attn), tf.cast(subt_shape[0] / 2, tf.int32), True) # (N, 1) attn_mask = tf.greater_equal(self.temp_attn, nth) self.subt_enc = self.temp_attn * tf.cast(attn_mask, tf.float32) self.subt_enc = self.subt_enc * self.temp_attn self.summarize = unit_norm(tf.reduce_sum(self.subt_enc, axis=0, keepdims=True), dim=1) # (1, 4 * E_t) # gamma = tf.get_variable('gamma', [1, 1], initializer=tf.zeros_initializer) # # self.ans_vec = self.summarize * tf.nn.sigmoid(gamma) + \ # tf.squeeze(self.ques_enc, axis=0) * (1 - tf.nn.sigmoid(gamma)) self.ans_vec = unit_norm(self.summarize + self.ques_enc, dim=1) # (1, 4 * E_t) self.output = tf.matmul(self.ans_vec, self.ans_enc, transpose_b=True) # (1, 5)
def _build_net(self): with tf.variable_scope("Actor" + self.suffix): with tf.name_scope('inputs' + self.suffix): self.tf_obs = tf.placeholder(tf.float32, [None, self.n_features], name='observation' + self.suffix) self.tf_acts = tf.placeholder(tf.int32, [ None, ], name='actions_num' + self.suffix) self.tf_vt = tf.placeholder(tf.float32, [ None, ], name='actions_value' + self.suffix) self.tf_safe = tf.placeholder(tf.float32, [ None, ], name='safety_value' + self.suffix) self.entropy_weight = tf.placeholder( tf.float32, shape=(), name='entropy_weight_clustering' + self.suffix) ##### PPO change ##### self.ppo_ratio = tf.placeholder(tf.float32, [ None, ], name='ppo_ratio' + self.suffix) ##### PPO change ##### layer = tf.layers.dense( inputs=self.tf_obs, units=128, activation=tf.nn.tanh, # kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), kernel_initializer=tf.orthogonal_initializer( gain=np.sqrt(2.)), # ppo default initialization bias_initializer=tf.constant_initializer(0.1), name='fc1' + self.suffix) all_act = tf.layers.dense( inputs=layer, units=self.n_actions, activation=None, # kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), kernel_initializer=tf.orthogonal_initializer( gain=np.sqrt(2.)), # ppo default initialization bias_initializer=tf.constant_initializer(0.1), name='fc2' + self.suffix) self.trainable_variables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor' + self.suffix) self.trainable_variables_shapes = [ var.get_shape().as_list() for var in self.trainable_variables ] # sampling self.all_act_prob = tf.nn.softmax(all_act, name='act_prob' + self.suffix) self.all_act_prob = tf.clip_by_value(self.all_act_prob, 1e-20, 1.0) with tf.name_scope('loss' + self.suffix): neg_log_prob = tf.reduce_sum( -tf.log(tf.clip_by_value(self.all_act_prob, 1e-30, 1.0)) * tf.one_hot(indices=self.tf_acts, depth=self.n_actions), axis=1) loss = tf.reduce_mean(neg_log_prob * self.tf_vt) loss += self.entropy_weight * tf.reduce_mean( tf.reduce_sum( tf.log(tf.clip_by_value(self.all_act_prob, 1e-30, 1.0)) * self.all_act_prob, axis=1)) self.entro = self.entropy_weight * tf.reduce_mean( tf.reduce_sum( tf.log(tf.clip_by_value(self.all_act_prob, 1e-30, 1.0)) * self.all_act_prob, axis=1)) self.loss = loss with tf.name_scope('train' + self.suffix): self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss) # safety loss """ * -1? """ self.chosen_action_log_probs = tf.reduce_sum( tf.log(tf.clip_by_value(self.all_act_prob, 1e-30, 1.0)) * tf.one_hot(indices=self.tf_acts, depth=self.n_actions), axis=1) ##### PPO CHANGE ##### self.ppo_old_chosen_action_log_probs = tf.placeholder( tf.float32, [None]) ##### PPO CHANGE ##### self.old_chosen_action_log_probs = tf.stop_gradient( tf.placeholder(tf.float32, [None])) # self.each_safety_loss = tf.exp(self.chosen_action_log_probs - self.old_chosen_action_log_probs) * self.tf_safe self.each_safety_loss = ( tf.exp(self.chosen_action_log_probs) - tf.exp(self.old_chosen_action_log_probs)) * self.tf_safe self.average_safety_loss = tf.reduce_mean( self.each_safety_loss) #/ self.n_episodes tf.reduce_sum # self.average_safety_loss +=self.entro # KL D self.old_all_act_prob = tf.stop_gradient( tf.placeholder(tf.float32, [None, self.n_actions])) def kl(x, y): EPS = 1e-10 x = tf.where(tf.abs(x) < EPS, EPS * tf.ones_like(x), x) y = tf.where(tf.abs(y) < EPS, EPS * tf.ones_like(y), y) X = tf.distributions.Categorical(probs=x + EPS) Y = tf.distributions.Categorical(probs=y + EPS) return tf.distributions.kl_divergence(X, Y, allow_nan_stats=False) self.each_kl_divergence = kl( self.all_act_prob, self.old_all_act_prob ) # tf.reduce_sum(kl(self.all_act_prob, self.old_all_act_prob), axis=1) self.average_kl_divergence = tf.reduce_mean( self.each_kl_divergence) # self.kl_gradients = tf.gradients(self.average_kl_divergence, self.trainable_variables) # useless self.desired_kl = desired_kl # self.metrics = [self.loss, self.average_kl_divergence, self.average_safety_loss, self.entro] # Luping self.metrics = [ self.loss, self.loss, self.average_safety_loss, self.entro ] # Luping # FLat self.flat_params_op = get_flat_params(self.trainable_variables) """not use tensorflow default function, here we calculate the gradient by self: (1) loss: g (2) kl: directional_gradients (math, fisher) (3) safe: b """ ##### PPO change ##### #### PPO Suyi's Change #### with tf.name_scope('ppoloss' + self.suffix): self.ppo_ratio = tf.exp(self.chosen_action_log_probs - self.ppo_old_chosen_action_log_probs) # self.ppo_ratio = tf.Print(self.ppo_ratio, [self.ppo_ratio], "self.ppo_ratio: ") surr = self.ppo_ratio * self.tf_vt self.ppoloss = -tf.reduce_mean( tf.minimum( surr, tf.clip_by_value(self.ppo_ratio, 1. - self.clip_eps, 1. + self.clip_eps) * self.tf_vt)) self.ppoloss += self.entropy_weight * tf.reduce_mean( tf.reduce_sum( tf.log(tf.clip_by_value(self.all_act_prob, 1e-30, 1.0)) * self.all_act_prob, axis=1)) # self.ppoloss += 0.01 * tf.reduce_mean(tf.reduce_sum(tf.log(tf.clip_by_value(self.all_act_prob, 1e-30, 1.0)) * self.all_act_prob, axis=1)) with tf.variable_scope('ppotrain'): # self.atrain_op = tf.train.AdamOptimizer(self.lr).minimize(self.ppoloss) self.atrain_op = tf.train.AdamOptimizer(self.lr).minimize( self.ppoloss) #### PPO Suyi's Change #### self.ppoloss_flat_gradients_op = get_flat_gradients( self.ppoloss, self.trainable_variables) ##### PPO change ##### self.loss_flat_gradients_op = get_flat_gradients( self.loss, self.trainable_variables) self.kl_flat_gradients_op = get_flat_gradients( self.average_kl_divergence, self.trainable_variables) self.constraint_flat_gradients_op = get_flat_gradients( self.average_safety_loss, self.trainable_variables) self.vec = tf.placeholder(tf.float32, [None]) self.fisher_product_op = self.get_fisher_product_op() self.new_params = tf.placeholder(tf.float32, [None]) self.params_assign_op = assign_network_params_op( self.new_params, self.trainable_variables, self.trainable_variables_shapes)
def GRU(self, rnn_size=None, reuse=None): rnn_size = args.hidden_size if rnn_size is None else rnn_size return tf.nn.rnn_cell.GRUCell( rnn_size, kernel_initializer=tf.orthogonal_initializer(), reuse=reuse)
def __init__(self,embedding_matrix,num_classes,max_sents,max_words,attention_heads=8, attention_size=512,dropout_keep=0.9,activation=tf.nn.elu): ''' hierarchical convolutional attention network for text classification parameters: - embedding_matrix: numpy array numpy array of word embeddings each row should represent a word embedding NOTE: the word index 0 is dropped, so the first row is ignored - num_classes: int number of output classes - max_sents: int maximum number of sentences per document - max_words: int maximum number of words per sentence - attention_heads: int (default: 8) number of attention heads to use in multihead attention - attention_size: int (default: 512) dimension size of output embeddings from attention - dropout_keep: float (default: 0.9) dropout keep rate for embeddings and attention softmax - activation: tensorflow activation function (default: tf.nn.elu) activation function to use for convolutional feature extraction methods: - train(,data,labels,validation_data,epochs=30,savebest=False,filepath=None) train network on given data - predict(data) return the one-hot-encoded predicted labels for given data - score(data,labels) return the accuracy of predicted labels on given data - save(filepath) save the model weights to a file - load(filepath) load model weights from a file ''' self.attention_heads = attention_heads self.attention_size = attention_size self.embedding_size = embedding_matrix.shape[1] self.embeddings = embedding_matrix.astype(np.float32) self.ms = max_sents self.mw = max_words self.dropout_keep = dropout_keep self.dropout = tf.placeholder(tf.float32) #doc input and mask self.doc_input = tf.placeholder(tf.int32, shape=[max_sents,max_words]) self.words_per_line = tf.reduce_sum(tf.sign(self.doc_input),1) self.max_lines = tf.reduce_sum(tf.sign(self.words_per_line)) self.max_words = tf.reduce_max(self.words_per_line) self.doc_input_reduced = self.doc_input[:self.max_lines,:self.max_words] self.num_words = self.words_per_line[:self.max_lines] #word embeddings self.word_embeds = tf.gather(tf.get_variable('embeddings',initializer=self.embeddings, dtype=tf.float32),self.doc_input_reduced) positions = tf.expand_dims(tf.range(self.max_words),0) word_pos = tf.gather(tf.get_variable('word_pos',shape=(self.mw,self.embedding_size), dtype=tf.float32,initializer=tf.random_normal_initializer(0,0.1)),positions) self.word_embeds = tf.nn.dropout(self.word_embeds + word_pos,self.dropout) #for feature/parameter comparison print(self) print(f"attention heads: {attention_heads}") print(f"attention size: {attention_size}") print(f"self embedding size: {self.embedding_size}") print(f"self embeddings: {self.embeddings}") print(f"max sents (ms): {self.ms}") print(f"max words (mw): {self.mw}") print(f"dropout: {dropout_keep}") print(f"self doc_input: {self.doc_input}") print(f"self words_per_line: {self.words_per_line}") print(f"self max_lines {self.max_lines}") print(f"self max_words {self.max_words}") print(f"self doc_input_reduced: {self.doc_input_reduced}") print(f"self num_words: {self.num_words}") #masks to eliminate padding mask_base = tf.cast(tf.sequence_mask(self.num_words,self.max_words),tf.float32) mask = tf.tile(tf.expand_dims(mask_base,2),[1,1,self.attention_size]) mask2 = tf.tile(tf.expand_dims(mask_base,2),[self.attention_heads,1,self.max_words]) print(f"mask_base: {mask_base}") print(f"mask: {mask}") print(f"mask2: {mask2}") #word self attention 1 Q1 = tf.layers.conv1d(self.word_embeds,self.attention_size,3,padding='same', activation=activation,kernel_initializer=tf.orthogonal_initializer()) K1 = tf.layers.conv1d(self.word_embeds,self.attention_size,3,padding='same', activation=activation,kernel_initializer=tf.orthogonal_initializer()) V1 = tf.layers.conv1d(self.word_embeds,self.attention_size,3,padding='same', activation=activation,kernel_initializer=tf.orthogonal_initializer()) Q1 = tf.where(tf.equal(mask,0),tf.zeros_like(Q1),Q1) K1 = tf.where(tf.equal(mask,0),tf.zeros_like(K1),K1) V1 = tf.where(tf.equal(mask,0),tf.zeros_like(V1),V1) Q1_ = tf.concat(tf.split(Q1,self.attention_heads,axis=2),axis=0) K1_ = tf.concat(tf.split(K1,self.attention_heads,axis=2),axis=0) V1_ = tf.concat(tf.split(V1,self.attention_heads,axis=2),axis=0) outputs1 = tf.matmul(Q1_,tf.transpose(K1_,[0, 2, 1])) outputs1 = outputs1/(K1_.get_shape().as_list()[-1]**0.5) outputs1 = tf.where(tf.equal(outputs1,0),tf.ones_like(outputs1)*-1000,outputs1) outputs1 = tf.nn.dropout(tf.nn.softmax(outputs1),self.dropout) outputs1 = tf.where(tf.equal(mask2,0),tf.zeros_like(outputs1),outputs1) outputs1 = tf.matmul(outputs1,V1_) outputs1 = tf.concat(tf.split(outputs1,self.attention_heads,axis=0),axis=2) outputs1 = tf.where(tf.equal(mask,0),tf.zeros_like(outputs1),outputs1) #word self attention 2 Q2 = tf.layers.conv1d(self.word_embeds,self.attention_size,3,padding='same', activation=activation,kernel_initializer=tf.orthogonal_initializer()) K2 = tf.layers.conv1d(self.word_embeds,self.attention_size,3,padding='same', activation=activation,kernel_initializer=tf.orthogonal_initializer()) V2 = tf.layers.conv1d(self.word_embeds,self.attention_size,3,padding='same', activation=tf.nn.tanh,kernel_initializer=tf.orthogonal_initializer()) Q2 = tf.where(tf.equal(mask,0),tf.zeros_like(Q2),Q2) K2 = tf.where(tf.equal(mask,0),tf.zeros_like(K2),K2) V2 = tf.where(tf.equal(mask,0),tf.zeros_like(V2),V2) Q2_ = tf.concat(tf.split(Q2,self.attention_heads,axis=2),axis=0) K2_ = tf.concat(tf.split(K2,self.attention_heads,axis=2),axis=0) V2_ = tf.concat(tf.split(V2,self.attention_heads,axis=2),axis=0) outputs2 = tf.matmul(Q2_,tf.transpose(K2_,[0, 2, 1])) outputs2 = outputs2/(K2_.get_shape().as_list()[-1]**0.5) outputs2 = tf.where(tf.equal(outputs2,0),tf.ones_like(outputs2)*-1000,outputs2) outputs2 = tf.nn.dropout(tf.nn.softmax(outputs2),self.dropout) outputs2 = tf.where(tf.equal(mask2,0),tf.zeros_like(outputs2),outputs2) outputs2 = tf.matmul(outputs2,V2_) outputs2 = tf.concat(tf.split(outputs2,self.attention_heads,axis=0),axis=2) outputs2 = tf.where(tf.equal(mask,0),tf.zeros_like(outputs2),outputs2) outputs = tf.multiply(outputs1,outputs2) outputs = layer_norm(outputs) #word target attention Q = tf.get_variable('word_Q',(1,1,self.attention_size), tf.float32,tf.orthogonal_initializer()) K = tf.layers.conv1d(outputs,self.attention_size,3,padding='same', activation=activation,kernel_initializer=tf.orthogonal_initializer()) Q = tf.tile(Q,[self.max_lines,1,1]) K = tf.where(tf.equal(mask,0),tf.zeros_like(K),K) Q_ = tf.concat(tf.split(Q,self.attention_heads,axis=2),axis=0) K_ = tf.concat(tf.split(K,self.attention_heads,axis=2),axis=0) V_ = tf.concat(tf.split(outputs,self.attention_heads,axis=2),axis=0) outputs = tf.matmul(Q_,tf.transpose(K_,[0, 2, 1])) outputs = outputs/(K_.get_shape().as_list()[-1]**0.5) outputs = tf.where(tf.equal(outputs,0),tf.ones_like(outputs)*-1000,outputs) outputs = tf.nn.dropout(tf.nn.softmax(outputs),self.dropout) outputs = tf.matmul(outputs,V_) outputs = tf.concat(tf.split(outputs,self.attention_heads,axis=0),axis=2) self.sent_embeds = tf.transpose(outputs,[1, 0, 2]) #sentence positional embeddings positions = tf.expand_dims(tf.range(self.max_lines),0) sent_pos = tf.gather(tf.get_variable('sent_pos',shape=(self.ms,self.attention_size), dtype=tf.float32,initializer=tf.random_normal_initializer(0,0.1)),positions) self.sent_embeds = tf.nn.dropout(self.sent_embeds + sent_pos,self.dropout) #sentence self attention 1 Q1 = tf.layers.conv1d(self.sent_embeds,self.attention_size,3,padding='same', activation=activation,kernel_initializer=tf.orthogonal_initializer()) K1 = tf.layers.conv1d(self.sent_embeds,self.attention_size,3,padding='same', activation=activation,kernel_initializer=tf.orthogonal_initializer()) V1 = tf.layers.conv1d(self.sent_embeds,self.attention_size,3,padding='same', activation=activation,kernel_initializer=tf.orthogonal_initializer()) Q1_ = tf.concat(tf.split(Q1,self.attention_heads,axis=2),axis=0) K1_ = tf.concat(tf.split(K1,self.attention_heads,axis=2),axis=0) V1_ = tf.concat(tf.split(V1,self.attention_heads,axis=2),axis=0) outputs1 = tf.matmul(Q1_,tf.transpose(K1_,[0, 2, 1])) outputs1 = outputs1/(K1_.get_shape().as_list()[-1]**0.5) outputs1 = tf.nn.dropout(tf.nn.softmax(outputs1),self.dropout) outputs1 = tf.matmul(outputs1,V1_) outputs1 = tf.concat(tf.split(outputs1,self.attention_heads,axis=0),axis=2) #sentence self attention 2 Q2 = tf.layers.conv1d(self.sent_embeds,self.attention_size,3,padding='same', activation=activation,kernel_initializer=tf.orthogonal_initializer()) K2 = tf.layers.conv1d(self.sent_embeds,self.attention_size,3,padding='same', activation=activation,kernel_initializer=tf.orthogonal_initializer()) V2 = tf.layers.conv1d(self.sent_embeds,self.attention_size,3,padding='same', activation=tf.nn.tanh,kernel_initializer=tf.orthogonal_initializer()) Q2_ = tf.concat(tf.split(Q2,self.attention_heads,axis=2),axis=0) K2_ = tf.concat(tf.split(K2,self.attention_heads,axis=2),axis=0) V2_ = tf.concat(tf.split(V2,self.attention_heads,axis=2),axis=0) outputs2 = tf.matmul(Q2_,tf.transpose(K2_,[0, 2, 1])) outputs2 = outputs2/(K2_.get_shape().as_list()[-1]**0.5) outputs2 = tf.nn.dropout(tf.nn.softmax(outputs2),self.dropout) outputs2 = tf.matmul(outputs2,V2_) outputs2 = tf.concat(tf.split(outputs2,self.attention_heads,axis=0),axis=2) outputs = tf.multiply(outputs1,outputs2) outputs = layer_norm(outputs) #sentence target attention Q = tf.get_variable('sent_Q',(1,1,self.attention_size), tf.float32,tf.orthogonal_initializer()) K = tf.layers.conv1d(outputs,self.attention_size,3,padding='same', activation=activation,kernel_initializer=tf.orthogonal_initializer()) Q_ = tf.concat(tf.split(Q,self.attention_heads,axis=2),axis=0) K_ = tf.concat(tf.split(K,self.attention_heads,axis=2),axis=0) V_ = tf.concat(tf.split(outputs,self.attention_heads,axis=2),axis=0) outputs = tf.matmul(Q_,tf.transpose(K_,[0, 2, 1])) outputs = outputs/(K_.get_shape().as_list()[-1]**0.5) outputs = tf.nn.dropout(tf.nn.softmax(outputs),self.dropout) outputs = tf.matmul(outputs,V_) outputs = tf.concat(tf.split(outputs,self.attention_heads,axis=0),axis=2) self.doc_embed = tf.nn.dropout(tf.squeeze(outputs,[0]),self.dropout) #classification functions self.output = tf.layers.dense(self.doc_embed,num_classes, kernel_initializer=tf.orthogonal_initializer()) self.prediction = tf.nn.softmax(self.output) #loss, accuracy, and training functions self.labels = tf.placeholder(tf.float32, shape=[num_classes]) self.labels_rs = tf.expand_dims(self.labels,0) self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits (logits=self.output,labels=self.labels_rs)) self.optimizer = tf.train.AdamOptimizer(2e-5,0.9,0.99).minimize(self.loss) #init op self.init_op = tf.global_variables_initializer() self.saver = tf.train.Saver() self.sess = tf.Session() self.sess.run(self.init_op)
def train_net(batch_size=100, t_steps=200, l_dim=[100, 50, 5, 50, 100], activation='tanh', gamma=0.001, alpha_t=0.1, noise_str=0.1, err_alg=0, learning_rate=0.003, learning_rate_inv=0.003, learning_rate_rinv=0.1, num_steps_rinv=2, top_loss='sigmoid_ce', mode='autoencoder', dataset='mnist', SGD=True, preprocess=False, tb_path='/tmp/targprop/'): """ Args: batch_size (int, > 0): the number of examples in each training batch t_steps (int, > 0): the number of training steps l_dim (list of ints): the layer dimensions activation (tanh, linear, sigmoid, relu): activation functions of network gamma (float, > 0): regularization parameter for regularized target prop alpha_t (float, (0, 1)): the 'learning rate' in target propagation, i.e. the top layer target is x - alpha_t* dL/dx err_alg (int, in [0, 1, 2, 3]): which error propagation algorithm to use 0: backprop 1: constrained least-squares target prop (essentially op-by-op difference target prop) 2: regularized least-squares target prop (op-by-op) 3: difference target prop using L_inv (close to a carbon copy of Lee et al) learning_rate (float, > 0): the learning rate in gradient descent. learning_rate_inv (float, > 0): the learning rate for L_inv if err_alg==3 top_loss ('sigmoid_ce', softmax_ce', 'sigmoid_l2', 'l2'): the top-layer, defined by pre-loss nonlinearity and loss function mode ('autoencoder', 'classification'): 'autoencoder': outputs are set to inputs 'classification': outputs are set to labels dataset ('mnist', 'cifar'): which dataset to use. SGD (bool): stochastic gradient descent. Should be True. False can be useful for debugging and seeing if algorithms converge on a single batch. preprocess (bool): preprocess the data with PCA + whitening. Returns: output_dict output_dict['L']: list. loss for each training step output_dict['L_test']: float. loss for test data at final training step output_dict['accuracy']: accuracy of classification output_dict['accuracy_test']: accuracy on test set output_dict['actvs']: activations of last layer. for autoencoder mode. """ # data if dataset == 'cifar': data = ds.cifar10_data() data_test = ds.cifar10_data_test() elif dataset == 'mnist': data = ds.mnist_data() data_test = ds.mnist_data_test() else: # set train and test the same. change later. data = dataset data_test = dataset if preprocess: from sklearn.decomposition import PCA pca = PCA(n_components=1000, whiten=True) data.inputs = pca.fit_transform(data.inputs) data_test.inputs = pca.transform(data_test.inputs) # autoencoderify if mode == 'autoencoder': data.outputs = data.inputs data_test.outputs = data_test.inputs # model parameters / architecture m_dim = data.inputs.shape[1] # input dimension p_dim = data.outputs.shape[1] # output dimension l_dim = [m_dim] + l_dim + [p_dim] # layer dimensions layers = len(l_dim)-1 # operations from operations.py lin = ops.linear() add = ops.addition() # set activation function if activation == 'tanh': tf_act = tf.nn.tanh op_act = ops.tanh() elif activation == 'linear': tf_act = tf.identity op_act = ops.identity() elif activation == 'sigmoid': tf_act = tf.nn.sigmoid op_act = ops.sigmoid() elif activation == 'relu': tf_act = tf.nn.relu op_act = ops.relu() # put activations in lists acts = (layers+1)*[None] # activation functions tf_acts = (layers+1)*[None] # activation functions for l in range(1, layers): acts[l] = op_act tf_acts[l] = tf_act acts[-1] = ops.identity() # last activation function is just identity, so we can offload the pre-loss nonlinearity to the 'loss' layer tf_acts[-1] = tf.identity def nonlin_layer(x_in, W_in, b_in): return tf_act(tf.matmul(x_in, W_in) + b_in) def affine_layer(x_in, W_in, b_in): return tf.matmul(x_in, W_in) + b_in # put op functions in lists... f = (layers+1)*[None] for l in range(1, layers): f[l] = nonlin_layer f[-1] = affine_layer # initialize variable lists W = (layers+1)*[None] # forward weights b = (layers+1)*[None] # biases train_op_W = (layers+1)*[None] train_op_p = (layers+1)*[None] train_op_tx = (layers+1)*[None] summary_ops = (layers+1)*[None] # initialize activation lists x = (layers+1)*[None] tx = (layers+1)*[None] p = (layers+1)*[None] loss = (layers+1)*[None] tloss = (layers+1)*[None] ploss = (layers+1)*[None] # create tensorflow graph with layer-local loss functions tf.reset_default_graph() # placeholders x[0] = tf.placeholder(tf.float32, shape=[None, l_dim[0]], name='input') tx[-1] = tf.placeholder(tf.float32, shape=[None, l_dim[-1]], name='output') in_shape = x[0].get_shape() # 0 layer stuff tx[0] = tf.get_variable('layer0_ffx_tar', shape=[batch_size, l_dim[0]], dtype=tf.float32, initializer=tf.constant_initializer(0.)) loss[0] = 0. tloss[0] = 0. ploss[0] = 0. opt = tf.train.RMSPropOptimizer(learning_rate) for l in range(1, layers+1): with tf.name_scope('layer'+str(l)+'_ff') as scope: W[l] = tf.get_variable(scope+'W', shape=[l_dim[l-1], l_dim[l]], dtype=tf.float32, initializer=tf.orthogonal_initializer(0.95)) b[l] = tf.get_variable(scope+'b', shape=[1, l_dim[l]], dtype=tf.float32, initializer=tf.constant_initializer(0.)) x[l] = f[l](x[l-1], W[l], b[l]) tx[l] = tf.get_variable(scope+'x_tar', shape=[batch_size, l_dim[l]], dtype=tf.float32, initializer=tf.constant_initializer(0.)) p[l] = tf.get_variable(scope+'p', shape=[batch_size, l_dim[l]], dtype=tf.float32, initializer=tf.constant_initializer(0.)) if l == layers: # loss[l] = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=x[l], labels=tx[l])) # correct_prediction = tf.equal(tf.argmax( tf.nn.softmax(x[l]), 1 ), tf.argmax( tx[l], 1 )) loss[l] = 0.5*tf.reduce_mean( (x[l] - tx[l])**2. ) correct_prediction = tf.equal(tf.argmax( x[l], 1 ), tf.argmax( tx[l], 1 )) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) accuracy_summary = tf.summary.scalar('accuracy', accuracy) elif l < layers: loss[l] = 0.5*tf.reduce_mean((x[l] - tx[l])**2.) summary_ops[l] = tf.summary.scalar('log_loss'+str(l), tf.log(loss[l])) # target loss term tloss[l] = 0.5*gamma*tf.nn.l2_loss(f[l](tx[l-1], W[l], b[l]) - tx[l]) # Lagrange multiplier term ploss[l] = tf.reduce_sum(tf.multiply(p[l], f[l](tx[l-1], W[l], b[l]) - tx[l])) train_op_W[l] = opt.minimize(loss[l] + tloss[l] + ploss[l], var_list=[W[l], b[l]]) train_op_p[l] = tf.train.GradientDescentOptimizer(gamma).minimize(-ploss[l], var_list=[p[l]]) for l in range(0, layers): train_op_tx[l] = opt.minimize(loss[l] + tloss[l] + tloss[l+1] + ploss[l] + ploss[l+1], var_list=[tx[l]]) merged = tf.summary.merge_all() writer = tf.summary.FileWriter(tb_path) sess = tf.Session() sess.run(tf.global_variables_initializer()) for t in range(t_steps+1): if SGD: x0, y = data.next_batch(batch_size) else: x0 = data.inputs[:batch_size] y = data.outputs[:batch_size] feed_dict = {x[0]: x0, tx[-1]: y} sess.run(train_op_tx[0], feed_dict=feed_dict) for l in range(1, layers): sess.run(train_op_tx[l], feed_dict=feed_dict) sess.run(train_op_W[l], feed_dict=feed_dict) sess.run(train_op_W[-1], feed_dict=feed_dict) if t % 5 == 0: for l in range(1, layers+1): sess.run(train_op_p[l], feed_dict=feed_dict) if t % 1 == 0: writer.add_summary(sess.run(merged, feed_dict=feed_dict), t) if t % 20 == 0: print 'Iter: ', t, 'Loss, accuracy: ', sess.run([loss[-1], accuracy], feed_dict=feed_dict) # ( V ^__^) V training complete V (^__^ V ) #feed_dict = {x[0]: data_test.inputs, tx[-1]: data_test.outputs} #L_test, accuracy_test = sess.run([loss[-1], accuracy], feed_dict=feed_dict) # prepare the output dictionary output_dict = {} #output_dict['L_test'] = L_test #output_dict['accuracy_test'] = accuracy_test # if mode == 'autoencoder': # if top_loss == 'sigmoid_ce': # output_dict['reconstruction'] = sess.run(tf.sigmoid(x3_test[-1][:20])) # else: # output_dict['reconstruction'] = x3_test[-1][:20] # save final layer activations (reconstructions) sess.close() # (= _ =) ..zzZZ return output_dict
def self_cross_attention( config, context_embedded, context_len, candidate_embedded, candidate_len ): """ :param config: :param context_embedded: shape = (batch_size, max_turn_num, max_turn_len, emb_size) :param context_len: shape = (batch_size, max_turn_num ) :param candidate_embedded: shape = (batch_size, options_num, max_turn_len, emb_size) :param candidate_len: shape = (batch_size, options_num) :return: """ # feature is a list of tensors which shape is (batch_size, max_turn_num, options_num, max_turn_len, max_turn_len) feature = [tf.einsum('bimn,bjmn->bij',context_embedded,candidate_embedded)] C_stack = [context_embedded] R_stack = [candidate_embedded] CR_stack = [] RC_stack = [] self_C = context_embedded self_R = candidate_embedded for i in range(config['stack_num']): with tf.variable_scope('self_stack_'+str(i), reuse=tf.AUTO_REUSE): # self_C.shape = (batch_size, max_turn_num, max_turn_len, emb_size) self_C = pab.self_block(Q=self_C, K=self_C, V=self_C, Q_lengths=context_len, K_lengths=context_len) # self_R.shape = (batch_size, options_num, max_turn_len, emb_size) self_R = pab.self_block(Q=self_R, K= self_R, V=self_R, Q_lengths=candidate_len, K_lengths=candidate_len) C_stack.append(self_C) R_stack.append(self_R) with tf.variable_scope('C_at_R_stack_'+str(i),tf.AUTO_REUSE): # cross_CR.shape = (batch_size, max_turn_num, options_num, max_turn_len, emb_size) cross_CR = pab.cross_block(Q=C_stack[i], K=R_stack[i], V=R_stack[i], Q_lengths=context_len, K_lengths=candidate_len) with tf.variable_scope('R_at_C_stack_',str(i),reuse=tf.AUTO_REUSE): # cross_RC.shape = (batch_size, options_num, max_turn_num, max_turn_len, emb_size) cross_RC = pab.cross_block(Q=R_stack[i], K=C_stack[i], V=C_stack[i], Q_lengths=candidate_len, K_lengths=context_len) CR_stack.append(cross_CR) RC_stack.append(cross_RC) CR_stack.append(pab.cross_block(Q=C_stack[-1], K=R_stack[-1], V=R_stack[-1], Q_lengths=context_len, K_lengths=candidate_len)) RC_stack.append(pab.cross_block(Q=R_stack[-1], K=C_stack[-1], V=C_stack[-1], Q_lengths=candidate_len, K_lengths=context_len)) # self_feature.shape = (batch_size, options_num, max_turn_num, max_turn_len, max_turn_len, stack_num) self_F = tf.einsum('bijks,bmnks->bimjns',tf.stack(R_stack,axis=-1),tf.stack(C_stack,axis=-1)) / tf.sqrt(200.0) # cross_feature.shape = (batch_size, options_num, max_turn_num, max_turn_len, max_turn_len, stack_num) cross_F = tf.einsum('bijkls,bjizls->bijkzs', tf.stack(RC_stack,axis=-1), tf.stack(CR_stack,axis=-1)) / tf.sqrt(200.0) # feature.shape = (batch_size * options_num, max_turn_num, max_turn_len, max_turn_len, stack_num) feature = tf.reshape(tf.concat([self_F,cross_F],axis=-1), shape=[-1, self_F.shape[2], self_F.shape[3], self_F.shape[4], self_F.shape[5]+cross_F.shape[5]]) with tf.variable_scope('cnn_aggregation'): final_info = pop.CNN_3d(feature,32,16) with tf.variable_scope('linear'): W = tf.get_variable( name='weights', shape=[final_info.shape[-1], 1], initializer=tf.orthogonal_initializer()) bias = tf.get_variable( name='bias', shape=[1], initializer=tf.zeros_initializer()) logits = tf.reshape(tf.matmul(final_info, W) + bias, [-1,self_F.shape[1]]) probs = tf.nn.softmax(logits) return probs, logits
def testDuplicatedInitializer(self): init = tf.orthogonal_initializer() self.assertFalse(duplicated_initializer(self, init, 1, (10, 10)))
def lstm_cell(self): return tf.nn.rnn_cell.LSTMCell(self.rnn_size, initializer=tf.orthogonal_initializer())
def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or "simple_lstm_cell", reuse=self._reuse): c, h = state if not hasattr(self, '_wi'): self._wi = tf.get_variable('simple_lstm_cell_wi', dtype=tf.float32, shape=[inputs.get_shape()[-1] + h.get_shape()[-1], self._num_units], initializer=tf.orthogonal_initializer()) self._bi = tf.get_variable('simple_lstm_cell_bi', dtype=tf.float32, shape=[self._num_units], initializer=tf.constant_initializer(0.0)) self._wo = tf.get_variable('simple_lstm_cell_wo', dtype=tf.float32, shape=[inputs.get_shape()[-1] + h.get_shape()[-1], self._num_units], initializer=tf.orthogonal_initializer()) self._bo = tf.get_variable('simple_lstm_cell_bo', dtype=tf.float32, shape=[self._num_units], initializer=tf.constant_initializer(0.0)) self._wc = tf.get_variable('simple_lstm_cell_wc', dtype=tf.float32, shape=[inputs.get_shape()[-1] + h.get_shape()[-1], self._num_units], initializer=tf.orthogonal_initializer()) self._bc = tf.get_variable('simple_lstm_cell_bc', dtype=tf.float32, shape=[self._num_units], initializer=tf.constant_initializer(0.0)) i = tf.nn.sigmoid(tf.matmul(tf.concat([inputs, h], 1), self._wi) + self._bi) o = tf.nn.sigmoid(tf.matmul(tf.concat([inputs, h], 1), self._wo) + self._bo) _c = self._activation(tf.matmul(tf.concat([inputs, h], 1), self._wc) + self._bc) # remove forget gate according to the paper new_c = c + i * _c new_h = o * self._activation(new_c) return new_h, (new_c, new_h)
h = slim.stack(tf.divide(x, 4.0), slim.fully_connected, [n_hidden] * n_layer, activation_fn=tf.nn.relu) log_d = slim.fully_connected(h, 1, activation_fn=None) return log_d tf.reset_default_graph() data = sample_mog(params['batch_size']) noise = ds.Normal(tf.zeros(params['z_dim']), tf.ones(params['z_dim'])).sample(params['batch_size']) # Construct generator and discriminator nets with slim.arg_scope([slim.fully_connected], weights_initializer=tf.orthogonal_initializer(gain=0.8)): samples = generator(noise, output_dim=params['x_dim']) real_score = discriminator(data) fake_score = discriminator(samples, reuse=True) # D maximizes this, G minimizes this + a regularizer V = -tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=real_score, labels=tf.ones_like(real_score)) + tf.nn.sigmoid_cross_entropy_with_logits(logits=fake_score, labels=tf.zeros_like(fake_score))) gen_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "generator") disc_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "discriminator")
def testInitializerDifferent(self): for dtype in [tf.float32, tf.float64]: init1 = tf.orthogonal_initializer(seed=1, dtype=dtype) init2 = tf.orthogonal_initializer(seed=2, dtype=dtype) self.assertFalse(identicaltest(self, init1, init2, (10, 10)))
def smn_model(input_x, input_x_mask, input_y, input_y_mask, word_emb, keep_rate, conf, x_len=None, y_len=None): turns1_e = tf.nn.embedding_lookup(word_emb, input_x) response_e = tf.nn.embedding_lookup(word_emb, input_y) response_embeddings = response_e rnn_units = 200 sentence_GRU = tf.nn.rnn_cell.GRUCell( rnn_units, kernel_initializer=tf.orthogonal_initializer()) all_utterance_embeddings = tf.unstack(turns1_e, num=conf["max_turn_num"], axis=1) all_utterance_len = tf.unstack(x_len, num=conf["max_turn_num"], axis=1) A_matrix = tf.get_variable( 'A_matrix_v', shape=(rnn_units, rnn_units), initializer=tf.contrib.layers.xavier_initializer(), dtype=tf.float32) final_GRU = tf.nn.rnn_cell.GRUCell( rnn_units, kernel_initializer=tf.orthogonal_initializer()) reuse = None response_GRU_embeddings, _ = tf.nn.dynamic_rnn(sentence_GRU, response_embeddings, sequence_length=y_len, dtype=tf.float32, scope='sentence_GRU') response_embeddings = tf.transpose(response_embeddings, perm=[0, 2, 1]) response_GRU_embeddings = tf.transpose(response_GRU_embeddings, perm=[0, 2, 1]) matching_vectors = [] for utterance_embeddings, utterance_len in zip(all_utterance_embeddings, all_utterance_len): matrix1 = tf.matmul(utterance_embeddings, response_embeddings) utterance_GRU_embeddings, _ = tf.nn.dynamic_rnn( sentence_GRU, utterance_embeddings, sequence_length=utterance_len, dtype=tf.float32, scope='sentence_GRU') matrix2 = tf.einsum('aij,jk->aik', utterance_GRU_embeddings, A_matrix) # TODO:check this matrix2 = tf.matmul(matrix2, response_GRU_embeddings) matrix = tf.stack([matrix1, matrix2], axis=3, name='matrix_stack') conv_layer = tf.layers.conv2d( matrix, filters=8, kernel_size=(3, 3), padding='VALID', kernel_initializer=tf.contrib.keras.initializers.he_normal(), activation=tf.nn.relu, reuse=reuse, name='conv') # TODO: check other params pooling_layer = tf.layers.max_pooling2d( conv_layer, (3, 3), strides=(3, 3), padding='VALID', name='max_pooling') # TODO: check other params matching_vector = tf.layers.dense( tf.contrib.layers.flatten(pooling_layer), 50, kernel_initializer=tf.contrib.layers.xavier_initializer(), activation=tf.tanh, reuse=reuse, name='matching_v') # TODO: check wthether this is correct if not reuse: reuse = True matching_vectors.append(matching_vector) _, last_hidden = tf.nn.dynamic_rnn( final_GRU, tf.stack(matching_vectors, axis=0, name='matching_stack'), dtype=tf.float32, time_major=True, scope='final_GRU') # TODO: check time_major #logits = tf.layers.dense(last_hidden, 2, kernel_initializer=tf.contrib.layers.xavier_initializer(), name='final_v') #self.y_pred = tf.nn.softmax(logits) #self.total_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y_true, logits=logits)) #tf.summary.scalar('loss', self.total_loss) #optimizer = tf.train.AdamOptimizer(learning_rate=0.001) #self.train_op = optimizer.minimize(self.total_loss) return last_hidden
def cell(): cell = tf.nn.rnn_cell.LSTMCell(self.cell_size, initializer=tf.orthogonal_initializer()) return cell