def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. ru = core_rnn_cell._linear([inputs, state], 2 * self._num_units, True, 1.0) ru = tf.nn.sigmoid(ru) r, u = tf.split(ru, 2, 1) with tf.variable_scope("Candidate"): lambdas = core_rnn_cell._linear([inputs, state], self._num_weights, True) lambdas = tf.split(tf.nn.softmax(lambdas), self._num_weights, 1) Ws = tf.get_variable("Ws", shape=[ self._num_weights, inputs.get_shape()[1], self._num_units ]) Ws = [ tf.squeeze(i) for i in tf.split(0, self._num_weights, Ws) ] candidate_inputs = [] for idx, W in enumerate(Ws): candidate_inputs.append( tf.matmul(inputs, W) * lambdas[idx]) Wx = tf.add_n(candidate_inputs) c = tf.nn.tanh(Wx + core_rnn_cell._linear( [r * state], self._num_units, True, scope="second")) new_h = u * state + (1 - u) * c return new_h, new_h
def call(self, inputs, state, att_score=None): if self._gate_linear is None: bias_ones = self._bias_initializer if self._bias_initializer is None: bias_ones = init_ops.constant_initializer(1.0, dtype=inputs.dtype) with vs.variable_scope("gates"): # Reset gate and update gate. self._gate_linear = _linear( [inputs, state], 2 * self._num_units, True, bias_initializer=bias_ones, kernel_initializer=self._kernel_initializer) value = math_ops.sigmoid(self._gate_linear([inputs, state])) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) r_state = r * state if self._candidate_linear is None: with vs.variable_scope("candidate"): self._candidate_linear = _linear( [inputs, r_state], self._num_units, True, bias_initializer=self._bias_initializer, kernel_initializer=self._kernel_initializer) c = self._activation(self._candidate_linear([inputs, r_state])) u = (1.0 - att_score) * u new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM).""" with tf.variable_scope(scope or type(self).__name__): c, h = state # change bias argument to False since LN will add bias via shift concat = core_rnn_cell._linear([inputs, h, c], 2 * self._num_units, False) i, f = tf.split(concat, 2, 1) j = core_rnn_cell._linear([inputs, h], self._num_units, False) # add layer normalization to each gate i = ln(i, scope='i/') j = ln(j, scope='j/') f = ln(f, scope='f/') new_c = (c * tf.nn.sigmoid(f + self._forget_bias) + tf.nn.sigmoid(i) * self._activation(j)) o = core_rnn_cell._linear([inputs, h, new_c], self._num_units, False) o = ln(o, scope='o/') # add layer_normalization in calculation of new hidden state new_h = self._activation(ln(new_c, scope='new_h/')) * tf.nn.sigmoid(o) new_state = tf.nn.rnn_cell.LSTMStateTuple(new_c, new_h) return new_h, new_state
def get_next_input(): # compute Badhanau style attention #performing convolution or reshaping input to (-1,2*d) and then doing matmul, is essentially the same operation #see matrix_mult.py...conv2d might be faster?? #https://stackoverflow.com/questions/38235555/tensorflow-matmul-of-input-matrix-with-batch-data encoder_features = tf.nn.conv2d( encoder_output, W_att_enc, [1, 1, 1, 1], "SAME" ) # shape (batch_size,max_enc_steps,1,attention_vec_size) dec_portion = tf.matmul(previous_state.h, W_att_dec) decoder_features = tf.expand_dims( tf.expand_dims(dec_portion, 1), 1 ) # reshape to (batch_size, 1, 1, attention_vec_size) #python broadcasting will alllow the two features to get added e_not_masked = tf.reduce_sum( v_blend * tf.nn.tanh(encoder_features + decoder_features), [2, 3]) # calculate e, (batch_size, max_enc_steps) #The shape of output of a softmax is the same as the input: it just normalizes the values. attn_dist = tf.nn.softmax( e_not_masked) # (batch_size, max_enc_steps) attn_dist = tf.Print(attn_dist, [tf.shape(attn_dist)], message="attn_dist", first_n=5, summarize=200) #Multiplying all the 2d vectors with same attn_dist values,and finally keeping 1 2d vector for every batch example context_vector = tf.reduce_sum( tf.reshape(attn_dist, [N, -1, 1, 1]) * encoder_output, [1, 2]) # shape (batch_size, attn_size). context_vector = tf.reshape(context_vector, [-1, 2 * nodes]) #next_input = tf.cond(self.is_train, lambda: tf.concat( # [tf.reshape(decoder_emb_inp[:, time], (N, dw)), context_vector], 1), # lambda: tf.concat([tf.nn.embedding_lookup(word_emb_mat, prediction), context_vector], 1)) #output_logits = tf.add(tf.matmul(previous_output, W_dense), b_dense) prediction = tf.cond( self.pointer_gen, lambda: execute_pointer_network(attn_dist), lambda: execute_normal_decoder( previous_output, W_dense, b_dense)) with tf.variable_scope("modified_dec_inputs", reuse=tf.AUTO_REUSE): next_input = tf.cond( self.is_train, lambda: _linear(args=[context_vector] + [ tf.reshape(decoder_emb_inp[:, time], (N, dw)) ], output_size=dw, bias=True), lambda: _linear([context_vector] + [ tf.nn.embedding_lookup( word_emb_mat, prediction) ], dw, True)) return next_input, attn_dist
def __call__(self, inputs, state): with vs.variable_scope('Gates'): value = _linear([inputs, state], 2 * self._num_units, True, 1.0) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) r = ln(r, scope='r/') u = ln(u, scope='u/') r, u = sigmoid(r), sigmoid(u) with vs.variable_scope('Candidate'): Cand = _linear([inputs, r * state], self._num_units, True) c_pre = ln(Cand, scope='new_h/') c = self._activation(c_pre) new_h = u * state + (1 - u) * c return new_h, new_h
def __build_encoder_state_computer(self, emb_encoder_inputs, encoder_mask): with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=None): with variable_scope.variable_scope("seq2seq_Encoder"): encoder_cell_fw = tf.nn.rnn_cell.LSTMCell(self.hidden_size) encoder_cell_bw = tf.nn.rnn_cell.LSTMCell(self.hidden_size) encoder_cell_fw = tf.nn.rnn_cell.DropoutWrapper( encoder_cell_fw, output_keep_prob=self.keep_prob) encoder_cell_bw = tf.nn.rnn_cell.DropoutWrapper( encoder_cell_bw, output_keep_prob=self.keep_prob) (outputs, encoder_state_fw, encoder_state_bw) = rnn.static_bidirectional_rnn( encoder_cell_fw, encoder_cell_bw, emb_encoder_inputs, dtype=tf.float32) encoder_outputs = outputs encoder_state_c = encoder_state_bw[0] encoder_state_m = encoder_state_bw[1] with variable_scope.variable_scope("initial_transfor_c"): final_state_c = core_rnn_cell._linear( encoder_state_c, self.hidden_size, True) final_state_c = tf.tanh(final_state_c) with variable_scope.variable_scope("initial_transfor_m"): final_state_m = core_rnn_cell._linear( encoder_state_m, self.hidden_size, True) final_state_m = tf.tanh(final_state_m) final_state = tf.nn.rnn_cell.LSTMStateTuple( final_state_c, final_state_m) # First calculate a concatenation of encoder outputs to put attention on. # cell.output_size is embedding_size top_states = [ array_ops.reshape(e, [-1, 1, encoder_cell_fw.output_size * 2]) for e in encoder_outputs ] attention_states = array_ops.concat(top_states, 1) final_attention_states = tf.multiply(encoder_mask, attention_states) return final_state, final_attention_states
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM). @param: inputs (batch,n) @param state: the states and hidden unit of the two cells """ with tf.variable_scope(scope or type(self).__name__): c1, c2, h1, h2 = state # change bias argument to False since LN will add bias via shift concat = _linear([inputs, h1, h2], 5 * self._num_units, False) i, j, f1, f2, o = tf.split(value=concat, num_or_size_splits=5, axis=1) # add layer normalization to each gate i = ln(i, scope='i/') j = ln(j, scope='j/') f1 = ln(f1, scope='f1/') f2 = ln(f2, scope='f2/') o = ln(o, scope='o/') new_c = (c1 * tf.nn.sigmoid(f1 + self._forget_bias) + c2 * tf.nn.sigmoid(f2 + self._forget_bias) + tf.nn.sigmoid(i) * self._activation(j)) # add layer_normalization in calculation of new hidden state new_h = self._activation(ln(new_c, scope='new_h/')) * tf.nn.sigmoid(o) new_state = LSTMStateTuple(new_c, new_h) return new_h, new_state
def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0, is_train=None): if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] flat_args = [flatten(arg, 1) for arg in args] if input_keep_prob < 1.0: assert is_train is not None flat_args = [ tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg) for arg in flat_args ] with tf.variable_scope(scope or 'Linear'): flat_out = _linear( flat_args, output_size, bias, bias_initializer=tf.constant_initializer(bias_start)) out = reconstruct(flat_out, args[0], 1) if squeeze: out = tf.squeeze(out, [len(args[0].get_shape().as_list()) - 1]) if wd: add_wd(wd) return out
def __call__(self, inputs, state, scope=None): """MGU with nunits cells.""" with tf.variable_scope(scope or type(self).__name__): # "MGUCell" with tf.variable_scope("forget_gate"): arg = _linear([state, inputs], self._num_units, True) f = math_ops.sigmoid(arg) print(f) with tf.variable_scope("candidate"): h_tilde = tf.tanh( _linear([inputs, f * state], self._num_units, True)) h = (1 - f) * state + f * h_tilde return h, h
def __call__(self, inputs, state, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with _checked_scope(self, scope or "gru_cell", reuse=self._reuse): with vs.variable_scope("gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. # 一次计算出两个gate的值 value = sigmoid( _linear([inputs, state], 2 * self._num_units, True, 1.0)) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) with vs.variable_scope("candidate"): c = self._activation( _linear([inputs, r * state], self._num_units, True)) new_h = u * state + (1 - u) * c # GRU里面输出和state都是一个h return new_h, new_h
def call(self, input, states): h = states.h c = states.c z = states.z ha, hb, z_b = tf.split(input, [self.h_above_size, self.h_below_size, 1], 1) s_rec = h s_td = z * ha s_bu = z_b * hb bias_init = tf.constant_initializer(0, dtype=tf.float32) concat = core_rnn_cell._linear( [s_rec, s_td, s_bu], 4 * self.hstate_size + 1, bias=True, bias_initializer=bias_init) #[B,4d+1] ,d is the state_size pre_f, pre_i, pre_o, pre_g, pre_z_next = tf.split( concat, [ self.hstate_size, self.hstate_size, self.hstate_size, self.hstate_size, 1 ], 1) i = tf.sigmoid(pre_i) # [B, h_l] g = tf.tanh(pre_g) # [B, h_l] f = tf.sigmoid(pre_f) # [B, h_l] o = tf.sigmoid(pre_o) # [B, h_l] z = tf.squeeze(z, axis=[1]) z_b = tf.squeeze(z_b, axis=[1]) c_next = tf.where( tf.equal(z, tf.constant(1, dtype=tf.float32)), tf.multiply(i, g), #flush tf.where( tf.equal(z_b, tf.constant(1, dtype=tf.float32)), tf.add(tf.multiply(c, f), tf.multiply(i, g)), #update tf.identity(c) #copy )) h_next = tf.where( tf.equal(z, tf.constant(1, dtype=tf.float32)), tf.multiply(o, tf.tanh(c_next)), #flush tf.where( tf.equal(z_b, tf.constant(1, dtype=tf.float32)), tf.multiply(o, tf.tanh(c_next)), #update tf.identity(h) #copy )) slope_multiplier = 1 pre_z_next = tf.sigmoid(pre_z_next * slope_multiplier) g = tf.get_default_graph() with g.gradient_override_map({"Round": "Identity"}): z_next = tf.round(pre_z_next) out_state = HMLSTMStateTuple(c=c_next, h=h_next, z=z_next) h_next = tf.nn.dropout(h_next, keep_prob=self.keep_p) output = tf.concat([h_next, z_next], axis=1) return output, out_state, concat
def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = core_rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y), [2, 3]) return s
def call(self, inputs, state): """ Hierarchical multi-scale long short-term memory cell (HMLSTM) inputs: [B, hb_l + 1 + ha_l] state: (c=[B, h_l], h=[B, h_l], z=[B, 1]) output: [B, h_l + 1] new_state: (c=[B, h_l], h=[B, h_l], z=[B, 1]) """ c = state.c # [B, h_l] h = state.h # [B, h_l] z = state.z # [B, 1] in_splits = tf.constant([self._h_below_size, 1, self._h_above_size]) hb, zb, ha = array_ops.split( value=inputs, num_or_size_splits=in_splits, axis=1, name='split') # [B, hb_l], [B, 1], [B, ha_l] s_recurrent = h # [B, h_l] expanded_z = z # [B, 1] s_above = tf.multiply(expanded_z, ha) # [B, ha_l] s_below = tf.multiply(zb, hb) # [B, hb_l] length = 4 * self._num_units + 1 states = [s_recurrent, s_above, s_below] bias_init = tf.constant_initializer(-1e5, dtype=tf.float32) # [B, 4 * h_l + 1] concat = core_rnn_cell._linear(states, length, bias=False, bias_initializer=bias_init) gate_splits = tf.constant(([self._num_units] * 4) + [1], dtype=tf.int32) i, g, f, o, z_tilde = array_ops.split(value=concat, num_or_size_splits=gate_splits, axis=1) i = tf.sigmoid(i) # [B, h_l] g = tf.tanh(g) # [B, h_l] f = tf.sigmoid(f) # [B, h_l] o = tf.sigmoid(o) # [B, h_l] new_c = self.calculate_new_cell_state(c, g, i, f, z, zb) new_h = self.calculate_new_hidden_state(h, o, new_c, z, zb) new_z = tf.expand_dims(self.calculate_new_indicator(z_tilde), -1) output = array_ops.concat((new_h, new_z), axis=1) # [B, h_l + 1] new_state = HMLSTMState(c=new_c, h=new_h, z=new_z) return output, new_state
def __call__(self, inputs, state): '''Gated recurrent unit (GRU) with nunits cells''' with vs.variable_scope('Gates'): value = _linear([inputs, state], 2 * self._num_units, True, kernel_initializer=tf.constant_initializer(1.0)) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) r = ln(r, scope='r/') u = ln(u, scope='u/') r, u = sigmoid(r), sigmoid(u) pass with vs.variable_scope('Candidate'): Cand = _linear([inputs, r * state], self._num_units, True) c_pre = ln(Cand, scope='new_h/') c = self._activation(c_pre) pass new_h = u * state + (1 - u) * c return new_h, new_h
def call(self, inputs, state): """ Conditionl GRU operations inputs: [batch_size, num_units] state: (h=[batch_size, num_units], c=[batch_size, num_units]) output: [batch_size, num_units] new_state: (h=[batch_size, num_units], c=[batch_size, num_units]) """ h = state.h c = state.c bias_ones = self._bias_initializer if self._bias_initializer is None: bias_ones = init_ops.constant_initializer(1.0, dtype=inputs.dtype) with vs.variable_scope('gates'): val_concat = core_rnn_cell._linear( [inputs, h, c], 2 * self._num_units, bias=False, bias_initializer=self._bias_initializer, kernel_initializer=self._kernel_initializer) val = math_ops.sigmoid(val_concat) r, z = array_ops.split(value=val, num_or_size_splits=2, axis=1) r_state = r * h with vs.variable_scope('candidate'): hbar_out = core_rnn_cell._linear( [inputs, r_state, c], self._num_units, bias=False, bias_initializer=self._bias_initializer, kernel_initializer=self._kernel_initializer) hbar = self._activation(hbar_out) output = (1 - z) * h + z * hbar new_state = ConditionalGRUState(h=output, c=c) return output, new_state
def highway(input_, size, layer_size=1, bias=-2, f=tf.nn.relu): """Highway Network (cf. http://arxiv.org/abs/1505.00387). t = sigmoid(Wy + b) z = t * g(Wy + b) + (1 - t) * y where g is nonlinearity, t is transform gate, and (1 - t) is carry gate. """ output = input_ for idx in range(layer_size): with tf.variable_scope('output_lin_%d' % idx): output = f(core_rnn_cell._linear(output, size, 0)) with tf.variable_scope('transform_lin_%d' % idx): transform_gate = tf.sigmoid( core_rnn_cell._linear(input_, size, 0) + bias) carry_gate = 1. - transform_gate output = transform_gate * output + carry_gate * input_ return output
def __call__(self, inputs, state, scope=None): """Long short-term memory cell with attention (LSTMA).""" if self._state_is_tuple: # 这里把state分为三个部分,LSTM的state,attns(代表attention向量)和attn的state state, attns, attn_states = state else: # 如果不是元组,就按照长度切分 states = state state = array_ops.slice(states, [0, 0], [-1, self._cell.state_size]) attns = array_ops.slice( states, [0, self._cell.state_size], [-1, self._attn_size]) attn_states = array_ops.slice( states, [0, self._cell.state_size + self._attn_size], [-1, self._attn_size * self._attn_length]) # attention状态是[None x Attention向量长度 x Attention窗口长度] attn_states = array_ops.reshape(attn_states, [-1, self._attn_length, self._atten_size]) input_size = self._input_size if input_size is None: input_size = inputs.get_shape().as_list()[1] # 让input 和 attns 进行一个什么运算呢? inputs = _linear([inputs, attns], input_size, True) lstm_output, new_state = self._cell(inputs, state) if self._state_is_tuple: new_state_cat = array_ops.concat(nest.flatten(new_state), 1) else: new_state_cat = new_state # 利用attention机制计算出下一时刻需要的上下文向量c_t和attention状态(隐藏状态)h_j new_attns, new_attn_states = self._attention(new_state_cat, attn_states) with vs.variable_scope("attn_output_projection"): # 利用c_t和x_t(y_{t-1})计算出t时刻输出s_t output = _linear([lstm_output, new_state], self._atten_size,True) # 把当前时刻输出s_t增加到下一时刻attention状态去 new_attn_states = array_ops.concat([new_attn_states, array_ops.expand_dims(output,1)],1) new_attn_states = array_ops.reshape(new_attn_states, [-1, self._attn_length * self._attn_size]) new_state = (new_state, new_attns, new_attn_states) if not self._state_is_tuple: new_state = array_ops.concat(list(new_state), 1) return output, new_state
def __call__(self, lm_inputs, seq_len): """Runs RNN and returns the logits.""" params = self.params emb_inputs = self.prepare_decoder_input(lm_inputs[:-1, :]) outputs, _ = \ tf.nn.dynamic_rnn(self.cell, emb_inputs, sequence_length=seq_len, dtype=tf.float32, time_major=True) # T x B x H => (T x B) x H outputs = tf.reshape(outputs, [-1, self.cell.output_size]) with tf.variable_scope("rnn"): # Additional variable scope required to mimic the attention # decoder scope so that variable initialization is hassle free if params.lm_hidden_size != params.proj_size: with tf.variable_scope("SimpleProjection"): outputs = _linear([outputs], params.proj_size, True) with tf.variable_scope("OutputProjection"): outputs = _linear([outputs], params.vocab_size, True) return outputs
def attention(query): ''' Point on hidden using hidden_features and query :param query:shape:[batch_size,attention_size] :return: ''' with vs.variable_scope('Attention'): # y shape:[batch_size,attention_size] # 相当于执行W2Dj的运算 y=core_rnn_cell._linear(query,attention_vec_size,True) # y shape:[batch_size,1,1,attention_size] y=array_ops.reshape(y,[-1,1,1,attention_vec_size]) # Attention mask is softmax of v^T *tanh(...) s=math_ops.reduce_sum(v*math_ops.tanh(hidden_features+y),[2,3]) return s
def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence( query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) for a in range(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = core_rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) #a = a + 1e-5 a1 = tf.multiply(a, encoder_mask) #print (mask_a.get_shape()) floor = math_ops.reduce_sum(a1, axis=1) floor = tf.stack([floor], axis=1) #print (floor.get_shape()) a2 = tf.truediv(a1, floor) nan_bool = tf.is_nan(a2) #mask_a = tf.select(nan_bool, a1+0.1, a2) mask_a = a2 #print (mask_a.get_shape()) #print ("_____________") # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(mask_a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape( d, [-1, attn_size])) #remember this size return ds, mask_a
def attention(query, prev_alpha): """Put attention masks on hidden using hidden_features and query.""" with tf.variable_scope("Attention"): y = _linear(query, params.attention_vec_size, True) y = tf.reshape(y, [-1, 1, 1, params.attention_vec_size]) s = tf.reduce_sum( v * tf.tanh(hidden_features + y), [2, 3]) alpha = tf.nn.softmax(s) * attn_mask sum_vec = tf.reduce_sum(alpha, reduction_indices=[1], keepdims=True) norm_term = tf.tile(sum_vec, tf.stack([1, tf.shape(alpha)[1]])) alpha = alpha / norm_term alpha = tf.expand_dims(alpha, 2) alpha = tf.expand_dims(alpha, 3) context_vec = tf.reduce_sum(alpha * hidden, [1, 2]) return tuple([context_vec, alpha])
def __call__(self, inputs, state, scope=None): """Run one step of minimal RNN. Args: inputs: input Tensor, 2D, batch x num_units. state: a state Tensor, `2-D, batch x state_size`. Returns: A tuple containing: - A `2-D, [batch x num_units]`, Tensor representing the output of the cell after reading `inputs` when previous state was `state`. - A `2-D, [batch x num_units]`, Tensor representing the new state of cell after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: - If input size cannot be inferred from inputs via static shape inference. - If state is not `2D`. """ # Phi projection to a latent space / candidate #z = inputs z = self._activation(inputs) """for i, layer_size in enumerate(self._num_units): with tf.variable_scope("phi_" + str(i)): z = self._activation(_linear( z, layer_size, True, bias_initializer=self._bias_initializer, kernel_initializer=self._kernel_initializer))""" # Update gate bias_ones = self._bias_initializer if self._bias_initializer is None: bias_ones = init_ops.constant_initializer(1.0, dtype=inputs.dtype) with tf.variable_scope("update_gate"): arg = _linear([state, z], self._num_units[-1], True, bias_initializer=bias_ones, kernel_initializer=self._kernel_initializer) u = math_ops.sigmoid(arg) # Activation step new_h = u * state + (1 - u) * z return new_h, new_h
def attention(query,attn_size,V,hidden_features, attn_length,attn_states,name,mask): cs = [] if nest.is_sequence(query): query_list = nest.flatten(query) query = tf.concat(query_list,1) with tf.variable_scope("Attention"+name) as scope: y = _linear( args=query, output_size=attn_size, bias=True, bias_initializer = self.initializer, kernel_initializer = self.initializer) y = tf.reshape(y, [-1, 1, 1, attn_size]) s = tf.reduce_sum(V * tf.nn.tanh(hidden_features + y), [2, 3]) a_masked = masked_attention(s,mask) c = tf.reduce_sum(tf.reshape( a_masked, [-1, attn_length, 1, 1])*attn_states, [1,2]) cs=tf.reshape(c, [-1, attn_size]) return cs,a_masked
hidden_conv = tf.expand_dims(state_outputs, 2) # k: [filter_height, filter_width, in_channels, out_channels] k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size]) # [bs, nstep, 1, embed size * 2] hidden_features = tf.nn.conv2d(hidden_conv, k, [1, 1, 1, 1], "SAME") # [bs, nstep, embed size * 2] hidden_features = tf.reshape(hidden_features, origin_shape) # [bs, 1, nstep, embed size * 2] hidden_features = tf.expand_dims(hidden_features, 1) v = tf.get_variable("AttnV", [attn_size]) slot_inputs_shape = tf.shape(slot_inputs) # [bs * nstep, embed size * 2] slot_inputs = tf.reshape(slot_inputs, [-1, attn_size]) # [bs * nstep, embed size * 2] y = core_rnn_cell._linear(slot_inputs, attn_size, True) # [bs , nstep, embed size * 2] y = tf.reshape(y, slot_inputs_shape) # [bs , nstep, 1, embed size * 2] y = tf.expand_dims(y, 2) # [bs , nstep, nstep] = [bs, 1, nstep, hidden size] + [bs , nstep, 1, embed size * 2] s = tf.reduce_sum(v * tf.tanh(hidden_features + y), [3]) a = tf.nn.softmax(s) # a shape = [bs, nstep, nstep, 1] a = tf.expand_dims(a, -1) # a shape = [bs, nstep, embed size * 2] slot_d = tf.reduce_sum(a * hidden, [2]) slot_output = tf.reshape(slot_d,[-1,attn_size]) else: attn_size = state_shape[2].value slot_d=state_outputs
def createModel( input_data, input_size, sequence_length, slot_size, intent_size, layer_size=128, isTraining=True, ): cell_fw = tf.contrib.rnn.BasicLSTMCell(layer_size) cell_bw = tf.contrib.rnn.BasicLSTMCell(layer_size) if isTraining == True: cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=0.5, output_keep_prob=0.5) cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=0.5, output_keep_prob=0.5) embedding = tf.get_variable("embedding", [input_size, layer_size]) inputs = tf.nn.embedding_lookup(embedding, input_data) state_outputs, final_state = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, inputs, sequence_length=sequence_length, dtype=tf.float32) final_state = tf.concat([ final_state[0][0], final_state[0][1], final_state[1][0], final_state[1][1] ], 1) state_outputs = tf.concat([state_outputs[0], state_outputs[1]], 2) state_shape = state_outputs.get_shape() with tf.variable_scope("attention"): slot_inputs = state_outputs if remove_slot_attn == False: with tf.variable_scope("slot_attn"): attn_size = state_shape[2].value origin_shape = tf.shape(state_outputs) hidden = tf.expand_dims(state_outputs, 1) hidden_conv = tf.expand_dims(state_outputs, 2) # hidden shape = [batch, sentence length, 1, hidden size] k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size]) hidden_features = tf.nn.conv2d(hidden_conv, k, [1, 1, 1, 1], "SAME") hidden_features = tf.reshape(hidden_features, origin_shape) hidden_features = tf.expand_dims(hidden_features, 1) v = tf.get_variable("AttnV", [attn_size]) slot_inputs_shape = tf.shape(slot_inputs) slot_inputs = tf.reshape(slot_inputs, [-1, attn_size]) y = core_rnn_cell._linear(slot_inputs, attn_size, True) y = tf.reshape(y, slot_inputs_shape) y = tf.expand_dims(y, 2) s = tf.reduce_sum(v * tf.tanh(hidden_features + y), [3]) a = tf.nn.softmax(s) # a shape = [batch, input size, sentence length, 1] a = tf.expand_dims(a, -1) slot_d = tf.reduce_sum(a * hidden, [2]) else: attn_size = state_shape[2].value slot_inputs = tf.reshape(slot_inputs, [-1, attn_size]) intent_input = final_state with tf.variable_scope("intent_attn"): attn_size = state_shape[2].value hidden = tf.expand_dims(state_outputs, 2) k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size]) hidden_features = tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = tf.get_variable("AttnV", [attn_size]) y = core_rnn_cell._linear(intent_input, attn_size, True) y = tf.reshape(y, [-1, 1, 1, attn_size]) s = tf.reduce_sum(v * tf.tanh(hidden_features + y), [2, 3]) a = tf.nn.softmax(s) a = tf.expand_dims(a, -1) a = tf.expand_dims(a, -1) d = tf.reduce_sum(a * hidden, [1, 2]) if add_final_state_to_intent == True: intent_output = tf.concat([d, intent_input], 1) else: intent_output = d with tf.variable_scope("slot_gated"): intent_gate = core_rnn_cell._linear(intent_output, attn_size, True) intent_gate = tf.reshape( intent_gate, [-1, 1, intent_gate.get_shape()[1].value]) v1 = tf.get_variable("gateV", [attn_size]) if remove_slot_attn == False: slot_gate = v1 * tf.tanh(slot_d + intent_gate) else: slot_gate = v1 * tf.tanh(state_outputs + intent_gate) slot_gate = tf.reduce_sum(slot_gate, [2]) slot_gate = tf.expand_dims(slot_gate, -1) if remove_slot_attn == False: slot_gate = slot_d * slot_gate else: slot_gate = state_outputs * slot_gate slot_gate = tf.reshape(slot_gate, [-1, attn_size]) slot_output = tf.concat([slot_gate, slot_inputs], 1) with tf.variable_scope("intent_proj"): intent = core_rnn_cell._linear(intent_output, intent_size, True) with tf.variable_scope("slot_proj"): slot = core_rnn_cell._linear(slot_output, slot_size, True) outputs = [slot, intent] return outputs
def call(self, inputs, state, att_score=None): time_now_score = tf.expand_dims(inputs[:,-1], -1) time_last_score = tf.expand_dims(inputs[:,-2], -1) inputs = inputs[:,:-2] inputs = inputs * att_score num_proj = self._num_units if self._num_proj is None else self._num_proj sigmoid = math_ops.sigmoid if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") if self._time_kernel_w1 is None: scope = vs.get_variable_scope() with vs.variable_scope( scope, initializer=self._initializer) as unit_scope: with vs.variable_scope(unit_scope): self._time_input_w1 = vs.get_variable( "_time_input_w1", shape=[self._num_units], dtype=dtype) self._time_input_bias1 = vs.get_variable( "_time_input_bias1", shape=[self._num_units], dtype=dtype) self._time_input_w2 = vs.get_variable( "_time_input_w2", shape=[self._num_units], dtype=dtype) self._time_input_bias2 = vs.get_variable( "_time_input_bias2", shape=[self._num_units], dtype=dtype) self._time_kernel_w1 = vs.get_variable( "_time_kernel_w1", shape=[input_size, self._num_units], dtype=dtype) self._time_kernel_t1 = vs.get_variable( "_time_kernel_t1", shape=[self._num_units, self._num_units], dtype=dtype) self._time_bias1 = vs.get_variable( "_time_bias1", shape=[self._num_units], dtype=dtype) self._time_kernel_w2 = vs.get_variable( "_time_kernel_w2", shape=[input_size, self._num_units], dtype=dtype) self._time_kernel_t2 = vs.get_variable( "_time_kernel_t2", shape=[self._num_units, self._num_units], dtype=dtype) self._time_bias2 = vs.get_variable( "_time_bias2", shape=[self._num_units], dtype=dtype) self._o_kernel_t1 = vs.get_variable( "_o_kernel_t1", shape=[self._num_units, self._num_units], dtype=dtype) self._o_kernel_t2 = vs.get_variable( "_o_kernel_t2", shape=[self._num_units, self._num_units], dtype=dtype) time_now_input = tf.nn.tanh(time_now_score * self._time_input_w1 + self._time_input_bias1) time_last_input = tf.nn.tanh(time_last_score * self._time_input_w2 + self._time_input_bias2) time_now_state = math_ops.matmul(inputs, self._time_kernel_w1) + math_ops.matmul(time_now_input, self._time_kernel_t1) + self._time_bias1 time_last_state = math_ops.matmul(inputs, self._time_kernel_w2) + math_ops.matmul(time_last_input, self._time_kernel_t2) + self._time_bias2 if self._linear1 is None: scope = vs.get_variable_scope() with vs.variable_scope( scope, initializer=self._initializer) as unit_scope: if self._num_unit_shards is not None: unit_scope.set_partitioner( partitioned_variables.fixed_size_partitioner( self._num_unit_shards)) self._linear1 = _linear([inputs, m_prev], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate lstm_matrix = self._linear1([inputs, m_prev]) i, j, f, o = array_ops.split( value=lstm_matrix, num_or_size_splits=4, axis=1) o = o + math_ops.matmul(time_now_input, self._o_kernel_t1) + math_ops.matmul(time_last_input, self._o_kernel_t2) # Diagonal connections if self._use_peepholes and not self._w_f_diag: scope = vs.get_variable_scope() with vs.variable_scope( scope, initializer=self._initializer) as unit_scope: with vs.variable_scope(unit_scope): self._w_f_diag = vs.get_variable( "w_f_diag", shape=[self._num_units], dtype=dtype) self._w_i_diag = vs.get_variable( "w_i_diag", shape=[self._num_units], dtype=dtype) self._w_o_diag = vs.get_variable( "w_o_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * sigmoid(time_last_state) * c_prev + sigmoid(i + self._w_i_diag * c_prev) * sigmoid(time_now_state) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * sigmoid(time_last_state) * c_prev + sigmoid(i) * sigmoid(time_now_state) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + self._w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: if self._linear2 is None: scope = vs.get_variable_scope() with vs.variable_scope(scope, initializer=self._initializer): with vs.variable_scope("projection") as proj_scope: if self._num_proj_shards is not None: proj_scope.set_partitioner( partitioned_variables.fixed_size_partitioner( self._num_proj_shards)) self._linear2 = _linear(m, self._num_proj, False) m = self._linear2(m) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1)) return m, new_state
def step(self, time, inputs, state, name=None): """Perform a decoding step. Args: time: scalar `int32` tensor. inputs: A (structure of) input tensors. state: A (structure of) state tensors and TensorArrays. name: Name scope for any created operations. Returns: `(outputs, next_state, next_inputs, finished)`. """ with ops.name_scope(name, 'PGDecoderStep', (time, inputs, state)): cell_outputs, cell_state = self._cell(inputs, state) # the first cell state contains attention, which is context attention = cell_state[0].attention att_cell_state = cell_state[0].cell_state alignments = cell_state[0].alignments with tf.variable_scope('calculate_pgen'): p_gen = _linear([attention, inputs, att_cell_state], 1, True) p_gen = tf.sigmoid(p_gen) if self._output_layer is not None: cell_outputs = self._output_layer(cell_outputs) vocab_dist = tf.nn.softmax(cell_outputs) * p_gen # z = tf.reduce_sum(alignments,axis=1) # z = tf.reduce_sum(tf.cast(tf.less_equal(alignments, 0),tf.int32)) alignments = alignments * (1 - p_gen) # x = tf.reduce_sum(tf.cast(tf.less_equal((1-p_gen), 0),tf.int32)) # y = tf.reduce_sum(tf.cast(tf.less_equal(alignments[3], 0),tf.int32)) # this is only for debug # alignments2 = tf.Print(alignments2,[tf.shape(inputs),x,y,alignments[2][9:12]],message="zeros in vocab dist and alignments") # since we have OOV words, we need expand the vocab dist vocab_size = tf.shape(vocab_dist)[-1] extended_vsize = vocab_size + self.source_oov_words batch_size = tf.shape(vocab_dist)[0] extra_zeros = tf.zeros((batch_size, self.source_oov_words)) # batch * extend vocab size vocab_dists_extended = tf.concat(axis=-1, values=[vocab_dist, extra_zeros]) # vocab_dists_extended = tf.Print(vocab_dists_extended,[tf.shape(vocab_dists_extended),self.source_oov_words],message='vocab_dists_extended size') batch_nums = tf.range(0, limit=batch_size) # shape (batch_size) batch_nums = tf.expand_dims(batch_nums, 1) # shape (batch_size, 1) attn_len = tf.shape(self.source_extend_tokens)[ 1] # number of states we attend over batch_nums = tf.tile(batch_nums, [1, attn_len]) # shape (batch_size, attn_len) indices = tf.stack((batch_nums, self.source_extend_tokens), axis=2) # shape (batch_size, enc_t, 2) shape = [batch_size, extended_vsize] attn_dists_projected = tf.scatter_nd(indices, alignments, shape) final_dists = attn_dists_projected + vocab_dists_extended # final_dists = tf.Print(final_dists,[tf.reduce_sum(tf.cast(tf.less_equal(final_dists[0],0),tf.int32))],message='final dist') # note: sample_ids will contains OOV words sample_ids = self._helper.sample(time=time, outputs=final_dists, state=cell_state) (finished, next_inputs, next_state) = self._helper.next_inputs( time=time, outputs=cell_outputs, state=cell_state, sample_ids=sample_ids, ) outputs = tf.contrib.seq2seq.BasicDecoderOutput( final_dists, sample_ids) return (outputs, next_state, next_inputs, finished)
def createModel(input_data, input_size, sequence_length, slot_size, intent_size, layer_size=128, isTraining=True): cell_fw = tf.contrib.rnn.BasicLSTMCell(layer_size) cell_bw = tf.contrib.rnn.BasicLSTMCell(layer_size) if isTraining == True: cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=0.5, output_keep_prob=0.5) cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=0.5, output_keep_prob=0.5) # embedding layer, [word size, embed size] 724, 64 if arg.embedding_path: embedding_weight = np.load(arg.embedding_path) embedding = tf.Variable(embedding_weight, name='embedding', dtype=tf.float32) else: embedding = tf.get_variable('embedding', [input_size, layer_size]) # [bs, nstep, embed size] inputs = tf.nn.embedding_lookup(embedding, input_data) # state_outputs: [bs, nstep, embed size], final_state: [4, bs, embed size] include cell state * 2, hidden state * 2 state_outputs, final_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=sequence_length, dtype=tf.float32) # [bs, embed size * 4] final_state = tf.concat([final_state[0][0], final_state[0][1], final_state[1][0], final_state[1][1]], 1) # [bs, nstep, embed size * 2] state_outputs = tf.concat([state_outputs[0], state_outputs[1]], 2) state_shape = state_outputs.get_shape() with tf.variable_scope('attention'): # [bs, nstep, embed size * 2] slot_inputs = state_outputs if not remove_slot_attn: with tf.variable_scope('slot_attn'): # embed size * 2 attn_size = state_shape[2].value origin_shape = tf.shape(state_outputs) # [bs, 1, nstep, embed size * 2] hidden = tf.expand_dims(state_outputs, 1) # [bs, nstep, 1, embed size * 2] hidden_conv = tf.expand_dims(state_outputs, 2) # k: [filter_height, filter_width, in_channels, out_channels] k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size]) # [bs, nstep, 1, embed size * 2] hidden_features = tf.nn.conv2d(hidden_conv, k, [1, 1, 1, 1], "SAME") # [bs, nstep, embed size * 2] hidden_features = tf.reshape(hidden_features, origin_shape) # [bs, 1, nstep, embed size * 2] hidden_features = tf.expand_dims(hidden_features, 1) v = tf.get_variable("AttnV", [attn_size]) slot_inputs_shape = tf.shape(slot_inputs) # [bs * nstep, embed size * 2] slot_inputs = tf.reshape(slot_inputs, [-1, attn_size]) # [bs * nstep, embed size * 2] y = core_rnn_cell._linear(slot_inputs, attn_size, True) # [bs , nstep, embed size * 2] y = tf.reshape(y, slot_inputs_shape) # [bs , nstep, 1, embed size * 2] y = tf.expand_dims(y, 2) # [bs , nstep, nstep] = [bs, 1, nstep, hidden size] + [bs , nstep, 1, embed size * 2] s = tf.reduce_sum(v * tf.tanh(hidden_features + y), [3]) a = tf.nn.softmax(s) # a shape = [bs, nstep, nstep, 1] a = tf.expand_dims(a, -1) # a shape = [bs, nstep, embed size * 2] slot_d = tf.reduce_sum(a * hidden, [2]) else: attn_size = state_shape[2].value slot_inputs = tf.reshape(slot_inputs, [-1, attn_size]) intent_input = final_state with tf.variable_scope('intent_attn'): attn_size = state_shape[2].value # [bs, nstep, 1, embed size * 2] hidden = tf.expand_dims(state_outputs, 2) k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size]) # [bs, nstep, 1, embed size * 2] hidden_features = tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = tf.get_variable("AttnV", [attn_size]) # [bs, embed size * 4] y = core_rnn_cell._linear(intent_input, attn_size, True) # [bs, 1, 1, embed size * 4] y = tf.reshape(y, [-1, 1, 1, attn_size]) # [bs, nstep] s = tf.reduce_sum(v * tf.tanh(hidden_features + y), [2, 3]) a = tf.nn.softmax(s) # [bs, nstep, 1] a = tf.expand_dims(a, -1) # [bs, nstep, 1, 1] a = tf.expand_dims(a, -1) # [bs, embed size * 2] d = tf.reduce_sum(a * hidden, [1, 2]) if add_final_state_to_intent == True: # [bs, embed size * 2 + embed size * 4] intent_output = tf.concat([d, intent_input], 1) else: intent_output = d with tf.variable_scope('slot_gated'): # [bs, embed size * 2] intent_gate = core_rnn_cell._linear(intent_output, attn_size, True) # [bs, 1,embed size * 2] intent_gate = tf.reshape(intent_gate, [-1, 1, intent_gate.get_shape()[1].value]) v1 = tf.get_variable("gateV", [attn_size]) if not remove_slot_attn: # [bs, nstep, embed size * 2] slot_gate = v1 * tf.tanh(slot_d + intent_gate) else: # [bs, nstep, embed size * 2] slot_gate = v1 * tf.tanh(state_outputs + intent_gate) # [bs, nstep] slot_gate = tf.reduce_sum(slot_gate, [2]) # [bs, nstep, 1] slot_gate = tf.expand_dims(slot_gate, -1) if not remove_slot_attn: # [bs, nstep, embed size * 2] slot_gate = slot_d * slot_gate else: slot_gate = state_outputs * slot_gate # [bs * nstep, embed size * 2] slot_gate = tf.reshape(slot_gate, [-1, attn_size]) # [bs * nstep, embed size * 4] slot_output = tf.concat([slot_gate, slot_inputs], 1) with tf.variable_scope('intent_proj'): # [bs, intent_size] intent = core_rnn_cell._linear(intent_output, intent_size, True) with tf.variable_scope('slot_proj'): # [bs * nsetp, intent_size] slot = core_rnn_cell._linear(slot_output, slot_size, True) if arg.use_crf: nstep = tf.shape(state_outputs)[1] slot = tf.reshape(slot, [-1, nstep, slot_size]) outputs = [slot, intent] return outputs
def createModel(input_data, input_size, sequence_length, slots, slot_size, intent_size, layer_size=128, isTraining=True): cell_fw = tf.contrib.rnn.BasicLSTMCell(layer_size) cell_bw = tf.contrib.rnn.BasicLSTMCell(layer_size) if isTraining == True: cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=0.5, output_keep_prob=0.5) cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=0.5, output_keep_prob=0.5) if arg.embedding_path: print("Loading embedding with numpy!") embedding_weight = np.load(arg.embedding_path) embedding = tf.Variable(embedding_weight, name='embedding', dtype=tf.float32) else: embedding = tf.get_variable('embedding', [input_size, layer_size]) inputs = tf.nn.embedding_lookup(embedding, input_data) state_outputs, final_state = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, inputs, sequence_length=sequence_length, dtype=tf.float32) final_state = tf.concat([ final_state[0][0], final_state[0][1], final_state[1][0], final_state[1][1] ], 1) state_outputs = tf.concat([state_outputs[0], state_outputs[1]], 2) state_shape = state_outputs.get_shape() with tf.variable_scope('attention'): slot_inputs = state_outputs if not remove_slot_attn: with tf.variable_scope('slot_attn'): attn_size = state_shape[2].value origin_shape = tf.shape(state_outputs) hidden = tf.expand_dims(state_outputs, 1) hidden_conv = tf.expand_dims(state_outputs, 2) k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size]) hidden_features = tf.nn.conv2d(hidden_conv, k, [1, 1, 1, 1], "SAME") hidden_features = tf.reshape(hidden_features, origin_shape) hidden_features = tf.expand_dims(hidden_features, 1) v = tf.get_variable("AttnV", [attn_size]) slot_inputs_shape = tf.shape(slot_inputs) slot_inputs = tf.reshape(slot_inputs, [-1, attn_size]) y = core_rnn_cell._linear(slot_inputs, attn_size, True) y = tf.reshape(y, slot_inputs_shape) y = tf.expand_dims(y, 2) s = tf.reduce_sum(v * tf.tanh(hidden_features + y), [3]) a = tf.nn.softmax(s) a = tf.expand_dims(a, -1) slot_d = tf.reduce_sum(a * hidden, [2]) slot_reinforce_state = tf.expand_dims(slot_d, 2) else: attn_size = state_shape[2].value slot_d = slot_inputs slot_reinforce_state = tf.expand_dims(slot_inputs, 2) slot_inputs = tf.reshape(slot_inputs, [-1, attn_size]) intent_input = final_state with tf.variable_scope('intent_attn'): attn_size = state_shape[2].value hidden = tf.expand_dims(state_outputs, 2) k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size]) hidden_features = tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = tf.get_variable("AttnV", [attn_size]) y = core_rnn_cell._linear(intent_input, attn_size, True) y = tf.reshape(y, [-1, 1, 1, attn_size]) s = tf.reduce_sum(v * tf.tanh(hidden_features + y), [2, 3]) a = tf.nn.softmax(s) a = tf.expand_dims(a, -1) a = tf.expand_dims(a, -1) d = tf.reduce_sum(a * hidden, [1, 2]) r_intent = d intent_context_states = d if arg.priority_order == 'intent_first': for n in range(arg.iteration_num): with tf.variable_scope('intent_subnet' + str(n - 1)): attn_size = state_shape[2].value hidden = tf.expand_dims(state_outputs, 2) k1 = tf.get_variable("W1", [1, 1, attn_size, attn_size]) k2 = tf.get_variable('W2', [1, 1, attn_size, attn_size]) slot_reinforce_features = tf.nn.conv2d( slot_reinforce_state, k1, [1, 1, 1, 1], "SAME") hidden_features = tf.nn.conv2d(hidden, k2, [1, 1, 1, 1], "SAME") v1 = tf.get_variable("AttnV", [attn_size]) bias = tf.get_variable("Bias", [attn_size]) s = tf.reduce_sum( v1 * tf.tanh(hidden_features + slot_reinforce_features + bias), [2, 3]) a = tf.nn.softmax(s) a = tf.expand_dims(a, -1) a = tf.expand_dims(a, -1) r = tf.reduce_sum(a * slot_reinforce_state, [1, 2]) r_intent = r + intent_context_states intent_output = tf.concat([r_intent, intent_input], 1) with tf.variable_scope('slot_subnet' + str(n - 1)): intent_gate = core_rnn_cell._linear( r_intent, attn_size, True) intent_gate = tf.reshape( intent_gate, [-1, 1, intent_gate.get_shape()[1].value]) v1 = tf.get_variable("gateV", [attn_size]) relation_factor = v1 * tf.tanh(slot_d + intent_gate) relation_factor = tf.reduce_sum(relation_factor, [2]) relation_factor = tf.expand_dims(relation_factor, -1) slot_reinforce_state1 = slot_d * relation_factor slot_reinforce_state = tf.expand_dims( slot_reinforce_state1, 2) slot_reinforce_vector = tf.reshape(slot_reinforce_state1, [-1, attn_size]) slot_output = tf.concat( [slot_reinforce_vector, slot_inputs], 1) else: for n in range(arg.iteration_num): with tf.variable_scope('slot_subnet' + str(n - 1)): intent_gate = core_rnn_cell._linear( r_intent, attn_size, True) intent_gate = tf.reshape( intent_gate, [-1, 1, intent_gate.get_shape()[1].value]) v1 = tf.get_variable("gateV", [attn_size]) relation_factor = v1 * tf.tanh(slot_d + intent_gate) relation_factor = tf.reduce_sum(relation_factor, [2]) relation_factor = tf.expand_dims(relation_factor, -1) slot_reinforce_state = slot_d * relation_factor slot_reinforce_vector = tf.reshape(slot_reinforce_state, [-1, attn_size]) slot_output = tf.concat( [slot_reinforce_vector, slot_inputs], 1) with tf.variable_scope('intent_subnet' + str(n - 1)): attn_size = state_shape[2].value hidden = tf.expand_dims(state_outputs, 2) slot_reinforce_output = tf.expand_dims( slot_reinforce_state, 2) k1 = tf.get_variable("W1", [1, 1, attn_size, attn_size]) k2 = tf.get_variable('W2', [1, 1, attn_size, attn_size]) slot_features = tf.nn.conv2d(slot_reinforce_output, k1, [1, 1, 1, 1], "SAME") hidden_features = tf.nn.conv2d(hidden, k2, [1, 1, 1, 1], "SAME") v1 = tf.get_variable("AttnV", [attn_size]) bias = tf.get_variable("Bias", [attn_size]) s = tf.reduce_sum( v1 * tf.tanh(hidden_features + slot_features + bias), [2, 3]) a = tf.nn.softmax(s) a = tf.expand_dims(a, -1) a = tf.expand_dims(a, -1) r = tf.reduce_sum(a * slot_reinforce_output, [1, 2]) r_intent = r + intent_context_states intent_output = tf.concat([r_intent, intent_input], 1) with tf.variable_scope('intent_proj'): intent = core_rnn_cell._linear(intent_output, intent_size, True) with tf.variable_scope('slot_proj'): slot = core_rnn_cell._linear(slot_output, slot_size, True) if arg.use_crf: nstep = tf.shape(state_outputs)[1] slot = tf.reshape(slot, [-1, nstep, slot_size]) outputs = [slot, intent] return outputs
def createModel(input_data, input_size, sequence_length, slot_size, intent_size, remove_slot_attn, add_final_state_to_intent, use_crf, layer_size=128, isTraining=True, embedding_path=None, use_batch_crossent=True): #cell_fw = tf.contrib.rnn.BasicLSTMCell(layer_size) cell_fw = tf.nn.rnn_cell.LSTMCell(layer_size) cell_bw = tf.nn.rnn_cell.LSTMCell(layer_size) #cell_bw = tf.contrib.rnn.BasicLSTMCell(layer_size) if isTraining == True: cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=0.5, output_keep_prob=0.5) cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=0.5, output_keep_prob=0.5) # embedding layer, [word size, embed size] 724, 64 if embedding_path: embedding_weight = np.load(embedding_path) embedding = tf.Variable(embedding_weight, name='embedding', dtype=tf.float32) else: embedding = tf.get_variable('embedding', [input_size, layer_size]) # embedding:[vocab_size, embedding_size] # input_data:[batch, input_sequence_length] # inputs:[batch, input_sequence_length, embedding_size] inputs = tf.nn.embedding_lookup(embedding, input_data) # state_outputs: [batch, nstep, embed size], final_state: [4, bs, embed size] include cell state * 2, hidden state * 2 #(output_fw, output_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( # output_fw: [batch, input_sequence_length, num_units],它的值为hidden_state # output_bw: [batch, input_sequence_length, num_units],它的值为hidden_state # (cell_state_fw, hidden_state_fw) = states_fw # cell_state_fw: [batch, num_units] # hidden_state_fw: [batch, num_units] (output_fw, output_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=inputs, sequence_length=sequence_length, dtype=tf.float32) (cell_state_fw, hidden_state_fw) = state_fw (cell_state_bw, hidden_state_bw) = state_bw # [batch, hidden_size * 4] final_state = tf.concat( [cell_state_fw, hidden_state_fw, cell_state_bw, hidden_state_bw], axis=1) # sequence_outputs:[batch, input_sequence_length, hidden_size* 2] sequence_outputs = tf.concat([output_fw, output_bw], axis=2) print("cell_state_fw:", cell_state_fw, " hidden_state_fw:", hidden_state_fw) print("final_state:", final_state) print("squence_outputs:", sequence_outputs) # tensor.get_shape()返回的是tuple,不是tensor sequence_output_shape = sequence_outputs.get_shape( ) # [batch, input_sequence_length, hidden_size* 2] """ sequence output作为attention的输入 计算context_i,即论文中C_i^S """ with tf.variable_scope('attention'): # state_outputs:[batch, input_sequence_length, hidden_size* 2] slot_inputs = sequence_outputs if not remove_slot_attn: # 需要slot attention with tf.variable_scope('slot_attn'): """ e_{i,k}=V^T*tanh(W_{he}*h_k+W_{ie}*h_i) alpha_{i,j} = softmax(e{i,j}) c_i = sum_{j}(alpha_{i,j}*h_j) y_i=softmax(W_hy*(h_i+c_i)) 其中: W_{he}*h_k:用的是卷积实现 W_{ie}*h_i:用的是线性映射 _linear() """ """ hidden_features = W_{he}*h_k:用的是卷积实现 """ # attn_size = hidden_size * 2 attn_size = sequence_output_shape[2].value # [batch, height=input_sequence_length, width=1, channel=hidden_size * 2] hidden_input_conv = tf.expand_dims(sequence_outputs, axis=2) # W_he: [filter_height=1, filter_width=1, in_channels=hidden*2, out_channels=hidden*2], 注意: 1*1的核 W_he = tf.get_variable("slot_AttnW", shape=[1, 1, attn_size, attn_size]) # 物理意义:对hidden的各维之间进行卷积,等价于: W_{he}*h_k,不过不太清楚为何用卷积来实现 # hidden_features:[batch, height=input_sequence_length, width=1, channel=hidden_size * 2] hidden_features = tf.nn.conv2d(input=hidden_input_conv, filter=W_he, strides=[1, 1, 1, 1], padding="SAME") # hidden_features:[batch, 1,input_sequence_length, hidden_size * 2] hidden_features = tf.transpose(hidden_features, perm=[0, 2, 1, 3]) """ # 下面是原作者的写法,比较冗余 origin_shape = tf.shape(sequence_outputs) # 返回的是tensor # hidden_features:[batch,input_sequence_length,hidden_size * 2] hidden_features = tf.reshape(hidden_features, origin_shape) # hidden_features:[batch, 1,input_sequence_length, hidden_size * 2] hidden_features = tf.expand_dims(hidden_features, 1) """ """ # 下面的代码比较啰嗦 # [batch, input_sequence_length, hidden_size* 2] slot_inputs_shape = tf.shape(slot_inputs) #返回tensor # slot_inputs:[batch * input_sequence_length, hidden_size * 2] slot_inputs = tf.reshape(slot_inputs, [-1, attn_size]) # [batch * input_sequence_length, hidden_size * 2] # W_{ie}*h_i+bias, 注意:这里并没有显式定义W_ie,因为在_linear函数中会自己定义W_ie y = core_rnn_cell._linear(slot_inputs, output_size=attn_size, bias=True) #y = tf.layers.dense(slot_inputs, attn_size, use_bias=True, activation=None) # 线性函数也可以这样写 # [batch , input_sequence_length, hidden_size* 2] y = tf.reshape(y, slot_inputs_shape) # [batch , input_sequence_length, 1, hidden_size* 2] y = tf.expand_dims(y, 2) print("layer_y:", y) """ """ y = W_{ie}*h_i:用的是线性映射 _linear(), W_{ie}未显式声明,在Linear函数中 """ # sequence_output:[batch, input_sequence_length, hidden_size* 2] # slot_inputs:[batch * input_sequence_length, hidden_size * 2] slot_inputs = tf.reshape(sequence_outputs, [-1, attn_size]) # y: [batch , input_sequence_length, hidden_size* 2] y = tf.layers.dense(inputs=sequence_outputs, units=attn_size, activation=None, use_bias=True) # [batch , input_sequence_length, 1, hidden_size* 2] y = tf.expand_dims(y, 2) print("layer_y:", y) """ e_{i,k}=V^T*tanh(W_{he}*h_k+W_{ie}*h_i) 注意: 在seq2seq-attention中,e_{i,k}=g(s_{i-1},h_k), 即e_{i,k}是由encoder中的hidden与decoder中的hidden共同作用而来 但此处的e_{i,k}比较特殊,h_k,h_i都由encoder的hidden隐向量得来 因此, 这种做法有点类似于 transformer中的query-key-value-attention的query的计算方式 """ # [batch , nstep, nstep] = [batch, 1, nstep, hidden_size*2] + [batch , nstep, 1, hidden_size * 2] # hidden_features:[batch, 1,input_sequence_length, hidden_size * 2] # y:[batch, input_sequence_length,1 , hidden_size* 2] # bahdanau_activate:[batch, input_sequence_length, input_sequence_length, hidden_size* 2] # 有维度为1的,会自动广播 bahdanau_activate = tf.tanh(hidden_features + y) # V:[attn_size=hidden_size*2] V = tf.get_variable("slot_AttnV", [attn_size]) # v_bahdanau:[batch, input_sequence_length, input_sequence_length, hidden_size* 2] v_bahdanau = V * bahdanau_activate # 注意:此处是一阶与4阶相乘,注意:这里是点乘,不是矩阵相乘 # logit_i_k:[batch, input_sequence_length, input_sequence_length] logit_i_k = tf.reduce_sum(v_bahdanau, axis=[3]) # 这里与上一步结合起来,就是:e(i,k)=v^T*tanh(w1*hk+w2*hi), (n*1)^T *(n*1)将向量映射成分数 """ alpha_{i,j} = softmax(e{i,j}) c_i = sum_{j}(alpha_{i,j}*h_j) """ # score_i_k:[batch, input_sequence_length, input_sequence_length] score_i_k = tf.nn.softmax(logit_i_k, axis=-1) # score_i_k:[batch, input_sequence_length=i, input_sequence_length=k, 1] score_i_k = tf.expand_dims(score_i_k, axis=-1) # hidden=[batch, 1, input_sequence_length, hidden_size* 2] hidden = tf.expand_dims(sequence_outputs, axis=1) """ 原论文中的C_i^S = slot_context_hidden """ # score_i_k:[batch, input_sequence_length, input_sequence_length, 1] # hidden:[batch, 1, input_sequence_length, hidden_size* 2] # slot_context_hidden: [batch, input_sequence_length, hidden_size * 2] slot_context_hidden = tf.reduce_sum(score_i_k * hidden, axis=[2]) else: """ 不需attention,直接将sequence output作为预测slot的输入 """ # attn_size = hidden size * 2 attn_size = sequence_output_shape[2].value # [batch*input_sequence_length, hidden_size* 2] slot_inputs = tf.reshape(slot_inputs, [-1, attn_size]) # ===============intent attention ============================ """ 计算c_I 注意:intent attention是针对最后的hidden state进行的 """ # intent_input:[batch, hidden_size * 4] intent_input = final_state with tf.variable_scope('intent_attn'): # attn_size: hidden_size*2 attn_size = sequence_output_shape[2].value # hidden:[batch, input_sequence_length, 1, hidden_size*2] hidden = tf.expand_dims(sequence_outputs, 2) """ 注意:虽然名字相同, 但variable_scope不同,与slot-attn中的不是同一个变量!!! """ """ hidden_features = W_{he}*h_k:用的是卷积实现 """ # W_he: [filter_height=1, filter_width=1, in_channels=hidden*2, out_channels=hidden*2], 注意: 1*1的核 W_he = tf.get_variable("intent_AttnW", shape=[ 1, 1, attn_size, attn_size ]) # 注意:此处与 slot_attention中用的是相同的attention # 物理意义:对hidden的各维之间进行卷积,等价于: W_{he}*h_k # [batch, input_sequence_length, 1, hidden_size*2] hidden_features = tf.nn.conv2d(input=hidden, filter=W_he, strides=[1, 1, 1, 1], padding="SAME") """ y = W_{ie}*h_i:用的是线性映射 _linear() ,W_{ie}未显式声明,在Linear函数中 """ # intent_input:[batch, hidden_size*4] # y: [batch, attn_size=hidden_size*2] y = core_rnn_cell._linear(intent_input, output_size=attn_size, bias=True) print("intent-attn, attn_size:", attn_size, " y:", y) # [batch, 1, 1, hidden_size * 2] y = tf.reshape(y, shape=[-1, 1, 1, attn_size]) """ e_{i,k}=V^T*tanh(W_{he}*h_k+W_{ie}*h_i) """ # V:[batch, input_sequence_length, 1, hidden_size*2] # hidden_features:[batch, input_sequence_length, 1, hidden_size*2] # y:[batch, 1, 1, hidden_size * 2] # bahdanau_activate:[batch, input_sequence_length, 1, hidden_size*2] V = tf.get_variable("intent_AttnV", shape=[attn_size]) bahdanau_activate = V * tf.tanh(hidden_features + y) # logit_i_k:[batch, input_sequence_length] logit_i_k = tf.reduce_sum(bahdanau_activate, axis=[2, 3]) """ alpha_{i,j} = softmax(e{i,j}) c_i = sum_{j}(alpha_{i,j}*h_j) """ # [batch, input_sequence_length] score_i_k = tf.nn.softmax(logit_i_k) # [batch, input_sequence_length, 1] score_i_k = tf.expand_dims(score_i_k, axis=-1) # score_i_k:[batch, input_sequence_length, 1, 1] score_i_k = tf.expand_dims(score_i_k, axis=-1) # 注意: c_intent 为hidden在各时间长度上进行加权平均 # score_i_k:[batch, input_sequence_length, 1, 1] # hidden:[batch, input_sequence_length, 1, hidden_size*2] # intent_context_hidden:[batch, hidden_size*2] intent_context_hidden = tf.reduce_sum(score_i_k * hidden, axis=[1, 2]) if add_final_state_to_intent == True: """ c_I = c_i + h_T, T代表最后时刻 encoder_final_state """ # intent_input:[batch, hidden_size * 4] # intent_context_hidden:[batch, hidden_size*2] # intent_output:[batch, hidden_size* 2 + hidden_size * 4] intent_output = tf.concat( [intent_context_hidden, intent_input], 1) else: """ c_I = c_i, T代表最后时刻 encoder_final_state """ # c_intent = context_intent # intent_context_hidden:[batch, hidden_size*2] intent_output = intent_context_hidden """ 计算slot_gate slot_gate=v*tanh(c_i^S + W*c_I) """ with tf.variable_scope('slot_gated'): # W*c_I # intent_gate:[batch, hidden_size * 2] intent_gate = core_rnn_cell._linear(intent_output, output_size=attn_size, bias=True) # W*c_intent embed_size = intent_gate.get_shape()[1].value # [batch, 1, hidden_size * 2] intent_gate = tf.reshape(intent_gate, [-1, 1, embed_size]) # V_gate:[hidden_size*2] V_gate = tf.get_variable("gateV", [attn_size]) if not remove_slot_attn: # 需要slot attention """ 需要slot attention slot_context_hidden:c_i^S, intent_gate:W*c_I 论文中公式(6): g=sum(v*tanh(c_i + W*c^I)) """ # slot_context_hidden: [batch, input_sequence_length, hidden_size * 2] # intent_gate:[batch, 1, hidden_size * 2] # slot_gate:[batch, input_sequence_length, hidden_size * 2] slot_gate = V_gate * tf.tanh(slot_context_hidden + intent_gate) else: """ 不需要slot attention,用原始的hidden输入 论文中公式(8): g=sum(v*tanh(h_i + W*c^I)) """ # V_gate:[hidden_size*2] # sequence_outputs:[batch, input_sequence_length, hidden_size * 2] # intent_gate:[batch, 1, hidden_size * 2] # slot_gate:[batch, input_sequence_length, hidden_size * 2] slot_gate = V_gate * tf.tanh(sequence_outputs + intent_gate) # slot_gate:[batch, input_sequence_length, 1] slot_gate = tf.reduce_sum(slot_gate, axis=[2], keep_dims=True) """ h_i+c_i^S*slot_gate """ if not remove_slot_attn: # 需要slot attention """ 论文中公式(7): y_i(slot)=softmax(W_hy(hi+c_i^S*slot_gate)) 中的 c_i^S*slot_gate部分 """ # slot_context_hidden: [batch, input_sequence_length, hidden_size * 2] # slot_gate:[batch, input_sequence_length, 1] # context_slot_gate:[batch, input_sequence_length, hidden_size* 2] context_slot_gate = slot_context_hidden * slot_gate else: """ 论文中公式(9): y_i(slot)=softmax(W_hy(hi+h_i*slot_gate)) 中的 c_i*slot_gate部分 """ # sequence_outputs:[batch, input_sequence_length, hidden_size* 2] # context_slot_gate:[batch, input_sequence_length, hidden_size* 2] context_slot_gate = sequence_outputs * slot_gate # context_slot_gate:[batch * input_sequence_length, attn_size=hidden_size*2] context_slot_gate = tf.reshape(context_slot_gate, [-1, attn_size]) """ hi+c_i*slot_gate or hi+h_i*slot_gate """ # context_slot_gate:[batch * input_sequence_length, attn_size=hidden_size*2] # slot_inputs:[batch * input_sequence_length, hidden_size * 2] # slot_output:[batch * input_sequence_length, hidden_size * 4] slot_output = tf.concat([context_slot_gate, slot_inputs], axis=1) """ 注意:上面 slot_output与paper中的公式稍有不同,此处是将h_i, c_i^S*slot_gate concat起来,而非相加 原paper中公式: y_i(slot) = softmax(W_hy(h_i+c_i^S*slot_gate)) (7) or y_i(slot) = crf(W_hy(h_i+c_i^S*slot_gate)) """ with tf.variable_scope('slot_proj'): # slot_output:[batch * input_sequence_length, hidden_size * 4] # slot_logits:[batch * input_sequence_length, slot_size] # linear里的矩阵为论文中:W_s{hy} slot_logits = core_rnn_cell._linear(slot_output, output_size=slot_size, bias=True) if use_crf or use_batch_crossent: # sequence_outputs:[batch, input_sequence_length, hidden_size* 2] nstep = tf.shape(sequence_outputs)[1] # slot_logits:[batch, input_sequence_length, slot_size] slot_logits = tf.reshape(slot_logits, [-1, nstep, slot_size]) """ y(intent) = softmax(W_hy(c_I + h_T)) """ with tf.variable_scope('intent_proj'): # intent_output:[batch, hidden_size* 2 + hidden_size * 4] # intent_logits:[batch, intent_size] # linear里的矩阵为论文中:W_I{hy} intent_logits = core_rnn_cell._linear(intent_output, output_size=intent_size, bias=True) return [slot_logits, intent_logits]