def encode(self, inputs, sequence_length): with tf.variable_scope(self.shared_scope or "RNNEncoder") as scope: # TODO: flatten the tensor with rank >= 4 to rank 3 tensor. sequence_length, _ = flatten(sequence_length, 1) inputs, prev_shape = flatten( inputs, 3) # [*, max_sequence_length, hidden_size] output_shape = prev_shape[:-1] + [self.output_size] state_shape = prev_shape[:-2] + [self.output_size] outputs, state = tf.nn.bidirectional_dynamic_rnn( self.cell_fw, self.cell_bw, inputs, sequence_length=sequence_length, dtype=tf.float32, scope=scope) with tf.variable_scope("outputs"): outputs = tf.concat(outputs, -1) outputs = linear(outputs, self.output_size) outputs = tf.nn.dropout(outputs, self.keep_prob) with tf.variable_scope("state"): state = merge_state(state) state = linear(state, self.output_size) outputs = tf.reshape(outputs, output_shape) state = tf.reshape(state, state_shape) return outputs, state
def attention(decoder_state, response_last_hidden): with tf.variable_scope("Attention"): with tf.variable_scope("decoder_features"): decoder_features = tf_utils.linear( decoder_state, attention_vec_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims( tf.expand_dims(decoder_features, 1), 1 ) # reshape to (batch_size, 1, 1, attention_vec_size) with tf.variable_scope("response_features"): response_features = tf_utils.linear( response_last_hidden, attention_vec_size, True) response_features = tf.expand_dims( tf.expand_dims(response_features, 1), 1) def masked_attention(e): attn_dist = tf.nn.softmax( e) # take softmax. shape (batch_size, attn_length) masked_sums = tf.reduce_sum( attn_dist, axis=1) # shape (batch_size) return attn_dist / tf.reshape(masked_sums, [-1, 1]) # re-normalize e = tf.reduce_sum( v * tf.tanh(encoder_features + decoder_features + response_features), [2, 3]) # calculate e attn_dist = masked_attention(e) context_vector = tf.reduce_sum( tf.reshape(attn_dist, [-1, self.config.max_utter_len, 1, 1]) * encoder_states, [1, 2]) # shape (batch_size, attn_size). context_vector = tf.reshape(context_vector, [-1, attn_size]) return context_vector, attn_dist
def encode(self, input_embeddings, sequence_length): with tf.variable_scope(self.shared_scope or "SentenceEncoder") as scope: if self.cell_bw is not None: outputs, state = tf.nn.bidirectional_dynamic_rnn( self.cell_fw, self.cell_bw, input_embeddings, sequence_length=sequence_length, dtype=tf.float32, scope=scope) with tf.variable_scope("outputs"): outputs = tf.concat(outputs, 2) outputs = linear(outputs, self.rnn_size) outputs = tf.nn.dropout(outputs, self.keep_prob) with tf.variable_scope("state"): state = merge_state(state) state = linear(state, self.rnn_size) else: outputs, state = tf.nn.dynamic_rnn( self.cell_fw, input_embeddings, sequence_length=sequence_length, dtype=tf.float32, scope=scope) return outputs, state
def _get_distribution(state, output_size): h = state num_layers = 1 for i in range(num_layers): with tf.variable_scope('linear%d' % i) as scope: h = linear(h, output_size, scope=scope) with tf.variable_scope('Mean'): mean = linear(h, output_size, activation=None) with tf.variable_scope('Var'): var = linear(h, output_size, activation=tf.nn.softplus) return tfd.MultivariateNormalDiag(mean, var)
def encode(self, input_embeddings, sequence_length): with tf.variable_scope(self.shared_scope or "MultiEncoderWrapper") as scope: outputs, state = zip(*[ e.encode(input_embeddings, sequence_length) for e in self.encoders ]) with tf.variable_scope('outputs'): outputs = tf.concat(outputs, 2) outputs = linear(outputs, self.rnn_size) with tf.variable_scope('state'): state = merge_state(state, self.rnn_size) return outputs, state
def merge_state(state, rnn_size, activation=tf.nn.tanh): """ This function assumes that the state is an output from 'tf.nn.bidirectional_dynamic_rnn' i.e. state = (fw_state, bw_state). the state can also be a nested tuple such as state = ((fw_state_0, fw_state_1, ...), (bw_state_0, bw_state_1)) if our RNN has multiple layers. """ if not type(state) == tuple: raise ValueError if isinstance(state[0], LSTMStateTuple): raise NotImplementedError # In the function linear(), two states from the forward and backward RNN (both states have the shape of [batch_size, rnn_state]) are combined and transformed into the tensor with the shape of [batch_size, rnn_state] to make the encoder's and the decoder's state size equal. if type(state[0]) == tuple: # num_layers >= 2 new_state = [] for fs, bs in zip(*state): ns = tf.concat([fs, bs], axis=-1) if rnn_size is not None: ns = linear(ns, rnn_size, activation=activation) new_state.append(ns) new_state = tuple(new_state) else: new_state = tf.concat(state, 1) new_state = linear(new_state, rnn_size, activation=activation) return new_state
def build_model(self): # Build the Computation Graph inputs = tf.nn.embedding_lookup( self.data.embed, self.input_x) # [batch_size, sent_len, emd_size] avg_pooling = tf_utils.AvgPooling(inputs, self.input_x_len, self.seq_len) logits = tf_utils.linear(avg_pooling, self.num_class, bias=True, scope='softmax') # Obtain the Predict, Loss, Train_op predict_prob = tf.nn.softmax(logits, name='predict_prob') predict_label = tf.cast(tf.argmax(logits, 1), tf.int32) with tf.name_scope("loss"): loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.input_y) loss = tf.reduce_mean(loss) l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if v.get_shape().ndims > 1 ]) reg_loss = loss + self.config.lambda_l2 * l2_loss # Build the loss optimizer = tf.train.AdamOptimizer(self.learning_rate) # optimizer = tf.train.AdagradOptimizer(self.learning_rate) if self.config.clipper: tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), self.config.clipper) train_step = optimizer.apply_gradients(list(zip(grads, tvars))) else: train_step = optimizer.minimize( loss, global_step=self.global_step_tensor) self.predict_prob = predict_prob self.predict_label = predict_label self.logits = logits self.loss = loss self.reg_loss = reg_loss self.train_step = train_step
def build_model(self): # Build the Computation Graph self.layers = self.config.layers self.lstm_size = self.config.lstm_size inputs = tf.nn.embedding_lookup( self.data.embed, self.input_x) # [batch_size, sent_len, emd_size] def BiLSTM(input_x, input_x_len, hidden_size, num_layers=1, dropout_keep_rate=None, return_sequence=True): def lstm_cell(): return tf.contrib.rnn.BasicLSTMCell(hidden_size) def gru_cell(): return tf.contrib.rnn.GRUCell(hidden_size) cell_fw = lstm_cell() cell_bw = lstm_cell() if num_layers > 1: cell_fw = tf.contrib.rnn.MultiRNNCell( [lstm_cell() for _ in range(num_layers)]) cell_bw = tf.contrib.rnn.MultiRNNCell( [lstm_cell() for _ in range(num_layers)]) if dropout_keep_rate is not None: cell_fw = tf.contrib.rnn.DropoutWrapper( cell_fw, output_keep_prob=dropout_keep_rate) cell_bw = tf.contrib.rnn.DropoutWrapper( cell_bw, output_keep_prob=dropout_keep_rate) b_outputs, b_states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, input_x, sequence_length=input_x_len, dtype=tf.float32) if return_sequence: outputs = tf.concat(b_outputs, axis=2) else: # states: [c, h] outputs = tf.concat([b_states[0][1], b_states[1][1]], axis=-1) return outputs with tf.variable_scope("bilstm") as s: lstm_x = BiLSTM(inputs, self.input_x_len, self.lstm_size, num_layers=self.layers, dropout_keep_rate=self.drop_keep_rate, return_sequence=True) avg_pooling = tf_utils.AvgPooling(inputs, self.input_x_len, self.seq_len) max_pooling = tf_utils.MaxPooling(lstm_x, self.input_x_len) logits = tf_utils.linear([max_pooling, avg_pooling], self.num_class, bias=True, scope='softmax') # Obtain the Predict, Loss, Train_op predict_prob = tf.nn.softmax(logits, name='predict_prob') predict_label = tf.cast(tf.argmax(logits, 1), tf.int32) loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.input_y) loss = tf.reduce_mean(loss) l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if v.get_shape().ndims > 1 ]) reg_loss = loss + self.config.lambda_l2 * l2_loss optimizer = tf.train.AdamOptimizer(self.learning_rate) if self.config.clipper: tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), self.config.clipper) train_step = optimizer.apply_gradients(list(zip(grads, tvars))) else: train_step = optimizer.minimize( loss, global_step=self.global_step_tensor) self.predict_prob = predict_prob self.predict_label = predict_label self.loss = loss self.reg_loss = reg_loss self.train_step = train_step
def build_model(self): # Build the Computation Graph self.filter_sizes = self.config.filter_sizes self.num_filters = self.config.num_filters self.initializer = tf.random_normal_initializer(stddev=0.1) inputs = tf.nn.embedding_lookup(self.data.embed, self.input_x) inputs_ = tf.expand_dims(inputs, -1) pooled_outputs = [] for i, filter_size in enumerate(self.filter_sizes): with tf.name_scope("convolution-pooling-%s" % filter_size): filter = tf.get_variable( "filter-%s" % filter_size, [filter_size, self.embed_size, 1, self.num_filters], initializer=self.initializer) conv = tf.nn.conv2d(inputs_, filter, strides=[1, 1, 1, 1], padding="VALID", name="conv") b = tf.get_variable("b-%s" % filter_size, [self.num_filters]) h = tf.nn.relu(tf.nn.bias_add(conv, b), "relu") pooled = tf.nn.max_pool( h, ksize=[1, self.seq_len - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) h_pool = tf.concat(pooled_outputs, -1) num_filters_total = self.num_filters * len(self.filter_sizes) outputs = tf.reshape(h_pool, [-1, num_filters_total]) if self.drop_keep_rate is not None: outputs = tf.nn.dropout(outputs, keep_prob=self.drop_keep_rate) logits = tf_utils.linear(outputs, self.num_class, bias=True, scope='softmax') # Obtain the Predict, Loss, Train_op predict_prob = tf.nn.softmax(logits, name='predict_prob') predict_label = tf.cast(tf.argmax(logits, 1), tf.int32) with tf.name_scope("loss"): loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.input_y)) l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if v.get_shape().ndims > 1 ]) reg_loss = loss + self.config.lambda_l2 * l2_loss optimizer = tf.train.AdamOptimizer(self.learning_rate) if self.config.clipper: tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), self.config.clipper) train_step = optimizer.apply_gradients(list(zip(grads, tvars))) else: train_step = optimizer.minimize( loss, global_step=self.global_step_tensor) self.predict_prob = predict_prob self.predict_label = predict_label self.logits = logits self.loss = loss self.reg_loss = reg_loss self.train_step = train_step