def attention_layer(self, x, y, hidden_size, bias, name, is_train, cache=None): """ """ # Query Key Value projection q = dense_layer(x, hidden_size // 4, name='q') k = dense_layer(y, hidden_size // 4, name='k') v = dense_layer(y, hidden_size, name='v') if cache is not None: # Combine cached keys and values with new keys and values. k = tf.concat([cache["k"], k], axis=1) v = tf.concat([cache["v"], v], axis=1) # Update cache cache["k"] = k cache["v"] = v # Split head (for multi head attention) q = self.split_heads(q, hidden_size // 4) k = self.split_heads(k, hidden_size // 4) v = self.split_heads(v, hidden_size) # Scale q to prevent the dot product # between q and k from growing too large. depth = (hidden_size // self.num_heads) q *= depth**-0.5 # Calculate dot product attention logits = tf.matmul(q, k, transpose_b=True) logits += bias w = tf.nn.softmax(logits, name="attention_weights") if is_train: w = tf.nn.dropout(w, self.dropout_rate) attention_output = tf.matmul(w, v) # Recombine heads --> [batch_size, length, hidden_size] attention_output = self.combine_heads(attention_output, hidden_size) # Run the combined outputs through another linear projection layer. attention_output = dense_layer(attention_output, hidden_size, name='att_out') return attention_output, w
def get_logits(self, image, is_train, **kwargs): widths = tf.ones(tf.shape(image)[0], dtype=tf.int32) * tf.shape(image)[2] features, sequence_length = self._convnet_layers( image, widths, is_train) features = rnn_layers(features, sequence_length, self.rnn_size, use_projection=True) logits = dense_layer(features, len(self.out_charset) + 1, name='logits') return logits, sequence_length
def get_logits(self, image, is_train, **kwargs): """ """ # ResNet widths = tf.ones(tf.shape(image)[0], dtype=tf.int32) * tf.shape(image)[2] features, sequence_length = self._convnet_layers( image, widths, is_train) # LSTM encoder with tf.variable_scope("rnn"): rnn_inputs = tf.nn.max_pool(features, (1, 8, 1, 1), (1, 1, 1, 1), 'VALID', data_format='NHWC') rnn_inputs = tf.squeeze(rnn_inputs, axis=[1]) rnn_inputs = tf.transpose(rnn_inputs, perm=[1, 0, 2], name='time_major') holistic_features = rnn_layer(rnn_inputs, sequence_length, self.rnn_size, scope='holistic') holistic_feature = dense_layer(holistic_features[-1], self.FLAGS.rnn_size, name='holistic_projection') # 2D LSTM decoder logits, weights = self.twodim_attention_decoder( holistic_feature, features, kwargs['label'], len(self.out_charset), self.FLAGS.rnn_size, is_train, self.FLAGS.label_maxlen) logits = tf.reshape( logits, [-1, self.FLAGS.label_maxlen, len(self.out_charset) + 1]) sequence_length = None self.attention_weights = tf.expand_dims(weights, axis=1) return logits, sequence_length
def get_logits(self, image, is_train, **kwargs): """ """ widths = tf.ones(tf.shape(image)[0], dtype=tf.int32) * tf.shape(image)[2] features, sequence_length = self._convnet_layers( image, widths, is_train) attention_states = rnn_layers(features, sequence_length, self.rnn_size, use_projection=True) attention_states = dense_layer(attention_states, self.rnn_size, name='att_state_dense') logits, weights = attention_decoder(attention_states, kwargs['label'], len(self.out_charset), self.rnn_size, is_train, self.FLAGS.label_maxlen, cell_type='gru') return logits, sequence_length
def twodim_attention_decoder(self, holistic_feature, attention_states, label, num_classes, rnn_size, is_train, label_maxlen=25): """ """ with tf.variable_scope('attention_layer'): batch_size = tf.shape(attention_states)[0] cell = tf.contrib.rnn.LSTMCell(rnn_size) dummy_label = tf.concat([ tf.zeros([batch_size, num_classes]), tf.ones([batch_size, 1]) ], axis=-1) decoder_inputs = [dummy_label] + [None] * (label_maxlen - 1) if label is not None: output_shape = tf.to_int64( tf.stack([batch_size, label_maxlen], axis=0)) label = tf.sparse_to_dense(sparse_indices=label.indices, sparse_values=label.values, output_shape=output_shape, default_value=num_classes) label_one_hot = tf.one_hot(label, num_classes + 1) else: label_one_hot = tf.zeros([batch_size, label_maxlen]) softmax_w = tf.get_variable( 'softmax_w', [rnn_size, num_classes + 1], initializer=tf.contrib.layers.xavier_initializer()) softmax_b = tf.get_variable( 'softmax_b', [num_classes + 1], initializer=tf.constant_initializer(value=0.0)) def get_train_input(prev, i): if i == 0: return dummy_label else: return label_one_hot[:, i - 1, :] def get_eval_input(prev, i): if i == 0: return dummy_label else: _logit = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) _prediction = tf.argmax(_logit, axis=-1) return tf.one_hot(_prediction, num_classes + 1) def get_input(prev, i): if is_train: return get_train_input(prev, i) else: return get_eval_input(prev, i) # attention_states [B, 8, 25, 512] height = tf.shape(attention_states)[1] width = tf.shape(attention_states)[2] attn_size = rnn_size q = tf.get_variable("AttnQ", [1, attn_size * 2, attn_size], dtype=tf.float32) k = tf.get_variable("AttnK", [3, 3, attn_size, attn_size], dtype=tf.float32) v = tf.get_variable("AttnV", [1, 1, attn_size, 1], dtype=tf.float32) key = tf.nn.conv2d(attention_states, k, [1, 1, 1, 1], "SAME") def attention(query): with tf.variable_scope("Attention"): query = tf.reshape(query, [batch_size, 1, attn_size * 2]) y = tf.nn.conv1d(query, q, 1, "SAME", data_format="NWC") y = tf.reshape(y, [-1, 1, 1, attn_size]) s = tf.nn.conv2d(tf.nn.tanh(key + y), v, [1, 1, 1, 1], "SAME") s = tf.reshape(s, [-1, height * width, 1]) a = tf.nn.softmax(s, axis=1) a = tf.reshape(a, [-1, height, width, 1]) d = tf.reduce_sum(a * attention_states, [1, 2]) return d, tf.reshape(a, [-1, height, width]) attn_weights = [] features = [] prev = None state = (holistic_feature, holistic_feature) _state = tf.concat(state, axis=-1) attns, ats = attention(_state) attn_weights.append(ats) for i, inp in enumerate(decoder_inputs): if i > 0: tf.get_variable_scope().reuse_variables() if prev is not None: with tf.variable_scope("loop_function", reuse=True): inp = get_input(prev, i) input_size = inp.get_shape().with_rank(2)[1] inputs = tf.concat([inp, attns], axis=-1) x = dense_layer(inputs, input_size, name="input_projection", activation=None) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. _state = tf.concat(state, axis=-1) attns, ats = attention(_state) attn_weights.append(ats) with tf.variable_scope("AttnOutputProjection"): inputs = tf.concat([cell_output, attns], axis=-1) output = dense_layer(inputs, rnn_size, name="output_projection", activation=tf.nn.relu) prev = output features.append(output) features = tf.stack(features, axis=1) features = tf.reshape(features, (-1, rnn_size)) rnn_logits = tf.nn.xw_plus_b(features, softmax_w, softmax_b) rnn_logits = tf.reshape( rnn_logits, (batch_size, label_maxlen, num_classes + 1)) attn_weights = tf.stack(attn_weights, axis=1) return rnn_logits, attn_weights
def decoder_stack(self, decoder_inputs, encoder_outputs, self_attention_bias, attention_bias, is_train, cache=None): """ """ ws = [] # Decoder stack for n in range(self.dec_layers): with tf.variable_scope("decoder_layer_%d" % n): layer_name = "layer_%d" % n layer_cache = cache[layer_name] if cache is not None else None with tf.variable_scope("self_attention"): # layer norm y = self.layer_norm(decoder_inputs, self.hidden_size) # self att y, _ = self.attention_layer(y, y, self.hidden_size, self_attention_bias, 'self_att', is_train, layer_cache) # dropout if is_train: y = tf.nn.dropout(y, self.dropout_rate) # skip decoder_inputs = y + decoder_inputs with tf.variable_scope("encdec_attention"): # layer norm y = self.layer_norm(decoder_inputs, self.hidden_size) # self att y, w = self.attention_layer(y, encoder_outputs, self.hidden_size, attention_bias, 'encdec_att', is_train) ws.append(w) # dropout if is_train: y = tf.nn.dropout(y, self.dropout_rate) # skip decoder_inputs = y + decoder_inputs with tf.variable_scope("ffn"): # layer norm y = self.layer_norm(decoder_inputs, self.hidden_size) # ffn y = dense_layer(y, self.filter_size, name='filter_layer', activation=tf.nn.relu) # dropout if is_train: y = tf.nn.dropout(y, self.dropout_rate) y = dense_layer(y, self.hidden_size, name='output_layer', activation=tf.nn.relu) # dropout if is_train: y = tf.nn.dropout(y, self.dropout_rate) # skip decoder_inputs = y + decoder_inputs # Output normalization decoder_outputs = self.layer_norm(decoder_inputs, self.hidden_size) ws = tf.stack(ws, axis=1) return decoder_outputs, ws
def transformer_encoder(self, features, num_layers, hidden_size, is_train): """ """ with tf.variable_scope('transformer_enc'): attention_bias = 0 # Position encoding batch_size = tf.shape(features)[0] height = tf.shape(features)[1] width = tf.shape(features)[2] const_h = self.FLAGS.resize_hw.height // 4 const_w = self.FLAGS.resize_hw.width // 4 h_encoding = self.get_position_encoding(height, hidden_size, 'h_encoding') w_encoding = self.get_position_encoding(width, hidden_size, 'w_encoding') h_encoding = tf.expand_dims(h_encoding, axis=1) w_encoding = tf.expand_dims(w_encoding, axis=0) h_encoding = tf.tile(tf.expand_dims(h_encoding, axis=0), [batch_size, 1, 1, 1]) w_encoding = tf.tile(tf.expand_dims(w_encoding, axis=0), [batch_size, 1, 1, 1]) # Adaptive 2D potisiontal encoding inter = tf.reduce_mean(features, axis=[1, 2]) # [B, hidden] inter = dense_layer(inter, hidden_size // 2, name='intermediate', activation=tf.nn.relu) if is_train: inter = tf.nn.dropout(inter, self.dropout_rate) alpha = dense_layer(inter, 2 * hidden_size, name='alpha', activation=tf.nn.sigmoid) alpha = tf.reshape(alpha, [-1, 2, 1, hidden_size]) pos_encoding = alpha[:, 0:1, :, :] * h_encoding \ + alpha[:, 1:2, :, :] * w_encoding features += pos_encoding self.hw = tf.reduce_sum(alpha, axis=[2, 3]) # Save shape shape = (-1, height, width, hidden_size) features = tf.reshape(features, (-1, height * width, hidden_size)) # Dropout if is_train: features = tf.nn.dropout(features, self.dropout_rate) # Encoder stack ws = [] for n in range(num_layers): with tf.variable_scope("encoder_layer_%d" % n): with tf.variable_scope("self_attention"): # layer norm y = self.layer_norm(features, hidden_size) # self att y, w = self.attention_layer(y, y, hidden_size, attention_bias, 'self_att', is_train) ws.append(w) # dropout if is_train: y = tf.nn.dropout(y, self.dropout_rate) # skip features = y + features with tf.variable_scope("ffn"): # layer norm y = self.layer_norm(features, hidden_size) # cnn y = tf.reshape(features, shape) conv_params = [ ConvParams(self.filter_size, 1, (1, 1), 'same', False, True, 'expand'), ConvParams(self.filter_size, 3, (1, 1), 'same', False, True, 'dwconv'), ConvParams(self.hidden_size, 1, (1, 1), 'same', False, True, 'reduce') ] y = conv_layer(y, conv_params[0], is_train) y = depthwise_conv_layer(y, conv_params[1], is_train) y = conv_layer(y, conv_params[2], is_train) y = tf.reshape(y, (-1, height * width, hidden_size)) # skip features = y + features # Output normalization features = self.layer_norm(features, hidden_size) ws = tf.stack(ws, axis=1) return features, shape, ws