def __init__(self, hparams, is_train): super(DecoderStack, self).__init__() self.my_layers = [] self.hparams = hparams self_attention_layer = SelfAttention(hparams['num_units'], hparams['num_heads'], hparams['dropout_rate'], is_train) enc_dec_attention_layer = MultiheadAttention(hparams['num_units'], hparams['num_heads'], hparams['dropout_rate'], is_train) ffn_layer = FeedForwardNetwork(hparams['num_units'], hparams['num_filter_units'], hparams['dropout_rate'], is_train) self.self_attention_wrapper = LayerWrapper(self_attention_layer, hparams['num_units'], hparams['dropout_rate'], is_train) self.enc_dec_attention_wrapper = LayerWrapper(enc_dec_attention_layer, hparams['num_units'], hparams['dropout_rate'], is_train) self.ffn_wrapper = LayerWrapper(ffn_layer, hparams['num_units'], hparams['dropout_rate'], is_train) self.output_norm = LayerNormalization(hparams['num_units']) self.pondering_layer = tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid, use_bias=True, bias_initializer=tf.constant_initializer(1.0))
def call(self, decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias): batch_size, length, hidden_size = tf.unstack(tf.shape(decoder_inputs)) act = ACT(batch_size, length, hidden_size) halt_threshold = 1.0 - self.hparams.act_epsilon state = decoder_inputs previous_state = tf.zeros_like(state, name='previous_state') for step in range(self.hparams.act_max_step): # judge to continue if not act.should_continue(halt_threshold): break # position and timestep encoding state += model_utils.get_position_encoding(self.hparams.max_length, hidden_size) state += model_utils.get_timestep_encoding( step, self.hparams.act_max_step, hidden_size) # to judge pondering pondering = self.pondering_layer(state) pondering = tf.squeeze(pondering, axis=-1) # proceed act step update_weights = act(pondering, halt_threshold) update_weights = act(pondering, halt_threshold) if (num_head_3logit): self_attention_layer = SelfAttention(hparams.num_units, 3, hparams.dropout_rate, is_train) elif (num_head_5logit): self_attention_layer = SelfAttention(hparams.num_units, 5, hparams.dropout_rate, is_train) ffn_layer = FeedForwardNetwork(hparams.num_units, hparams.num_filter_units, hparams.dropout_rate, is_train) self.self_attention_wrapper = LayerWrapper(self_attention_layer, hparams.num_units, hparams.dropout_rate, is_train) self.ffn_wrapper = LayerWrapper(ffn_layer, hparams.num_units, hparams.dropout_rate, is_train) self.output_norm = LayerNormalization(hparams.num_units) state = self.self_attention_wrapper(state, decoder_self_attention_bias) state = self.enc_dec_attention_wrapper(state, encoder_outputs, attention_bias) state = self.ffn_wrapper(state) # update new state and previous state new_state = (state * update_weights) + (previous_state * (1 - update_weights)) previous_state = new_state return self.output_norm(new_state), act.n_updates, act.remainders
def __init__(self, hparams, is_train): super(DecoderStack, self).__init__() self.my_layers = [] for i in range(hparams['num_layers']): self_attention_layer = SelfAttention(hparams['num_units'], hparams['num_heads'], hparams['dropout_rate'], is_train) enc_dec_attention_layer = MultiheadAttention( hparams['num_units'], hparams['num_heads'], hparams['dropout_rate'], is_train) ffn_layer = FeedForwardNetwork(hparams['num_units'], hparams['num_filter_units'], hparams['dropout_rate'], is_train) self.my_layers.append([ LayerWrapper(self_attention_layer, hparams['num_units'], hparams['dropout_rate'], is_train), LayerWrapper(enc_dec_attention_layer, hparams['num_units'], hparams['dropout_rate'], is_train), LayerWrapper(ffn_layer, hparams['num_units'], hparams['dropout_rate'], is_train), ]) self.output_norm = LayerNormalization(hparams['num_units'])
def __init__(self, hparams, is_train): super(EncoderStack, self).__init__() self.hparams = hparams self_attention_layer = SelfAttention(hparams.num_units, hparams.num_heads, hparams.dropout_rate, is_train) ffn_layer = FeedForwardNetwork(hparams.num_units, hparams.num_filter_units, hparams.dropout_rate, is_train) self.self_attention_wrapper = LayerWrapper(self_attention_layer, hparams.num_units, hparams.dropout_rate, is_train) self.ffn_wrapper = LayerWrapper(ffn_layer, hparams.num_units, hparams.dropout_rate, is_train) self.output_norm = LayerNormalization(hparams.num_units) self.pondering_layer = tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid, use_bias=True, bias_initializer=tf.constant_initializer(1.0)) self.num_head_layer = tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid, use_bias=True, bias_initializer=tf.constant_initializer(1.0))