def __init__(self, params, train): super(EncoderOutputLayer, self).__init__() input_hidden_size = 2 * params["hidden_size"] output_hidden_size = params["hidden_size"] self.feed_foward_layer = ffn_layer.FeedFowardNetwork( #input_hidden_size, params["hidden_size"], #input_hidden_size, params["filter_size"], input_hidden_size, output_hidden_size, params["relu_dropout"], train, params["allow_ffn_pad"], output_size=output_hidden_size, activation=tf.nn.relu) #activation=tf.nn.tanh) self.feed_foward_layer = PrePostProcessingWrapper( self.feed_foward_layer, params, train, input_hidden_size=input_hidden_size, output_hidden_size=output_hidden_size) self.output_norm_layer = LayerNormalization(output_hidden_size)
def __init__(self, params, train): super(DecoderStack, self).__init__() self.layers = [] for _ in range(params["num_hidden_layers"]): self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) enc_dec_attention_layer = attention_layer.Attention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) feed_forward_network = ffn_layer.FeedFowardNetwork( #params["hidden_size"], params["filter_size"], #params["hidden_size"] * 2, params["filter_size"], params["hidden_size"] + params["latent_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"], output_size=params["hidden_size"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(enc_dec_attention_layer, params, train), PrePostProcessingWrapper( feed_forward_network, params, train, input_hidden_size=params["hidden_size"] + params["latent_size"], output_hidden_size=params["hidden_size"]) ]) self.output_normalization = LayerNormalization(params["hidden_size"])
def __init__(self, params, train): super(EncoderStack, self).__init__() self.return_attention_scores = params['return_attention_scores'] self.layers = [] no_scores_params = copy.deepcopy(params) no_scores_params.update({'return_attention_scores': False}) for _ in range(params["num_hidden_layers"]): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], params["return_attention_scores"], train) feed_forward_network = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, no_scores_params, train) ]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(params["hidden_size"])
def compute_bow_loss(latent_sample, targets, params, train): """ Args: latent_variable: size [batch_size, hidden_size] targets: size [batch_size, length] """ with tf.variable_scope("bow_decoder"): # feed forward bow_ffn_layer = ffn_layer.FeedFowardNetwork( params["latent_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"], output_size=params["vocab_size"], activation=tf.nn.relu) expd_lv = tf.expand_dims(latent_sample, axis=1) # get [batch_size, 1, hidden_size] bow_logits = bow_ffn_layer( expd_lv, padding=None) # get [batch_size, 1, vocab_size] length = tf.shape(targets)[1] tile_bow_logits = tf.tile( bow_logits, [1, length, 1]) # get [batch_size, length, vocab_size] # compute loss xentropy, weights = metrics.padded_cross_entropy_loss( tile_bow_logits, targets, params["label_smoothing"], params["vocab_size"]) # average first in sentence, then in batch bow_predict_loss = tf.reduce_sum(xentropy) return bow_predict_loss
def __init__(self, params, train): super(LatentVariableLayer, self).__init__() self.train = train # debug if self.train: input_hidden_size = 2 * params["hidden_size"] output_hidden_size = 2 * params["hidden_size"] self.feed_foward_layer = ffn_layer.FeedFowardNetwork( #input_hidden_size, params["hidden_size"], input_hidden_size, params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"], output_size=output_hidden_size, activation=tf.nn.relu) #activation=tf.nn.tanh) self.feed_foward_layer = PrePostProcessingWrapper( self.feed_foward_layer, params, train, input_hidden_size=input_hidden_size, output_hidden_size=output_hidden_size)
def __init__(self, params, train): super(DecoderStack, self).__init__() self.layers = [] # N层decoder for _ in range(params["num_hidden_layers"]): # decoder端self-attention self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) # source-target attention enc_dec_attention_layer = attention_layer.Attention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) # ffn feed_forward_network = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) # 用PrePostProcess同样做layer norm、dropout self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(enc_dec_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train) ]) self.output_normalization = LayerNormalization(params["hidden_size"])
def __init__(self, params, train): super(LatentVariableLayer, self).__init__() self.train = train #output_hidden_size = 2 * params["hidden_size"] # use hidden_size as latent_size output_hidden_size = 2 * params["latent_size"] self.norm = True self.drop = True self.residual = False if params["use_std"]: self.residual = True self.params = params self.prior_ffl = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"], output_size = output_hidden_size, activation=tf.nn.relu) self.prior_ffl = PrePostProcessingWrapper( self.prior_ffl, params, train, input_hidden_size = params["hidden_size"], output_hidden_size = output_hidden_size, norm=self.norm, drop=self.drop, residual=self.residual) #norm=True, drop=False, residual=False) if self.params["use_std"]: self.prior_mu_layer = tf.layers.Dense( params["latent_size"], use_bias=False, activation=tf.tanh, name="mu_layer") self.prior_std_layer = tf.layers.Dense( params["latent_size"], use_bias=False, activation=tf.sigmoid, name="std_layer") if self.train: input_hidden_size = 2 * params["hidden_size"] self.recog_ffl = ffn_layer.FeedFowardNetwork( input_hidden_size, params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"], output_size = output_hidden_size, activation=tf.nn.relu) self.recog_ffl = PrePostProcessingWrapper( self.recog_ffl, params, train, input_hidden_size = input_hidden_size, output_hidden_size = output_hidden_size, norm=self.norm, drop=self.drop, residual=self.residual) #norm=True, drop=False, residual=False) if self.params["use_std"]: self.recog_mu_layer = tf.layers.Dense( params["latent_size"], use_bias=False, activation=tf.tanh, name="mu_layer") self.recog_std_layer = tf.layers.Dense( params["latent_size"], use_bias=False, activation=tf.sigmoid, name="std_layer")
def __init__(self, params, train): super(DecoderStack, self).__init__() self.layers = [] for i in range(params["num_hidden_layers"]): # Flag based calling of Self Attention if 'dec-self' in params["concrete_heads"]: print("*** Decoder Concrete ***") self_attention_layer = attention_layer.SelfAttentionConcrete( params["hidden_size"], params["num_heads"], params["attention_dropout"], train, {'l0_penalty': 1.0}, concrete_coef=params["concrete_coef"]) elif not params["alive_heads_dec_self"]: print("*** Decoder Plain ***") self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) else: print("*** Decoder Fixed Alive ***") print("The fixed gates used for decoder self attention are : {}".format(params['alive_heads_dec_self'])) self_attention_layer = attention_layer.SelfAttentionFixedAliveHeads( params["hidden_size"], params["num_heads"], params["attention_dropout"], train, head_gate=params["alive_heads_dec_self"][i]) # Flag based calling of encoder-decoder Attention if 'enc-dec' in params["concrete_heads"]: print("*** Enc-Dec Concrete ***") enc_dec_attention_layer = attention_layer.AttentionConcrete( params["hidden_size"], params["num_heads"], params["attention_dropout"], train, {'l0_penalty': 1.0}, concrete_coef=params["concrete_coef"]) elif not params["alive_heads_enc_dec"]: print("*** Enc-Dec Plain ***") enc_dec_attention_layer = attention_layer.Attention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) else: print("*** Enc-Dec Fixed Alive ***") print("The fixed gates used for encoder decoder attention are : {}".format(params['alive_heads_enc_dec'])) enc_dec_attention_layer = attention_layer.AttentionFixedAliveHeads( params["hidden_size"], params["num_heads"], params["attention_dropout"], train, head_gate=params["alive_heads_enc_dec"][i]) # Feed Forward layer feed_forward_network = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(enc_dec_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train)]) self.output_normalization = LayerNormalization(params["hidden_size"])
def __init__(self, params, train): super(SentenceEmbeddingLayer, self).__init__() self.sent_attention_layer = ffn_layer.FeedFowardNetwork( # 2 sub-layers, one is feedfoward with activation, another is linear params["hidden_size"], params["hidden_size"], params["relu_dropout"], train, params["allow_ffn_pad"], output_size = 1, activation=tf.nn.relu, #use_bias_output=True) use_bias_output=False) #output_size = 1, activation=tf.nn.tanh) self.sent_attention_layer = PrePostProcessingWrapper( self.sent_attention_layer, params, train, input_hidden_size = params["hidden_size"], output_hidden_size = 1, norm=False) # encoder_stack do nomarlization, do not re-normarlize
def __init__(self, params, train): super(EncoderStack, self).__init__() self.layers = [] for _ in range(params.num_hidden_layers): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( params.hidden_size, params.num_heads, params.attention_dropout, train) feed_forward_network = ffn_layer.FeedFowardNetwork( params.hidden_size, params.filter_size, params.relu_dropout, train) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train)]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(params.hidden_size)
def __init__(self, params, train): super(EncoderStack, self).__init__() self.layers = [] for _ in range(params["num_hidden_layers"]): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) feed_forward_network = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train) ])
def __init__(self, params, train): super(SentenceEmbeddingLayer, self).__init__() self.sent_attention_layer = ffn_layer.FeedFowardNetwork( # 2 sub-layers, one is feedfoward with activation, another is linear params["hidden_size"], params["hidden_size"], params["relu_dropout"], train, params["allow_ffn_pad"], output_size=1, activation=tf.nn.relu) #output_size = 1, activation=tf.nn.tanh) self.sent_attention_layer = PrePostProcessingWrapper( self.sent_attention_layer, params, train, input_hidden_size=params["hidden_size"], output_hidden_size=1)
def __init__(self, params, train, input_size, as_standard_norm=False): # helping vars self.train = train self.params = params self.as_standard_norm = as_standard_norm if as_standard_norm: return self.input_size = input_size self.output_size = 2 * params["latent_size"] assert params["num_latent_layers"] >= 1 self.layers = [] for i in range(params["num_latent_layers"]): temp_input_size = self.input_size if i > 0: temp_input_size = self.output_size ffl = ffn_layer.FeedFowardNetwork(temp_input_size, params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"], output_size=self.output_size, activation=tf.tanh) #output_size = self.output_size, activation=tf.nn.relu) ffl = PrePostProcessingWrapper(ffl, params, train, input_hidden_size=temp_input_size, output_hidden_size=self.output_size, norm=True, drop=True, residual=(i > 0)) #norm=True, drop=True, residual=(i>0 and i<params["num_latent_layers"]-1)) self.layers.append(ffl) # if use_std, do another feed forward. if self.params["use_std"]: self.mu_layer = tf.layers.Dense(params["latent_size"], use_bias=False, activation=tf.tanh, name="mu_layer") self.std_layer = tf.layers.Dense(params["latent_size"], use_bias=False, activation=tf.sigmoid, name="std_layer")
def __init__(self, params, train): super(EncoderStack, self).__init__() self.layers = [] assert not ('enc-self' in params["concrete_heads"] and params["alive_heads_enc_self"]), \ "enc-self is passed as both with trainable concrete gates heads and fixed gates" assert not ('dec-self' in params["concrete_heads"] and params["alive_heads_dec_self"]), \ "dec-self is passed as both with trainable concrete gates heads and fixed gates" assert not ('dec-enc' in params["concrete_heads"] and params["alive_heads_dec_enc"]), \ "dec-enc is passed as both with trainable concrete gates heads and fixed gates" for i in range(params["num_hidden_layers"]): # Create sublayers for each layer. if 'enc-self' in params["concrete_heads"]: print("*** Encoder Concrete ***") self_attention_layer = attention_layer.SelfAttentionConcrete( params["hidden_size"], params["num_heads"], params["attention_dropout"], train, {'l0_penalty': 1.0}, concrete_coef=params["concrete_coef"]) elif not params["alive_heads_enc_self"]: print("*** Encoder Plain ***") self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) else: print("*** Encoder Fixed Alive ***") print("The fixed gates used for encoder self attention are : {}".format(params['alive_heads_enc_self'])) self_attention_layer = attention_layer.SelfAttentionFixedAliveHeads( params["hidden_size"], params["num_heads"], params["attention_dropout"], train, head_gate=params["alive_heads_enc_self"][i]) feed_forward_network = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train)]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(params["hidden_size"])
def __init__(self, params, train): super(EncoderStack, self).__init__() self.layers = [] # 原来是以数组形式呈现的,呵呵 # 定义了N层结构相同,参数不同的self-attention+feedforward层 for _ in range(params["num_hidden_layers"]): # Create sublayers for each layer. 每次定义一个self-att和ffn self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) feed_forward_network = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train) ]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(params["hidden_size"])
def __init__(self, params, train): super(EncoderStack, self).__init__() # 这是干啥 self.layers = [] # 用列表来储存这些层,每个元素是一个二元元组 for _ in range(params["num_hidden_layers"]): # 循环建立N个独立层,这个参数被设置为6 # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( # SelfAttention 层 params["hidden_size"], params["num_heads"], params["attention_dropout"], train) feed_forward_network = ffn_layer.FeedFowardNetwork( # 前向传播层 params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) self.layers.append([ PrePostProcessingWrapper( self_attention_layer, params, train), # 所有的层都要经过layer normalizaiton和dropout PrePostProcessingWrapper(feed_forward_network, params, train) ]) # Create final layer normalization layer. self.output_normalization = LayerNormalization( params["hidden_size"]) # 怎么起作用的
def __init__(self, params, train): super(EncoderStack, self).__init__() self.layers = [] for _ in range(params["num_hidden_layers"]): # Create sublayers for each layer. self_attention_layer_one = attention2_layer.SelfAttentionOne( params["hidden_size"], params["num_heads"], params["num_vir_entities"], params["attention_dropout"], train) self_attention_layer_two = attention2_layer.SelfAttentionTwo( params["hidden_size"], params["d2_model"], params["num_heads"], params["num_vir_entities"], params["attention_dropout"], train) feed_forward_network = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], params["allow_ffn_pad"], train) self.layers.append([ self_attention_layer_one, self_attention_layer_two, feed_forward_network ]) self.train = train self.postprocess_dropout = params["layer_postprocess_dropout"] self.d2_model = params["d2_model"] self.hidden_size = params["hidden_size"]