def __init__(self, attention_type, hidden_size=768, intermediate_size=3072, intermediate_act_fn=utils.gelu, attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.1, initializer_range=0.02, num_attention_heads=12, num_rand_blocks=3, block_size=64, use_bias=True, seed=None, name=None): """Constructor of an encoder layer of a transformer in Pegasus style. Args: attention_type: Type of attention, needs to be one of ['original_full', 'simulated_sparse', 'block_sparse']. hidden_size: (optional) int. Size of hidden dimension. intermediate_size: (optional) int. Size of intermediate dimension. intermediate_act_fn: optional) Activation function for intermediate layer. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. hidden_dropout_prob: (optional) float. Dropout probability of the attention. initializer_range: (optional) float. Range of the weight initializer. num_attention_heads: (optional) int. Number of attention heads. num_rand_blocks: (optional) int. Number of random chunks per row. block_size: (optional) int. size of block in sequence. use_bias: (optional) bool. Whether key/query/value uses a bias vector. seed: (Optional) int. Reandom seed for generating random mask. name: The name scope of this layer. """ super(PrenormEncoderLayer, self).__init__(name=name) self.hidden_dropout_prob = hidden_dropout_prob # Attention layer attention_head_size = hidden_size // num_attention_heads self.attn_layer = attention.MultiHeadedAttentionLayer( attention_type, num_attention_heads, num_rand_blocks, attention_head_size, initializer_range, block_size, block_size, attention_probs_dropout_prob, use_bias, seed, name="self") # Dense layers self.projection_layer = utils.Dense3dProjLayer( num_attention_heads, attention_head_size, utils.create_initializer(initializer_range), None, "dense", use_bias) self.expand_layer = utils.Dense2dLayer( intermediate_size, utils.create_initializer(initializer_range), intermediate_act_fn, "dense") self.contract_layer = utils.Dense2dLayer( hidden_size, utils.create_initializer(initializer_range), None, "dense") # Normalization layer self.first_layer_norm = utils.NormLayer() self.second_layer_norm = utils.NormLayer()
def __init__(self, hidden_size=768, intermediate_size=3072, intermediate_act_fn=utils.gelu, attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.1, initializer_range=0.02, num_attention_heads=12, use_bias=True, name=None): """Constructor of a decoder layer of a transformer in Pegasus style. Args: hidden_size: (optional) int. Size of hidden dimension. intermediate_size: (optional) int. Size of intermediate dimension. intermediate_act_fn: optional) Activation function for intermediate layer. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. hidden_dropout_prob: (optional) float. Dropout probability of the attention. initializer_range: (optional) float. Range of the weight initializer. num_attention_heads: (optional) int. Number of attention heads. use_bias: (optional) bool. Whether key/query/value uses a bias vector. name: The name scope of this layer. """ super(PrenormDecoderLayer, self).__init__(name=name) self.hidden_dropout_prob = hidden_dropout_prob # Attention layers attention_head_size = hidden_size // num_attention_heads self.self_attn_layer = attention.MultiHeadedAttentionLayer( "original_full", use_bias=use_bias, name="self", num_attention_heads=num_attention_heads, size_per_head=attention_head_size, initializer_range=initializer_range, attention_probs_dropout_prob=attention_probs_dropout_prob) self.cross_attn_layer = attention.MultiHeadedAttentionLayer( "original_full", use_bias=use_bias, name="encdec", num_attention_heads=num_attention_heads, size_per_head=attention_head_size, initializer_range=initializer_range, attention_probs_dropout_prob=attention_probs_dropout_prob) # Dense layers self.self_proj_layer = utils.Dense3dProjLayer( num_attention_heads, attention_head_size, utils.create_initializer(initializer_range), None, "dense", use_bias) self.cross_proj_layer = utils.Dense3dProjLayer( num_attention_heads, attention_head_size, utils.create_initializer(initializer_range), None, "dense", use_bias) self.expand_layer = utils.Dense2dLayer( intermediate_size, utils.create_initializer(initializer_range), intermediate_act_fn, "dense") self.contract_layer = utils.Dense2dLayer( hidden_size, utils.create_initializer(initializer_range), None, "dense") # Normalization layer self.first_layer_norm = utils.NormLayer() self.second_layer_norm = utils.NormLayer() self.third_layer_norm = utils.NormLayer()
def __init__(self, attention_type, hidden_size=768, intermediate_size=3072, intermediate_act_fn=utils.gelu, attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.1, initializer_range=0.02, num_attention_heads=12, num_rand_blocks=3, block_size=64, use_bias=True, seed=None, name=None): """Constructor of an encoder layer of a transformer in BERT style. Args: attention_type: Type of attention, needs to be one of ['original_full', 'simulated_sparse', 'block_sparse']. hidden_size: (optional) int. Size of hidden dimension. intermediate_size: (optional) int. Size of intermediate dimension. intermediate_act_fn: optional) Activation function for intermediate layer. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. hidden_dropout_prob: (optional) float. Dropout probability of the attention. initializer_range: (optional) float. Range of the weight initializer. num_attention_heads: (optional) int. Number of attention heads. num_rand_blocks: (optional) int. Number of random chunks per row. block_size: (optional) int. size of block in sequence. use_bias: (optional) bool. Whether key/query/value uses a bias vector. seed: (Optional) int. Reandom seed for generating random mask. name: The name scope of this layer. """ super(PostnormEncoderLayer, self).__init__(name=name) self.hidden_dropout_prob = hidden_dropout_prob # Attention layer의 정 attention_head_size = hidden_size // num_attention_heads # 12 multi-head attention 을 위해서 head size를 정의 self.attn_layer = attention.MultiHeadedAttentionLayer( attention_type, num_attention_heads, num_rand_blocks, # block_sparse, 12, 3 attention_head_size, initializer_range, block_size, block_size, # 64, 0.01, 16, 16 attention_probs_dropout_prob, use_bias, seed, name="self") # 0.01, true, (0~11 seed encoder layer에 만큼 커짐) # Dense layers: attention 결과를 1)추출 -> 2)확장 -> 3)축소 하는 방식으로 Feature를 더 정교하게 뽑아내는 과정 # 1) 어텐션을 projection 하는 레이어 self.projection_layer = utils.Dense3dProjLayer( num_attention_heads, attention_head_size, # 12, 64 utils.create_initializer(initializer_range), None, "dense", use_bias) # 2) 확장 레이어 정의 self.expand_layer = utils.Dense2dLayer( intermediate_size, utils.create_initializer(initializer_range), intermediate_act_fn, "dense") # 3) 축소 레이어 정의 self.contract_layer = utils.Dense2dLayer( # 마지막 레이어 feature를 뽑아내는 레이어 hidden_size, utils.create_initializer(initializer_range), None, "dense") # Normalization layer self.first_layer_norm = utils.NormLayer() self.second_layer_norm = utils.NormLayer()