Пример #1
0
  def __init__(self,
               attention_type,
               hidden_size=768,
               intermediate_size=3072,
               intermediate_act_fn=utils.gelu,
               attention_probs_dropout_prob=0.0,
               hidden_dropout_prob=0.1,
               initializer_range=0.02,
               num_attention_heads=12,
               num_rand_blocks=3,
               block_size=64,
               use_bias=True,
               seed=None,
               name=None):
    """Constructor of an encoder layer of a transformer in Pegasus style.

    Args:
      attention_type: Type of attention, needs to be one of ['original_full',
        'simulated_sparse', 'block_sparse'].
      hidden_size: (optional) int. Size of hidden dimension.
      intermediate_size: (optional) int. Size of intermediate dimension.
      intermediate_act_fn: optional) Activation function for intermediate layer.
      attention_probs_dropout_prob: (optional) float. Dropout probability of the
        attention probabilities.
      hidden_dropout_prob: (optional) float. Dropout probability of the
        attention.
      initializer_range: (optional) float. Range of the weight initializer.
      num_attention_heads: (optional) int. Number of attention heads.
      num_rand_blocks: (optional) int. Number of random chunks per row.
      block_size: (optional) int. size of block in sequence.
      use_bias: (optional) bool. Whether key/query/value uses a bias vector.
      seed: (Optional) int. Reandom seed for generating random mask.
      name: The name scope of this layer.
    """
    super(PrenormEncoderLayer, self).__init__(name=name)
    self.hidden_dropout_prob = hidden_dropout_prob

    # Attention layer
    attention_head_size = hidden_size // num_attention_heads
    self.attn_layer = attention.MultiHeadedAttentionLayer(
        attention_type, num_attention_heads, num_rand_blocks,
        attention_head_size, initializer_range, block_size, block_size,
        attention_probs_dropout_prob, use_bias, seed, name="self")

    # Dense layers
    self.projection_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size,
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    self.expand_layer = utils.Dense2dLayer(
        intermediate_size, utils.create_initializer(initializer_range),
        intermediate_act_fn, "dense")
    self.contract_layer = utils.Dense2dLayer(
        hidden_size, utils.create_initializer(initializer_range),
        None, "dense")

    # Normalization layer
    self.first_layer_norm = utils.NormLayer()
    self.second_layer_norm = utils.NormLayer()
Пример #2
0
    def __init__(self,
                 hidden_size,
                 vocab_size,
                 embeder,
                 initializer=None,
                 activation_fn=None,
                 name="cls/predictions"):
        super(MaskedLMLayer, self).__init__(name=name)
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embeder = embeder

        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        self.extra_layer = utils.Dense2dLayer(hidden_size, hidden_size,
                                              initializer, activation_fn,
                                              "transform")
        self.norm_layer = utils.NormLayer(hidden_size, name="transform")

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.output_bias = tf.compat.v1.get_variable(
            name + "/output_bias",
            shape=[vocab_size],
            initializer=tf.zeros_initializer())
Пример #3
0
  def __init__(self,
               hidden_size=768,
               intermediate_size=3072,
               intermediate_act_fn=utils.gelu,
               attention_probs_dropout_prob=0.0,
               hidden_dropout_prob=0.1,
               initializer_range=0.02,
               num_attention_heads=12,
               use_bias=True,
               name=None):
    """Constructor of a decoder layer of a transformer in Pegasus style.

    Args:
      hidden_size: (optional) int. Size of hidden dimension.
      intermediate_size: (optional) int. Size of intermediate dimension.
      intermediate_act_fn: optional) Activation function for intermediate layer.
      attention_probs_dropout_prob: (optional) float. Dropout probability of the
        attention probabilities.
      hidden_dropout_prob: (optional) float. Dropout probability of the
        attention.
      initializer_range: (optional) float. Range of the weight initializer.
      num_attention_heads: (optional) int. Number of attention heads.
      use_bias: (optional) bool. Whether key/query/value uses a bias vector.
      name: The name scope of this layer.
    """
    super(PrenormDecoderLayer, self).__init__(name=name)
    self.hidden_dropout_prob = hidden_dropout_prob

    # Attention layers
    attention_head_size = hidden_size // num_attention_heads
    self.self_attn_layer = attention.MultiHeadedAttentionLayer(
        "original_full", use_bias=use_bias, name="self",
        num_attention_heads=num_attention_heads,
        size_per_head=attention_head_size,
        initializer_range=initializer_range,
        attention_probs_dropout_prob=attention_probs_dropout_prob)
    self.cross_attn_layer = attention.MultiHeadedAttentionLayer(
        "original_full", use_bias=use_bias, name="encdec",
        num_attention_heads=num_attention_heads,
        size_per_head=attention_head_size,
        initializer_range=initializer_range,
        attention_probs_dropout_prob=attention_probs_dropout_prob)

    # Dense layers
    self.self_proj_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size,
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    self.cross_proj_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size,
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    self.expand_layer = utils.Dense2dLayer(
        intermediate_size, utils.create_initializer(initializer_range),
        intermediate_act_fn, "dense")
    self.contract_layer = utils.Dense2dLayer(
        hidden_size, utils.create_initializer(initializer_range),
        None, "dense")

    # Normalization layer
    self.first_layer_norm = utils.NormLayer()
    self.second_layer_norm = utils.NormLayer()
    self.third_layer_norm = utils.NormLayer()
Пример #4
0
  def __init__(self,
               attention_type,
               hidden_size=768,
               intermediate_size=3072,
               intermediate_act_fn=utils.gelu,
               attention_probs_dropout_prob=0.0,
               hidden_dropout_prob=0.1,
               initializer_range=0.02,
               num_attention_heads=12,
               num_rand_blocks=3,
               block_size=64,
               use_bias=True,
               seed=None,
               name=None):
    """Constructor of an encoder layer of a transformer in BERT style.

    Args:
      attention_type: Type of attention, needs to be one of ['original_full',
        'simulated_sparse', 'block_sparse'].
      hidden_size: (optional) int. Size of hidden dimension.
      intermediate_size: (optional) int. Size of intermediate dimension.
      intermediate_act_fn: optional) Activation function for intermediate layer.
      attention_probs_dropout_prob: (optional) float. Dropout probability of the
        attention probabilities.
      hidden_dropout_prob: (optional) float. Dropout probability of the
        attention.
      initializer_range: (optional) float. Range of the weight initializer.
      num_attention_heads: (optional) int. Number of attention heads.
      num_rand_blocks: (optional) int. Number of random chunks per row.
      block_size: (optional) int. size of block in sequence.
      use_bias: (optional) bool. Whether key/query/value uses a bias vector.
      seed: (Optional) int. Reandom seed for generating random mask.
      name: The name scope of this layer.
    """
    super(PostnormEncoderLayer, self).__init__(name=name)
    self.hidden_dropout_prob = hidden_dropout_prob

    # Attention layer의 정
    attention_head_size = hidden_size // num_attention_heads # 12 multi-head attention 을 위해서 head size를 정의
    self.attn_layer = attention.MultiHeadedAttentionLayer(
        attention_type, num_attention_heads, num_rand_blocks, # block_sparse, 12, 3
        attention_head_size, initializer_range, block_size, block_size, # 64, 0.01, 16, 16
        attention_probs_dropout_prob, use_bias, seed, name="self") # 0.01, true, (0~11 seed encoder layer에 만큼 커짐)

    # Dense layers: attention 결과를 1)추출 -> 2)확장 -> 3)축소 하는 방식으로 Feature를 더 정교하게 뽑아내는 과정
    # 1) 어텐션을 projection 하는 레이어
    self.projection_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size, # 12, 64
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    # 2) 확장 레이어 정의
    self.expand_layer = utils.Dense2dLayer(
        intermediate_size, utils.create_initializer(initializer_range),
        intermediate_act_fn, "dense")
    # 3) 축소 레이어 정의
    self.contract_layer = utils.Dense2dLayer( # 마지막 레이어 feature를 뽑아내는 레이어
        hidden_size, utils.create_initializer(initializer_range),
        None, "dense")

    # Normalization layer
    self.first_layer_norm = utils.NormLayer()
    self.second_layer_norm = utils.NormLayer()
Пример #5
0
    def __init__(self,
                 hidden_size,
                 vocab_size,
                 embeder,
                 input_tensor,
                 initializer=None,
                 activation_fn=None,
                 name="cls/predictions",
                 label_ids=None,
                 label_weights=None,
                 masked_lm_positions=None):
        super(MaskedLMLayer, self).__init__(name=name)
        self.hidden_size = hidden_size  # 768 사이즈 정의
        self.vocab_size = vocab_size  # 50358 사전 사이즈 정
        self.embeder = embeder  # embedding layer 정의

        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        self.extra_layer = utils.Dense2dLayer(  # gelu activation function 사용하여 non-linear 사용
            hidden_size, initializer, activation_fn, "transform")
        self.norm_layer = utils.NormLayer("transform")  # nomalize layer정의

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        # biase 변수 생성
        self.output_bias = tf.compat.v1.get_variable(
            name + "/output_bias",
            shape=[vocab_size],  # vocab size 50358
            initializer=tf.zeros_initializer())

        if masked_lm_positions is not None:  # mask lm position의 input tensor 만큼 gather하
            input_tensor = tf.gather(input_tensor,
                                     masked_lm_positions,
                                     batch_dims=1)

        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.compat.v1.variable_scope("transform") as sc:
            input_tensor = self.extra_layer(
                input_tensor, scope=sc
            )  # linear transform하고 gelu activation func 사용 (4, 75, 768)
            input_tensor = self.norm_layer(input_tensor,
                                           scope=sc)  # normalize 실행

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        logits = self.embeder.linear(
            input_tensor
        )  # output은 embedding weight vocab이므로 결과 -> (4, 75, 768) -> (4, 75, 50358)
        logits = tf.nn.bias_add(logits, self.output_bias)  # bias 더하
        self.log_probs = tf.nn.log_softmax(logits, axis=-1)  # log softmax 실행

        if label_ids is not None:
            one_hot_labels = tf.one_hot(
                label_ids, depth=self.vocab_size,
                dtype=tf.float32)  # one-hot label 을 만든다 vocab size만

            # The `positions` tensor might be zero-padded (if the sequence is too
            # short to have the maximum number of predictions). The `label_weights`
            # tensor has a value of 1.0 for every real prediction and 0.0 for the
            # padding predictions.
            per_example_loss = -tf.reduce_sum(self.log_probs * one_hot_labels,
                                              axis=-1)  # loss 구함
            numerator = tf.reduce_sum(
                label_weights * per_example_loss)  # label weight만 계산하기 위해서
            denominator = tf.reduce_sum(label_weights) + 1e-5  # weight 합을 구함
            self.loss = numerator / denominator  # 평균 구하기
        else:
            self.loss = tf.constant(0.0)