Exemplo n.º 1
0
  def __init__(self,
               attention_type,
               hidden_size=768,
               intermediate_size=3072,
               intermediate_act_fn=utils.gelu,
               attention_probs_dropout_prob=0.0,
               hidden_dropout_prob=0.1,
               initializer_range=0.02,
               num_attention_heads=12,
               num_rand_blocks=3,
               block_size=64,
               use_bias=True,
               seed=None,
               name=None):
    """Constructor of an encoder layer of a transformer in Pegasus style.

    Args:
      attention_type: Type of attention, needs to be one of ['original_full',
        'simulated_sparse', 'block_sparse'].
      hidden_size: (optional) int. Size of hidden dimension.
      intermediate_size: (optional) int. Size of intermediate dimension.
      intermediate_act_fn: optional) Activation function for intermediate layer.
      attention_probs_dropout_prob: (optional) float. Dropout probability of the
        attention probabilities.
      hidden_dropout_prob: (optional) float. Dropout probability of the
        attention.
      initializer_range: (optional) float. Range of the weight initializer.
      num_attention_heads: (optional) int. Number of attention heads.
      num_rand_blocks: (optional) int. Number of random chunks per row.
      block_size: (optional) int. size of block in sequence.
      use_bias: (optional) bool. Whether key/query/value uses a bias vector.
      seed: (Optional) int. Reandom seed for generating random mask.
      name: The name scope of this layer.
    """
    super(PrenormEncoderLayer, self).__init__(name=name)
    self.hidden_dropout_prob = hidden_dropout_prob

    # Attention layer
    attention_head_size = hidden_size // num_attention_heads
    self.attn_layer = attention.MultiHeadedAttentionLayer(
        attention_type, num_attention_heads, num_rand_blocks,
        attention_head_size, initializer_range, block_size, block_size,
        attention_probs_dropout_prob, use_bias, seed, name="self")

    # Dense layers
    self.projection_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size,
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    self.expand_layer = utils.Dense2dLayer(
        intermediate_size, utils.create_initializer(initializer_range),
        intermediate_act_fn, "dense")
    self.contract_layer = utils.Dense2dLayer(
        hidden_size, utils.create_initializer(initializer_range),
        None, "dense")

    # Normalization layer
    self.first_layer_norm = utils.NormLayer()
    self.second_layer_norm = utils.NormLayer()
Exemplo n.º 2
0
    def __init__(self,
                 hidden_size,
                 vocab_size,
                 embeder,
                 initializer=None,
                 activation_fn=None,
                 name="cls/predictions"):
        super(MaskedLMLayer, self).__init__(name=name)
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embeder = embeder

        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        self.extra_layer = utils.Dense2dLayer(hidden_size, hidden_size,
                                              initializer, activation_fn,
                                              "transform")
        self.norm_layer = utils.NormLayer(hidden_size, name="transform")

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.output_bias = tf.compat.v1.get_variable(
            name + "/output_bias",
            shape=[vocab_size],
            initializer=tf.zeros_initializer())
Exemplo n.º 3
0
  def __init__(self, params):
    name = "encoder"
    super(EncoderStack, self).__init__(name=name)
    self.params = params

    if params["norm_type"] == "prenorm": # layer norm type을 설졍
      encoder_class = PrenormEncoderLayer
    elif params["norm_type"] == "postnorm": # 기본 postnorm encoder 사
      encoder_class = PostnormEncoderLayer
    else:
      raise NotImplementedError(
          "Norm type {} is not implemented".format(params["norm_type"]))

    # Encoder layers
    self.encoder_layers = [
        encoder_class(  # pylint: disable=g-complex-comprehension
            self.params["attention_type"], # block_sparse attention type 설정
            self.params["hidden_size"], # 768
            self.params["intermediate_size"], # intermediate_size
            utils.get_activation(self.params["hidden_act"]), # gelu activation function
            self.params["attention_probs_dropout_prob"], # 0.1
            self.params["hidden_dropout_prob"], # 0.1
            self.params["initializer_range"], # 0.02
            self.params["num_attention_heads"], # num_attention_heads
            self.params["num_rand_blocks"], # rand block : 3
            self.params["block_size"], # 16
            self.params["use_bias"], # True
            seed=layer_idx,
            name="layer_%d" % layer_idx)
        for layer_idx in range(self.params["num_hidden_layers"]) # 개 encoder 12개를 list에 담음
    ]

    # Normalization layer
    self.layer_norm = utils.NormLayer()
Exemplo n.º 4
0
    def __init__(self, params):
        name = "encoder"
        super(EncoderStack, self).__init__(name=name)
        self.params = params

        if params["norm_type"] == "prenorm":
            encoder_class = PrenormEncoderLayer
        elif params["norm_type"] == "postnorm":
            encoder_class = PostnormEncoderLayer
        else:
            raise NotImplementedError("Norm type {} is not implemented".format(
                params["norm_type"]))

        # Encoder layers
        self.encoder_layers = [
            encoder_class(  # pylint: disable=g-complex-comprehension
                self.params["attention_type"],
                self.params["hidden_size"],
                self.params["intermediate_size"],
                utils.get_activation(self.params["hidden_act"]),
                self.params["attention_probs_dropout_prob"],
                self.params["hidden_dropout_prob"],
                self.params["initializer_range"],
                self.params["num_attention_heads"],
                self.params["num_rand_blocks"],
                self.params["block_size"],
                self.params["use_bias"],
                seed=layer_idx,
                name="layer_%d" % layer_idx)
            for layer_idx in range(self.params["num_hidden_layers"])
        ]

        # Normalization layer
        self.layer_norm = utils.NormLayer()
Exemplo n.º 5
0
    def __init__(self, params):
        if params["couple_encoder_decoder"]:
            name = "encoder"
            super(DecoderStack, self).__init__(name=name)
        else:
            name = "decoder"
            super(DecoderStack, self).__init__(name=name)

        self.params = params

        if params["norm_type"] == "prenorm":
            decoder_class = PrenormDecoderLayer
        elif params["norm_type"] == "postnorm":
            decoder_class = PostnormDecoderLayer
        else:
            raise NotImplementedError("Norm type {} is not implemented".format(
                params["norm_type"]))

        if params["use_gradient_checkpointing"]:
            decoder_class = add_gradient_recomputation(decoder_class)

        if self.params.get("num_decoder_layers", None) is not None:
            num_hidden_layers = self.params["num_decoder_layers"]
        else:
            num_hidden_layers = self.params["num_hidden_layers"]

        with tf.compat.v1.variable_scope(name):
            # Decoder layers
            self.decoder_layers = [
                decoder_class(  # pylint: disable=g-complex-comprehension
                    self.params["hidden_size"],
                    self.params["intermediate_size"],
                    utils.get_activation(self.params["hidden_act"]),
                    self.params["attention_probs_dropout_prob"],
                    self.params["hidden_dropout_prob"],
                    self.params["initializer_range"],
                    self.params["num_attention_heads"],
                    self.params["use_bias"],
                    name="layer_%d" % layer_idx)
                for layer_idx in range(num_hidden_layers)
            ]

            # Normalization layer
            self.layer_norm = utils.NormLayer(self.params["hidden_size"])
Exemplo n.º 6
0
  def __init__(self, params):
    if params["couple_encoder_decoder"]: # encoder 같다
      name = "encoder"
      with tf.compat.v1.variable_scope(
          name, reuse=tf.compat.v1.AUTO_REUSE) as scope:
        super(DecoderStack, self).__init__(name=name, _scope=scope)
    else:
      name = "decoder"
      super(DecoderStack, self).__init__(name=name)

    self.params = params

    if params["norm_type"] == "prenorm": # decoder prenorm 선택
      decoder_class = PrenormDecoderLayer
    elif params["norm_type"] == "postnorm":
      decoder_class = PostnormDecoderLayer
    else:
      raise NotImplementedError(
          "Norm type {} is not implemented".format(params["norm_type"]))

    if self.params.get("num_decoder_layers", None) is not None: # decoder number layer 를 설정했다
      num_hidden_layers = self.params["num_decoder_layers"]
    else:
      num_hidden_layers = self.params["num_hidden_layers"]# 하지 않았다면 기존 number layer 사

    # Decoder layers
    self.decoder_layers = [
        decoder_class(  # pylint: disable=g-complex-comprehension
            self.params["hidden_size"],
            self.params["intermediate_size"],
            utils.get_activation(self.params["hidden_act"]),
            self.params["attention_probs_dropout_prob"],
            self.params["hidden_dropout_prob"],
            self.params["initializer_range"],
            self.params["num_attention_heads"],
            self.params["use_bias"],
            name="layer_%d" % layer_idx)
        for layer_idx in range(num_hidden_layers)
    ]

    # Normalization layer
    self.layer_norm = utils.NormLayer()
Exemplo n.º 7
0
  def __init__(self,
               hidden_size=768,
               intermediate_size=3072,
               intermediate_act_fn=utils.gelu,
               attention_probs_dropout_prob=0.0,
               hidden_dropout_prob=0.1,
               initializer_range=0.02,
               num_attention_heads=12,
               use_bias=True,
               name=None):
    """Constructor of a decoder layer of a transformer in Pegasus style.

    Args:
      hidden_size: (optional) int. Size of hidden dimension.
      intermediate_size: (optional) int. Size of intermediate dimension.
      intermediate_act_fn: optional) Activation function for intermediate layer.
      attention_probs_dropout_prob: (optional) float. Dropout probability of the
        attention probabilities.
      hidden_dropout_prob: (optional) float. Dropout probability of the
        attention.
      initializer_range: (optional) float. Range of the weight initializer.
      num_attention_heads: (optional) int. Number of attention heads.
      use_bias: (optional) bool. Whether key/query/value uses a bias vector.
      name: The name scope of this layer.
    """
    super(PrenormDecoderLayer, self).__init__(name=name)
    self.hidden_dropout_prob = hidden_dropout_prob

    # Attention layers
    attention_head_size = hidden_size // num_attention_heads
    self.self_attn_layer = attention.MultiHeadedAttentionLayer(
        "original_full", use_bias=use_bias, name="self",
        num_attention_heads=num_attention_heads,
        size_per_head=attention_head_size,
        initializer_range=initializer_range,
        attention_probs_dropout_prob=attention_probs_dropout_prob)
    self.cross_attn_layer = attention.MultiHeadedAttentionLayer(
        "original_full", use_bias=use_bias, name="encdec",
        num_attention_heads=num_attention_heads,
        size_per_head=attention_head_size,
        initializer_range=initializer_range,
        attention_probs_dropout_prob=attention_probs_dropout_prob)

    # Dense layers
    self.self_proj_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size,
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    self.cross_proj_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size,
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    self.expand_layer = utils.Dense2dLayer(
        intermediate_size, utils.create_initializer(initializer_range),
        intermediate_act_fn, "dense")
    self.contract_layer = utils.Dense2dLayer(
        hidden_size, utils.create_initializer(initializer_range),
        None, "dense")

    # Normalization layer
    self.first_layer_norm = utils.NormLayer()
    self.second_layer_norm = utils.NormLayer()
    self.third_layer_norm = utils.NormLayer()
Exemplo n.º 8
0
  def __init__(self,
               attention_type,
               hidden_size=768,
               intermediate_size=3072,
               intermediate_act_fn=utils.gelu,
               attention_probs_dropout_prob=0.0,
               hidden_dropout_prob=0.1,
               initializer_range=0.02,
               num_attention_heads=12,
               num_rand_blocks=3,
               block_size=64,
               use_bias=True,
               seed=None,
               name=None):
    """Constructor of an encoder layer of a transformer in BERT style.

    Args:
      attention_type: Type of attention, needs to be one of ['original_full',
        'simulated_sparse', 'block_sparse'].
      hidden_size: (optional) int. Size of hidden dimension.
      intermediate_size: (optional) int. Size of intermediate dimension.
      intermediate_act_fn: optional) Activation function for intermediate layer.
      attention_probs_dropout_prob: (optional) float. Dropout probability of the
        attention probabilities.
      hidden_dropout_prob: (optional) float. Dropout probability of the
        attention.
      initializer_range: (optional) float. Range of the weight initializer.
      num_attention_heads: (optional) int. Number of attention heads.
      num_rand_blocks: (optional) int. Number of random chunks per row.
      block_size: (optional) int. size of block in sequence.
      use_bias: (optional) bool. Whether key/query/value uses a bias vector.
      seed: (Optional) int. Reandom seed for generating random mask.
      name: The name scope of this layer.
    """
    super(PostnormEncoderLayer, self).__init__(name=name)
    self.hidden_dropout_prob = hidden_dropout_prob

    # Attention layer의 정
    attention_head_size = hidden_size // num_attention_heads # 12 multi-head attention 을 위해서 head size를 정의
    self.attn_layer = attention.MultiHeadedAttentionLayer(
        attention_type, num_attention_heads, num_rand_blocks, # block_sparse, 12, 3
        attention_head_size, initializer_range, block_size, block_size, # 64, 0.01, 16, 16
        attention_probs_dropout_prob, use_bias, seed, name="self") # 0.01, true, (0~11 seed encoder layer에 만큼 커짐)

    # Dense layers: attention 결과를 1)추출 -> 2)확장 -> 3)축소 하는 방식으로 Feature를 더 정교하게 뽑아내는 과정
    # 1) 어텐션을 projection 하는 레이어
    self.projection_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size, # 12, 64
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    # 2) 확장 레이어 정의
    self.expand_layer = utils.Dense2dLayer(
        intermediate_size, utils.create_initializer(initializer_range),
        intermediate_act_fn, "dense")
    # 3) 축소 레이어 정의
    self.contract_layer = utils.Dense2dLayer( # 마지막 레이어 feature를 뽑아내는 레이어
        hidden_size, utils.create_initializer(initializer_range),
        None, "dense")

    # Normalization layer
    self.first_layer_norm = utils.NormLayer()
    self.second_layer_norm = utils.NormLayer()
Exemplo n.º 9
0
    def __init__(self,
                 hidden_size,
                 vocab_size,
                 embeder,
                 input_tensor,
                 initializer=None,
                 activation_fn=None,
                 name="cls/predictions",
                 label_ids=None,
                 label_weights=None,
                 masked_lm_positions=None):
        super(MaskedLMLayer, self).__init__(name=name)
        self.hidden_size = hidden_size  # 768 사이즈 정의
        self.vocab_size = vocab_size  # 50358 사전 사이즈 정
        self.embeder = embeder  # embedding layer 정의

        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        self.extra_layer = utils.Dense2dLayer(  # gelu activation function 사용하여 non-linear 사용
            hidden_size, initializer, activation_fn, "transform")
        self.norm_layer = utils.NormLayer("transform")  # nomalize layer정의

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        # biase 변수 생성
        self.output_bias = tf.compat.v1.get_variable(
            name + "/output_bias",
            shape=[vocab_size],  # vocab size 50358
            initializer=tf.zeros_initializer())

        if masked_lm_positions is not None:  # mask lm position의 input tensor 만큼 gather하
            input_tensor = tf.gather(input_tensor,
                                     masked_lm_positions,
                                     batch_dims=1)

        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.compat.v1.variable_scope("transform") as sc:
            input_tensor = self.extra_layer(
                input_tensor, scope=sc
            )  # linear transform하고 gelu activation func 사용 (4, 75, 768)
            input_tensor = self.norm_layer(input_tensor,
                                           scope=sc)  # normalize 실행

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        logits = self.embeder.linear(
            input_tensor
        )  # output은 embedding weight vocab이므로 결과 -> (4, 75, 768) -> (4, 75, 50358)
        logits = tf.nn.bias_add(logits, self.output_bias)  # bias 더하
        self.log_probs = tf.nn.log_softmax(logits, axis=-1)  # log softmax 실행

        if label_ids is not None:
            one_hot_labels = tf.one_hot(
                label_ids, depth=self.vocab_size,
                dtype=tf.float32)  # one-hot label 을 만든다 vocab size만

            # The `positions` tensor might be zero-padded (if the sequence is too
            # short to have the maximum number of predictions). The `label_weights`
            # tensor has a value of 1.0 for every real prediction and 0.0 for the
            # padding predictions.
            per_example_loss = -tf.reduce_sum(self.log_probs * one_hot_labels,
                                              axis=-1)  # loss 구함
            numerator = tf.reduce_sum(
                label_weights * per_example_loss)  # label weight만 계산하기 위해서
            denominator = tf.reduce_sum(label_weights) + 1e-5  # weight 합을 구함
            self.loss = numerator / denominator  # 평균 구하기
        else:
            self.loss = tf.constant(0.0)