예제 #1
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward,
                 dropout=0.1,
                 activation="gelu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=True,
                 weight_attr=None,
                 bias_attr=None,
                 topo=None):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3

        super(TransformerDecoderLayer, self).__init__()
        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before

        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)

        self.self_attn = MultiHeadAttention(d_model,
                                            nhead,
                                            dropout=attn_dropout,
                                            weight_attr=weight_attrs[0],
                                            bias_attr=bias_attrs[0],
                                            topo=topo)
        if topo is None or topo.mp_info.size == 1:
            self.linear1 = nn.Linear(d_model,
                                     dim_feedforward,
                                     weight_attrs[2],
                                     bias_attr=bias_attrs[2])
            self.linear2 = nn.Linear(dim_feedforward,
                                     d_model,
                                     weight_attrs[2],
                                     bias_attr=bias_attrs[2])
        else:
            self.linear1 = paddlenlp.ops.ColumnParallelLiner(
                (d_model, dim_feedforward),
                topo.mp_info.size,
                gather_out=False,
                param_attr=weight_attrs[2],
                bias_attr=bias_attrs[2])
            self.linear2 = paddlenlp.ops.RowParallelLiner(
                (dim_feedforward, d_model),
                topo.mp_info.size,
                input_is_parallel=True,
                param_attr=weight_attrs[2],
                bias_attr=bias_attrs[2])

        self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
        self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)
예제 #2
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward,
                 dropout=0.1,
                 activation="gelu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=True,
                 weight_attr=None,
                 bias_attr=None,
                 num_partitions=1):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3

        super(TransformerDecoderLayer, self).__init__()
        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before

        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)

        self.self_attn = MultiHeadAttention(
            d_model,
            nhead,
            dropout=attn_dropout,
            weight_attr=weight_attrs[0],
            bias_attr=bias_attrs[0],
            num_partitions=num_partitions)

        self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
            d_model,
            dim_feedforward,
            weight_attr=weight_attrs[2],
            gather_output=False,
            has_bias=True)

        self.linear2 = fleet.meta_parallel.RowParallelLinear(
            dim_feedforward,
            d_model,
            weight_attr=weight_attrs[2],
            input_is_parallel=True,
            has_bias=True)

        self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
        self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)
예제 #3
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward,
                 dropout_rate=0.1,
                 activation="relu",
                 attn_dropout_rate=None,
                 act_dropout_rate=None,
                 normalize_before=False,
                 weight_attr=None,
                 bias_attr=None):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3

        super(FusedTransformerEncoderLayer, self).__init__()
        assert d_model > 0, ("Expected d_model to be greater than 0, "
                             "but recieved {}".format(d_model))
        assert nhead > 0, ("Expected nhead to be greater than 0, "
                           "but recieved {}".format(nhead))
        assert dim_feedforward > 0, (
            "Expected dim_feedforward to be greater than 0, "
            "but recieved {}".format(dim_feedforward))
        attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate
        act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
        self.normalize_before = normalize_before

        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
        bias_attrs = _convert_param_attr_to_list(bias_attr, 2)

        self.fused_attn = FusedMultiHeadAttention(
            d_model,
            nhead,
            dropout_rate=dropout_rate,
            attn_dropout_rate=attn_dropout_rate,
            normalize_before=self.normalize_before,
            weight_attr=weight_attrs[0],
            bias_attr=bias_attrs[0])

        self.ffn = FusedFeedForward(d_model,
                                    dim_feedforward,
                                    dropout_rate=dropout_rate,
                                    activation=activation,
                                    act_dropout_rate=act_dropout_rate,
                                    normalize_before=self.normalize_before,
                                    weight_attr=weight_attrs[1],
                                    bias_attr=bias_attrs[1])
예제 #4
0
 def __init__(self,
              d_model,
              nhead,
              dim_feedforward,
              dropout=0.1,
              activation="gelu",
              attn_dropout=None,
              act_dropout=None,
              normalize_before=True,
              weight_attr=None,
              bias_attr=None,
              mesh_idx=None):
     self._config = locals()
     self._config.pop("self")
     self._config.pop("__class__", None)  # py3
     self.mesh_idx = mesh_idx
     super(TransformerDecoderLayer, self).__init__()
     attn_dropout = dropout if attn_dropout is None else attn_dropout
     act_dropout = dropout if act_dropout is None else act_dropout
     self.normalize_before = normalize_before
     weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
     bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
     self.self_attn = MultiHeadAttention(d_model,
                                         nhead,
                                         dropout=attn_dropout,
                                         weight_attr=weight_attrs[0],
                                         bias_attr=bias_attrs[0],
                                         mesh_idx=self.mesh_idx)
     self.linear1 = nn.Linear(d_model,
                              dim_feedforward,
                              weight_attrs[2],
                              bias_attr=bias_attrs[2])
     self.linear2 = nn.Linear(dim_feedforward,
                              d_model,
                              weight_attrs[2],
                              bias_attr=bias_attrs[2])
     self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
     self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
     self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
     self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train")
     self.activation = getattr(F, activation)
예제 #5
0
    def setUp(self):
        paddle.disable_static()
        self.__class__.op_type = "fused_feedforward"
        #check grad in test_out_and_grad()
        self.__class__.no_need_check_grad = True
        self.getDtype()
        self.getShape()
        self.getDiff()
        self.getActivation()
        self.getNormalizeBefore()
        paddle.set_default_dtype(self.dtype)
        self.weight_attr = None
        self.bias_attr = None

        self.weight_attrs = transformer._convert_param_attr_to_list(
            self.weight_attr, 2)
        self.bias_attrs = transformer._convert_param_attr_to_list(
            self.bias_attr, 2)
        self.linear1 = Linear(self.d_model,
                              self.dim_feedforward,
                              self.weight_attrs[1],
                              bias_attr=self.bias_attrs[1])
        self.linear2 = Linear(self.dim_feedforward,
                              self.d_model,
                              self.weight_attrs[1],
                              bias_attr=self.bias_attrs[1])

        paddle.set_default_dtype(self.layer_norm_dtype)
        self.norm1 = LayerNorm(self.d_model)
        self.norm2 = LayerNorm(self.d_model)
        self.dropout = Dropout(0.0, mode="upscale_in_train")
        self.dropout1 = Dropout(0.0, mode="upscale_in_train")
        self.dropout2 = Dropout(0.0, mode="upscale_in_train")
        self.activation = getattr(F, self.act_method)

        self.src = np.random.random((self.batch_size, self.query_length,
                                     self.d_model)).astype(self.dtype)
        self.dout = np.random.random((self.batch_size, self.query_length,
                                      self.d_model)).astype(self.dtype)
예제 #6
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward,
                 dropout=0.1,
                 activation="gelu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=True,
                 weight_attr=None,
                 bias_attr=None,
                 topo=None,
                 **kwargs):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3

        super(TransformerDecoderLayer, self).__init__()
        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before

        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)

        self._fuse = kwargs.get('fuse', False)
        if self._fuse:
            nranks, ring_id = 1, -1
            if topo is not None and topo.mp_info.size > 1:
                nranks = topo.mp_info.size
                ring_id = 0
            self.self_attn = incubate.nn.FusedMultiHeadAttention(
                d_model,
                nhead,
                dropout_rate=dropout,
                attn_dropout_rate=attn_dropout,
                normalize_before=normalize_before,
                qkv_weight_attr=weight_attrs[0],
                qkv_bias_attr=bias_attrs[0],
                linear_weight_attr=weight_attrs[0],
                linear_bias_attr=bias_attrs[0],
                epsilon=1e-5,
                nranks=nranks,
                ring_id=ring_id)
            self.ffn = incubate.nn.FusedFeedForward(
                d_model,
                dim_feedforward,
                dropout_rate=act_dropout,
                epsilon=1e-5,
                activation=activation,
                normalize_before=normalize_before,
                act_dropout_rate=0.0,
                linear1_weight_attr=weight_attrs[2],
                linear1_bias_attr=bias_attrs[2],
                linear2_weight_attr=weight_attrs[2],
                linear2_bias_attr=bias_attrs[2],
                nranks=nranks,
                ring_id=ring_id)
        else:
            self.self_attn = MultiHeadAttention(d_model,
                                                nhead,
                                                dropout=attn_dropout,
                                                weight_attr=weight_attrs[0],
                                                bias_attr=bias_attrs[0],
                                                topo=topo)
            if topo is None or topo.mp_info.size == 1:
                self.linear1 = nn.Linear(d_model,
                                         dim_feedforward,
                                         weight_attrs[2],
                                         bias_attr=bias_attrs[2])
                self.linear2 = nn.Linear(dim_feedforward,
                                         d_model,
                                         weight_attrs[2],
                                         bias_attr=bias_attrs[2])
            else:
                self.linear1 = paddlenlp.ops.ColumnParallelLiner(
                    (d_model, dim_feedforward),
                    topo.mp_info.size,
                    gather_out=False,
                    param_attr=weight_attrs[2],
                    bias_attr=bias_attrs[2])
                self.linear2 = paddlenlp.ops.RowParallelLiner(
                    (dim_feedforward, d_model),
                    topo.mp_info.size,
                    input_is_parallel=True,
                    param_attr=weight_attrs[2],
                    bias_attr=bias_attrs[2])

            self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
            self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
            self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
            self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train")
            self.activation = getattr(F, activation)
예제 #7
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward,
                 dropout=0.1,
                 activation="gelu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=True,
                 weight_attr=None,
                 bias_attr=None,
                 num_partitions=1,
                 expert_mode=False,
                 num_experts=1,
                 top_k=2,
                 hcg=None,
                 gate=None,
                 recompute_interval=0):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3

        super(TransformerDecoderLayer, self).__init__()
        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before
        self.recompute_interval = recompute_interval

        # moe config
        self.top_k = top_k
        self.num_experts = num_experts
        self.expert_mode = expert_mode
        self.hcg = hcg

        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)

        self.self_attn = MultiHeadAttention(d_model,
                                            nhead,
                                            dropout=attn_dropout,
                                            weight_attr=weight_attrs[0],
                                            bias_attr=bias_attrs[0],
                                            num_partitions=num_partitions)

        if expert_mode:
            import os
            experts_list = nn.LayerList()
            for expi in range(num_experts):
                exp_layer = ExpertLayer(d_model,
                                        dim_feedforward // top_k,
                                        windex=expi,
                                        num_expert=num_experts)
                experts_list.append(exp_layer)

            moe_group_size = hcg.get_expert_parallel_world_size()
            moe_group = hcg.get_expert_parallel_group()
            mp_group = hcg.get_model_parallel_group()
            gate_config = {
                "type": "gshard",
                "top_k": top_k,
            }
            self.moe_mlp = MoeLayer(d_model=d_model,
                                    experts=experts_list,
                                    gate_config=gate_config,
                                    moe_group=moe_group,
                                    mp_group=mp_group,
                                    recompute_interval=self.recompute_interval)
        else:
            self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
                d_model,
                dim_feedforward,
                weight_attr=weight_attrs[2],
                gather_output=False,
                has_bias=True)

            self.linear2 = fleet.meta_parallel.RowParallelLinear(
                dim_feedforward,
                d_model,
                weight_attr=weight_attrs[2],
                input_is_parallel=True,
                has_bias=True)

        self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
        self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)