def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, activation="gelu", attn_dropout=None, act_dropout=None, normalize_before=True, weight_attr=None, bias_attr=None, topo=None): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 super(TransformerDecoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before weight_attrs = _convert_param_attr_to_list(weight_attr, 3) bias_attrs = _convert_param_attr_to_list(bias_attr, 3) self.self_attn = MultiHeadAttention(d_model, nhead, dropout=attn_dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0], topo=topo) if topo is None or topo.mp_info.size == 1: self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2]) self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2]) else: self.linear1 = paddlenlp.ops.ColumnParallelLiner( (d_model, dim_feedforward), topo.mp_info.size, gather_out=False, param_attr=weight_attrs[2], bias_attr=bias_attrs[2]) self.linear2 = paddlenlp.ops.RowParallelLiner( (dim_feedforward, d_model), topo.mp_info.size, input_is_parallel=True, param_attr=weight_attrs[2], bias_attr=bias_attrs[2]) self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5) self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5) self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train") self.activation = getattr(F, activation)
def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, activation="gelu", attn_dropout=None, act_dropout=None, normalize_before=True, weight_attr=None, bias_attr=None, num_partitions=1): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 super(TransformerDecoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before weight_attrs = _convert_param_attr_to_list(weight_attr, 3) bias_attrs = _convert_param_attr_to_list(bias_attr, 3) self.self_attn = MultiHeadAttention( d_model, nhead, dropout=attn_dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0], num_partitions=num_partitions) self.linear1 = fleet.meta_parallel.ColumnParallelLinear( d_model, dim_feedforward, weight_attr=weight_attrs[2], gather_output=False, has_bias=True) self.linear2 = fleet.meta_parallel.RowParallelLinear( dim_feedforward, d_model, weight_attr=weight_attrs[2], input_is_parallel=True, has_bias=True) self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5) self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5) self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train") self.activation = getattr(F, activation)
def __init__(self, d_model, nhead, dim_feedforward, dropout_rate=0.1, activation="relu", attn_dropout_rate=None, act_dropout_rate=None, normalize_before=False, weight_attr=None, bias_attr=None): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 super(FusedTransformerEncoderLayer, self).__init__() assert d_model > 0, ("Expected d_model to be greater than 0, " "but recieved {}".format(d_model)) assert nhead > 0, ("Expected nhead to be greater than 0, " "but recieved {}".format(nhead)) assert dim_feedforward > 0, ( "Expected dim_feedforward to be greater than 0, " "but recieved {}".format(dim_feedforward)) attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate self.normalize_before = normalize_before weight_attrs = _convert_param_attr_to_list(weight_attr, 2) bias_attrs = _convert_param_attr_to_list(bias_attr, 2) self.fused_attn = FusedMultiHeadAttention( d_model, nhead, dropout_rate=dropout_rate, attn_dropout_rate=attn_dropout_rate, normalize_before=self.normalize_before, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0]) self.ffn = FusedFeedForward(d_model, dim_feedforward, dropout_rate=dropout_rate, activation=activation, act_dropout_rate=act_dropout_rate, normalize_before=self.normalize_before, weight_attr=weight_attrs[1], bias_attr=bias_attrs[1])
def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, activation="gelu", attn_dropout=None, act_dropout=None, normalize_before=True, weight_attr=None, bias_attr=None, mesh_idx=None): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 self.mesh_idx = mesh_idx super(TransformerDecoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before weight_attrs = _convert_param_attr_to_list(weight_attr, 3) bias_attrs = _convert_param_attr_to_list(bias_attr, 3) self.self_attn = MultiHeadAttention(d_model, nhead, dropout=attn_dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0], mesh_idx=self.mesh_idx) self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2]) self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2]) self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5) self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5) self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train") self.activation = getattr(F, activation)
def setUp(self): paddle.disable_static() self.__class__.op_type = "fused_feedforward" #check grad in test_out_and_grad() self.__class__.no_need_check_grad = True self.getDtype() self.getShape() self.getDiff() self.getActivation() self.getNormalizeBefore() paddle.set_default_dtype(self.dtype) self.weight_attr = None self.bias_attr = None self.weight_attrs = transformer._convert_param_attr_to_list( self.weight_attr, 2) self.bias_attrs = transformer._convert_param_attr_to_list( self.bias_attr, 2) self.linear1 = Linear(self.d_model, self.dim_feedforward, self.weight_attrs[1], bias_attr=self.bias_attrs[1]) self.linear2 = Linear(self.dim_feedforward, self.d_model, self.weight_attrs[1], bias_attr=self.bias_attrs[1]) paddle.set_default_dtype(self.layer_norm_dtype) self.norm1 = LayerNorm(self.d_model) self.norm2 = LayerNorm(self.d_model) self.dropout = Dropout(0.0, mode="upscale_in_train") self.dropout1 = Dropout(0.0, mode="upscale_in_train") self.dropout2 = Dropout(0.0, mode="upscale_in_train") self.activation = getattr(F, self.act_method) self.src = np.random.random((self.batch_size, self.query_length, self.d_model)).astype(self.dtype) self.dout = np.random.random((self.batch_size, self.query_length, self.d_model)).astype(self.dtype)
def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, activation="gelu", attn_dropout=None, act_dropout=None, normalize_before=True, weight_attr=None, bias_attr=None, topo=None, **kwargs): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 super(TransformerDecoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before weight_attrs = _convert_param_attr_to_list(weight_attr, 3) bias_attrs = _convert_param_attr_to_list(bias_attr, 3) self._fuse = kwargs.get('fuse', False) if self._fuse: nranks, ring_id = 1, -1 if topo is not None and topo.mp_info.size > 1: nranks = topo.mp_info.size ring_id = 0 self.self_attn = incubate.nn.FusedMultiHeadAttention( d_model, nhead, dropout_rate=dropout, attn_dropout_rate=attn_dropout, normalize_before=normalize_before, qkv_weight_attr=weight_attrs[0], qkv_bias_attr=bias_attrs[0], linear_weight_attr=weight_attrs[0], linear_bias_attr=bias_attrs[0], epsilon=1e-5, nranks=nranks, ring_id=ring_id) self.ffn = incubate.nn.FusedFeedForward( d_model, dim_feedforward, dropout_rate=act_dropout, epsilon=1e-5, activation=activation, normalize_before=normalize_before, act_dropout_rate=0.0, linear1_weight_attr=weight_attrs[2], linear1_bias_attr=bias_attrs[2], linear2_weight_attr=weight_attrs[2], linear2_bias_attr=bias_attrs[2], nranks=nranks, ring_id=ring_id) else: self.self_attn = MultiHeadAttention(d_model, nhead, dropout=attn_dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0], topo=topo) if topo is None or topo.mp_info.size == 1: self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2]) self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2]) else: self.linear1 = paddlenlp.ops.ColumnParallelLiner( (d_model, dim_feedforward), topo.mp_info.size, gather_out=False, param_attr=weight_attrs[2], bias_attr=bias_attrs[2]) self.linear2 = paddlenlp.ops.RowParallelLiner( (dim_feedforward, d_model), topo.mp_info.size, input_is_parallel=True, param_attr=weight_attrs[2], bias_attr=bias_attrs[2]) self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5) self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5) self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train") self.activation = getattr(F, activation)
def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, activation="gelu", attn_dropout=None, act_dropout=None, normalize_before=True, weight_attr=None, bias_attr=None, num_partitions=1, expert_mode=False, num_experts=1, top_k=2, hcg=None, gate=None, recompute_interval=0): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 super(TransformerDecoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before self.recompute_interval = recompute_interval # moe config self.top_k = top_k self.num_experts = num_experts self.expert_mode = expert_mode self.hcg = hcg weight_attrs = _convert_param_attr_to_list(weight_attr, 3) bias_attrs = _convert_param_attr_to_list(bias_attr, 3) self.self_attn = MultiHeadAttention(d_model, nhead, dropout=attn_dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0], num_partitions=num_partitions) if expert_mode: import os experts_list = nn.LayerList() for expi in range(num_experts): exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts) experts_list.append(exp_layer) moe_group_size = hcg.get_expert_parallel_world_size() moe_group = hcg.get_expert_parallel_group() mp_group = hcg.get_model_parallel_group() gate_config = { "type": "gshard", "top_k": top_k, } self.moe_mlp = MoeLayer(d_model=d_model, experts=experts_list, gate_config=gate_config, moe_group=moe_group, mp_group=mp_group, recompute_interval=self.recompute_interval) else: self.linear1 = fleet.meta_parallel.ColumnParallelLinear( d_model, dim_feedforward, weight_attr=weight_attrs[2], gather_output=False, has_bias=True) self.linear2 = fleet.meta_parallel.RowParallelLinear( dim_feedforward, d_model, weight_attr=weight_attrs[2], input_is_parallel=True, has_bias=True) self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5) self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5) self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train") self.activation = getattr(F, activation)