def setUp(self): self.config() self.generate_input_data() paddle.set_default_dtype(self.x_type) self.__class__.op_type = "fused_attention" # use autograd to check grad in this unittest. self.__class__.no_need_check_grad = True self.q_proj = Linear(self.embed_dim, self.embed_dim, self.weight_attr, bias_attr=self.bias_attr) self.k_proj = Linear(self.kdim, self.embed_dim, self.weight_attr, bias_attr=self.bias_attr) self.v_proj = Linear(self.vdim, self.embed_dim, self.weight_attr, bias_attr=self.bias_attr) self.out_proj = Linear(self.embed_dim, self.embed_dim, self.weight_attr, bias_attr=self.bias_attr) paddle.set_default_dtype(np.float32) self.norm1 = LayerNorm(self.embed_dim) self.norm2 = LayerNorm(self.embed_dim) paddle.set_default_dtype(self.x_type) self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train")
def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, activation="relu", normalize_before=False, weight_attr=None, bias_attr=None): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 super(TransformerDecoderLayer, self).__init__() self.normalize_before = normalize_before weight_attrs = _convert_param_attr_to_list(weight_attr, 3) bias_attrs = _convert_param_attr_to_list(bias_attr, 3) self.nhead = nhead self.self_attn = MultiHeadAttention(d_model, nhead, dropout=dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0]) self.multihead_attn = MultiHeadAttention(d_model, nhead, dropout=dropout, weight_attr=weight_attrs[1], bias_attr=bias_attrs[1]) self.linear1 = Linear(d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2]) self.dropout = Dropout(dropout, mode="upscale_in_train") self.linear2 = Linear(dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2]) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) self.dropout1 = Dropout(dropout, mode="upscale_in_train") self.dropout2 = Dropout(dropout, mode="upscale_in_train") self.dropout3 = Dropout(dropout, mode="upscale_in_train") self.activation = getattr(F, activation)
def setUp(self): self.config() self.generate_input_data() self.rtol = 1e-5 # FIXME(limin29): Because there is a problem with the test precision # on A100, atol is temporarily set to 1e-2, and it will be # changed back after the precision problem is solved. self.atol = 1e-2 # make sure local development precision if "V100" in paddle.device.cuda.get_device_name(): self.atol = 1e-4 if self.x_type is np.float16: self.atol = 1e-1 paddle.set_default_dtype(self.x_type) self.__class__.op_type = "fused_attention" # use autograd to check grad in this unittest. self.__class__.no_need_check_grad = True self.q_proj = Linear(self.embed_dim, self.embed_dim, self.weight_attr, bias_attr=self.bias_attr) self.k_proj = Linear(self.kdim, self.embed_dim, self.weight_attr, bias_attr=self.bias_attr) self.v_proj = Linear(self.vdim, self.embed_dim, self.weight_attr, bias_attr=self.bias_attr) self.out_proj = Linear(self.embed_dim, self.embed_dim, self.weight_attr, bias_attr=self.bias_attr) paddle.set_default_dtype(np.float32) self.norm1 = LayerNorm(self.embed_dim) self.norm2 = LayerNorm(self.embed_dim) paddle.set_default_dtype(self.x_type) self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train")
def setUp(self): paddle.disable_static() self.__class__.op_type = "fused_feedforward" #check grad in test_out_and_grad() self.__class__.no_need_check_grad = True self.getDtype() self.getShape() self.getDiff() self.getActivation() self.getNormalizeBefore() paddle.set_default_dtype(self.dtype) self.weight_attr = None self.bias_attr = None self.weight_attrs = transformer._convert_param_attr_to_list( self.weight_attr, 2) self.bias_attrs = transformer._convert_param_attr_to_list( self.bias_attr, 2) self.linear1 = Linear(self.d_model, self.dim_feedforward, self.weight_attrs[1], bias_attr=self.bias_attrs[1]) self.linear2 = Linear(self.dim_feedforward, self.d_model, self.weight_attrs[1], bias_attr=self.bias_attrs[1]) paddle.set_default_dtype(self.layer_norm_dtype) self.norm1 = LayerNorm(self.d_model) self.norm2 = LayerNorm(self.d_model) self.dropout = Dropout(0.0, mode="upscale_in_train") self.dropout1 = Dropout(0.0, mode="upscale_in_train") self.dropout2 = Dropout(0.0, mode="upscale_in_train") self.activation = getattr(F, self.act_method) self.src = np.random.random((self.batch_size, self.query_length, self.d_model)).astype(self.dtype) self.dout = np.random.random((self.batch_size, self.query_length, self.d_model)).astype(self.dtype)
def setUp(self): self.config() self.generate_input_data() self.rtol = 1e-5 # FIXME(wangxi): Because there is a problem with the test precision # on A100, atol is temporarily set to 1e-2, and it will be # changed back after the precision problem is solved. self.atol = 1e-2 # make sure local development precision if "V100" in paddle.device.cuda.get_device_name(): self.atol = 1e-4 if self.x_type is np.float16: self.atol = 1e-1 paddle.set_default_dtype(self.x_type) self.__class__.op_type = "fused_multi_transformer" # use autograd to check grad in this unittest. self.__class__.no_need_check_grad = False bias_attr = paddle.fluid.ParamAttr( initializer=paddle.fluid.initializer.Constant(value=0.0005)) self.q_proj = Linear(self.embed_dim, self.embed_dim, self.weight_attr, bias_attr=bias_attr) #bias_attr=self.bias_attr) self.k_proj = Linear(self.kdim, self.embed_dim, self.weight_attr, bias_attr=self.bias_attr) self.v_proj = Linear(self.vdim, self.embed_dim, self.weight_attr, bias_attr=self.bias_attr) self.out_proj = Linear(self.embed_dim, self.embed_dim, self.weight_attr, bias_attr=self.bias_attr) self.ffn1_proj = Linear(self.embed_dim, 4 * self.embed_dim, self.weight_attr, bias_attr=self.bias_attr) self.ffn2_proj = Linear(4 * self.embed_dim, self.embed_dim, self.weight_attr, bias_attr=self.bias_attr) paddle.set_default_dtype(np.float32) self.norm = LayerNorm(self.embed_dim) self.ffn_norm = LayerNorm(self.embed_dim) paddle.set_default_dtype(self.x_type) self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train") self.activation = getattr(F, self.act_method)