def forward(self, inp): if self.div_val == 1: embed = self.emb_layers[0](inp) if self.d_proj != self.d_embed: embed = F.linear(embed, self.emb_projs[0]) else: inp_flat = paddle.reshape(inp, shape=[-1]) emb_flat = paddle.zeros( [inp_flat.shape[0], self.d_proj], dtype=global_dtype) for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx) indices_i = paddle.nonzero(mask_i).squeeze([1]) if indices_i.numel() == 0: continue inp_i = paddle.gather(inp_flat, indices_i, axis=0) - l_idx emb_i = self.emb_layers[i](inp_i) emb_i = F.linear(emb_i, self.emb_projs[i]) emb_flat = paddle.scatter(emb_flat, indices_i, emb_i) embed = paddle.reshape( emb_flat, shape=inp.shape.append(self.d_proj)) embed = embed * self.emb_scale return embed
def _compute_logits(self, hidden, weight, bias, proj=None): if proj is None: logit = F.linear(hidden, weight.t(), bias=bias) else: proj_hid = F.linear(hidden, proj) logit = F.linear(proj_hid, weight.t(), bias=bias) return logit
def f(w1, w2, x1, x2): Z1 = F.linear(x1, w1) Z2 = F.linear(x2, w2) S1 = safe_divide(R, Z1) S2 = safe_divide(R, Z2) C1 = x1 * self.gradprop(Z1, x1, S1)[0] C2 = x2 * self.gradprop(Z2, x2, S2)[0] return C1 + C2
def forward(self, input): if self.activation: out = F.linear(input, self.weight * self.scale) out = fused_leaky_relu(out, self.bias * self.lr_mul) else: out = F.linear(input, self.weight * self.scale, bias=self.bias * self.lr_mul) return out
def f(R, w1, w2, x1, x2): R_nonzero = R.not_equal(ZERO_TENSOR).astype(R.dtype) Za1 = F.linear(x1, w1) * R_nonzero Za2 = -F.linear(x1, w2) * R_nonzero Zb1 = -F.linear(x2, w1) * R_nonzero Zb2 = F.linear(x2, w2) * R_nonzero C1 = pos_prop(R, Za1, Za2, x1) C2 = pos_prop(R, Zb1, Zb2, x2) return C1 + C2
def first_prop(pd, px, nx, pw, nw): Rpp = F.linear(px, pw) * pd Rpn = F.linear(px, nw) * pd Rnp = F.linear(nx, pw) * pd Rnn = F.linear(nx, nw) * pd Pos = (Rpp + Rnn).sum(dim=-1, keepdim=True) Neg = (Rpn + Rnp).sum(dim=-1, keepdim=True) Z1 = F.linear(px, pw) Z2 = F.linear(px, nw) Z3 = F.linear(nx, pw) Z4 = F.linear(nx, nw) S1 = safe_divide(Rpp, Z1) S2 = safe_divide(Rpn, Z2) S3 = safe_divide(Rnp, Z3) S4 = safe_divide(Rnn, Z4) C1 = px * self.gradprop(Z1, px, S1)[0] C2 = px * self.gradprop(Z2, px, S2)[0] C3 = nx * self.gradprop(Z3, nx, S3)[0] C4 = nx * self.gradprop(Z4, nx, S4)[0] bp = self.bias * pd * safe_divide(Pos, Pos + Neg) bn = self.bias * pd * safe_divide(Neg, Pos + Neg) Sb1 = safe_divide(bp, Z1) Sb2 = safe_divide(bn, Z2) Cb1 = px * self.gradprop(Z1, px, Sb1)[0] Cb2 = px * self.gradprop(Z2, px, Sb2)[0] return C1 + C4 + Cb1 + C2 + C3 + Cb2
def functional(self, place): paddle.disable_static(place) input = paddle.to_tensor(self.input) weight = paddle.to_tensor(self.weight) bias = paddle.to_tensor(self.bias) out = F.linear(input, weight, bias) return out.numpy()
def forward(self, inputs): with paddle.no_grad(): eps_in = self._scale_noise(self.epsilon_input.shape) eps_out = self._scale_noise(self.epsilon_output.shape) noise_v = paddle.multiply(eps_in, eps_out).detach() return F.linear(inputs, self.weight + self.sigma_weight * noise_v.t(), self.bias + self.sigma_bias * eps_out.squeeze().t())
def forward(self, input): if self._act_preprocess is not None: input = self._act_preprocess(input) quant_input = self._fake_quant_input(input) weight = self.weight if self._weight_preprocess is not None: weight = self._weight_preprocess(self.weight) quant_weight = self._fake_quant_weight(weight) out = F.linear( x=quant_input, weight=quant_weight, bias=self.bias, name=self.name) return out
def forward(self, input, label): cosine = F.linear(F.normalize(input), F.normalize(self.weight)) sine = paddle.sqrt( paddle.clip(1.0 - paddle.pow(cosine, 2), min=0, max=1)) phi = cosine * self.cos_m - sine * self.sin_m if self.easy_margin: phi = paddle.where(cosine > 0, phi, cosine) else: phi = paddle.where(cosine > self.th, phi, cosine - self.mm) one_hot = paddle.nn.functional.one_hot(label, self.class_dim) output = (one_hot * phi) + ((1.0 - one_hot) * cosine) output *= self.s return output
def forward(self, x): """Forward feature from the regression head to get integral result of bounding box location. Args: x (Tensor): Features of the regression head, shape (N, 4*(n+1)), n is self.reg_max. Returns: x (Tensor): Integral result of box locations, i.e., distance offsets from the box center in four directions, shape (N, 4). """ x = F.softmax(x.reshape([-1, self.reg_max + 1]), axis=1) x = F.linear(x, self.project).reshape([-1, 4]) return x
def forward(self, x): # use inner api to process identity if self.is_mp: input_parallel = paddle.distributed.collective._c_identity( x, group=self.model_parallel_group) else: input_parallel = x output_parallel = F.linear(input_parallel, self.weight, self.bias, name=self._name) if self.gather_output and self.is_mp: output = paddle.distributed.collective._c_concat( output_parallel, group=self.model_parallel_group) else: output = output_parallel return output
def forward(self, input, expand_ratio=None, channel=None): self.cur_config = {'expand_ratio': expand_ratio, 'channel': channel} ### weight: (Cin, Cout) in_nc = int(input.shape[-1]) assert ( expand_ratio == None or channel == None ), "expand_ratio and channel CANNOT be NOT None at the same time." if expand_ratio != None: out_nc = int(expand_ratio * self.base_output_dim) elif channel != None: out_nc = int(channel) else: out_nc = self._out_features weight = self.weight[:in_nc, :out_nc] if self._bias_attr != False: bias = self.bias[:out_nc] else: bias = self.bias out = F.linear(x=input, weight=weight, bias=bias, name=self.name) return out
def forward(self, x): if self.input_is_parallel or (not self.is_mp): input_parallel = x else: # split last dim input_parallel = paddle.distributed.collective._c_split( x, group=self.model_parallel_group) output_parallel = F.linear(input_parallel, self.weight, name=self._name) if self.is_mp: output_ = paddle.distributed.collective._mp_allreduce( output_parallel, group=self.model_parallel_group, use_calc_stream=True, use_model_parallel=True) else: output_ = output_parallel output = output_ + self.bias if self.bias is not None else output_ return output
def forward(self, total_features, norm_weight): logits = linear(total_features, paddle.t(norm_weight)) return logits
def forward(self, x, params=None, bn_training=True): """ :param x: 输入图片 :param params: :param bn_training: set False to not update :return: 输出分类 """ if params is None: params = self.vars weight, bias = params[0], params[1] # 第1个CONV层 x = F.conv2d(x, weight, bias, stride=1, padding=1) weight, bias = params[2], params[3] # 第1个BN层 running_mean, running_var = self.vars_bn[0], self.vars_bn[1] x = F.batch_norm(x, running_mean, running_var, weight=weight, bias=bias, training=bn_training) x = F.relu(x) # 第1个relu x = F.max_pool2d(x, kernel_size=2) # 第1个MAX_POOL层 weight, bias = params[4], params[5] # 第2个CONV层 x = F.conv2d(x, weight, bias, stride=1, padding=1) weight, bias = params[6], params[7] # 第2个BN层 running_mean, running_var = self.vars_bn[2], self.vars_bn[3] x = F.batch_norm(x, running_mean, running_var, weight=weight, bias=bias, training=bn_training) x = F.relu(x) # 第2个relu x = F.max_pool2d(x, kernel_size=2) # 第2个MAX_POOL层 weight, bias = params[8], params[9] # 第3个CONV层 x = F.conv2d(x, weight, bias, stride=1, padding=1) weight, bias = params[10], params[11] # 第3个BN层 running_mean, running_var = self.vars_bn[4], self.vars_bn[5] x = F.batch_norm(x, running_mean, running_var, weight=weight, bias=bias, training=bn_training) x = F.relu(x) # 第3个relu x = F.max_pool2d(x, kernel_size=2) # 第3个MAX_POOL层 weight, bias = params[12], params[13] # 第4个CONV层 x = F.conv2d(x, weight, bias, stride=1, padding=1) weight, bias = params[14], params[15] # 第4个BN层 running_mean, running_var = self.vars_bn[6], self.vars_bn[7] x = F.batch_norm(x, running_mean, running_var, weight=weight, bias=bias, training=bn_training) x = F.relu(x) # 第4个relu x = F.max_pool2d(x, kernel_size=2) # 第4个MAX_POOL层 x = paddle.reshape(x, [x.shape[0], -1]) ## flatten weight, bias = params[-2], params[-1] # linear x = F.linear(x, weight, bias) output = x return output
def test_static(self): paddle.enable_static() default_main_program().random_seed = 42 dtype = "float32" layer_norm_dtype = "float32" batch_size = 1 d_model = 8 dim_feedforward = 8 x = paddle.static.data(name='x', shape=[batch_size, d_model, dim_feedforward], dtype=dtype) linear1_weight = paddle.static.data(name='linear1_weight', shape=[d_model, dim_feedforward], dtype=dtype) linear1_bias = paddle.static.data(name='linear1_bias', shape=[dim_feedforward]) linear2_weight = paddle.static.data(name='linear2_weight', shape=[dim_feedforward, d_model], dtype=dtype) linear2_bias = paddle.static.data(name='linear2_bias', shape=[d_model]) ln1_scale = paddle.static.data(name='ln1_scale', shape=[d_model]) ln1_bias = paddle.static.data(name='ln1_scale', shape=[d_model]) ln2_scale = paddle.static.data(name='ln2_scale', shape=[d_model]) ln2_bias = paddle.static.data(name='ln2_scale', shape=[d_model]) fused_out = incubate_f.fused_feedforward(x, linear1_weight, linear2_weight, linear1_bias, linear2_bias, ln1_scale, ln1_bias, ln2_scale, ln2_bias, 0.0, 0.0, activation="relu", pre_layer_norm=False) ######base ffn###### linear1_out = F.linear(x, linear1_weight, linear1_bias) act_out = F.relu(linear1_out) dropout1_out = F.dropout(x=act_out, p=0.0, training=False) linear2_out = F.linear(dropout1_out, linear2_weight, linear2_bias) dropout2_out = x + F.dropout(x=linear2_out, p=0.0, training=False) ln_out = F.layer_norm(dropout2_out, normalized_shape=list([d_model]), weight=ln2_scale, bias=ln2_bias) ######base ffn###### exe = paddle.static.Executor(paddle.CUDAPlace(0)) x_data = np.random.random( (batch_size, d_model, dim_feedforward)).astype(dtype) linear1_weight_data = np.random.random( (d_model, dim_feedforward)).astype(dtype) linear1_bias_data = np.zeros((dim_feedforward)).astype(dtype) linear2_weight_data = np.random.random( (dim_feedforward, d_model)).astype(dtype) linear2_bias_data = np.zeros((d_model)).astype(dtype) ln1_scale_data = np.ones((d_model)).astype(layer_norm_dtype) ln1_bias_data = np.zeros((d_model)).astype(layer_norm_dtype) ln2_scale_data = np.ones((d_model)).astype(layer_norm_dtype) ln2_bias_data = np.zeros((d_model)).astype(layer_norm_dtype) res_list = [fused_out, ln_out] real_res = [] for res in res_list: fetch = exe.run(feed={ 'x': x_data, 'linear1_weight': linear1_weight_data, 'linear1_bias': linear1_bias_data, 'linear2_weight': linear2_weight_data, 'linear2_bias': linear2_bias_data, 'ln1_scale': ln1_scale_data, 'ln1_bias': ln1_bias_data, 'ln2_scale': ln2_scale_data, 'ln2_bias': ln2_bias_data }, fetch_list=[res]) real_res.append(fetch) self.assertTrue(np.allclose(real_res[0], real_res[1], atol=1e-3), "two value is check diff")