Exemplo n.º 1
0
    def magnitude(self, x):
        """Compute the magnitude spectrogram.

        Args:
            (real, imag)
            real (Variable): shape(B, C, 1, T), dtype flaot32, the real part of the spectrogram.
            imag (Variable): shape(B, C, 1, T), dtype flaot32, the image part of the spectrogram.

        Returns:
            Variable: shape(B, C, 1, T), dtype flaot32, the magnitude spectrogram. It is the square root of the power spectrogram.
        """
        power = self.power(x)
        magnitude = F.sqrt(power)
        return magnitude
Exemplo n.º 2
0
    def forward(self, pred, target):
        target = 1 - target[:, 0]
        batch_size, vector_size = pred.shape[0], pred.shape[1]

        pred = L.l2_normalize(pred, axis=1, epsilon=1e-10)

        square_norm = L.reduce_sum(L.square(pred), dim=1)
        dist = L.elementwise_add(-2.0 * L.matmul(pred, pred, transpose_y=True),
                                 square_norm,
                                 axis=0)
        dist = L.elementwise_add(dist, square_norm, axis=1)
        dist = L.elementwise_max(dist, L.zeros_like(dist))
        dist = L.sqrt(dist)

        ap_dist = L.reshape(dist, (0, 0, 1))
        an_dist = L.reshape(dist, (0, 1, -1))

        loss = L.expand(ap_dist, (1, 1, batch_size)) - L.expand(
            an_dist, (1, batch_size, 1)) + self.magin

        indice_equal = L.diag(
            L.fill_constant((batch_size, ), dtype='float32', value=1.0))
        indice_not_equal = 1.0 - indice_equal

        broad_matrix = L.expand(L.reshape(target, (-1, 1)),
                                (1, batch_size)) + L.expand(
                                    L.reshape(target, (1, -1)),
                                    (batch_size, 1))

        pp = L.cast(L.equal(broad_matrix, L.zeros_like(broad_matrix)),
                    dtype='float32')
        pp = L.reshape(indice_not_equal * pp, (0, 0, 1))

        pn = L.cast(L.equal(broad_matrix,
                            L.zeros_like(broad_matrix) + 1),
                    dtype='float32')
        pn = L.reshape(indice_not_equal * pn, (1, 0, -1))

        apn = L.expand(pp,
                       (1, 1, batch_size)) * L.expand(pn, (batch_size, 1, 1))

        loss = loss * L.cast(apn, dtype='float32')
        loss = L.elementwise_max(loss, L.zeros_like(loss))

        num_tri = L.reduce_sum(
            L.cast(L.greater_than(loss, L.zeros_like(loss)), dtype='float32'))

        loss = L.reduce_sum(loss) * self.loss_weight / (num_tri + 1e-16)

        return loss
Exemplo n.º 3
0
    def func(self, place):
        shape = [2, 3, 7, 9]
        eps = 0.0001
        dtype = np.float64

        x = layers.data('x', shape, False, dtype)
        x.persistable = True

        y = layers.sqrt(x)
        x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)

        gradient_checker.double_grad_check(
            [x], y, x_init=x_arr, place=place, eps=eps)
        gradient_checker.double_grad_check_for_dygraph(
            self.sqrt_wrapper, [x], y, x_init=x_arr, place=place)
Exemplo n.º 4
0
def norm_except_dim(p, dim):
    shape = p.shape
    ndims = len(shape)
    if dim is None:
        return F.sqrt(F.reduce_sum(F.square(p)))
    elif dim == 0:
        p_matrix = F.reshape(p, (shape[0], -1))
        return l2_norm(p_matrix, axis=1)
    elif dim == -1 or dim == ndims - 1:
        p_matrix = F.reshape(p, (-1, shape[-1]))
        return l2_norm(p_matrix, axis=0)
    else:
        perm = list(range(ndims))
        perm[0] = dim
        perm[dim] = 0
        p_transposed = F.transpose(p, perm)
        return norm_except_dim(p_transposed, 0)
Exemplo n.º 5
0
 def communicate_avg_loss():
     communicate()
     self._generate_avg_loss(main_block, loss, avg_loss)
     next_local_steps = layers.cast(layers.ceil(
         layers.sqrt(lr_0 * avg_loss / (global_lr * loss_0) *
                     float(init_k_steps))),
                                    dtype='int64')
     max_local_steps = layers.fill_constant(shape=[1],
                                            dtype='int64',
                                            value=16)
     min_local_steps = layers.fill_constant(shape=[1],
                                            dtype='int64',
                                            value=1)
     next_local_steps = layers.elementwise_min(
         next_local_steps, max_local_steps)
     next_local_steps = layers.elementwise_max(
         next_local_steps, min_local_steps)
     layers.assign(next_local_steps, k_steps)
    def build_program(self, dtype):
        with fluid.program_guard(self.main_program, self.startup_program):
            self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 3)
            self.feed_vars.append(
                fluid.data(name="data3", shape=[128, 128], dtype=dtype))

            # subgraph with 2 op nodes
            tmp_0 = layers.sum(
                [self.feed_vars[0], self.feed_vars[1], self.feed_vars[2]])
            tmp_1 = layers.sqrt(tmp_0)
            tmp_2 = layers.mul(tmp_0, self.feed_vars[3])
            # subgraph with 2 op nodes
            tmp_3 = layers.square(layers.sum([tmp_1, tmp_2]))

        self.append_gradients(tmp_3)

        self.num_fused_ops = 4
        self.fetch_list = [tmp_3, self.grad(tmp_0)]
 def forward(self, output1, output2, label):
     """
     :param output1: [n, 128]
     :param output2: [n, 128]
     :param label: [n, 1]
     :return: [1]
     """
     distance = layers.elementwise_sub(output1, output2)
     distance = layers.square(distance)
     euclidean_distance = layers.reduce_sum(distance, dim=1, keep_dim=True)
     euclidean_distance = layers.sqrt(euclidean_distance)
     loss_contrastive = layers.elementwise_mul(
         1 - label, layers.square(euclidean_distance),
         axis=0) + layers.elementwise_mul(
             label,
             layers.square(
                 layers.clamp(self.margin - euclidean_distance, min=0.0)),
             axis=0)
     return loss_contrastive, euclidean_distance.numpy(), label.numpy()
Exemplo n.º 8
0
    def _dygraph_clip_by_global_norm(self, params_grads):
        params_and_grads = []
        sum_square_list = []
        for p, g in params_grads:
            if g is None:
                continue
            if self._need_clip_func is not None and not self._need_clip_func(
                    p):
                continue
            merge_grad = g
            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                merge_grad = layers.merge_selected_rows(g)
                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
            square = layers.square(merge_grad)
            sum_square = layers.reduce_sum(square)
            sum_square_list.append(sum_square)

        # all parameters have been filterd out
        if len(sum_square_list) == 0:
            return params_grads

        global_norm_var = layers.concat(sum_square_list)
        global_norm_var = layers.reduce_sum(global_norm_var)
        global_norm_var = layers.sqrt(global_norm_var)
        max_global_norm = layers.fill_constant(shape=[1],
                                               dtype='float32',
                                               value=self.clip_norm)
        clip_var = layers.elementwise_div(x=max_global_norm,
                                          y=layers.elementwise_max(
                                              x=global_norm_var,
                                              y=max_global_norm))
        for p, g in params_grads:
            if g is None:
                continue
            if self._need_clip_func is not None and not self._need_clip_func(
                    p):
                params_and_grads.append((p, g))
                continue
            new_grad = layers.elementwise_mul(x=g, y=clip_var)
            params_and_grads.append((p, new_grad))

        return params_and_grads
Exemplo n.º 9
0
def graph_norm(gw, feature):
    """Implementation of graph normalization
   
    Reference Paper: BENCHMARKING GRAPH NEURAL NETWORKS
   
    Each node features is divied by sqrt(num_nodes) per graphs.

    Args:
        gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`)

        feature: A tensor with shape (num_nodes, hidden_size)

    Return:
        A tensor with shape (num_nodes, hidden_size)
    """
    nodes = L.fill_constant([gw.num_nodes, 1], dtype="float32", value=1.0)
    norm = graph_pooling(gw, nodes, pool_type="sum")
    norm = L.sqrt(norm)
    feature_lod = op.nested_lod_reset(feature, gw.graph_lod)
    norm = L.sequence_expand_as(norm, feature_lod)
    norm.stop_gradient = True
    return feature_lod / norm
Exemplo n.º 10
0
    def forward(self, tenFirst, tenSecond, tenFeaturesFirst, tenFeaturesSecond, tenFlow):
        b, _, h, w = tenFlow.shape
        tenDifference = tenFirst - backwarp(tenInput=tenSecond, tenFlow=tenFlow * self.fltBackward)
        tenDifference = L.pow(tenDifference, 2)
        tenDifference = L.reduce_sum(tenDifference, 1, True) # [b, 1, h, w]
        tenDifference = L.sqrt(tenDifference).detach()

        tenFeaturesFirst = self.moduleFeat(tenFeaturesFirst)

        tenMean = L.reshape(tenFlow, (b, 2, -1))    # [b, 2, h * w]
        tenMean = L.reduce_mean(tenMean, 2, True)   # [b, 2, 1]
        tenMean = L.reshape(tenMean, (b, 2, 1, 1))  # [b, 2, 1, 1]
        tenMean = L.expand(tenMean, (1, 1, h, w))   # [b, 2, h, w]
        delta = tenFlow - tenMean

        diff = L.concat([tenDifference, delta, tenFeaturesFirst], 1)
        tenDist = self.moduleDist(self.moduleMain(diff))
        tenDist = L.pow(tenDist, 2.0) * -1.0
        tenDist = tenDist - L.reduce_max(tenDist, 1, True)
        tenDist = L.exp(tenDist)

        tenDivisor = L.reduce_sum(tenDist, 1, True)
        tenDivisor = L.reciprocal(tenDivisor)

        tenScaleX = L.unfold(x=tenFlow[:, 0:1, :, :], 
                             kernel_sizes=self.intUnfold, 
                             strides=1, 
                             paddings=int((self.intUnfold - 1) / 2)) # [b, c, h * w]
        tenScaleX = L.reshape(tenScaleX, (b, -1, h, w))          # [b, c, h, w]
        tenScaleX = self.moduleScaleX(tenDist * tenScaleX) * tenDivisor

        tenScaleY = L.unfold(x=tenFlow[:, 1:2, :, :], 
                             kernel_sizes=self.intUnfold, 
                             strides=1, 
                             paddings=int((self.intUnfold - 1) / 2)) # [b, c, h * w]
        tenScaleY = L.reshape(tenScaleY, (b, -1, h, w))          # [b, c, h, w]
        tenScaleY = self.moduleScaleY(tenDist * tenScaleY) * tenDivisor

        return L.concat([tenScaleX, tenScaleY], 1)
Exemplo n.º 11
0
import paddle.fluid as fluid
import numpy as np
import paddle.fluid.layers as L
def gen_data():
    return {
        "x": np.random.randint(1, 5, size=[8, 10]).astype('float32'),
        "y": np.random.randint(1, 5, size=[10]).astype('float32'),
    }
x = fluid.layers.data(name="x", shape=[8,10], dtype='float32')
y = fluid.layers.data(name="y", shape=[10], dtype='float32')
mm = L.sqrt(L.reduce_sum(L.elementwise_mul(x,x), dim=0))
kk = L.ones_like(y)
z = fluid.layers.elementwise_div(x, mm, axis=1)
# z = x / y
place = fluid.CPUPlace()
exe = fluid.Executor(place)
z_value = exe.run(feed=gen_data(),
                    fetch_list=[z.name])
print(z_value) #
epsilon = 1e-5
N = 2
C = 2
H = 2
W = 2
HW = H * W

x = paddle.randn((N, C, H, W))
x.stop_gradient = False

U = fluid.layers.reduce_mean(x, dim=[2, 3], keep_dim=True)  # [N, C, 1, 1]
V = fluid.layers.reduce_mean(fluid.layers.square(x - U),
                             dim=[2, 3],
                             keep_dim=True)  # [N, C, 1, 1]
normX = (x - U) / L.sqrt(V + epsilon)
Var1 = (x - U)
Var2 = 1.0 / L.sqrt(V + epsilon)
Var3 = (x - U) * 1.0 / L.sqrt(V + epsilon)

dUdx = paddle.grad(outputs=[U],
                   inputs=[x],
                   create_graph=True,
                   retain_graph=True)[0]
dVdx = paddle.grad(outputs=[V],
                   inputs=[x],
                   create_graph=True,
                   retain_graph=True)[0]
dnormXdx = paddle.grad(outputs=[normX],
                       inputs=[x],
                       create_graph=True,
Exemplo n.º 13
0
def norm(inputs, dim):
    tp = [1,0]
    mm = L.sqrt(L.reduce_sum(L.elementwise_mul(inputs, inputs), dim=-dim))
    h = L.elementwise_div(inputs, mm, axis=tp[dim])
    return h
    def forward(self, x):
        if self.training:
            N, C, H, W = x.shape
            NHW = N * H * W

            # 方案一:用乘法
            # U = fluid.layers.reduce_mean(x, dim=[0, 2, 3], keep_dim=True)  # [1, C, 1, 1]
            # V = fluid.layers.reduce_mean(fluid.layers.square(x - U), dim=[0, 2, 3], keep_dim=True)  # [1, C, 1, 1]
            # normX = (x - U) / L.sqrt(V + self.epsilon)  # [N, C, H, W]
            # scale = L.unsqueeze(self.weight, [0, 2, 3])
            # bias = L.unsqueeze(self.bias, [0, 2, 3])
            # out = normX * scale + bias
            # U = L.reshape(U, (-1, ))
            # V = L.reshape(V, (-1, ))

            # 方案二:用分组卷积代替乘法
            # out = W*(x - U)/s + B     = (W/s) * x + B - (W/s)*U
            U = fluid.layers.reduce_mean(x, dim=[0, 2, 3],
                                         keep_dim=False)  # [C, ]
            if self.special_kernel is None:  # 为了快速求(x - U)
                special_kernel = np.ones((self.num_features, 1, 1, 1),
                                         np.float32)
                self.special_kernel = paddle.to_tensor(special_kernel)
                self.special_kernel.stop_gradient = True
            V = F.conv2d(x, self.special_kernel, -U,
                         groups=self.num_features)  # 为了快速求(x - U)
            V = fluid.layers.reduce_mean(fluid.layers.square(V),
                                         dim=[0, 2, 3],
                                         keep_dim=False)  # [C, ]
            std = L.sqrt(V + self.epsilon)  # [C, ]
            A = self.weight / std  # [C, ]
            B = self.bias - U * A  # [C, ]
            A = L.unsqueeze(A, [1, 2, 3])  # [C, 1, 1, 1]
            out = F.conv2d(x, A, B, groups=self.num_features)

            curr_U = U.numpy()
            curr_V = V.numpy()
            state_dict = self.state_dict()
            momentum = self.momentum
            _mean = self._mean.numpy() * momentum + curr_U * (1. - momentum)
            _variance = self._variance.numpy() * momentum + curr_V * (1. -
                                                                      momentum)
            state_dict['_mean'] = _mean.astype(np.float32)
            state_dict['_variance'] = _variance.astype(np.float32)
            self.set_state_dict(state_dict)
            self.A = None
            self.B = None
        else:
            # 方案一:用乘法
            # U = L.unsqueeze(self._mean, [0, 2, 3])  # [1, C, 1, 1]
            # V = L.unsqueeze(self._variance, [0, 2, 3])  # [1, C, 1, 1]
            # normX = (x - U) / L.sqrt(V + self.epsilon)  # [N, C, H, W]
            # scale = L.unsqueeze(self.weight, [0, 2, 3])
            # bias = L.unsqueeze(self.bias, [0, 2, 3])
            # out = normX * scale + bias

            # 方案二:用分组卷积代替乘法
            # out = W*(x - U)/s + B     = (W/s) * x + B - (W/s)*U
            if self.A is None:
                std = L.sqrt(self._variance + self.epsilon)  # [C, ]
                A = self.weight / std  # [C, ]
                B = self.bias - self._mean * A  # [C, ]
                A = L.unsqueeze(A, [1, 2, 3])  # [C, 1, 1, 1]
                self.A = A
                self.B = B
            out = F.conv2d(x, self.A, self.B, groups=self.num_features)
        return out
    def forward(self, input, conv, conv_g):
        # deal with wight and grad of self.pre_dxdw!
        self._check_input_dim(input)
        N, C, H, W = input.shape
        NHW = N * H * W
        y = input  # [N, C, H, W]
        weight = conv.weight

        # burnin
        if self.training and self.burnin > 0:
            self.iter_count += 1
            self._update_buffer_num()

        if self.buffer_num > 0 and self.training and (
                not input.stop_gradient):  # some layers are frozen!
            # cal current batch mu and sigma
            cur_mu = L.reduce_mean(y, dim=[0, 2, 3], keep_dim=False)  # [C, ]
            if self.special_kernel is None:  # 为了快速求(x - cur_mu)
                special_kernel = np.ones((self.num_features, 1, 1, 1),
                                         np.float32)
                self.special_kernel = paddle.to_tensor(special_kernel)
                self.special_kernel.stop_gradient = True
            cur_sigma2 = F.conv2d(
                y, self.special_kernel, -cur_mu,
                groups=self.num_features)  # 为了快速求(x - cur_mu)
            cur_sigma2 = L.reduce_sum(
                L.square(cur_sigma2), dim=[0, 2, 3], keep_dim=False) / (
                    NHW - 1)  # [C, ]  作者原版实现中使用的是样本方差,所以分母-1

            y2 = L.square(y)
            cur_meanx2 = L.reduce_mean(y2, dim=[0, 2, 3],
                                       keep_dim=False)  # [C, ]

            # cal dmu/dw dsigma2/dw
            # dmudw = paddle.grad(outputs=[cur_mu], inputs=[weight], create_graph=False, retain_graph=True)[0]
            # dmeanx2dw = paddle.grad(outputs=[cur_meanx2], inputs=[weight], create_graph=False, retain_graph=True)[0]

            # 自己的求法
            dmudinput = np.zeros(input.shape, np.float32) + 1.0 / NHW
            dmudinput = paddle.to_tensor(dmudinput)
            dmeanx2dinput = input.numpy()
            dmeanx2dinput = paddle.to_tensor(dmeanx2dinput)
            dmeanx2dinput *= 2.0 / NHW
            dmudw = conv_g.get_grad_w(conv.weight, conv.bias, dmudinput)
            dmeanx2dw = conv_g.get_grad_w(conv.weight, conv.bias,
                                          dmeanx2dinput)

            # update cur_mu and cur_sigma2 with pres
            weight_data = weight.numpy()
            weight_data = paddle.to_tensor(weight_data)
            weight_data.stop_gradient = True
            # 如果用L.stack()会报错,所以用L.concat()代替。
            mu_all = [
                cur_mu,
            ] + [
                tmp_mu + L.reduce_sum(self.rho * tmp_d * (weight_data - tmp_w),
                                      dim=[1, 2, 3]) for tmp_mu, tmp_d, tmp_w
                in zip(self.pre_mu, self.pre_dmudw, self.pre_weight)
            ]
            meanx2_all = [
                cur_meanx2,
            ] + [
                tmp_meanx2 + L.reduce_sum(
                    self.rho * tmp_d * (weight_data - tmp_w), dim=[1, 2, 3])
                for tmp_meanx2, tmp_d, tmp_w in zip(
                    self.pre_meanx2, self.pre_dmeanx2dw, self.pre_weight)
            ]
            mu_all = [L.unsqueeze(mu_, 0) for mu_ in mu_all]
            meanx2_all = [L.unsqueeze(meanx2_, 0) for meanx2_ in meanx2_all]
            mu_all = L.concat(mu_all, 0)
            meanx2_all = L.concat(meanx2_all, 0)

            sigma2_all = meanx2_all - L.square(mu_all)

            # with considering count
            re_mu_all = mu_all.clone()
            re_meanx2_all = meanx2_all.clone()
            mask1 = L.cast(sigma2_all >= 0., dtype="float32")
            mask1.stop_gradient = True
            re_mu_all *= mask1
            re_meanx2_all *= mask1
            count = L.reduce_sum(L.cast(sigma2_all >= 0., dtype="float32"),
                                 dim=[
                                     0,
                                 ])
            mu = L.reduce_sum(re_mu_all, dim=[
                0,
            ]) / count
            sigma2 = L.reduce_sum(re_meanx2_all, dim=[
                0,
            ]) / count - L.square(mu)

            cur_mu_ = cur_mu.numpy()
            cur_mu_ = paddle.to_tensor(cur_mu_)
            cur_mu_.stop_gradient = True
            self.pre_mu = [
                cur_mu_,
            ] + self.pre_mu[:(self.buffer_num - 1)]
            cur_meanx2_ = cur_meanx2.numpy()
            cur_meanx2_ = paddle.to_tensor(cur_meanx2_)
            cur_meanx2_.stop_gradient = True
            self.pre_meanx2 = [
                cur_meanx2_,
            ] + self.pre_meanx2[:(self.buffer_num - 1)]
            dmudw_ = dmudw.numpy()
            dmudw_ = paddle.to_tensor(dmudw_)
            dmudw_.stop_gradient = True
            self.pre_dmudw = [
                dmudw_,
            ] + self.pre_dmudw[:(self.buffer_num - 1)]
            dmeanx2dw_ = dmeanx2dw.numpy()
            dmeanx2dw_ = paddle.to_tensor(dmeanx2dw_)
            dmeanx2dw_.stop_gradient = True
            self.pre_dmeanx2dw = [
                dmeanx2dw_,
            ] + self.pre_dmeanx2dw[:(self.buffer_num - 1)]

            tmp_weight = weight.numpy()
            tmp_weight = paddle.to_tensor(tmp_weight)
            tmp_weight.stop_gradient = True
            self.pre_weight = [
                tmp_weight,
            ] + self.pre_weight[:(self.buffer_num - 1)]

        else:
            mu = L.reduce_mean(y, dim=[0, 2, 3], keep_dim=False)  # [C, ]
            if self.special_kernel is None:  # 为了快速求(x - mu)
                special_kernel = np.ones((self.num_features, 1, 1, 1),
                                         np.float32)
                self.special_kernel = paddle.to_tensor(special_kernel)
                self.special_kernel.stop_gradient = True
            sigma2 = F.conv2d(y,
                              self.special_kernel,
                              -mu,
                              groups=self.num_features)  # 为了快速求(x - mu)
            sigma2 = L.reduce_sum(L.square(sigma2),
                                  dim=[0, 2, 3],
                                  keep_dim=False) / (NHW - 1)  # [C, ]
            cur_mu = mu
            cur_sigma2 = sigma2

        if not self.training or self.FROZEN:  # eval()状态
            U = self._mean
            # TODO: outside **0.5?
            if self.out_p:
                std = L.sqrt(self._variance + self.eps)
            else:
                std = L.sqrt(self._variance) + self.eps

        else:  # train()状态
            if self.track_running_stats is True:
                state_dict = self.state_dict()
                momentum = self.momentum
                _mean = self._mean.numpy() * momentum + cur_mu.numpy() * (
                    1. - momentum)
                _variance = self._variance.numpy(
                ) * momentum + cur_sigma2.numpy() * (1. - momentum)
                state_dict['_mean'] = _mean.astype(np.float32)
                state_dict['_variance'] = _variance.astype(np.float32)
                self.set_state_dict(state_dict)
            U = mu
            # TODO: outside **0.5?
            if self.out_p:
                std = L.sqrt(sigma2 + self.eps)
            else:
                std = L.sqrt(sigma2) + self.eps

        A = self.weight / std  # [C, ]
        B = self.bias - U * A  # [C, ]
        A = L.unsqueeze(A, [1, 2, 3])  # [C, 1, 1, 1]
        y = F.conv2d(y, A, B, groups=self.num_features)
        return y
Exemplo n.º 16
0
    def _dygraph_clip(self, params_grads):
        normal_params_grads = []
        moe_params_grads = []

        # separate moe params from normal params
        if self.moe_group is not None and self.moe_group.nranks > 1:
            for p, g in params_grads:
                if self.is_expert_param_func(p):
                    moe_params_grads.append((p, g))
                else:
                    normal_params_grads.append((p, g))
        else:
            normal_params_grads = params_grads

        # why to return sum_dtype?
        # we will call `get_l2_norm_pow` twice and the precisions may be different.
        # For convenience and simplification, we use sum_dtype directly instead of global_norm_var_normal.dtype
        global_norm_var_normal, sum_dtype \
            = self.get_l2_norm_pow(normal_params_grads)
        global_norm_var_moe = None
        if len(moe_params_grads) > 0:
            global_norm_var_moe, _ \
                = self.get_l2_norm_pow(moe_params_grads, sum_dtype)
            if global_norm_var_moe is not None:
                collective.all_reduce(global_norm_var_moe,
                                      op=collective.ReduceOp.SUM,
                                      group=self.moe_group)

        if global_norm_var_normal is None and global_norm_var_moe is None:
            return params_grads
        elif global_norm_var_normal is None:
            global_norm_var = global_norm_var_moe
        elif global_norm_var_moe is None:
            global_norm_var = global_norm_var_normal
        else:
            if global_norm_var_normal.dtype != global_norm_var_moe.dtype:
                # compared with normal norm, moe norm is the later one,
                # so its precision is no lower than normal norm
                global_norm_var_normal = \
                    global_norm_var_normal.astype(global_norm_var_moe.dtype)
            global_norm_var = global_norm_var_normal + global_norm_var_moe

        params_and_grads = []
        global_norm_var = layers.sqrt(global_norm_var)
        max_global_norm = layers.fill_constant(shape=[1],
                                               dtype=global_norm_var.dtype,
                                               value=self.clip_norm)
        clip_var = layers.elementwise_div(x=max_global_norm,
                                          y=layers.elementwise_max(
                                              x=global_norm_var,
                                              y=max_global_norm))
        for p, g in params_grads:
            if g is None:
                continue
            if getattr(p, 'need_clip', True) is False:
                params_and_grads.append((p, g))
                continue
            # TODO(wangxi): use inplace elementwise_mul
            clip_input = (clip_var.astype('float16') if g.dtype
                          == core.VarDesc.VarType.FP16 else clip_var)
            new_grad = layers.elementwise_mul(x=g, y=clip_input)
            params_and_grads.append((p, new_grad))
        return params_and_grads
Exemplo n.º 17
0
            def communicate():
                sub_block = default_main_program().current_block()
                ring_id = -1
                for param, snapshot in p2s:
                    sub_block.append_op(type='elementwise_sub',
                                        inputs={
                                            'X': [snapshot],
                                            'Y': [param]
                                        },
                                        outputs={'Out': [param]},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                    sub_block.append_op(type='c_sync_calc_stream',
                                        inputs={'X': param},
                                        outputs={'Out': param},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                    ring_id = (ring_id + 1) % self.nrings
                    sub_block.append_op(type='c_allreduce_sum',
                                        inputs={'X': [param]},
                                        outputs={'Out': [param]},
                                        attrs={
                                            'ring_id': ring_id,
                                            OP_ROLE_KEY: OpRole.Optimize
                                        })

                for ring_id in range(self.nrings):
                    sub_block.append_op(type='c_sync_comm_stream',
                                        inputs={'X': param},
                                        outputs={'Out': param},
                                        attrs={
                                            'ring_id': ring_id,
                                            OP_ROLE_KEY: OpRole.Optimize
                                        })

                for param, snapshot in p2s:
                    sub_block.append_op(type='scale',
                                        inputs={'X': [param]},
                                        outputs={'Out': [param]},
                                        attrs={
                                            'scale':
                                            1.0 / self.role_maker.worker_num(),
                                            OP_ROLE_KEY:
                                            OpRole.Optimize
                                        })
                    sub_block.append_op(type='elementwise_sub',
                                        inputs={
                                            'X': [snapshot],
                                            'Y': [param]
                                        },
                                        outputs={'Out': [param]},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                    sub_block.append_op(type='assign',
                                        inputs={'X': [param]},
                                        outputs={'Out': [snapshot]},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})

                if auto_steps:
                    next_local_steps = layers.cast(layers.ceil(
                        layers.sqrt(lr_0 * loss / (global_lr * loss_0) *
                                    float(init_k_steps))),
                                                   dtype='int64')
                    max_local_steps = layers.fill_constant(shape=[1],
                                                           dtype='int64',
                                                           value=16)
                    next_local_steps = layers.elementwise_min(
                        next_local_steps, max_local_steps)
                    layers.assign(next_local_steps, k_steps)
                layers.assign(step, last_step)
Exemplo n.º 18
0
    def forward(self,
                q,
                k,
                v,
                lengths,
                speaker_embed,
                start_index,
                force_monotonic=False,
                prev_coeffs=None,
                window=None):
        # add position encoding as an inductive bias
        if self.has_bias:  # multi-speaker model
            omega_q = 2 * F.sigmoid(
                F.squeeze(self.q_pos_affine(speaker_embed), axes=[-1]))
            omega_k = 2 * self.omega_initial * F.sigmoid(
                F.squeeze(self.k_pos_affine(speaker_embed), axes=[-1]))
        else:  # single-speaker case
            batch_size = q.shape[0]
            omega_q = F.ones((batch_size, ), dtype="float32")
            omega_k = F.ones(
                (batch_size, ), dtype="float32") * self.omega_default
        q += self.position_encoding_weight * positional_encoding(
            q, start_index, omega_q)
        k += self.position_encoding_weight * positional_encoding(k, 0, omega_k)

        q, k, v = self.q_affine(q), self.k_affine(k), self.v_affine(v)
        activations = F.matmul(q, k, transpose_y=True)
        activations /= np.sqrt(self.attention_dim)

        if self.training:
            # mask the <pad> parts from the encoder
            mask = F.sequence_mask(lengths, dtype="float32")
            attn_bias = F.scale(1. - mask, -1000)
            activations += F.unsqueeze(attn_bias, [1])
        elif force_monotonic:
            assert window is not None
            backward_step, forward_step = window
            T_enc = k.shape[1]
            batch_size, T_dec, _ = q.shape

            # actually T_dec = 1 here
            alpha = F.fill_constant((batch_size, T_dec), value=0, dtype="int64") \
                   if prev_coeffs is None \
                   else F.argmax(prev_coeffs, axis=-1)
            backward = F.sequence_mask(alpha - backward_step,
                                       maxlen=T_enc,
                                       dtype="bool")
            forward = F.sequence_mask(alpha + forward_step,
                                      maxlen=T_enc,
                                      dtype="bool")
            mask = F.cast(F.logical_xor(backward, forward), "float32")
            # print("mask's shape:", mask.shape)
            attn_bias = F.scale(1. - mask, -1000)
            activations += attn_bias

        # softmax
        coefficients = F.softmax(activations, axis=-1)
        # context vector
        coefficients = F.dropout(coefficients,
                                 1. - self.keep_prob,
                                 dropout_implementation='upscale_in_train')
        contexts = F.matmul(coefficients, v)
        # context normalization
        enc_lengths = F.cast(F.unsqueeze(lengths, axes=[1, 2]), "float32")
        contexts *= F.sqrt(enc_lengths)
        # out affine
        contexts = self.out_affine(contexts)
        return contexts, coefficients
Exemplo n.º 19
0
 def _sqrt(x):
     if isinstance(x, PTensor):
         return layers.sqrt(x)
     else:
         return np.sqrt(x)
    def forward(self, x, y):
        # x,y误差一帧
        u1 = zeros_like(x)
        u2 = zeros_like(x)
        l_t = self.l * self.t
        taut = self.a / self.t

        grad2_x = self.conv_img_grad(y)
        # grad2_x[:, :, :, 0] = 0.5 * (x[:, :, :, 1] - x[:, :, :, 0])
        # grad2_x[:, :, :, -1] = 0.5 * (x[:, :, :, -1] - x[:, :, :, -2])

        grad2_y = self.conv_img_grad2(y)
        # grad2_y[:, :, 0, :] = 0.5 * (x[:, :, 1, :] - x[:, :, 0, :])
        # grad2_y[:, :, -1, :] = 0.5 * (x[:, :, -1, :] - x[:, :, -2, :])

        p11 = zeros_like(x)
        p12 = zeros_like(x)
        p21 = zeros_like(x)
        p22 = zeros_like(x)

        gsqx = grad2_x**2
        gsqy = grad2_y**2
        grad = gsqx + gsqy + 1e-12

        rho_c = y - grad2_x * u1 - grad2_y * u2 - x

        for i in range(self.n_iter):
            rho = rho_c + grad2_x * u1 + grad2_y * u2 + 1e-12

            v1 = zeros_like(x)
            v2 = zeros_like(x)
            mask1 = rho < -l_t * grad
            mask2 = rho > l_t * grad
            mask3 = logical_and(logical_not(logical_or(mask1, mask2)),
                                (grad > 1e-12))
            mask1 = cast(mask1, dtype='float32')
            mask2 = cast(mask2, dtype='float32')
            mask3 = cast(mask3, dtype='float32')
            mask1.stop_gradient = True
            mask2.stop_gradient = True
            mask3.stop_gradient = True

            # v1 = v1 + l_t * grad2_x * mask1 - l_t * grad2_x * mask2 - (rho / grad) * grad2_x * mask3
            # v2 = v2 + l_t * grad2_y * mask1 - l_t * grad2_y * mask2 - (rho / grad) * grad2_y * mask3
            v1 = elementwise_add(
                u1,
                elementwise_add(
                    elementwise_mul(l_t * grad2_x, mask1),
                    elementwise_add(
                        elementwise_mul(-l_t * grad2_x, mask2),
                        elementwise_mul(-elementwise_div(rho, grad),
                                        elementwise_mul(grad2_x, mask3)))))
            v2 = elementwise_add(
                u2,
                elementwise_add(
                    elementwise_mul(l_t * grad2_y, mask1),
                    elementwise_add(
                        elementwise_mul(-l_t * grad2_y, mask2),
                        elementwise_mul(-elementwise_div(rho, grad),
                                        elementwise_mul(grad2_y, mask3)))))

            del rho
            del mask1
            del mask2
            del mask3

            v1 += u1
            v2 += u2

            u1 = v1 + self.t * self.divergence(p11, p12)
            u2 = v2 + self.t * self.divergence(p21, p22)
            del v1
            del v2
            u1 = u1
            u2 = u2

            u1x, u1y = self.forward_grad(u1)
            u2x, u2y = self.forward_grad(u2)

            p11 = (p11 + taut * u1x) / (1. +
                                        taut * sqrt(u1x**2 + u1y**2 + 1e-12))
            p12 = (p12 + taut * u1y) / (1. +
                                        taut * sqrt(u1x**2 + u1y**2 + 1e-12))
            p21 = (p21 + taut * u2x) / (1. +
                                        taut * sqrt(u2x**2 + u2y**2 + 1e-12))
            p22 = (p22 + taut * u2y) / (1. +
                                        taut * sqrt(u2x**2 + u2y**2 + 1e-12))
            del u1x
            del u1y
            del u2x
            del u2y

        return u1, u2
    def forward(self, x):
        '''
        bt,c,w,h=x.shape
        tmp=layers.reshape(x,shape=[48,-1,c,w,h])
        res=layers.reshape(tmp[:,:-1],shape=[-1,c,w,h])'''
        x = self.bottleneck(x)
        inp = self.norm_img(x)
        bt, c, w, h = inp.shape
        inp = layers.reshape(inp, shape=[self.batch_size, -1, c, w, h])

        x = inp[:, :-1]
        y = inp[:, 1:]

        x = layers.reshape(layers.transpose(x, perm=[0, 2, 1, 3, 4]),
                           shape=[-1, c, h, w])
        y = layers.reshape(layers.transpose(y, perm=[0, 2, 1, 3, 4]),
                           shape=[-1, c, h, w])
        u1 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32')
        u2 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32')

        l_t = self.lamda * self.theta
        taut = self.tau / (self.theta + 1e-12)

        grad2_x = self.conv4Ix(layers.pad(y, (0, 0, 0, 0, 0, 0, 1, 1)))

        tmp = layers.unstack(grad2_x, axis=3)
        tmp[-1] = 0.5 * (x[:, :, :, -1] - x[:, :, :, -2])
        tmp[0] = 0.5 * (x[:, :, :, 1] - x[:, :, :, 0])
        grad2_x = layers.stack(tmp, axis=3)

        grad2_y = self.conv4Iy(layers.pad(y, (0, 0, 0, 0, 1, 1, 0, 0)))
        tmp = layers.unstack(grad2_y, axis=2)
        tmp[-1] = 0.5 * (x[:, :, -1, :] - x[:, :, -2, :])
        tmp[0] = 0.5 * (x[:, :, 1, :] - x[:, :, 0, :])
        grad2_y = layers.stack(tmp, axis=2)

        p11 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32')
        p12 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32')
        p21 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32')
        p22 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32')

        gsqx = grad2_x**2
        gsqy = grad2_y**2

        grad = gsqx + gsqy + 1e-12

        rho_c = y - grad2_x * u1 - grad2_y * u2 - x
        for i in range(self.n_iter):
            rho = rho_c + grad2_x * u1 + grad2_y * u2 + 1e-12

            mask1 = (rho < -l_t * grad).detach().astype('float32')
            mask1.stop_gradient = True
            tmp1 = l_t * grad2_x
            tmp2 = l_t * grad2_y
            v1 = tmp1 * mask1
            v2 = tmp2 * mask1

            mask2 = (rho > l_t * grad).detach().astype('float32')
            mask2.stop_gradient = True
            v1 = -tmp1 * mask2 + v1
            v2 = -tmp2 * mask2 + v2

            mask3 = fluid.layers.ones(
                x.shape, dtype='float32') - (mask1 + mask2 - mask1 * mask2)
            mask3.stop_gradient = True
            tmp1 = (-rho / grad) * grad2_x
            tmp2 = (-rho / grad) * grad2_y

            v1 = tmp1 * mask3 + v1
            v2 = tmp2 * mask3 + v2

            del rho
            del mask1
            del mask2
            del mask3

            v1 += u1
            v2 += u2

            u1 = v1 + self.theta * self.divergence(p11, p12)
            u2 = v2 + self.theta * self.divergence(p21, p22)

            del v1
            del v2
            u1 = u1
            u2 = u2

            u1x, u1y = self.forward_grad(u1)
            u2x, u2y = self.forward_grad(u2)

            p11 = (p11 + taut * u1x) / (
                1. + taut * layers.sqrt(u1x**2 + u1y**2 + 1e-12))
            p12 = (p12 + taut * u1y) / (
                1. + taut * layers.sqrt(u1x**2 + u1y**2 + 1e-12))
            p21 = (p21 + taut * u2x) / (
                1. + taut * layers.sqrt(u2x**2 + u2y**2 + 1e-12))
            p22 = (p22 + taut * u2y) / (
                1. + taut * layers.sqrt(u2x**2 + u2y**2 + 1e-12))
            del u1x
            del u1y
            del u2x
            del u2y

        flow = layers.concat([u1, u2], axis=1)

        #  flow = layers.transpose(layers.reshape(flow,shape=[b,t,c*2,h,w]),perm=[0,2,1,3,4])
        flow = self.unbottleneck(flow)
        flow = self.bn(flow) if self.bn else flow
        return flow
Exemplo n.º 22
0
    def _dygraph_clip(self, params_grads):
        sum_square_fp32, sum_square_fp16 = [], []
        unslice_params_fp32, unslice_params_fp16 = [], []

        for p, g in params_grads:
            p_slice = True  # using for slice parameter in sharding stage3
            if g is None or getattr(p, 'need_clip', True) is False:
                continue
            if hasattr(p, "unslice"):
                p_slice = False

            merge_grad = g
            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                merge_grad = layers.get_tensor_from_selected_rows(
                    layers.merge_selected_rows(g))
            square = layers.square(merge_grad)
            sum_square = layers.reduce_sum(square)

            if p.dtype == paddle.float16:
                if p_slice: sum_square_fp16.append(sum_square)
                else: unslice_params_fp16.append(sum_square)
            elif p.dtype == paddle.float32:
                if p_slice: sum_square_fp32.append(sum_square)
                else: unslice_params_fp32.append(sum_square)

        # global norm of non-distributed FP16 params_and_grads
        if len(sum_square_fp16) == 0:
            global_norm_fp16 = paddle.to_tensor([0.], dtype=paddle.float32)
        else:
            global_norm_fp16 = layers.concat(sum_square_fp16)
            global_norm_fp16 = layers.reduce_sum(global_norm_fp16)
            global_norm_fp16 = paddle.cast(
                global_norm_fp16, dtype=paddle.float32)

        # global norm of non-distributed FP16 params_and_grads for unslice parameters
        if len(unslice_params_fp16) == 0:
            global_unslice_fp16 = paddle.to_tensor([0.], dtype=paddle.float32)
        else:
            global_unslice_fp16 = layers.concat(unslice_params_fp16)
            global_unslice_fp16 = layers.reduce_sum(global_unslice_fp16)
            global_unslice_fp16 = paddle.cast(
                global_unslice_fp16, dtype=paddle.float32)

        # global norm of non-distributed FP32 params_and_grads
        global_norm_fp32 = layers.concat(sum_square_fp32) if len(
            sum_square_fp32) != 0 else paddle.to_tensor(
                [0.], dtype=paddle.float32)
        global_norm_fp32 = layers.reduce_sum(global_norm_fp32)

        # global norm of non-distributed FP32 params_and_grads for unslice parameters
        global_unslice_fp32 = layers.concat(unslice_params_fp32) if len(
            unslice_params_fp32) != 0 else paddle.to_tensor(
                [0.], dtype=paddle.float32)
        global_unslice_fp32 = layers.reduce_sum(global_unslice_fp32)
        global_unslice_var = global_unslice_fp16 + global_unslice_fp32

        global_norm_var = global_norm_fp16 + global_norm_fp32 + 1.0 / self._group.nranks * global_unslice_var

        # add all reduce to get global norm of distributed params_and_grads
        dev_id = int(self._device.split(":")[1])
        if paddle.device.get_device() == "cpu":
            global_norm_var = global_norm_var.cuda(dev_id)

        with device_guard(dev_id, "gpu"):
            paddle.distributed.all_reduce(global_norm_var, group=self._group)

        global_norm_var = layers.sqrt(global_norm_var)
        max_global_norm = layers.fill_constant(
            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)

        clip_var = layers.elementwise_div(
            x=max_global_norm,
            y=layers.elementwise_max(
                x=global_norm_var, y=max_global_norm))
        clip_var_fp16 = paddle.cast(clip_var, paddle.float16)

        for p, g in params_grads:
            if getattr(p, 'need_clip', True) is False or g is None:
                continue
            origin_state = g.stop_gradient
            g.stop_gradient = True
            if p.dtype == paddle.float16:
                g.scale_(clip_var_fp16.item())
            else:
                g.scale_(clip_var.item())
            g.stop_gradient = origin_state
            # p._reset_grad_inplace_version(True)

        return params_grads
Exemplo n.º 23
0
    def forward(self, x):
        tmp = layers.elementwise_mul(x, x)  # or x ** 2
        tmp1 = layers.sqrt(
            layers.reduce_mean(tmp, dim=1, keep_dim=True) + self.epsilon)

        return x * tmp1
Exemplo n.º 24
0
    def _dygraph_clip(self, params_grads):
        params_and_grads = []

        sum_square_dist_fp16 = []
        sum_square_dist_fp32 = []
        sum_square_not_dist_fp16 = []
        sum_square_not_dist_fp32 = []

        for p, g in params_grads:
            if g is None:
                continue
            if getattr(p, 'need_clip', True) is False:
                continue
            merge_grad = g
            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                merge_grad = layers.merge_selected_rows(g)
                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
            square = layers.square(merge_grad)
            sum_square = layers.reduce_sum(square)

            not_shared_enable = (not hasattr(p, 'is_firstly_shared')) or (
                hasattr(p, 'is_firstly_shared')
                and getattr(p, 'is_firstly_shared', True))

            if not_shared_enable:
                if p.is_distributed:
                    if p.dtype == paddle.float16:
                        sum_square_dist_fp16.append(sum_square)
                    elif p.dtype == paddle.float32:
                        sum_square_dist_fp32.append(sum_square)
                else:
                    if p.dtype == paddle.float16:
                        sum_square_not_dist_fp16.append(sum_square)
                    elif p.dtype == paddle.float32:
                        sum_square_not_dist_fp32.append(sum_square)

        # global norm of distributed FP16 params_and_grads
        if len(sum_square_dist_fp16) == 0:
            global_norm_dist_fp16 = paddle.to_tensor([0.],
                                                     dtype=paddle.float32)
        else:
            global_norm_dist_fp16 = layers.concat(sum_square_dist_fp16)
            global_norm_dist_fp16 = layers.reduce_sum(global_norm_dist_fp16)
            global_norm_dist_fp16 = paddle.cast(global_norm_dist_fp16,
                                                dtype=paddle.float32)

        # global norm of non-distributed FP16 params_and_grads
        if len(sum_square_not_dist_fp16) == 0:
            global_norm_not_dist_fp16 = paddle.to_tensor([0.],
                                                         dtype=paddle.float32)
        else:
            global_norm_not_dist_fp16 = layers.concat(sum_square_not_dist_fp16)
            global_norm_not_dist_fp16 = layers.reduce_sum(
                global_norm_not_dist_fp16)
            global_norm_not_dist_fp16 = paddle.cast(global_norm_not_dist_fp16,
                                                    dtype=paddle.float32)

        # global norm of distributed FP32 params_and_grads
        global_norm_dist_fp32 = layers.concat(sum_square_dist_fp32) if len(
            sum_square_dist_fp32) != 0 else paddle.to_tensor(
                [0.], dtype=paddle.float32)
        global_norm_dist_fp32 = layers.reduce_sum(global_norm_dist_fp32)

        # global norm of non-distributed FP32 params_and_grads
        global_norm_not_dist_fp32 = layers.concat(
            sum_square_not_dist_fp32
        ) if len(sum_square_not_dist_fp32) != 0 else paddle.to_tensor(
            [0.], dtype=paddle.float32)
        global_norm_not_dist_fp32 = layers.reduce_sum(
            global_norm_not_dist_fp32)

        global_norm_var_dist = global_norm_dist_fp16 + global_norm_dist_fp32
        global_norm_var_not_dist = global_norm_not_dist_fp16 + global_norm_not_dist_fp32

        # add all reduce to get global norm of distributed params_and_grads
        if self._hcg.get_model_parallel_world_size() > 1:
            paddle.distributed.all_reduce(
                global_norm_var_dist,
                group=self._hcg.get_check_parallel_group())

        # add all reduce to get global norm of non-distributed params_and_grads in groups of pp
        if self._hcg.get_pipe_parallel_world_size() > 1:
            paddle.distributed.all_reduce(
                global_norm_var_not_dist,
                group=self._hcg.get_pipe_parallel_group())

        # In Sharding mode, param and grad is mapping different rank in optimizer.
        # ClipGradByGlobalNorm need allreduce to get globol norm
        if self._hcg.get_sharding_parallel_world_size() > 1:
            paddle.distributed.all_reduce(
                global_norm_var_not_dist,
                group=self._hcg.get_sharding_parallel_group())

        global_norm_var_fp32 = layers.sqrt(global_norm_var_dist +
                                           global_norm_var_not_dist)

        max_global_norm = layers.fill_constant(
            shape=[1], dtype=global_norm_var_fp32.dtype, value=self.clip_norm)
        clip_var = layers.elementwise_div(x=max_global_norm,
                                          y=layers.elementwise_max(
                                              x=global_norm_var_fp32,
                                              y=max_global_norm))
        clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
        for p, g in params_grads:
            if g is None:
                continue
            if getattr(p, 'need_clip', True) is False:
                params_and_grads.append((p, g))
                continue
            if p.dtype == paddle.float16:
                new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16)
            else:
                new_grad = layers.elementwise_mul(x=g, y=clip_var)
            params_and_grads.append((p, new_grad))

        return params_and_grads