예제 #1
0
 def __init__(self, inputx, indices, updates):
     super(TestScatterAddDynamicNet, self).__init__()
     self.scatter_add = P.ScatterAdd()
     self.test_dynamic = inner.GpuConvertToDynamicShape()
     self.inputx = Parameter(inputx, name="inputx")
     self.indices = Parameter(indices, name="indices")
     self.updates = Parameter(updates, name="updates")
예제 #2
0
def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power,
                         beta2_power, beta1, beta2, eps, lr, gradient, param, m, v, ps_parameter, cache_enable):
    """Apply sparse adam optimizer to the weight parameter when the gradient is sparse."""
    success = True
    indices = gradient.indices
    values = gradient.values
    if ps_parameter and not cache_enable:
        op_shape = P.Shape()
        shapes = (op_shape(param), op_shape(m), op_shape(v),
                  op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1),
                  op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices))
        success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2,
                                               eps, values, indices), shapes), param))
        return success

    if not target:
        success = F.depend(success, sparse_opt(param, m, v, beta1_power, beta2_power, lr, beta1, beta2,
                                               eps, values, indices))
    else:
        op_mul = P.Mul()
        op_square = P.Square()
        op_sqrt = P.Sqrt()
        scatter_add = P.ScatterAdd(use_locking)

        success = F.depend(success, F.assign(m, op_mul(beta1, m)))
        success = F.depend(success, F.assign(v, op_mul(beta2, v)))

        grad_indices = gradient.indices
        grad_value = gradient.values

        next_m = scatter_add(m,
                             grad_indices,
                             op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value))

        next_v = scatter_add(v,
                             grad_indices,
                             op_mul(F.tuple_to_array((1.0,)) - beta2, op_square(grad_value)))

        if use_nesterov:
            m_temp = next_m * _scaler_ten
            F.assign(m, op_mul(beta1, next_m))
            div_value = scatter_add(m,
                                    op_mul(grad_indices, _scaler_one),
                                    op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value))
            param_update = div_value / (op_sqrt(next_v) + eps)
            F.assign(m, m_temp / _scaler_ten)
        else:
            param_update = next_m / (op_sqrt(next_v) + eps)

        lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power)
        next_param = param - lr_t * param_update

        success = F.depend(success, F.assign(param, next_param))
        success = F.depend(success, F.assign(m, next_m))
        success = F.depend(success, F.assign(v, next_v))

    return success
예제 #3
0
def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking,
                         use_nesterov, target, beta1_power, beta2_power, beta1,
                         beta2, eps, lr, gradient, params, m, v, ps_parameter,
                         cache_enable):
    """Apply sparse lazy adam optimizer to the weight parameter when the gradient is sparse."""
    success = True
    indices = gradient.indices
    values = gradient.values
    if ps_parameter and not cache_enable:
        op_shape = P.Shape()
        shapes = (op_shape(params), op_shape(m), op_shape(v),
                  op_shape(beta1_power), op_shape(beta2_power), op_shape(lr),
                  op_shape(beta1), op_shape(beta2), op_shape(eps),
                  op_shape(values), op_shape(indices))
        success = F.depend(
            success,
            pull(
                push((beta1_power, beta2_power, lr, beta1, beta2, eps, values,
                      indices), shapes), params))
        return success

    if not target:
        success = F.depend(
            success,
            sparse_opt(params, m, v, beta1_power, beta2_power, lr, beta1,
                       beta2, eps, values, indices))
    else:
        op_gather = P.Gather()
        op_sqrt = P.Sqrt()
        scatter_add = P.ScatterAdd(use_locking)
        scatter_update = P.ScatterUpdate(use_locking)

        m_slice = op_gather(m, indices, 0)
        v_slice = op_gather(v, indices, 0)

        next_m = m_slice * beta1 + values * (1 - beta1)
        next_v = v_slice * beta2 + values * values * (1 - beta2)

        lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power)

        if use_nesterov:
            m_temp = beta1 * next_m + values * (1 - beta1)
            param_update = m_temp / (op_sqrt(next_v) + eps)
        else:
            param_update = next_m / (op_sqrt(next_v) + eps)

        success = F.depend(success,
                           scatter_add(params, indices, -lr_t * param_update))
        success = F.depend(success, scatter_update(m, indices, next_m))
        success = F.depend(success, scatter_update(v, indices, next_v))

    return success
예제 #4
0
 def __init__(self, lock, inputx, indices, updates):
     super(TestScatterAddNet, self).__init__()
     self.scatter_add = P.ScatterAdd(use_locking=lock)
     self.inputx = Parameter(inputx, name="inputx")
     self.indices = Parameter(indices, name="indices")
     self.updates = Parameter(updates, name="updates")
예제 #5
0
 def __init__(self):
     super(TestScatterAddDynamicNet2, self).__init__()
     self.scatter_add = P.ScatterAdd()
     self.test_dynamic = inner.GpuConvertToDynamicShape()
예제 #6
0
 def __init__(self, input_x):
     super(ScatterAddNet, self).__init__()
     self.input_x = Parameter(input_x, name="para")
     self.scatter_add = P.ScatterAdd()
예제 #7
0
 def __init__(self, inputx):
     super(TestScatterAddDynamicNet2, self).__init__()
     self.scatter_add = P.ScatterAdd()
     self.test_dynamic = inner.GpuConvertToDynamicShape()
     self.inputx = Parameter(inputx, name="inputx")