def _tensors_allreduce_with_sparse(degree, mean, allgather, allreduce_filter, grad, allreduce): """ Apply allgather on gradient instead of allreduce for sparse feature. Allgather is a communication operation used for distributed deep learning. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce_filter (bool): When it is true, allgather would apply. grad (IndexedSlices): The gradient before operation. allreduce (Primitive): The communication operator for gradients. Returns: IndexedSlices, the gradient after operation. """ if allreduce_filter: indices = allgather(grad.indices()) dout = allgather(grad.values()) if mean: degree = F.scalar_cast(degree, F.dtype(grad.values())) cast_op = P.Cast() mul_op = P.Mul() dout = mul_op( dout, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(dout))) grad = IndexedSlices(indices, dout, grad.dense_shape()) return grad
def _tensor_apply_decay_with_sparse(weight_decay, if_apply, weight, gradient): """Get grad with weight_decay.""" if if_apply: indices = gradient.indices() values = op_add((op_gather(weight, indices, 0) * weight_decay, gradient.values())) shape = gradient.dense_shape() return IndexedSlices(indices, values, shape) return gradient
def _tensors_cast_datatype_with_sparse(datatype, grad): """ Cast gradient to datatype. Args: datatype (mstype): the destination datatype of gradient. grad (IndexedSlices): The gradient before operation. Returns: IndexedSlices, the gradient after operation. """ dout = F.cast(grad.values(), datatype) return IndexedSlices(grad.indices(), dout, grad.dense_shape())
def tensor_grad_scale_with_sparse(scale, grad): """Get grad with scale.""" if scale == 1.0: return grad return IndexedSlices(grad.indices(), grad.values() * scale, grad.dense_shape())
def bprop(x, indices, axis, out, dout): return IndexedSlices(indices, dout, x), axis, out