def bprop(x, out, dout): if mean_flag: if F.issubclass_(F.typeof(dout), mstype.tensor): dx = all_reduce(dout) float_one = F.scalar_cast(1.0, F.dtype(dx)) num = F.scalar_cast(dev_num, F.dtype(dx)) dx = mul(dx, cast(F.scalar_to_array(float_one / num), F.dtype(dx))) else: indices = all_gather(dout.indices) grad = all_gather(dout.values) float_one = F.scalar_cast(1.0, F.dtype(grad)) num = F.scalar_cast(dev_num, F.dtype(grad)) grad = mul( grad, cast(F.scalar_to_array(float_one / num), F.dtype(grad))) dx = RowTensor(indices, grad, dout.dense_shape) else: if F.issubclass_(F.typeof(dout), mstype.tensor): dx = all_reduce(dout) else: indices = all_gather(dout.indices) grad = all_gather(dout.values) dx = RowTensor(indices, grad, dout.dense_shape) return (dx, )
def bprop(x, z, out, dout): if mean_flag: if F.issubclass_(F.typeof(dout), mstype.tensor): if do_mirror: z = F.depend(z, F.assign_add(z, dout)) real_grad = all_reduce(z) dx = real_grad else: dx = dout float_one = F.scalar_cast(1.0, F.dtype(dx)) num = F.scalar_cast(dev_num, F.dtype(dx)) dx = mul(dx, cast(F.scalar_to_array(float_one / num), F.dtype(dx))) else: dx = zeros_like( x) # The grad accumulation do not support row tensor now else: if F.issubclass_(F.typeof(dout), mstype.tensor): if do_mirror: z = F.depend(z, F.assign_add(z, dout)) real_grad = all_reduce(z) dx = real_grad else: dx = dout else: dx = zeros_like( x) # The grad accumulation do not support row tensor now return (dx, zeros_like(z))
def bprop(x, out, dout): if mean_flag: dx = all_reduce(dout) float_one = F.scalar_cast(1.0, F.dtype(dx)) num = F.scalar_cast(dev_num, F.dtype(dx)) dx = mul(dx, cast(F.scalar_to_array(float_one / num), F.dtype(dx))) else: dx = all_reduce(dout) return (dx, )
def _gauss_kernel_helper(filter_size): """gauss kernel helper""" filter_size = F.scalar_cast(filter_size, mstype.int32) coords = () for i in range(filter_size): i_cast = F.scalar_cast(i, mstype.float32) offset = F.scalar_cast(filter_size - 1, mstype.float32) / 2.0 element = i_cast - offset coords = coords + (element, ) g = np.square(coords).astype(np.float32) g = Tensor(g) return filter_size, g
def construct(self, img1, img2): _check_input_4d(F.shape(img1), "img1", self.cls_name) _check_input_4d(F.shape(img2), "img2", self.cls_name) _check_input_dtype(F.dtype(img1), 'img1', mstype.number_type, self.cls_name) P.SameTypeShape()(img1, img2) dtype_max_val = _get_dtype_max(F.dtype(img1)) max_val = F.scalar_cast(self.max_val, F.dtype(img1)) max_val = _convert_img_dtype_to_float32(max_val, dtype_max_val) img1 = _convert_img_dtype_to_float32(img1, dtype_max_val) img2 = _convert_img_dtype_to_float32(img2, dtype_max_val) c1 = (self.k1 * max_val)**2 c2 = (self.k2 * max_val)**2 sim = () mcs = () for i in range(self.level): sim, cs = _compute_multi_channel_loss(c1, c2, img1, img2, self.multi_convs_list[i], self.concat, self.reduce_mean) mcs += (self.relu(cs), ) img1, img2 = _downsample(img1, img2, self.avg_pool) mcs = mcs[0:-1:1] mcs_and_ssim = self.pack(mcs + (self.relu(sim), )) mcs_and_ssim = self.pow(mcs_and_ssim, self.weight_tensor) ms_ssim = self.prod(mcs_and_ssim, -1) loss = self.reduce_mean(ms_ssim, -1) return loss
def _tensors_allreduce_ps(degree, mean, allgather, allreduce, allreduce_filter, grad, ps_parameter): """ Apply allreduce on gradient. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce (Primitive): The communication operator for gradients. allreduce_filter (bool): When it is true, allreduce would apply. grad (Tensor): The gradient tensor before operation. ps_parameter (bool): Use parameter server or not. Returns: Tensor, the gradient tensor after operation. """ if ps_parameter: return grad if allreduce_filter: grad = allreduce(grad) if mean: degree = F.scalar_cast(degree, F.dtype(grad)) cast_op = P.Cast() mul_op = P.Mul() grad = mul_op( grad, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(grad))) return grad return grad
def _tensors_allreduce_with_sparse_ps(degree, mean, allgather, allreduce, allreduce_filter, grad, ps_parameter): """ Apply allgather on gradient instead of allreduce for sparse feature. Allgather is a communication operation used for distributed deep learning. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce (Primitive): The communication operator for gradients. allreduce_filter (bool): When it is true, allgather would apply. grad (tuple): The indices, gradient tensor and tensor_shape before operation. ps_parameter (bool): Use parameter server or not. Returns: RowTensor, the gradient after operation. """ if ps_parameter: return grad if allreduce_filter: indices = allgather(grad.indices) dout = allgather(grad.values) if mean: degree = F.scalar_cast(degree, F.dtype(grad.values)) cast_op = P.Cast() mul_op = P.Mul() dout = mul_op( dout, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(dout))) grad = RowTensor(indices, dout, grad.dense_shape) return grad
def _tensors_allreduce_with_sparse(degree, mean, allgather, allreduce_filter, grad, allreduce): """ Apply allgather on gradient instead of allreduce for sparse feature. Allgather is a communication operation used for distributed deep learning. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce_filter (bool): When it is true, allgather would apply. grad (IndexedSlices): The gradient before operation. allreduce (Primitive): The communication operator for gradients. Returns: IndexedSlices, the gradient after operation. """ if allreduce_filter: indices = allgather(grad.indices()) dout = allgather(grad.values()) if mean: degree = F.scalar_cast(degree, F.dtype(grad.values())) cast_op = P.Cast() mul_op = P.Mul() dout = mul_op( dout, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(dout))) grad = IndexedSlices(indices, dout, grad.dense_shape()) return grad
def bprop(x, y, z, out, dout): do_mirror = equal(y, grad_accumulation_step) do_mirror = reshape(do_mirror, (())) if mean_flag: if F.issubclass_(F.typeof(dout), mstype.tensor): if do_mirror: tmp = z + dout real_grad = all_reduce(tmp) dx = real_grad - z else: dx = dout float_one = F.scalar_cast(1.0, F.dtype(dx)) num = F.scalar_cast(dev_num, F.dtype(dx)) dx = mul(dx, cast(F.scalar_to_array(float_one/num), F.dtype(dx))) else: if do_mirror: indices = all_gather(dout.indices) grad = all_gather(dout.values) else: indices = dout.indices grad = dout.values float_one = F.scalar_cast(1.0, F.dtype(grad)) num = F.scalar_cast(dev_num, F.dtype(grad)) grad = mul(grad, cast(F.scalar_to_array(float_one/num), F.dtype(grad))) dx = RowTensor(indices, grad, dout.dense_shape) else: if F.issubclass_(F.typeof(dout), mstype.tensor): if do_mirror: tmp = z + dout real_grad = all_reduce(tmp) dx = real_grad - z else: dx = dout else: if do_mirror: indices = all_gather(dout.indices) grad = all_gather(dout.values) else: indices = dout.indices grad = dout.values dx = RowTensor(indices, grad, dout.dense_shape) return (dx, zeros_like(y), zeros_like(z))
def _convert_img_dtype_to_float32(img, max_val): """convert img dtype to float32""" # Ususally max_val is 1.0 or 255, we will do the scaling if max_val > 1. # We will scale img pixel value if max_val > 1. and just cast otherwise. ret = F.cast(img, mstype.float32) max_val = F.scalar_cast(max_val, mstype.float32) if max_val > 1.: scale = 1. / max_val ret = ret * scale return ret
def construct(self, img1, img2): _check_input_4d(F.shape(img1), "img1", self.cls_name) _check_input_4d(F.shape(img2), "img2", self.cls_name) P.SameTypeShape()(img1, img2) dtype_max_val = _get_dtype_max(F.dtype(img1)) max_val = F.scalar_cast(self.max_val, F.dtype(img1)) max_val = _convert_img_dtype_to_float32(max_val, dtype_max_val) img1 = _convert_img_dtype_to_float32(img1, dtype_max_val) img2 = _convert_img_dtype_to_float32(img2, dtype_max_val) mse = P.ReduceMean()(F.square(img1 - img2), (-3, -2, -1)) psnr = 10 * P.Log()(F.square(max_val) / mse) / F.scalar_log(10.0) return psnr
def construct(self, img1, img2): _check_input_dtype(F.dtype(img1), "img1", [mstype.float32, mstype.float16], self.cls_name) _check_input_filter_size(F.shape(img1), "img1", self.filter_size, self.cls_name) P.SameTypeShape()(img1, img2) dtype_max_val = _get_dtype_max(F.dtype(img1)) max_val = F.scalar_cast(self.max_val, F.dtype(img1)) max_val = _convert_img_dtype_to_float32(max_val, dtype_max_val) img1 = _convert_img_dtype_to_float32(img1, dtype_max_val) img2 = _convert_img_dtype_to_float32(img2, dtype_max_val) c1 = (self.k1 * max_val) ** 2 c2 = (self.k2 * max_val) ** 2 ssim_ave_channel, _ = _compute_multi_channel_loss(c1, c2, img1, img2, self.conv, self.concat, self.reduce_mean) loss = self.reduce_mean(ssim_ave_channel, -1) return loss
def _tensors_allreduce_mean(mul, degree, allreduce, parameters): """ Apply allreduce on parameters. Args: mul(Primitive): The mul operator for parameters. degree (int): The mean coefficient. allreduce (Primitive): The communication operator for parameters. parameters (Tensor): The parameters before operation. Returns: Tensor, the parameters after operation. """ degree = F.scalar_cast(degree, F.dtype(parameters)) parameters = allreduce(parameters) cast_op = P.Cast() return mul(parameters, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(parameters)))
def _tensors_allreduce_mean(mul, degree, allreduce_filter, grad): """ Apply mean and allreduce on gradient. Allreduce is a communication operation used for distributed deep learning. Args: mul (Primitive): Div operation. degree (int): The mean coefficient. allreduce_filter (bool): When it is true, allreduce would apply. grad (Tensor): The gradient tensor before operation. Returns: Tensor, the gradient tensor after operation. """ if allreduce_filter: degree = F.scalar_cast(degree, F.dtype(grad)) grad = _all_reduce(grad) cast_op = P.Cast() return mul(grad, cast_op(F.scalar_to_array(1.0/degree), F.dtype(grad))) return grad
def _tensors_allreduce_mean_with_sparse(mul, degree, allreduce_filter, grad): """ Apply mean and allgather on gradient instead of allreduce for sparse feature. Allgather is a communication operation used for distributed deep learning. Args: mul (Primitive): Div operation. degree (int): The mean coefficient. allreduce_filter (bool): When it is true, allgather would apply. grad (Tuple): The indices, gradient tensor and tensor_shape before operation. Returns: Tuple, include indices, the gradient tensor and tensor_shape after operation. """ if allreduce_filter: indices = _all_gather(grad[0]) degree = F.scalar_cast(degree, F.dtype(grad[1])) dout = _all_gather(grad[1]) cast_op = P.Cast() dout = mul(dout, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(dout))) grad = (indices, dout, grad[2]) return grad
def _tensors_allreduce(degree, mean, allgather, allreduce, allreduce_filter, grad): """ Apply allreduce on gradient. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce (Primitive): The communication operator for gradients. allreduce_filter (bool): When it is true, allreduce would apply. grad (Tensor): The gradient tensor before operation. Returns: Tensor, the gradient tensor after operation. """ if allreduce_filter: grad = allreduce(grad) if mean: degree = F.scalar_cast(degree, F.dtype(grad)) grad = F.tensor_mul( grad, F.cast(F.scalar_to_array(1.0 / degree), F.dtype(grad))) return grad return grad
def fx_cast(x): output = F.scalar_cast(x, input_t) return output
def _tensors_allreduce_mean(mul, degree, grad): degree = F.scalar_cast(degree, F.dtype(grad)) grad = _all_reduce_A(grad) cast_op = P.Cast() return mul(grad, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(grad)))
def fn_cast(x, t): output = F.scalar_cast(x, t) return output