def backward_and_update_half(self, loss, threshold = 2097152, clipping = False, clip_Value = 100): # THIS IS A EXPERIMENTAL FUNCTION FOR RESEARCH PURPOSE: # It converts the gradients to 16 bits half precision format before allreduce # To assist training, this functions provide an option to perform gradient clipping plist = [] acc = 0 glist = [] for p, g in autograd.backward(loss): if clipping: g = autograd.clip(g, -clip_Value, clip_Value) if g.size() > threshold: # larger than threshold -> reduced directly self.all_reduce_half(g.data) else: # smaller than threshold -> accumulate glist.append(g.data) acc += g.size() if (acc > threshold): self.fused_all_reduce_half(glist) acc = 0 glist = [] plist.append((p, g)) if glist: self.fused_all_reduce_half(glist) self.wait() for p, g in plist: self.update(p, g)
def backward_and_update_half(self, loss, threshold=2097152, clipping=False, clip_Value=100): """Performs backward propagation and parameter update, with FP16 precision communication. THIS IS A EXPERIMENTAL FUNCTION FOR RESEARCH PURPOSE: From the loss, it performs backward propagation to get the gradients and do the parameter update. For gradient communication, it fuses all the tensor smaller than the threshold value to reduce network latency, as well as converting them to FP16 half precision format before sending them out. To assist training, this functions provide an option to perform gradient clipping. Args: loss(Tensor): loss is the objective function of the deep learning model optimization, e.g. for classification problem it can be the output of the softmax_cross_entropy function. threshold(int): threshold is a parameter to control performance in fusing the tensors. For the tensors of sizes smaller than threshold, they are to be accumulated and fused before the all reduce operation. For the tensors of its size larger than the threshold value, they are to be reduced directly without fusion. clipping(bool): a boolean flag to choose whether to clip the gradient value clip_value(float): the clip value to be used when clipping is True """ plist = [] acc = 0 glist = [] for p, g in autograd.backward(loss): assert p.dtype == tensor.float32, ( 'This function is only available for input tensor precision 32 bit, ' 'which are converted into 16 bits before transmit') if clipping: g = autograd.clip(g, -clip_Value, clip_Value) if g.size() > threshold: # larger than threshold -> reduced directly self.all_reduce_half(g.data) else: # smaller than threshold -> accumulate glist.append(g.data) self.fused_all_reduce_half([g.data], send=False) acc += g.size() if (acc > threshold): self.fused_all_reduce_half(glist) acc = 0 glist = [] plist.append((p, g)) if glist: self.fused_all_reduce_half(glist) self.wait() for p, g in plist: self.update(p, g) self.opt.step()
def test_clip(self): x = np.array([-0.9, -0.3, -0.1, 0.1, 0.5, 0.9]).reshape(3, 2).astype(np.float32) x = tensor.from_numpy(x) min = -0.5 max = 0.5 x.to_device(gpu_dev) y = autograd.clip(x, min, max) # frontend model = sonnx.to_onnx([x, min, max], [y]) # print('The model is:\n{}'.format(model)) # backend sg_ir = sonnx.prepare(model, device=gpu_dev) y_t = sg_ir.run([x, min, max]) np.testing.assert_array_almost_equal(tensor.to_numpy(y), tensor.to_numpy(y_t[0]), decimal=5)