def bbox_ciou(self, boxes1_x0y0x1y1, boxes2_x0y0x1y1): ''' 计算ciou = iou - p2/c2 - av :param boxes1: (batch_size, num_priors, 4) pred_x0y0x1y1 :param boxes2: (batch_size, num_priors, 4) label_x0y0x1y1 :return: ''' # 得到中心点坐标、宽高 boxes1 = P.concat( [(boxes1_x0y0x1y1[:, :, :2] + boxes1_x0y0x1y1[:, :, 2:]) * 0.5, boxes1_x0y0x1y1[:, :, 2:] - boxes1_x0y0x1y1[:, :, :2]], axis=-1) boxes2 = P.concat( [(boxes2_x0y0x1y1[:, :, :2] + boxes2_x0y0x1y1[:, :, 2:]) * 0.5, boxes2_x0y0x1y1[:, :, 2:] - boxes2_x0y0x1y1[:, :, :2]], axis=-1) # 两个矩形的面积 boxes1_area = (boxes1_x0y0x1y1[:, :, 2] - boxes1_x0y0x1y1[:, :, 0]) * ( boxes1_x0y0x1y1[:, :, 3] - boxes1_x0y0x1y1[:, :, 1]) boxes2_area = (boxes2_x0y0x1y1[:, :, 2] - boxes2_x0y0x1y1[:, :, 0]) * ( boxes2_x0y0x1y1[:, :, 3] - boxes2_x0y0x1y1[:, :, 1]) # 相交矩形的左上角坐标、右下角坐标 left_up = P.elementwise_max(boxes1_x0y0x1y1[:, :, :2], boxes2_x0y0x1y1[:, :, :2]) right_down = P.elementwise_min(boxes1_x0y0x1y1[:, :, 2:], boxes2_x0y0x1y1[:, :, 2:]) # 相交矩形的面积inter_area。iou inter_section = P.relu(right_down - left_up) inter_area = inter_section[:, :, 0] * inter_section[:, :, 1] union_area = boxes1_area + boxes2_area - inter_area iou = inter_area / union_area # 包围矩形的左上角坐标、右下角坐标 enclose_left_up = P.elementwise_min(boxes1_x0y0x1y1[:, :, :2], boxes2_x0y0x1y1[:, :, :2]) enclose_right_down = P.elementwise_max(boxes1_x0y0x1y1[:, :, 2:], boxes2_x0y0x1y1[:, :, 2:]) # 包围矩形的对角线的平方 enclose_wh = enclose_right_down - enclose_left_up enclose_c2 = P.pow(enclose_wh[:, :, 0], 2) + P.pow( enclose_wh[:, :, 1], 2) # 两矩形中心点距离的平方 p2 = P.pow(boxes1[:, :, 0] - boxes2[:, :, 0], 2) + P.pow( boxes1[:, :, 1] - boxes2[:, :, 1], 2) # 增加av。分母boxes2[:, :, 3]可能为0,所以加了极小的常数防止nan atan1 = P.atan(boxes1[:, :, 2] / (boxes1[:, :, 3] + 1e-9)) atan2 = P.atan(boxes2[:, :, 2] / (boxes2[:, :, 3] + 1e-9)) v = 4.0 * P.pow(atan1 - atan2, 2) / (math.pi**2) a = v / (1 - iou + v) ciou = iou - 1.0 * p2 / enclose_c2 - 1.0 * a * v return ciou
def ffffffffffffffffffff(self, pred, target): ''' 输入矩形的格式是cx cy w h ''' assert pred.shape[0] == target.shape[0] pred = L.reshape(pred, [-1, 4]) target = L.reshape(target, [-1, 4]) pred = L.cast(pred, 'float32') target = L.cast(target, 'float32') # 相交矩形左上角坐标 tl = L.elementwise_max((pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)) # 相交矩形右下角坐标 br = L.elementwise_min((pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)) area_p = paddle.prod(pred[:, 2:], 1) # 预测框的面积 area_g = paddle.prod(target[:, 2:], 1) # gt框的面积 # 相交矩形是否存在? # en = (tl < br).type(tl.type()).prod(dim=1) en = L.cast(tl < br, 'float32') en = paddle.prod(en, 1) # 相交矩形是否存在? area_i = paddle.prod(br - tl, 1) * en area_u = area_p + area_g - area_i iou = (area_i) / (area_u + 1e-16) if self.loss_type == "iou": loss = 1 - iou**2 elif self.loss_type == "giou": c_tl = L.elementwise_min((pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)) c_br = L.elementwise_max((pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)) area_c = paddle.prod(c_br - c_tl, 1) # area_c限制在区间[1e-16, np.inf]内 area_c = L.clip(area_c, 1e-16, np.inf) giou = iou - (area_c - area_u) / area_c # giou限制在区间[-1.0, 1.0]内 giou = L.clip(giou, -1.0, 1.0) loss = 1 - giou if self.reduction == "mean": loss = loss.mean() elif self.reduction == "sum": loss = loss.sum() return loss
def forward(self, pred, target): target = 1 - target[:, 0] batch_size, vector_size = pred.shape[0], pred.shape[1] pred = L.l2_normalize(pred, axis=1, epsilon=1e-10) square_norm = L.reduce_sum(L.square(pred), dim=1) dist = L.elementwise_add(-2.0 * L.matmul(pred, pred, transpose_y=True), square_norm, axis=0) dist = L.elementwise_add(dist, square_norm, axis=1) dist = L.elementwise_max(dist, L.zeros_like(dist)) dist = L.sqrt(dist) ap_dist = L.reshape(dist, (0, 0, 1)) an_dist = L.reshape(dist, (0, 1, -1)) loss = L.expand(ap_dist, (1, 1, batch_size)) - L.expand( an_dist, (1, batch_size, 1)) + self.magin indice_equal = L.diag( L.fill_constant((batch_size, ), dtype='float32', value=1.0)) indice_not_equal = 1.0 - indice_equal broad_matrix = L.expand(L.reshape(target, (-1, 1)), (1, batch_size)) + L.expand( L.reshape(target, (1, -1)), (batch_size, 1)) pp = L.cast(L.equal(broad_matrix, L.zeros_like(broad_matrix)), dtype='float32') pp = L.reshape(indice_not_equal * pp, (0, 0, 1)) pn = L.cast(L.equal(broad_matrix, L.zeros_like(broad_matrix) + 1), dtype='float32') pn = L.reshape(indice_not_equal * pn, (1, 0, -1)) apn = L.expand(pp, (1, 1, batch_size)) * L.expand(pn, (batch_size, 1, 1)) loss = loss * L.cast(apn, dtype='float32') loss = L.elementwise_max(loss, L.zeros_like(loss)) num_tri = L.reduce_sum( L.cast(L.greater_than(loss, L.zeros_like(loss)), dtype='float32')) loss = L.reduce_sum(loss) * self.loss_weight / (num_tri + 1e-16) return loss
def intersect(box_a, box_b): # 相交区域的面积 """ We resize both tensors to [A,B,2] without new malloc: [A,2] -> [A,1,2] -> [A,B,2] [B,2] -> [1,B,2] -> [A,B,2] Then we compute the area of intersect between box_a and box_b. Args: box_a: (tensor) bounding boxes, Shape: [n,A,4]. box_b: (tensor) bounding boxes, Shape: [n,B,4]. Return: (tensor) intersection area, Shape: [n,A,B]. """ n = P.shape(box_a)[0] A = P.shape(box_a)[1] B = P.shape(box_b)[1] box_a = P.reshape(box_a, (n, A, 1, 4)) box_b = P.reshape(box_b, (n, 1, B, 4)) expand_box_a = P.expand(box_a, [1, 1, B, 1]) expand_box_b = P.expand(box_b, [1, A, 1, 1]) # 相交矩形的左上角坐标、右下角坐标 left_up = P.elementwise_max(expand_box_a[:, :, :, :2], expand_box_b[:, :, :, :2]) right_down = P.elementwise_min(expand_box_a[:, :, :, 2:], expand_box_b[:, :, :, 2:]) inter_section = P.relu(right_down - left_up) return inter_section[:, :, :, 0] * inter_section[:, :, :, 1]
def generalied_box_iou(boxes1, boxes2): """ Generalized IoU from https://giou.stanford.edu/ The boxes should be in [x0, y0, x1, y1] format Returns a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) """ # degenerate boxes gives inf / nan results # so do an early check assert L.reduce_all(boxes1[:, 2:] >= boxes1[:, :2]) assert L.reduce_all(boxes2[:, 2:] >= boxes2[:, :2]) iou, union = box_iou(boxes1, boxes2) N, M = boxes1.shape[0], boxes2.shape[0] boxes1 = L.unsqueeze(boxes1, axes=[1]) # [N, 1, 4] boxes1 = L.expand(boxes1, [1, M, 1]) # [N, M, 4] boxes2 = L.unsqueeze(boxes2, axes=[0]) # [1, M, 4] boxes2 = L.expand(boxes2, [N, 1, 1]) # [N, M, 4] lt = L.elementwise_min(boxes1[:, :, :2], boxes2[:, :, :2]) # [N, M, 2] rb = L.elementwise_max(boxes1[:, :, 2:], boxes2[:, :, 2:]) # [N, M, 2] wh = L.clip(rb - lt, min=0, max=1e8) # [N, M, 2] area = wh[:, :, 0] * wh[:, :, 1] + 1e-4 # prevent devided by zero return iou - (area - union) / area
def bbox_iou(boxes1, boxes2): ''' 预测框 boxes1 (?, grid_h, grid_w, 3, 1, 4),神经网络的输出(tx, ty, tw, th)经过了后处理求得的(bx, by, bw, bh) 图片中所有的gt boxes2 (?, 1, 1, 1, 150, 4) paddle里不支持省略号,boxes1_area = boxes1[..., 2] * boxes1[..., 3] 冒号要写完 ''' boxes1_area = boxes1[:, :, :, :, :, 2] * boxes1[:, :, :, :, :, 3] # 所有格子的3个预测框的面积 boxes2_area = boxes2[:, :, :, :, :, 2] * boxes2[:, :, :, :, :, 3] # 所有ground truth的面积 # (x, y, w, h)变成(x0, y0, x1, y1) boxes1 = P.concat([ boxes1[:, :, :, :, :, :2] - boxes1[:, :, :, :, :, 2:] * 0.5, boxes1[:, :, :, :, :, :2] + boxes1[:, :, :, :, :, 2:] * 0.5 ], axis=-1) boxes2 = P.concat([ boxes2[:, :, :, :, :, :2] - boxes2[:, :, :, :, :, 2:] * 0.5, boxes2[:, :, :, :, :, :2] + boxes2[:, :, :, :, :, 2:] * 0.5 ], axis=-1) # 所有格子的3个预测框 分别 和 150个ground truth 计算iou。 所以left_up和right_down的shape = (?, grid_h, grid_w, 3, 150, 2) expand_boxes1 = P.expand(boxes1, [1, 1, 1, 1, P.shape(boxes2)[4], 1 ]) # 不同于pytorch和tf,boxes1和boxes2都要扩展为相同shape expand_boxes2 = P.expand( boxes2, [1, P.shape(boxes1)[1], P.shape(boxes1)[2], P.shape(boxes1)[3], 1, 1]) # 不同于pytorch和tf,boxes1和boxes2都要扩展为相同shape left_up = P.elementwise_max(expand_boxes1[:, :, :, :, :, :2], expand_boxes2[:, :, :, :, :, :2]) # 相交矩形的左上角坐标 right_down = P.elementwise_min(expand_boxes1[:, :, :, :, :, 2:], expand_boxes2[:, :, :, :, :, 2:]) # 相交矩形的右下角坐标 inter_section = P.relu( right_down - left_up) # 相交矩形的w和h,是负数时取0 (?, grid_h, grid_w, 3, 150, 2) inter_area = inter_section[:, :, :, :, :, 0] * inter_section[:, :, :, :, :, 1] # 相交矩形的面积 (?, grid_h, grid_w, 3, 150) expand_boxes1_area = P.expand(boxes1_area, [1, 1, 1, 1, P.shape(boxes2)[4]]) expand_boxes2_area = P.expand(boxes2_area, [ 1, P.shape(expand_boxes1_area)[1], P.shape(expand_boxes1_area)[2], P.shape(expand_boxes1_area)[3], 1 ]) union_area = expand_boxes1_area + expand_boxes2_area - inter_area # union_area (?, grid_h, grid_w, 3, 150) iou = 1.0 * inter_area / union_area # iou (?, grid_h, grid_w, 3, 150) return iou
def _dygraph_clip(self, params_grads): params_and_grads = [] # clip by value first for p, g in params_grads: if g is None: continue if self._need_clip_func is not None and not self._need_clip_func( p): params_and_grads.append((p, g)) continue new_grad = layers.clip(x=g, min=-self.clip_value, max=self.clip_value) params_and_grads.append((p, new_grad)) params_grads = params_and_grads # clip by global norm params_and_grads = [] sum_square_list = [] for p, g in params_grads: if g is None: continue if self._need_clip_func is not None and not self._need_clip_func( p): continue merge_grad = g if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) square = layers.square(merge_grad) sum_square = layers.reduce_sum(square) sum_square_list.append(sum_square) # all parameters have been filterd out if len(sum_square_list) == 0: return params_grads global_norm_var = layers.concat(sum_square_list) global_norm_var = layers.reduce_sum(global_norm_var) global_norm_var = layers.sqrt(global_norm_var) max_global_norm = layers.fill_constant(shape=[1], dtype='float32', value=self.clip_norm) clip_var = layers.elementwise_div(x=max_global_norm, y=layers.elementwise_max( x=global_norm_var, y=max_global_norm)) for p, g in params_grads: if g is None: continue if self._need_clip_func is not None and not self._need_clip_func( p): params_and_grads.append((p, g)) continue new_grad = layers.elementwise_mul(x=g, y=clip_var) params_and_grads.append((p, new_grad)) return params_and_grads
def _iou_hw(box_a, box_b, eps=1e-9): """计算两组矩形两两之间的iou以及长宽比信息 Args: box_a: (tensor) bounding boxes, Shape: [A, 4]. box_b: (tensor) bounding boxes, Shape: [B, 4]. Return: (tensor) iou, Shape: [A, B]. """ A = box_a.shape[0] B = box_b.shape[0] box_a_rb = L.reshape(box_a[:, 2:], (A, 1, 2)) box_a_rb = L.expand(box_a_rb, [1, B, 1]) box_b_rb = L.reshape(box_b[:, 2:], (1, B, 2)) box_b_rb = L.expand(box_b_rb, [A, 1, 1]) max_xy = L.elementwise_min(box_a_rb, box_b_rb) box_a_lu = L.reshape(box_a[:, :2], (A, 1, 2)) box_a_lu = L.expand(box_a_lu, [1, B, 1]) box_b_lu = L.reshape(box_b[:, :2], (1, B, 2)) box_b_lu = L.expand(box_b_lu, [A, 1, 1]) min_xy = L.elementwise_max(box_a_lu, box_b_lu) inter = L.relu(max_xy - min_xy) inter = inter[:, :, 0] * inter[:, :, 1] box_a_w = box_a[:, 2]-box_a[:, 0] box_a_h = box_a[:, 3]-box_a[:, 1] area_a = box_a_h * box_a_w area_a = L.reshape(area_a, (A, 1)) area_a = L.expand(area_a, [1, B]) # [A, B] box_b_w = box_b[:, 2]-box_b[:, 0] box_b_h = box_b[:, 3]-box_b[:, 1] area_b = box_b_h * box_b_w area_b = L.reshape(area_b, (1, B)) area_b = L.expand(area_b, [A, 1]) # [A, B] union = area_a + area_b - inter iou = inter / union # [A, B] iou取值0~1之间,iou越大越应该抑制 # 长宽比信息 atan1 = L.atan(box_a_h / (box_a_w + eps)) atan2 = L.atan(box_b_h / (box_b_w + eps)) atan1 = L.reshape(atan1, (A, 1)) atan1 = L.expand(atan1, [1, B]) # [A, B] atan2 = L.reshape(atan2, (1, B)) atan2 = L.expand(atan2, [A, 1]) # [A, B] v = 4.0 * L.pow(atan1 - atan2, 2) / (math.pi ** 2) # [A, B] v取值0~1之间,v越小越应该抑制 factor = 0.4 overlap = L.pow(iou, (1 - factor)) * L.pow(1.0 - v, factor) return overlap
def forward(self, *items): """Forward network""" if self.training and self.p > 0: masks = [ layers.uniform_random(shape=x.shape[:2], min=0, max=1) >= self.p for x in items ] masks = [layers.cast(x, 'float32') for x in masks] total = layers.elementwise_add(*masks) scale = len(items) / layers.elementwise_max( total, layers.ones_like(total)) masks = [mask * scale for mask in masks] items = [ item * layers.unsqueeze(mask, axes=[-1]) for item, mask in zip(items, masks) ] return items
def _iou(box_a, box_b): ''' :param box_a: [c, A, 4] :param box_b: [c, B, 4] :return: [c, A, B] 两两之间的iou ''' # 变成左上角坐标、右下角坐标 boxes1 = P.concat([ box_a[:, :, :2] - box_a[:, :, 2:] * 0.5, box_a[:, :, :2] + box_a[:, :, 2:] * 0.5 ], axis=-1) boxes2 = P.concat([ box_b[:, :, :2] - box_b[:, :, 2:] * 0.5, box_b[:, :, :2] + box_b[:, :, 2:] * 0.5 ], axis=-1) c = P.shape(boxes1)[0] A = P.shape(boxes1)[1] B = P.shape(boxes2)[1] box_a = P.reshape(boxes1, (c, A, 1, 4)) box_b = P.reshape(boxes2, (c, 1, B, 4)) expand_box_a = P.expand(box_a, [1, 1, B, 1]) expand_box_b = P.expand(box_b, [1, A, 1, 1]) # 两个矩形的面积 boxes1_area = (expand_box_a[:, :, :, 2] - expand_box_a[:, :, :, 0]) * \ (expand_box_a[:, :, :, 3] - expand_box_a[:, :, :, 1]) boxes2_area = (expand_box_b[:, :, :, 2] - expand_box_b[:, :, :, 0]) * \ (expand_box_b[:, :, :, 3] - expand_box_b[:, :, :, 1]) # 相交矩形的左上角坐标、右下角坐标 left_up = P.elementwise_max(expand_box_a[:, :, :, :2], expand_box_b[:, :, :, :2]) right_down = P.elementwise_min(expand_box_a[:, :, :, 2:], expand_box_b[:, :, :, 2:]) # 相交矩形的面积inter_area。iou # inter_section = P.elementwise_max(right_down - left_up, 0.0) inter_section = P.relu(right_down - left_up) inter_area = inter_section[:, :, :, 0] * inter_section[:, :, :, 1] union_area = boxes1_area + boxes2_area - inter_area iou = inter_area / (union_area + 1e-9) return iou
def communicate_avg_loss(): communicate() self._generate_avg_loss(main_block, loss, avg_loss) next_local_steps = layers.cast(layers.ceil( layers.sqrt(lr_0 * avg_loss / (global_lr * loss_0) * float(init_k_steps))), dtype='int64') max_local_steps = layers.fill_constant(shape=[1], dtype='int64', value=16) min_local_steps = layers.fill_constant(shape=[1], dtype='int64', value=1) next_local_steps = layers.elementwise_min( next_local_steps, max_local_steps) next_local_steps = layers.elementwise_max( next_local_steps, min_local_steps) layers.assign(next_local_steps, k_steps)
def box_iou(boxes1, boxes2): area1 = box_area(boxes1) # [N] area2 = box_area(boxes2) # [M] N, M = boxes1.shape[0], boxes2.shape[0] boxes1 = L.unsqueeze(boxes1, axes=[1]) # [N, 1, 4] boxes1 = L.expand(boxes1, [1, M, 1]) # [N, M, 4] boxes2 = L.unsqueeze(boxes2, axes=[0]) # [1, M, 4] boxes2 = L.expand(boxes2, [N, 1, 1]) # [N, M, 4] lt = L.elementwise_max(boxes1[:, :, :2], boxes2[:, :, :2]) # [N, M, 2] rb = L.elementwise_min(boxes1[:, :, 2:], boxes2[:, :, 2:]) # [N, M, 2] wh = L.clip(rb - lt, min=0, max=1e8) # [N, M, 2] inter = wh[:, :, 0] * wh[:, :, 1] # [N, M] area1 = L.expand(L.unsqueeze(area1, [1]), [1, M]) # [N, M] area2 = L.expand(L.unsqueeze(area2, [0]), [N, 1]) # [N, M] union = area1 + area2 - inter iou = inter / union return iou, union
def _iou(box_a, box_b): """计算两组矩形两两之间的iou Args: box_a: (tensor) bounding boxes, Shape: [A, 4]. box_b: (tensor) bounding boxes, Shape: [B, 4]. Return: (tensor) iou, Shape: [A, B]. """ A = box_a.shape[0] B = box_b.shape[0] box_a_rb = L.reshape(box_a[:, 2:], (A, 1, 2)) box_a_rb = L.expand(box_a_rb, [1, B, 1]) box_b_rb = L.reshape(box_b[:, 2:], (1, B, 2)) box_b_rb = L.expand(box_b_rb, [A, 1, 1]) max_xy = L.elementwise_min(box_a_rb, box_b_rb) box_a_lu = L.reshape(box_a[:, :2], (A, 1, 2)) box_a_lu = L.expand(box_a_lu, [1, B, 1]) box_b_lu = L.reshape(box_b[:, :2], (1, B, 2)) box_b_lu = L.expand(box_b_lu, [A, 1, 1]) min_xy = L.elementwise_max(box_a_lu, box_b_lu) inter = L.relu(max_xy - min_xy) inter = inter[:, :, 0] * inter[:, :, 1] box_a_w = box_a[:, 2]-box_a[:, 0] box_a_h = box_a[:, 3]-box_a[:, 1] area_a = box_a_h * box_a_w area_a = L.reshape(area_a, (A, 1)) area_a = L.expand(area_a, [1, B]) # [A, B] box_b_w = box_b[:, 2]-box_b[:, 0] box_b_h = box_b[:, 3]-box_b[:, 1] area_b = box_b_h * box_b_w area_b = L.reshape(area_b, (1, B)) area_b = L.expand(area_b, [A, 1]) # [A, B] union = area_a + area_b - inter return inter / union # [A, B]
def sanitize_coordinates(_x1, _x2, img_size, padding: int = 0, cast: bool = True): """ Sanitizes the input coordinates so that x1 < x2, x1 != x2, x1 >= 0, and x2 <= image_size. Also converts from relative to absolute coordinates and casts the results to long tensors. If cast is false, the result won't be cast to longs. Warning: this does things in-place behind the scenes so copy if necessary. """ _x1 = _x1 * img_size _x2 = _x2 * img_size x1 = P.elementwise_min(_x1, _x2) x2 = P.elementwise_max(_x1, _x2) x1 = P.relu(x1 - padding) # 下限是0 img_size2 = P.expand(img_size, (P.shape(x2)[0], )) img_size2 = P.cast(img_size2, 'float32') x2 = img_size2 - P.relu(img_size2 - (x2 + padding)) # 上限是img_size if cast: x1 = P.cast(x1, 'int32') x2 = P.cast(x2, 'int32') return x1, x2
def _dygraph_clip(self, params_grads): sum_square_fp32, sum_square_fp16 = [], [] unslice_params_fp32, unslice_params_fp16 = [], [] for p, g in params_grads: p_slice = True # using for slice parameter in sharding stage3 if g is None or getattr(p, 'need_clip', True) is False: continue if hasattr(p, "unslice"): p_slice = False merge_grad = g if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.get_tensor_from_selected_rows( layers.merge_selected_rows(g)) square = layers.square(merge_grad) sum_square = layers.reduce_sum(square) if p.dtype == paddle.float16: if p_slice: sum_square_fp16.append(sum_square) else: unslice_params_fp16.append(sum_square) elif p.dtype == paddle.float32: if p_slice: sum_square_fp32.append(sum_square) else: unslice_params_fp32.append(sum_square) # global norm of non-distributed FP16 params_and_grads if len(sum_square_fp16) == 0: global_norm_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) else: global_norm_fp16 = layers.concat(sum_square_fp16) global_norm_fp16 = layers.reduce_sum(global_norm_fp16) global_norm_fp16 = paddle.cast( global_norm_fp16, dtype=paddle.float32) # global norm of non-distributed FP16 params_and_grads for unslice parameters if len(unslice_params_fp16) == 0: global_unslice_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) else: global_unslice_fp16 = layers.concat(unslice_params_fp16) global_unslice_fp16 = layers.reduce_sum(global_unslice_fp16) global_unslice_fp16 = paddle.cast( global_unslice_fp16, dtype=paddle.float32) # global norm of non-distributed FP32 params_and_grads global_norm_fp32 = layers.concat(sum_square_fp32) if len( sum_square_fp32) != 0 else paddle.to_tensor( [0.], dtype=paddle.float32) global_norm_fp32 = layers.reduce_sum(global_norm_fp32) # global norm of non-distributed FP32 params_and_grads for unslice parameters global_unslice_fp32 = layers.concat(unslice_params_fp32) if len( unslice_params_fp32) != 0 else paddle.to_tensor( [0.], dtype=paddle.float32) global_unslice_fp32 = layers.reduce_sum(global_unslice_fp32) global_unslice_var = global_unslice_fp16 + global_unslice_fp32 global_norm_var = global_norm_fp16 + global_norm_fp32 + 1.0 / self._group.nranks * global_unslice_var # add all reduce to get global norm of distributed params_and_grads dev_id = int(self._device.split(":")[1]) if paddle.device.get_device() == "cpu": global_norm_var = global_norm_var.cuda(dev_id) with device_guard(dev_id, "gpu"): paddle.distributed.all_reduce(global_norm_var, group=self._group) global_norm_var = layers.sqrt(global_norm_var) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) clip_var = layers.elementwise_div( x=max_global_norm, y=layers.elementwise_max( x=global_norm_var, y=max_global_norm)) clip_var_fp16 = paddle.cast(clip_var, paddle.float16) for p, g in params_grads: if getattr(p, 'need_clip', True) is False or g is None: continue origin_state = g.stop_gradient g.stop_gradient = True if p.dtype == paddle.float16: g.scale_(clip_var_fp16.item()) else: g.scale_(clip_var.item()) g.stop_gradient = origin_state # p._reset_grad_inplace_version(True) return params_grads
def _dygraph_clip(self, params_grads): normal_params_grads = [] moe_params_grads = [] # separate moe params from normal params if self.moe_group is not None and self.moe_group.nranks > 1: for p, g in params_grads: if self.is_expert_param_func(p): moe_params_grads.append((p, g)) else: normal_params_grads.append((p, g)) else: normal_params_grads = params_grads # why to return sum_dtype? # we will call `get_l2_norm_pow` twice and the precisions may be different. # For convenience and simplification, we use sum_dtype directly instead of global_norm_var_normal.dtype global_norm_var_normal, sum_dtype \ = self.get_l2_norm_pow(normal_params_grads) global_norm_var_moe = None if len(moe_params_grads) > 0: global_norm_var_moe, _ \ = self.get_l2_norm_pow(moe_params_grads, sum_dtype) if global_norm_var_moe is not None: collective.all_reduce(global_norm_var_moe, op=collective.ReduceOp.SUM, group=self.moe_group) if global_norm_var_normal is None and global_norm_var_moe is None: return params_grads elif global_norm_var_normal is None: global_norm_var = global_norm_var_moe elif global_norm_var_moe is None: global_norm_var = global_norm_var_normal else: if global_norm_var_normal.dtype != global_norm_var_moe.dtype: # compared with normal norm, moe norm is the later one, # so its precision is no lower than normal norm global_norm_var_normal = \ global_norm_var_normal.astype(global_norm_var_moe.dtype) global_norm_var = global_norm_var_normal + global_norm_var_moe params_and_grads = [] global_norm_var = layers.sqrt(global_norm_var) max_global_norm = layers.fill_constant(shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) clip_var = layers.elementwise_div(x=max_global_norm, y=layers.elementwise_max( x=global_norm_var, y=max_global_norm)) for p, g in params_grads: if g is None: continue if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue # TODO(wangxi): use inplace elementwise_mul clip_input = (clip_var.astype('float16') if g.dtype == core.VarDesc.VarType.FP16 else clip_var) new_grad = layers.elementwise_mul(x=g, y=clip_input) params_and_grads.append((p, new_grad)) return params_and_grads
def __call__(self, kernel_preds, cls_preds, mask_protos, batch_gt_objs_tensors, batch_gt_clss_tensors, batch_gt_masks_tensors, batch_gt_pos_idx_tensors): ''' :param kernel_preds: kernel_preds里每个元素形状是[N, 256, seg_num_grid, seg_num_grid], 每个格子的预测卷积核。 从 小感受野 到 大感受野。 :param cls_preds: cls_preds里每个元素形状是 [N, 80, seg_num_grid, seg_num_grid], 每个格子的预测概率,未进行sigmoid()激活。 从 小感受野 到 大感受野。 :param mask_protos: [bs, 256, s4, s4] 掩码原型 :param batch_gt_objs_tensors: 里每个元素形状是[N, seg_num_grid, seg_num_grid, 1], 每个格子的objness。 从 小感受野 到 大感受野。 :param batch_gt_clss_tensors: 里每个元素形状是[N, seg_num_grid, seg_num_grid, 80], 每个格子真实类别onehot。 从 小感受野 到 大感受野。 :param batch_gt_masks_tensors: 里每个元素形状是[N, -1, s4, s4], 真实掩码。 从 小感受野 到 大感受野。 :param batch_gt_pos_idx_tensors: 里每个元素形状是[N, -1, 3], 正样本的下标。 从 小感受野 到 大感受野。 :return: ''' batch_size = self.batch_size num_layers = len(kernel_preds) # ================= 计算损失 ================= num_ins = 0. # 记录这一批图片的正样本个数 loss_clss, loss_masks = [], [] for bid in range(batch_size): for lid in range(num_layers): # ================ 掩码损失 ====================== mask_proto = mask_protos[bid] # [256, s4, s4] 这张图片产生的掩码原型。 kernel_pred = kernel_preds[lid][ bid] # [256, seg_num_grid, seg_num_grid] 格子预测的卷积核(yolact中的“掩码系数”) kernel_pred = L.transpose( kernel_pred, perm=[1, 2, 0] ) # [seg_num_grid, seg_num_grid, 256] 格子预测的卷积核(yolact中的“掩码系数”) gt_objs = batch_gt_objs_tensors[lid][ bid] # [seg_num_grid, seg_num_grid, 1] gt_masks = batch_gt_masks_tensors[lid][bid] # [-1, s4, s4] pmidx = batch_gt_pos_idx_tensors[lid][bid] # [-1, 3] gt_objs.stop_gradient = True gt_masks.stop_gradient = True pmidx.stop_gradient = True idx_sum = L.reduce_sum(pmidx, dim=1) keep = L.where(idx_sum > -1) keep = L.reshape(keep, (-1, )) keep.stop_gradient = True pmidx = L.gather(pmidx, keep) # [M, 3] yx_idx = pmidx[:, :2] # [M, 2] m_idx = pmidx[:, 2] # [M, ] yx_idx.stop_gradient = True m_idx.stop_gradient = True # 抽出来 gt_obj = L.gather_nd(gt_objs, yx_idx) # [M, 1] 是否是真正的正样本。 pos_krn = L.gather_nd(kernel_pred, yx_idx) # [M, 256] 正样本的卷积核(掩码系数)。 gt_mask = L.gather(gt_masks, m_idx) # [M, s4, s4] 真实掩码。 # 正样本数量 num_ins += L.reduce_sum(gt_obj) # 生成预测掩码 mask_proto = L.transpose(mask_proto, perm=[1, 2, 0]) # [s4, s4, 256] masks = L.matmul(mask_proto, pos_krn, transpose_y=True) # [s4, s4, M] masks = L.sigmoid(masks) # [s4, s4, M] masks = L.transpose(masks, perm=[2, 0, 1]) # [M, s4, s4] loss_mask = self.dice_loss(masks, gt_mask, gt_obj) loss_masks.append(loss_mask) # ================ 分类损失。sigmoid_focal_loss() ====================== gamma = self.loss_gamma alpha = self.loss_alpha pred_conf = cls_preds[lid][ bid] # [80, seg_num_grid, seg_num_grid] 未进行sigmoid()激活。 pred_conf = L.transpose(pred_conf, perm=[ 1, 2, 0 ]) # [seg_num_grid, seg_num_grid, 80] 未进行sigmoid()激活。 pred_conf = L.sigmoid( pred_conf ) # [seg_num_grid, seg_num_grid, 80] 已进行sigmoid()激活。 gt_clss = batch_gt_clss_tensors[lid][ bid] # [seg_num_grid, seg_num_grid, 80] 真实类别onehot gt_clss.stop_gradient = True pos_loss = gt_clss * (0 - L.log(pred_conf + 1e-9)) * L.pow( 1 - pred_conf, gamma) * alpha neg_loss = ( 1.0 - gt_clss) * (0 - L.log(1 - pred_conf + 1e-9)) * L.pow( pred_conf, gamma) * (1 - alpha) focal_loss = pos_loss + neg_loss focal_loss = L.reduce_sum(focal_loss, dim=[0, 1]) loss_clss.append(focal_loss) loss_masks = L.concat(loss_masks, axis=0) loss_masks = L.reduce_sum(loss_masks) * self.ins_loss_weight loss_masks = loss_masks / L.elementwise_max( L.ones((1, ), dtype='float32'), num_ins) loss_clss = L.concat(loss_clss, axis=0) loss_clss = L.reduce_sum(loss_clss) * self.clss_loss_weight loss_clss = loss_clss / L.elementwise_max( L.ones((1, ), dtype='float32'), num_ins) loss_all = {"loss_masks": loss_masks, "loss_clss": loss_clss} return loss_all
def decrement(self): new_scale = self.scale / self.factor one = layers.fill_constant(shape=[1], dtype='float32', value=1.0) layers.assign(layers.elementwise_max(new_scale, one), self.scale) layers.assign(layers.zeros_like(self.good_steps), self.good_steps)
def bbox_ciou(boxes1, boxes2): ''' 计算ciou = iou - p2/c2 - av :param boxes1: (8, 13, 13, 3, 4) pred_xywh :param boxes2: (8, 13, 13, 3, 4) label_xywh :return: ''' # 变成左上角坐标、右下角坐标 boxes1_x0y0x1y1 = P.concat([ boxes1[:, :, :, :, :2] - boxes1[:, :, :, :, 2:] * 0.5, boxes1[:, :, :, :, :2] + boxes1[:, :, :, :, 2:] * 0.5 ], axis=-1) boxes2_x0y0x1y1 = P.concat([ boxes2[:, :, :, :, :2] - boxes2[:, :, :, :, 2:] * 0.5, boxes2[:, :, :, :, :2] + boxes2[:, :, :, :, 2:] * 0.5 ], axis=-1) ''' 逐个位置比较boxes1_x0y0x1y1[..., :2]和boxes1_x0y0x1y1[..., 2:],即逐个位置比较[x0, y0]和[x1, y1],小的留下。 比如留下了[x0, y0] 这一步是为了避免一开始w h 是负数,导致x0y0成了右下角坐标,x1y1成了左上角坐标。 ''' boxes1_x0y0x1y1 = P.concat([ P.elementwise_min(boxes1_x0y0x1y1[:, :, :, :, :2], boxes1_x0y0x1y1[:, :, :, :, 2:]), P.elementwise_max(boxes1_x0y0x1y1[:, :, :, :, :2], boxes1_x0y0x1y1[:, :, :, :, 2:]) ], axis=-1) boxes2_x0y0x1y1 = P.concat([ P.elementwise_min(boxes2_x0y0x1y1[:, :, :, :, :2], boxes2_x0y0x1y1[:, :, :, :, 2:]), P.elementwise_max(boxes2_x0y0x1y1[:, :, :, :, :2], boxes2_x0y0x1y1[:, :, :, :, 2:]) ], axis=-1) # 两个矩形的面积 boxes1_area = ( boxes1_x0y0x1y1[:, :, :, :, 2] - boxes1_x0y0x1y1[:, :, :, :, 0]) * ( boxes1_x0y0x1y1[:, :, :, :, 3] - boxes1_x0y0x1y1[:, :, :, :, 1]) boxes2_area = ( boxes2_x0y0x1y1[:, :, :, :, 2] - boxes2_x0y0x1y1[:, :, :, :, 0]) * ( boxes2_x0y0x1y1[:, :, :, :, 3] - boxes2_x0y0x1y1[:, :, :, :, 1]) # 相交矩形的左上角坐标、右下角坐标,shape 都是 (8, 13, 13, 3, 2) left_up = P.elementwise_max(boxes1_x0y0x1y1[:, :, :, :, :2], boxes2_x0y0x1y1[:, :, :, :, :2]) right_down = P.elementwise_min(boxes1_x0y0x1y1[:, :, :, :, 2:], boxes2_x0y0x1y1[:, :, :, :, 2:]) # 相交矩形的面积inter_area。iou inter_section = P.relu(right_down - left_up) inter_area = inter_section[:, :, :, :, 0] * inter_section[:, :, :, :, 1] union_area = boxes1_area + boxes2_area - inter_area iou = inter_area / (union_area + 1e-9) # 包围矩形的左上角坐标、右下角坐标,shape 都是 (8, 13, 13, 3, 2) enclose_left_up = P.elementwise_min(boxes1_x0y0x1y1[:, :, :, :, :2], boxes2_x0y0x1y1[:, :, :, :, :2]) enclose_right_down = P.elementwise_max(boxes1_x0y0x1y1[:, :, :, :, 2:], boxes2_x0y0x1y1[:, :, :, :, 2:]) # 包围矩形的对角线的平方 enclose_wh = enclose_right_down - enclose_left_up enclose_c2 = P.pow(enclose_wh[:, :, :, :, 0], 2) + P.pow( enclose_wh[:, :, :, :, 1], 2) # 两矩形中心点距离的平方 p2 = P.pow(boxes1[:, :, :, :, 0] - boxes2[:, :, :, :, 0], 2) + P.pow( boxes1[:, :, :, :, 1] - boxes2[:, :, :, :, 1], 2) # 增加av。 atan1 = P.atan(boxes1[:, :, :, :, 2] / (boxes1[:, :, :, :, 3] + 1e-9)) atan2 = P.atan(boxes2[:, :, :, :, 2] / (boxes2[:, :, :, :, 3] + 1e-9)) v = 4.0 * P.pow(atan1 - atan2, 2) / (math.pi**2) a = v / (1 - iou + v) ciou = iou - 1.0 * p2 / enclose_c2 - 1.0 * a * v return ciou
def __iou_loss(self, pred, targets, positive_mask, weights=None): """ Calculate the loss for location prediction Args: pred (Variables): bounding boxes prediction targets (Variables): targets for positive samples positive_mask (Variables): mask of positive samples weights (Variables): weights for each positive samples Return: loss (Varialbes): location loss """ positive_mask = fluid.layers.reshape(positive_mask, (-1, )) # [批大小*所有格子数, ] plw = pred[:, 0] * positive_mask # [批大小*所有格子数, ], 预测的l pth = pred[:, 1] * positive_mask # [批大小*所有格子数, ], 预测的t prw = pred[:, 2] * positive_mask # [批大小*所有格子数, ], 预测的r pbh = pred[:, 3] * positive_mask # [批大小*所有格子数, ], 预测的b tlw = targets[:, 0] * positive_mask # [批大小*所有格子数, ], 真实的l tth = targets[:, 1] * positive_mask # [批大小*所有格子数, ], 真实的t trw = targets[:, 2] * positive_mask # [批大小*所有格子数, ], 真实的r tbh = targets[:, 3] * positive_mask # [批大小*所有格子数, ], 真实的b tlw.stop_gradient = True trw.stop_gradient = True tth.stop_gradient = True tbh.stop_gradient = True area_target = (tlw + trw) * (tth + tbh) # [批大小*所有格子数, ], 真实的面积 area_predict = (plw + prw) * (pth + pbh) # [批大小*所有格子数, ], 预测的面积 ilw = fluid.layers.elementwise_min(plw, tlw) # [批大小*所有格子数, ], 相交矩形的l irw = fluid.layers.elementwise_min(prw, trw) # [批大小*所有格子数, ], 相交矩形的r ith = fluid.layers.elementwise_min(pth, tth) # [批大小*所有格子数, ], 相交矩形的t ibh = fluid.layers.elementwise_min(pbh, tbh) # [批大小*所有格子数, ], 相交矩形的b clw = fluid.layers.elementwise_max(plw, tlw) # [批大小*所有格子数, ], 包围矩形的l crw = fluid.layers.elementwise_max(prw, trw) # [批大小*所有格子数, ], 包围矩形的r cth = fluid.layers.elementwise_max(pth, tth) # [批大小*所有格子数, ], 包围矩形的t cbh = fluid.layers.elementwise_max(pbh, tbh) # [批大小*所有格子数, ], 包围矩形的b area_inter = (ilw + irw) * (ith + ibh) # [批大小*所有格子数, ], 相交矩形的面积 ious = (area_inter + 1.0) / (area_predict + area_target - area_inter + 1.0) ious = ious * positive_mask if self.iou_loss_type.lower() == "linear_iou": loss = 1.0 - ious elif self.iou_loss_type.lower() == "giou": area_uniou = area_predict + area_target - area_inter area_circum = (clw + crw) * (cth + cbh) + 1e-7 giou = ious - (area_circum - area_uniou) / area_circum loss = 1.0 - giou elif self.iou_loss_type.lower() == "iou": loss = 0.0 - fluid.layers.log(ious) elif self.iou_loss_type.lower() == "ciou": # 预测的矩形。cx_cy_w_h格式,以格子中心点为坐标原点。 pred_cx = (prw - plw) * 0.5 pred_cy = (pbh - pth) * 0.5 pred_w = (plw + prw) pred_h = (pth + pbh) pred_cx = L.reshape(pred_cx, (-1, 1)) pred_cy = L.reshape(pred_cy, (-1, 1)) pred_w = L.reshape(pred_w, (-1, 1)) pred_h = L.reshape(pred_h, (-1, 1)) pred_cx_cy_w_h = L.concat([pred_cx, pred_cy, pred_w, pred_h], -1) # [批大小*所有格子数, 4] # 真实的矩形。cx_cy_w_h格式,以格子中心点为坐标原点。 true_cx = (trw - tlw) * 0.5 true_cy = (tbh - tth) * 0.5 true_w = (tlw + trw) true_h = (tth + tbh) true_cx = L.reshape(true_cx, (-1, 1)) true_cy = L.reshape(true_cy, (-1, 1)) true_w = L.reshape(true_w, (-1, 1)) true_h = L.reshape(true_h, (-1, 1)) true_cx_cy_w_h = L.concat([true_cx, true_cy, true_w, true_h], -1) # [批大小*所有格子数, 4] # 预测的矩形。x0y0x1y1格式,以格子中心点为坐标原点。 boxes1_x0y0x1y1 = L.concat([ pred_cx_cy_w_h[:, :2] - pred_cx_cy_w_h[:, 2:] * 0.5, pred_cx_cy_w_h[:, :2] + pred_cx_cy_w_h[:, 2:] * 0.5 ], axis=-1) # 真实的矩形。x0y0x1y1格式,以格子中心点为坐标原点。 boxes2_x0y0x1y1 = L.concat([ true_cx_cy_w_h[:, :2] - true_cx_cy_w_h[:, 2:] * 0.5, true_cx_cy_w_h[:, :2] + true_cx_cy_w_h[:, 2:] * 0.5 ], axis=-1) # 包围矩形的左上角坐标、右下角坐标,shape 都是 (批大小*所有格子数, 2) enclose_left_up = L.elementwise_min(boxes1_x0y0x1y1[:, :2], boxes2_x0y0x1y1[:, :2]) enclose_right_down = L.elementwise_max(boxes1_x0y0x1y1[:, 2:], boxes2_x0y0x1y1[:, 2:]) # 包围矩形的对角线的平方 enclose_wh = enclose_right_down - enclose_left_up enclose_c2 = L.pow(enclose_wh[:, 0], 2) + L.pow( enclose_wh[:, 1], 2) # 两矩形中心点距离的平方 p2 = L.pow(pred_cx_cy_w_h[:, 0] - true_cx_cy_w_h[:, 0], 2) \ + L.pow(pred_cx_cy_w_h[:, 1] - true_cx_cy_w_h[:, 1], 2) # 增加av。加上除0保护防止nan。 atan1 = L.atan(pred_cx_cy_w_h[:, 2] / (pred_cx_cy_w_h[:, 3] + 1e-9)) atan2 = L.atan(true_cx_cy_w_h[:, 2] / (true_cx_cy_w_h[:, 3] + 1e-9)) v = 4.0 * L.pow(atan1 - atan2, 2) / (math.pi**2) a = v / (1 - ious + v) ciou = ious - 1.0 * p2 / (enclose_c2 + 1e-9) - 1.0 * a * v loss = 1.0 - ciou else: raise KeyError loss = fluid.layers.reshape(loss, (-1, 1)) # [批大小*所有格子数, 1] if weights is not None: loss = loss * weights return loss
def _dygraph_clip(self, params_grads): params_and_grads = [] sum_square_dist_fp16 = [] sum_square_dist_fp32 = [] sum_square_not_dist_fp16 = [] sum_square_not_dist_fp32 = [] for p, g in params_grads: if g is None: continue if getattr(p, 'need_clip', True) is False: continue merge_grad = g if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) square = layers.square(merge_grad) sum_square = layers.reduce_sum(square) not_shared_enable = (not hasattr(p, 'is_firstly_shared')) or ( hasattr(p, 'is_firstly_shared') and getattr(p, 'is_firstly_shared', True)) if not_shared_enable: if p.is_distributed: if p.dtype == paddle.float16: sum_square_dist_fp16.append(sum_square) elif p.dtype == paddle.float32: sum_square_dist_fp32.append(sum_square) else: if p.dtype == paddle.float16: sum_square_not_dist_fp16.append(sum_square) elif p.dtype == paddle.float32: sum_square_not_dist_fp32.append(sum_square) # global norm of distributed FP16 params_and_grads if len(sum_square_dist_fp16) == 0: global_norm_dist_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) else: global_norm_dist_fp16 = layers.concat(sum_square_dist_fp16) global_norm_dist_fp16 = layers.reduce_sum(global_norm_dist_fp16) global_norm_dist_fp16 = paddle.cast(global_norm_dist_fp16, dtype=paddle.float32) # global norm of non-distributed FP16 params_and_grads if len(sum_square_not_dist_fp16) == 0: global_norm_not_dist_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) else: global_norm_not_dist_fp16 = layers.concat(sum_square_not_dist_fp16) global_norm_not_dist_fp16 = layers.reduce_sum( global_norm_not_dist_fp16) global_norm_not_dist_fp16 = paddle.cast(global_norm_not_dist_fp16, dtype=paddle.float32) # global norm of distributed FP32 params_and_grads global_norm_dist_fp32 = layers.concat(sum_square_dist_fp32) if len( sum_square_dist_fp32) != 0 else paddle.to_tensor( [0.], dtype=paddle.float32) global_norm_dist_fp32 = layers.reduce_sum(global_norm_dist_fp32) # global norm of non-distributed FP32 params_and_grads global_norm_not_dist_fp32 = layers.concat( sum_square_not_dist_fp32 ) if len(sum_square_not_dist_fp32) != 0 else paddle.to_tensor( [0.], dtype=paddle.float32) global_norm_not_dist_fp32 = layers.reduce_sum( global_norm_not_dist_fp32) global_norm_var_dist = global_norm_dist_fp16 + global_norm_dist_fp32 global_norm_var_not_dist = global_norm_not_dist_fp16 + global_norm_not_dist_fp32 # add all reduce to get global norm of distributed params_and_grads if self._hcg.get_model_parallel_world_size() > 1: paddle.distributed.all_reduce( global_norm_var_dist, group=self._hcg.get_check_parallel_group()) # add all reduce to get global norm of non-distributed params_and_grads in groups of pp if self._hcg.get_pipe_parallel_world_size() > 1: paddle.distributed.all_reduce( global_norm_var_not_dist, group=self._hcg.get_pipe_parallel_group()) # In Sharding mode, param and grad is mapping different rank in optimizer. # ClipGradByGlobalNorm need allreduce to get globol norm if self._hcg.get_sharding_parallel_world_size() > 1: paddle.distributed.all_reduce( global_norm_var_not_dist, group=self._hcg.get_sharding_parallel_group()) global_norm_var_fp32 = layers.sqrt(global_norm_var_dist + global_norm_var_not_dist) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var_fp32.dtype, value=self.clip_norm) clip_var = layers.elementwise_div(x=max_global_norm, y=layers.elementwise_max( x=global_norm_var_fp32, y=max_global_norm)) clip_var_fp16 = paddle.cast(clip_var, paddle.float16) for p, g in params_grads: if g is None: continue if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue if p.dtype == paddle.float16: new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16) else: new_grad = layers.elementwise_mul(x=g, y=clip_var) params_and_grads.append((p, new_grad)) return params_and_grads