def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, label_smoothing=0.1, print_loss=False): # 一共有三层 num_layers = len(anchors) // 3 # 将预测结果和实际ground truth分开,args是[*model_body.output, *y_true] # y_true是一个列表,包含三个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。 # yolo_outputs是一个列表,包含三个特征层,shape分别为(m,13,13,255),(m,26,26,255),(m,52,52,255)。 y_true = args[num_layers:] yolo_outputs = args[:num_layers] # 先验框 # 678为142,110, 192,243, 459,401 # 345为36,75, 76,55, 72,146 # 012为12,16, 19,36, 40,28 anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2] ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] # 得到input_shpae为608,608 input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) loss = 0 # 取出每一张图片 # m的值就是batch_size m = K.shape(yolo_outputs[0])[0] mf = K.cast(m, K.dtype(yolo_outputs[0])) # y_true是一个列表,包含三个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。 # yolo_outputs是一个列表,包含三个特征层,shape分别为(m,13,13,255),(m,26,26,255),(m,52,52,255)。 for l in range(num_layers): # 以第一个特征层(m,13,13,3,85)为例子 # 取出该特征层中存在目标的点的位置。(m,13,13,3,1) object_mask = y_true[l][..., 4:5] # 取出其对应的种类(m,13,13,3,80) true_class_probs = y_true[l][..., 5:] if label_smoothing: true_class_probs = _smooth_labels(true_class_probs, label_smoothing) # 将yolo_outputs的特征层输出进行处理 # grid为网格结构(13,13,1,2),raw_pred为尚未处理的预测结果(m,13,13,3,85) # 还有解码后的xy,wh,(m,13,13,3,2) grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) # 这个是解码后的预测的box的位置 # (m,13,13,3,4) pred_box = K.concatenate([pred_xy, pred_wh]) # 找到负样本群组,第一步是创建一个数组,[] ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') # 对每一张图片计算ignore_mask def loop_body(b, ignore_mask): # 取出第b副图内,真实存在的所有的box的参数 # n,4 true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) # 计算预测结果与真实情况的iou # pred_box为13,13,3,4 # 计算的结果是每个pred_box和其它所有真实框的iou # 13,13,3,n iou = box_iou(pred_box[b], true_box) # 13,13,3 best_iou = K.max(iou, axis=-1) # 如果某些预测框和真实框的重合程度大于0.5,则忽略。 ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask # 遍历所有的图片 _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) # 将每幅图的内容压缩,进行处理 ignore_mask = ignore_mask.stack() # (m,13,13,3,1) ignore_mask = K.expand_dims(ignore_mask, -1) box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] # Calculate ciou loss as location loss raw_true_box = y_true[l][..., 0:4] ciou = box_ciou(pred_box, raw_true_box) ciou_loss = object_mask * box_loss_scale * (1 - ciou) ciou_loss = K.sum(ciou_loss) / mf location_loss = ciou_loss # 如果该位置本来有框,那么计算1与置信度的交叉熵 # 如果该位置本来没有框,而且满足best_iou<ignore_thresh,则被认定为负样本 # best_iou<ignore_thresh用于限制负样本数量 confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) + \ (1 - object_mask) * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf loss += location_loss + confidence_loss + class_loss # if print_loss: # loss = tf.Print(loss, [loss, location_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss: ') return loss
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, label_smoothing=0.1, print_loss=False): # num_anchors = 3 num_layers = len(anchors) // 3 # yolo_outputs = [shape = (None, h//32, w//32, num_anchors*(5+num_classes)), # shape = (None, h//16, w//16, num_anchors*(5+num_classes)), # shape = (None, h//8, w//8, num_anchors*(5+num_classes))] yolo_outputs = args[:num_layers] # y_true = [shape = (None, h//32, w//32, num_anchors, 5+num_classes), # shape = (None, h//16, w//16, num_anchors, 5+num_classes), # shape = (None, h//8, w//8, num_anchors, 5+num_classes)] y_true = args[num_layers:] anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2] ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] # input_shape = (h, w) input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) loss = 0 bs = K.shape(yolo_outputs[0])[0] batch_size = K.cast(bs, K.dtype(yolo_outputs[0])) # y_true是一个列表,包含三个特征层,shape分别为(bs,13,13,3,85),(bs,26,26,3,85),(bs,52,52,3,85)。 # yolo_outputs是一个列表,包含三个特征层,shape分别为(bs,13,13,255),(bs,26,26,255),(bs,52,52,255)。 for i in range(num_layers): # 以第一个特征层(bs,13,13,3,85)为例子 # 取出该特征层中存在目标的点的位置。(bs,13,13,3,1) object_mask = y_true[i][..., 4:5] # 取出其对应的种类(bs,13,13,3,80) true_class_probabilities = y_true[i][..., 5:] if label_smoothing: true_class_probabilities = _smooth_labels(true_class_probabilities, label_smoothing) # 将yolo_outputs的特征层输出进行处理 # grid为网格结构(13,13,1,2),raw_pred为尚未处理的预测结果(bs,13,13,3,85) # 还有解码后的xy,wh,(bs,13,13,3,2) grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[i], anchors[anchor_mask[i]], num_classes, input_shape, calc_loss=True) # 这个是解码后的预测的box的位置 # (bs,13,13,3,4) pred_box = K.concatenate([pred_xy, pred_wh]) # 找到负样本群组,第一步是创建一个数组,[] ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') # 对每一张图片计算ignore_mask def loop_body(b, ignore_mask): # 取出第b副图内,真实存在的所有的box的参数 # n,4 true_box = tf.boolean_mask(y_true[i][b, ..., 0:4], object_mask_bool[b, ..., 0]) # 计算预测结果与真实情况的iou # pred_box为13,13,3,4 # 计算的结果是每个pred_box和其它所有真实框的iou # 13,13,3,n iou = box_iou(pred_box[b], true_box) # 13,13,3 best_iou = K.max(iou, axis=-1) # 如果某些预测框和真实框的重合程度大于0.5,则忽略。 ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask # 遍历所有的图片 _, ignore_mask = K.control_flow_ops.while_loop(lambda b, *args: b < bs, loop_body, [0, ignore_mask]) # 将每幅图的内容压缩,进行处理 ignore_mask = ignore_mask.stack() # (bs,13,13,3,1) ignore_mask = K.expand_dims(ignore_mask, -1) box_loss_scale = 2 - y_true[i][..., 2:3] * y_true[i][..., 3:4] # Calculate ciou loss as location loss raw_true_box = y_true[i][..., 0:4] ciou = box_ciou(pred_box, raw_true_box) ciou_loss = object_mask * box_loss_scale * (1 - ciou) ciou_loss = K.sum(ciou_loss) / batch_size location_loss = ciou_loss # 如果该位置本来有框,那么计算1与置信度的交叉熵 # 如果该位置本来没有框,而且满足best_iou<ignore_thresh,则被认定为负样本 # best_iou<ignore_thresh用于限制负样本数量 confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) + \ (1 - object_mask) * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probabilities, raw_pred[..., 5:], from_logits=True) confidence_loss = K.sum(confidence_loss) / batch_size class_loss = K.sum(class_loss) / batch_size loss += location_loss + confidence_loss + class_loss # if print_loss: # loss = tf.Print(loss, [loss, location_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss: ') return loss
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, label_smoothing=0.1, print_loss=False, normalize=True): # 一共有两层 num_layers = len(anchors) // 3 #---------------------------------------------------------------------------------------------------# # 将预测结果和实际ground truth分开,args是[*model_body.output, *y_true] # y_true是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85) # yolo_outputs是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85) #---------------------------------------------------------------------------------------------------# y_true = args[num_layers:] yolo_outputs = args[:num_layers] #-----------------------------------------------------------# # 13x13的特征层对应的anchor是[81,82], [135,169], [344,319] # 26x26的特征层对应的anchor是[23,27], [37,58], [81,82] #-----------------------------------------------------------# anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2] ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] # 得到input_shpae为416,416 input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) loss = 0 num_pos = 0 #-----------------------------------------------------------# # 取出每一张图片 # m的值就是batch_size #-----------------------------------------------------------# m = K.shape(yolo_outputs[0])[0] mf = K.cast(m, K.dtype(yolo_outputs[0])) #---------------------------------------------------------------------------------------------------# # y_true是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85) # yolo_outputs是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85) #---------------------------------------------------------------------------------------------------# for l in range(num_layers): #-----------------------------------------------------------# # 以第一个特征层(m,13,13,3,85)为例子 # 取出该特征层中存在目标的点的位置。(m,13,13,3,1) #-----------------------------------------------------------# object_mask = y_true[l][..., 4:5] #-----------------------------------------------------------# # 取出其对应的种类(m,13,13,3,80) #-----------------------------------------------------------# true_class_probs = y_true[l][..., 5:] if label_smoothing: true_class_probs = _smooth_labels(true_class_probs, label_smoothing) #-----------------------------------------------------------# # 将yolo_outputs的特征层输出进行处理、获得四个返回值 # 其中: # grid (13,13,1,2) 网格坐标 # raw_pred (m,13,13,3,85) 尚未处理的预测结果 # pred_xy (m,13,13,3,2) 解码后的中心坐标 # pred_wh (m,13,13,3,2) 解码后的宽高坐标 #-----------------------------------------------------------# grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) #-----------------------------------------------------------# # pred_box是解码后的预测的box的位置 # (m,13,13,3,4) #-----------------------------------------------------------# pred_box = K.concatenate([pred_xy, pred_wh]) #-----------------------------------------------------------# # 找到负样本群组,第一步是创建一个数组,[] #-----------------------------------------------------------# ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') #-----------------------------------------------------------# # 对每一张图片计算ignore_mask #-----------------------------------------------------------# def loop_body(b, ignore_mask): #-----------------------------------------------------------# # 取出n个真实框:n,4 #-----------------------------------------------------------# true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) #-----------------------------------------------------------# # 计算预测框与真实框的iou # pred_box 13,13,3,4 预测框的坐标 # true_box n,4 真实框的坐标 # iou 13,13,3,n 预测框和真实框的iou #-----------------------------------------------------------# iou = box_iou(pred_box[b], true_box) #-----------------------------------------------------------# # best_iou 13,13,3 每个特征点与真实框的最大重合程度 #-----------------------------------------------------------# best_iou = K.max(iou, axis=-1) #-----------------------------------------------------------# # 判断预测框和真实框的最大iou小于ignore_thresh # 则认为该预测框没有与之对应的真实框 # 该操作的目的是: # 忽略预测结果与真实框非常对应特征点,因为这些框已经比较准了 # 不适合当作负样本,所以忽略掉。 #-----------------------------------------------------------# ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask #-----------------------------------------------------------# # 在这个地方进行一个循环、循环是对每一张图片进行的 #-----------------------------------------------------------# _, ignore_mask = K.control_flow_ops.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) #-----------------------------------------------------------# # ignore_mask用于提取出作为负样本的特征点 # (m,13,13,3) #-----------------------------------------------------------# ignore_mask = ignore_mask.stack() # (m,13,13,3,1) ignore_mask = K.expand_dims(ignore_mask, -1) #-----------------------------------------------------------# # 真实框越大,比重越小,小框的比重更大。 #-----------------------------------------------------------# box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] #-----------------------------------------------------------# # 计算Ciou loss #-----------------------------------------------------------# raw_true_box = y_true[l][..., 0:4] ciou = box_ciou(pred_box, raw_true_box) ciou_loss = object_mask * box_loss_scale * (1 - ciou) #------------------------------------------------------------------------------# # 如果该位置本来有框,那么计算1与置信度的交叉熵 # 如果该位置本来没有框,那么计算0与置信度的交叉熵 # 在这其中会忽略一部分样本,这些被忽略的样本满足条件best_iou<ignore_thresh # 该操作的目的是: # 忽略预测结果与真实框非常对应特征点,因为这些框已经比较准了 # 不适合当作负样本,所以忽略掉。 #------------------------------------------------------------------------------# confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \ (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) location_loss = K.sum( tf.where(tf.is_nan(ciou_loss), tf.zeros_like(ciou_loss), ciou_loss)) confidence_loss = K.sum( tf.where(tf.is_nan(confidence_loss), tf.zeros_like(confidence_loss), confidence_loss)) class_loss = K.sum( tf.where(tf.is_nan(class_loss), tf.zeros_like(class_loss), class_loss)) #-----------------------------------------------------------# # 计算正样本数量 #-----------------------------------------------------------# num_pos += tf.maximum(K.sum(K.cast(object_mask, tf.float32)), 1) loss += location_loss + confidence_loss + class_loss # if print_loss: # loss = tf.Print(loss, [loss, location_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss: ') if normalize: loss = loss / num_pos else: loss = loss / mf return loss
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, label_smoothing=0.1, print_loss=False): # three floors num_layers = len(anchors) // 3 y_true = args[num_layers:] yolo_outputs = args[:num_layers] # priori box # 678 142,110, 192,243, 459,401 # 345 36,75, 76,55, 72,146 # 012 12,16, 19,36, 40,28 anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2] ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] #input_shpae 608,608 input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) loss = 0 # Take out each picture m = K.shape(yolo_outputs[0])[0] mf = K.cast(m, K.dtype(yolo_outputs[0])) for e in range(num_layers): # Extract the position of the point with target in the feature layer object_mask = y_true[e][..., 4:5] # Take out the corresponding species true_class_probs = y_true[e][..., 5:] if label_smoothing: true_class_probs = _smooth_labels(true_class_probs, label_smoothing) # Process the characteristic layer output of YOLO_outputs grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[e], anchors[anchor_mask[e]], num_classes, input_shape, calc_loss=True) # Decoded predicted box position # (m,13,13,3,4) pred_box = K.concatenate([pred_xy, pred_wh]) ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') # Calculate ignore_mask for each image def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[e][b, ..., 0:4], object_mask_bool[b, ..., 0]) # Calculate the IOU of the prediction against the real situation # 13,13,3,n iou = box_iou(pred_box[b], true_box) # 13,13,3 best_iou = K.max(iou, axis=-1) # If the degree of overlap between some prediction boxes and real boxes is greater than 0.5, it is ignored.。 ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask # Walk through all the images _, ignore_mask = K.control_flow_ops.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) # Compress the content of each picture for processing ignore_mask = ignore_mask.stack() # (m,13,13,3,1) ignore_mask = K.expand_dims(ignore_mask, -1) box_loss_scale = 2 - y_true[e][..., 2:3] * y_true[e][..., 3:4] # Calculate ciou loss as location loss raw_true_box = y_true[e][..., 0:4] ciou = box_ciou(pred_box, raw_true_box) ciou_loss = object_mask * box_loss_scale * (1 - ciou) ciou_loss = K.sum(ciou_loss) / mf location_loss = ciou_loss # Calculate the cross entropy of 1 and confidence if there is a box at the location # If the location does not have a box and satisfies best_iou< Ignore_thresh is considered a negative sample # best_iou<ignore_thresh Used to limit the number of negative samples confidence_loss = object_mask * \ K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) + \ (1 - object_mask) * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf loss += location_loss + confidence_loss + class_loss # if print_loss: # loss = tf.Print(loss, [loss, location_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss:') return loss